[ORC] Add std::tuple support to SimplePackedSerialization.
[llvm-project.git] / llvm / lib / Target / ARM / ARMISelLowering.cpp
blob5366a64bcf89982f4474d8f101b46e6126939ec5
1 //===- ARMISelLowering.cpp - ARM DAG Lowering Implementation --------------===//
2 //
3 // Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
4 // See https://llvm.org/LICENSE.txt for license information.
5 // SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
6 //
7 //===----------------------------------------------------------------------===//
8 //
9 // This file defines the interfaces that ARM uses to lower LLVM code into a
10 // selection DAG.
12 //===----------------------------------------------------------------------===//
14 #include "ARMISelLowering.h"
15 #include "ARMBaseInstrInfo.h"
16 #include "ARMBaseRegisterInfo.h"
17 #include "ARMCallingConv.h"
18 #include "ARMConstantPoolValue.h"
19 #include "ARMMachineFunctionInfo.h"
20 #include "ARMPerfectShuffle.h"
21 #include "ARMRegisterInfo.h"
22 #include "ARMSelectionDAGInfo.h"
23 #include "ARMSubtarget.h"
24 #include "ARMTargetTransformInfo.h"
25 #include "MCTargetDesc/ARMAddressingModes.h"
26 #include "MCTargetDesc/ARMBaseInfo.h"
27 #include "Utils/ARMBaseInfo.h"
28 #include "llvm/ADT/APFloat.h"
29 #include "llvm/ADT/APInt.h"
30 #include "llvm/ADT/ArrayRef.h"
31 #include "llvm/ADT/BitVector.h"
32 #include "llvm/ADT/DenseMap.h"
33 #include "llvm/ADT/STLExtras.h"
34 #include "llvm/ADT/SmallPtrSet.h"
35 #include "llvm/ADT/SmallVector.h"
36 #include "llvm/ADT/Statistic.h"
37 #include "llvm/ADT/StringExtras.h"
38 #include "llvm/ADT/StringRef.h"
39 #include "llvm/ADT/StringSwitch.h"
40 #include "llvm/ADT/Triple.h"
41 #include "llvm/ADT/Twine.h"
42 #include "llvm/Analysis/VectorUtils.h"
43 #include "llvm/CodeGen/CallingConvLower.h"
44 #include "llvm/CodeGen/ISDOpcodes.h"
45 #include "llvm/CodeGen/IntrinsicLowering.h"
46 #include "llvm/CodeGen/MachineBasicBlock.h"
47 #include "llvm/CodeGen/MachineConstantPool.h"
48 #include "llvm/CodeGen/MachineFrameInfo.h"
49 #include "llvm/CodeGen/MachineFunction.h"
50 #include "llvm/CodeGen/MachineInstr.h"
51 #include "llvm/CodeGen/MachineInstrBuilder.h"
52 #include "llvm/CodeGen/MachineJumpTableInfo.h"
53 #include "llvm/CodeGen/MachineMemOperand.h"
54 #include "llvm/CodeGen/MachineOperand.h"
55 #include "llvm/CodeGen/MachineRegisterInfo.h"
56 #include "llvm/CodeGen/RuntimeLibcalls.h"
57 #include "llvm/CodeGen/SelectionDAG.h"
58 #include "llvm/CodeGen/SelectionDAGAddressAnalysis.h"
59 #include "llvm/CodeGen/SelectionDAGNodes.h"
60 #include "llvm/CodeGen/TargetInstrInfo.h"
61 #include "llvm/CodeGen/TargetLowering.h"
62 #include "llvm/CodeGen/TargetOpcodes.h"
63 #include "llvm/CodeGen/TargetRegisterInfo.h"
64 #include "llvm/CodeGen/TargetSubtargetInfo.h"
65 #include "llvm/CodeGen/ValueTypes.h"
66 #include "llvm/IR/Attributes.h"
67 #include "llvm/IR/CallingConv.h"
68 #include "llvm/IR/Constant.h"
69 #include "llvm/IR/Constants.h"
70 #include "llvm/IR/DataLayout.h"
71 #include "llvm/IR/DebugLoc.h"
72 #include "llvm/IR/DerivedTypes.h"
73 #include "llvm/IR/Function.h"
74 #include "llvm/IR/GlobalAlias.h"
75 #include "llvm/IR/GlobalValue.h"
76 #include "llvm/IR/GlobalVariable.h"
77 #include "llvm/IR/IRBuilder.h"
78 #include "llvm/IR/InlineAsm.h"
79 #include "llvm/IR/Instruction.h"
80 #include "llvm/IR/Instructions.h"
81 #include "llvm/IR/IntrinsicInst.h"
82 #include "llvm/IR/Intrinsics.h"
83 #include "llvm/IR/IntrinsicsARM.h"
84 #include "llvm/IR/Module.h"
85 #include "llvm/IR/PatternMatch.h"
86 #include "llvm/IR/Type.h"
87 #include "llvm/IR/User.h"
88 #include "llvm/IR/Value.h"
89 #include "llvm/MC/MCInstrDesc.h"
90 #include "llvm/MC/MCInstrItineraries.h"
91 #include "llvm/MC/MCRegisterInfo.h"
92 #include "llvm/MC/MCSchedule.h"
93 #include "llvm/Support/AtomicOrdering.h"
94 #include "llvm/Support/BranchProbability.h"
95 #include "llvm/Support/Casting.h"
96 #include "llvm/Support/CodeGen.h"
97 #include "llvm/Support/CommandLine.h"
98 #include "llvm/Support/Compiler.h"
99 #include "llvm/Support/Debug.h"
100 #include "llvm/Support/ErrorHandling.h"
101 #include "llvm/Support/KnownBits.h"
102 #include "llvm/Support/MachineValueType.h"
103 #include "llvm/Support/MathExtras.h"
104 #include "llvm/Support/raw_ostream.h"
105 #include "llvm/Target/TargetMachine.h"
106 #include "llvm/Target/TargetOptions.h"
107 #include <algorithm>
108 #include <cassert>
109 #include <cstdint>
110 #include <cstdlib>
111 #include <iterator>
112 #include <limits>
113 #include <string>
114 #include <tuple>
115 #include <utility>
116 #include <vector>
118 using namespace llvm;
119 using namespace llvm::PatternMatch;
121 #define DEBUG_TYPE "arm-isel"
123 STATISTIC(NumTailCalls, "Number of tail calls");
124 STATISTIC(NumMovwMovt, "Number of GAs materialized with movw + movt");
125 STATISTIC(NumLoopByVals, "Number of loops generated for byval arguments");
126 STATISTIC(NumConstpoolPromoted,
127 "Number of constants with their storage promoted into constant pools");
129 static cl::opt<bool>
130 ARMInterworking("arm-interworking", cl::Hidden,
131 cl::desc("Enable / disable ARM interworking (for debugging only)"),
132 cl::init(true));
134 static cl::opt<bool> EnableConstpoolPromotion(
135 "arm-promote-constant", cl::Hidden,
136 cl::desc("Enable / disable promotion of unnamed_addr constants into "
137 "constant pools"),
138 cl::init(false)); // FIXME: set to true by default once PR32780 is fixed
139 static cl::opt<unsigned> ConstpoolPromotionMaxSize(
140 "arm-promote-constant-max-size", cl::Hidden,
141 cl::desc("Maximum size of constant to promote into a constant pool"),
142 cl::init(64));
143 static cl::opt<unsigned> ConstpoolPromotionMaxTotal(
144 "arm-promote-constant-max-total", cl::Hidden,
145 cl::desc("Maximum size of ALL constants to promote into a constant pool"),
146 cl::init(128));
148 cl::opt<unsigned>
149 MVEMaxSupportedInterleaveFactor("mve-max-interleave-factor", cl::Hidden,
150 cl::desc("Maximum interleave factor for MVE VLDn to generate."),
151 cl::init(2));
153 // The APCS parameter registers.
154 static const MCPhysReg GPRArgRegs[] = {
155 ARM::R0, ARM::R1, ARM::R2, ARM::R3
158 void ARMTargetLowering::addTypeForNEON(MVT VT, MVT PromotedLdStVT) {
159 if (VT != PromotedLdStVT) {
160 setOperationAction(ISD::LOAD, VT, Promote);
161 AddPromotedToType (ISD::LOAD, VT, PromotedLdStVT);
163 setOperationAction(ISD::STORE, VT, Promote);
164 AddPromotedToType (ISD::STORE, VT, PromotedLdStVT);
167 MVT ElemTy = VT.getVectorElementType();
168 if (ElemTy != MVT::f64)
169 setOperationAction(ISD::SETCC, VT, Custom);
170 setOperationAction(ISD::INSERT_VECTOR_ELT, VT, Custom);
171 setOperationAction(ISD::EXTRACT_VECTOR_ELT, VT, Custom);
172 if (ElemTy == MVT::i32) {
173 setOperationAction(ISD::SINT_TO_FP, VT, Custom);
174 setOperationAction(ISD::UINT_TO_FP, VT, Custom);
175 setOperationAction(ISD::FP_TO_SINT, VT, Custom);
176 setOperationAction(ISD::FP_TO_UINT, VT, Custom);
177 } else {
178 setOperationAction(ISD::SINT_TO_FP, VT, Expand);
179 setOperationAction(ISD::UINT_TO_FP, VT, Expand);
180 setOperationAction(ISD::FP_TO_SINT, VT, Expand);
181 setOperationAction(ISD::FP_TO_UINT, VT, Expand);
183 setOperationAction(ISD::BUILD_VECTOR, VT, Custom);
184 setOperationAction(ISD::VECTOR_SHUFFLE, VT, Custom);
185 setOperationAction(ISD::CONCAT_VECTORS, VT, Legal);
186 setOperationAction(ISD::EXTRACT_SUBVECTOR, VT, Legal);
187 setOperationAction(ISD::SELECT, VT, Expand);
188 setOperationAction(ISD::SELECT_CC, VT, Expand);
189 setOperationAction(ISD::VSELECT, VT, Expand);
190 setOperationAction(ISD::SIGN_EXTEND_INREG, VT, Expand);
191 if (VT.isInteger()) {
192 setOperationAction(ISD::SHL, VT, Custom);
193 setOperationAction(ISD::SRA, VT, Custom);
194 setOperationAction(ISD::SRL, VT, Custom);
197 // Neon does not support vector divide/remainder operations.
198 setOperationAction(ISD::SDIV, VT, Expand);
199 setOperationAction(ISD::UDIV, VT, Expand);
200 setOperationAction(ISD::FDIV, VT, Expand);
201 setOperationAction(ISD::SREM, VT, Expand);
202 setOperationAction(ISD::UREM, VT, Expand);
203 setOperationAction(ISD::FREM, VT, Expand);
204 setOperationAction(ISD::SDIVREM, VT, Expand);
205 setOperationAction(ISD::UDIVREM, VT, Expand);
207 if (!VT.isFloatingPoint() &&
208 VT != MVT::v2i64 && VT != MVT::v1i64)
209 for (auto Opcode : {ISD::ABS, ISD::SMIN, ISD::SMAX, ISD::UMIN, ISD::UMAX})
210 setOperationAction(Opcode, VT, Legal);
211 if (!VT.isFloatingPoint())
212 for (auto Opcode : {ISD::SADDSAT, ISD::UADDSAT, ISD::SSUBSAT, ISD::USUBSAT})
213 setOperationAction(Opcode, VT, Legal);
216 void ARMTargetLowering::addDRTypeForNEON(MVT VT) {
217 addRegisterClass(VT, &ARM::DPRRegClass);
218 addTypeForNEON(VT, MVT::f64);
221 void ARMTargetLowering::addQRTypeForNEON(MVT VT) {
222 addRegisterClass(VT, &ARM::DPairRegClass);
223 addTypeForNEON(VT, MVT::v2f64);
226 void ARMTargetLowering::setAllExpand(MVT VT) {
227 for (unsigned Opc = 0; Opc < ISD::BUILTIN_OP_END; ++Opc)
228 setOperationAction(Opc, VT, Expand);
230 // We support these really simple operations even on types where all
231 // the actual arithmetic has to be broken down into simpler
232 // operations or turned into library calls.
233 setOperationAction(ISD::BITCAST, VT, Legal);
234 setOperationAction(ISD::LOAD, VT, Legal);
235 setOperationAction(ISD::STORE, VT, Legal);
236 setOperationAction(ISD::UNDEF, VT, Legal);
239 void ARMTargetLowering::addAllExtLoads(const MVT From, const MVT To,
240 LegalizeAction Action) {
241 setLoadExtAction(ISD::EXTLOAD, From, To, Action);
242 setLoadExtAction(ISD::ZEXTLOAD, From, To, Action);
243 setLoadExtAction(ISD::SEXTLOAD, From, To, Action);
246 void ARMTargetLowering::addMVEVectorTypes(bool HasMVEFP) {
247 const MVT IntTypes[] = { MVT::v16i8, MVT::v8i16, MVT::v4i32 };
249 for (auto VT : IntTypes) {
250 addRegisterClass(VT, &ARM::MQPRRegClass);
251 setOperationAction(ISD::VECTOR_SHUFFLE, VT, Custom);
252 setOperationAction(ISD::INSERT_VECTOR_ELT, VT, Custom);
253 setOperationAction(ISD::EXTRACT_VECTOR_ELT, VT, Custom);
254 setOperationAction(ISD::BUILD_VECTOR, VT, Custom);
255 setOperationAction(ISD::SHL, VT, Custom);
256 setOperationAction(ISD::SRA, VT, Custom);
257 setOperationAction(ISD::SRL, VT, Custom);
258 setOperationAction(ISD::SMIN, VT, Legal);
259 setOperationAction(ISD::SMAX, VT, Legal);
260 setOperationAction(ISD::UMIN, VT, Legal);
261 setOperationAction(ISD::UMAX, VT, Legal);
262 setOperationAction(ISD::ABS, VT, Legal);
263 setOperationAction(ISD::SETCC, VT, Custom);
264 setOperationAction(ISD::MLOAD, VT, Custom);
265 setOperationAction(ISD::MSTORE, VT, Legal);
266 setOperationAction(ISD::CTLZ, VT, Legal);
267 setOperationAction(ISD::CTTZ, VT, Custom);
268 setOperationAction(ISD::BITREVERSE, VT, Legal);
269 setOperationAction(ISD::BSWAP, VT, Legal);
270 setOperationAction(ISD::SADDSAT, VT, Legal);
271 setOperationAction(ISD::UADDSAT, VT, Legal);
272 setOperationAction(ISD::SSUBSAT, VT, Legal);
273 setOperationAction(ISD::USUBSAT, VT, Legal);
274 setOperationAction(ISD::ABDS, VT, Legal);
275 setOperationAction(ISD::ABDU, VT, Legal);
277 // No native support for these.
278 setOperationAction(ISD::UDIV, VT, Expand);
279 setOperationAction(ISD::SDIV, VT, Expand);
280 setOperationAction(ISD::UREM, VT, Expand);
281 setOperationAction(ISD::SREM, VT, Expand);
282 setOperationAction(ISD::UDIVREM, VT, Expand);
283 setOperationAction(ISD::SDIVREM, VT, Expand);
284 setOperationAction(ISD::CTPOP, VT, Expand);
285 setOperationAction(ISD::SELECT, VT, Expand);
286 setOperationAction(ISD::SELECT_CC, VT, Expand);
288 // Vector reductions
289 setOperationAction(ISD::VECREDUCE_ADD, VT, Legal);
290 setOperationAction(ISD::VECREDUCE_SMAX, VT, Legal);
291 setOperationAction(ISD::VECREDUCE_UMAX, VT, Legal);
292 setOperationAction(ISD::VECREDUCE_SMIN, VT, Legal);
293 setOperationAction(ISD::VECREDUCE_UMIN, VT, Legal);
294 setOperationAction(ISD::VECREDUCE_MUL, VT, Custom);
295 setOperationAction(ISD::VECREDUCE_AND, VT, Custom);
296 setOperationAction(ISD::VECREDUCE_OR, VT, Custom);
297 setOperationAction(ISD::VECREDUCE_XOR, VT, Custom);
299 if (!HasMVEFP) {
300 setOperationAction(ISD::SINT_TO_FP, VT, Expand);
301 setOperationAction(ISD::UINT_TO_FP, VT, Expand);
302 setOperationAction(ISD::FP_TO_SINT, VT, Expand);
303 setOperationAction(ISD::FP_TO_UINT, VT, Expand);
306 // Pre and Post inc are supported on loads and stores
307 for (unsigned im = (unsigned)ISD::PRE_INC;
308 im != (unsigned)ISD::LAST_INDEXED_MODE; ++im) {
309 setIndexedLoadAction(im, VT, Legal);
310 setIndexedStoreAction(im, VT, Legal);
311 setIndexedMaskedLoadAction(im, VT, Legal);
312 setIndexedMaskedStoreAction(im, VT, Legal);
316 const MVT FloatTypes[] = { MVT::v8f16, MVT::v4f32 };
317 for (auto VT : FloatTypes) {
318 addRegisterClass(VT, &ARM::MQPRRegClass);
319 if (!HasMVEFP)
320 setAllExpand(VT);
322 // These are legal or custom whether we have MVE.fp or not
323 setOperationAction(ISD::VECTOR_SHUFFLE, VT, Custom);
324 setOperationAction(ISD::INSERT_VECTOR_ELT, VT, Custom);
325 setOperationAction(ISD::INSERT_VECTOR_ELT, VT.getVectorElementType(), Custom);
326 setOperationAction(ISD::EXTRACT_VECTOR_ELT, VT, Custom);
327 setOperationAction(ISD::BUILD_VECTOR, VT, Custom);
328 setOperationAction(ISD::BUILD_VECTOR, VT.getVectorElementType(), Custom);
329 setOperationAction(ISD::SCALAR_TO_VECTOR, VT, Legal);
330 setOperationAction(ISD::SETCC, VT, Custom);
331 setOperationAction(ISD::MLOAD, VT, Custom);
332 setOperationAction(ISD::MSTORE, VT, Legal);
333 setOperationAction(ISD::SELECT, VT, Expand);
334 setOperationAction(ISD::SELECT_CC, VT, Expand);
336 // Pre and Post inc are supported on loads and stores
337 for (unsigned im = (unsigned)ISD::PRE_INC;
338 im != (unsigned)ISD::LAST_INDEXED_MODE; ++im) {
339 setIndexedLoadAction(im, VT, Legal);
340 setIndexedStoreAction(im, VT, Legal);
341 setIndexedMaskedLoadAction(im, VT, Legal);
342 setIndexedMaskedStoreAction(im, VT, Legal);
345 if (HasMVEFP) {
346 setOperationAction(ISD::FMINNUM, VT, Legal);
347 setOperationAction(ISD::FMAXNUM, VT, Legal);
348 setOperationAction(ISD::FROUND, VT, Legal);
349 setOperationAction(ISD::VECREDUCE_FADD, VT, Custom);
350 setOperationAction(ISD::VECREDUCE_FMUL, VT, Custom);
351 setOperationAction(ISD::VECREDUCE_FMIN, VT, Custom);
352 setOperationAction(ISD::VECREDUCE_FMAX, VT, Custom);
354 // No native support for these.
355 setOperationAction(ISD::FDIV, VT, Expand);
356 setOperationAction(ISD::FREM, VT, Expand);
357 setOperationAction(ISD::FSQRT, VT, Expand);
358 setOperationAction(ISD::FSIN, VT, Expand);
359 setOperationAction(ISD::FCOS, VT, Expand);
360 setOperationAction(ISD::FPOW, VT, Expand);
361 setOperationAction(ISD::FLOG, VT, Expand);
362 setOperationAction(ISD::FLOG2, VT, Expand);
363 setOperationAction(ISD::FLOG10, VT, Expand);
364 setOperationAction(ISD::FEXP, VT, Expand);
365 setOperationAction(ISD::FEXP2, VT, Expand);
366 setOperationAction(ISD::FNEARBYINT, VT, Expand);
370 // Custom Expand smaller than legal vector reductions to prevent false zero
371 // items being added.
372 setOperationAction(ISD::VECREDUCE_FADD, MVT::v4f16, Custom);
373 setOperationAction(ISD::VECREDUCE_FMUL, MVT::v4f16, Custom);
374 setOperationAction(ISD::VECREDUCE_FMIN, MVT::v4f16, Custom);
375 setOperationAction(ISD::VECREDUCE_FMAX, MVT::v4f16, Custom);
376 setOperationAction(ISD::VECREDUCE_FADD, MVT::v2f16, Custom);
377 setOperationAction(ISD::VECREDUCE_FMUL, MVT::v2f16, Custom);
378 setOperationAction(ISD::VECREDUCE_FMIN, MVT::v2f16, Custom);
379 setOperationAction(ISD::VECREDUCE_FMAX, MVT::v2f16, Custom);
381 // We 'support' these types up to bitcast/load/store level, regardless of
382 // MVE integer-only / float support. Only doing FP data processing on the FP
383 // vector types is inhibited at integer-only level.
384 const MVT LongTypes[] = { MVT::v2i64, MVT::v2f64 };
385 for (auto VT : LongTypes) {
386 addRegisterClass(VT, &ARM::MQPRRegClass);
387 setAllExpand(VT);
388 setOperationAction(ISD::INSERT_VECTOR_ELT, VT, Custom);
389 setOperationAction(ISD::EXTRACT_VECTOR_ELT, VT, Custom);
390 setOperationAction(ISD::BUILD_VECTOR, VT, Custom);
392 setOperationAction(ISD::SCALAR_TO_VECTOR, MVT::v2f64, Legal);
394 // We can do bitwise operations on v2i64 vectors
395 setOperationAction(ISD::AND, MVT::v2i64, Legal);
396 setOperationAction(ISD::OR, MVT::v2i64, Legal);
397 setOperationAction(ISD::XOR, MVT::v2i64, Legal);
399 // It is legal to extload from v4i8 to v4i16 or v4i32.
400 addAllExtLoads(MVT::v8i16, MVT::v8i8, Legal);
401 addAllExtLoads(MVT::v4i32, MVT::v4i16, Legal);
402 addAllExtLoads(MVT::v4i32, MVT::v4i8, Legal);
404 // It is legal to sign extend from v4i8/v4i16 to v4i32 or v8i8 to v8i16.
405 setOperationAction(ISD::SIGN_EXTEND_INREG, MVT::v4i8, Legal);
406 setOperationAction(ISD::SIGN_EXTEND_INREG, MVT::v4i16, Legal);
407 setOperationAction(ISD::SIGN_EXTEND_INREG, MVT::v4i32, Legal);
408 setOperationAction(ISD::SIGN_EXTEND_INREG, MVT::v8i8, Legal);
409 setOperationAction(ISD::SIGN_EXTEND_INREG, MVT::v8i16, Legal);
411 // Some truncating stores are legal too.
412 setTruncStoreAction(MVT::v4i32, MVT::v4i16, Legal);
413 setTruncStoreAction(MVT::v4i32, MVT::v4i8, Legal);
414 setTruncStoreAction(MVT::v8i16, MVT::v8i8, Legal);
416 // Pre and Post inc on these are legal, given the correct extends
417 for (unsigned im = (unsigned)ISD::PRE_INC;
418 im != (unsigned)ISD::LAST_INDEXED_MODE; ++im) {
419 for (auto VT : {MVT::v8i8, MVT::v4i8, MVT::v4i16}) {
420 setIndexedLoadAction(im, VT, Legal);
421 setIndexedStoreAction(im, VT, Legal);
422 setIndexedMaskedLoadAction(im, VT, Legal);
423 setIndexedMaskedStoreAction(im, VT, Legal);
427 // Predicate types
428 const MVT pTypes[] = {MVT::v16i1, MVT::v8i1, MVT::v4i1};
429 for (auto VT : pTypes) {
430 addRegisterClass(VT, &ARM::VCCRRegClass);
431 setOperationAction(ISD::BUILD_VECTOR, VT, Custom);
432 setOperationAction(ISD::VECTOR_SHUFFLE, VT, Custom);
433 setOperationAction(ISD::EXTRACT_SUBVECTOR, VT, Custom);
434 setOperationAction(ISD::CONCAT_VECTORS, VT, Custom);
435 setOperationAction(ISD::INSERT_VECTOR_ELT, VT, Custom);
436 setOperationAction(ISD::EXTRACT_VECTOR_ELT, VT, Custom);
437 setOperationAction(ISD::SETCC, VT, Custom);
438 setOperationAction(ISD::SCALAR_TO_VECTOR, VT, Expand);
439 setOperationAction(ISD::LOAD, VT, Custom);
440 setOperationAction(ISD::STORE, VT, Custom);
441 setOperationAction(ISD::TRUNCATE, VT, Custom);
442 setOperationAction(ISD::VSELECT, VT, Expand);
443 setOperationAction(ISD::SELECT, VT, Expand);
445 setOperationAction(ISD::SIGN_EXTEND, MVT::v8i32, Custom);
446 setOperationAction(ISD::SIGN_EXTEND, MVT::v16i16, Custom);
447 setOperationAction(ISD::SIGN_EXTEND, MVT::v16i32, Custom);
448 setOperationAction(ISD::ZERO_EXTEND, MVT::v8i32, Custom);
449 setOperationAction(ISD::ZERO_EXTEND, MVT::v16i16, Custom);
450 setOperationAction(ISD::ZERO_EXTEND, MVT::v16i32, Custom);
451 setOperationAction(ISD::TRUNCATE, MVT::v8i32, Custom);
452 setOperationAction(ISD::TRUNCATE, MVT::v16i16, Custom);
455 ARMTargetLowering::ARMTargetLowering(const TargetMachine &TM,
456 const ARMSubtarget &STI)
457 : TargetLowering(TM), Subtarget(&STI) {
458 RegInfo = Subtarget->getRegisterInfo();
459 Itins = Subtarget->getInstrItineraryData();
461 setBooleanContents(ZeroOrOneBooleanContent);
462 setBooleanVectorContents(ZeroOrNegativeOneBooleanContent);
464 if (!Subtarget->isTargetDarwin() && !Subtarget->isTargetIOS() &&
465 !Subtarget->isTargetWatchOS()) {
466 bool IsHFTarget = TM.Options.FloatABIType == FloatABI::Hard;
467 for (int LCID = 0; LCID < RTLIB::UNKNOWN_LIBCALL; ++LCID)
468 setLibcallCallingConv(static_cast<RTLIB::Libcall>(LCID),
469 IsHFTarget ? CallingConv::ARM_AAPCS_VFP
470 : CallingConv::ARM_AAPCS);
473 if (Subtarget->isTargetMachO()) {
474 // Uses VFP for Thumb libfuncs if available.
475 if (Subtarget->isThumb() && Subtarget->hasVFP2Base() &&
476 Subtarget->hasARMOps() && !Subtarget->useSoftFloat()) {
477 static const struct {
478 const RTLIB::Libcall Op;
479 const char * const Name;
480 const ISD::CondCode Cond;
481 } LibraryCalls[] = {
482 // Single-precision floating-point arithmetic.
483 { RTLIB::ADD_F32, "__addsf3vfp", ISD::SETCC_INVALID },
484 { RTLIB::SUB_F32, "__subsf3vfp", ISD::SETCC_INVALID },
485 { RTLIB::MUL_F32, "__mulsf3vfp", ISD::SETCC_INVALID },
486 { RTLIB::DIV_F32, "__divsf3vfp", ISD::SETCC_INVALID },
488 // Double-precision floating-point arithmetic.
489 { RTLIB::ADD_F64, "__adddf3vfp", ISD::SETCC_INVALID },
490 { RTLIB::SUB_F64, "__subdf3vfp", ISD::SETCC_INVALID },
491 { RTLIB::MUL_F64, "__muldf3vfp", ISD::SETCC_INVALID },
492 { RTLIB::DIV_F64, "__divdf3vfp", ISD::SETCC_INVALID },
494 // Single-precision comparisons.
495 { RTLIB::OEQ_F32, "__eqsf2vfp", ISD::SETNE },
496 { RTLIB::UNE_F32, "__nesf2vfp", ISD::SETNE },
497 { RTLIB::OLT_F32, "__ltsf2vfp", ISD::SETNE },
498 { RTLIB::OLE_F32, "__lesf2vfp", ISD::SETNE },
499 { RTLIB::OGE_F32, "__gesf2vfp", ISD::SETNE },
500 { RTLIB::OGT_F32, "__gtsf2vfp", ISD::SETNE },
501 { RTLIB::UO_F32, "__unordsf2vfp", ISD::SETNE },
503 // Double-precision comparisons.
504 { RTLIB::OEQ_F64, "__eqdf2vfp", ISD::SETNE },
505 { RTLIB::UNE_F64, "__nedf2vfp", ISD::SETNE },
506 { RTLIB::OLT_F64, "__ltdf2vfp", ISD::SETNE },
507 { RTLIB::OLE_F64, "__ledf2vfp", ISD::SETNE },
508 { RTLIB::OGE_F64, "__gedf2vfp", ISD::SETNE },
509 { RTLIB::OGT_F64, "__gtdf2vfp", ISD::SETNE },
510 { RTLIB::UO_F64, "__unorddf2vfp", ISD::SETNE },
512 // Floating-point to integer conversions.
513 // i64 conversions are done via library routines even when generating VFP
514 // instructions, so use the same ones.
515 { RTLIB::FPTOSINT_F64_I32, "__fixdfsivfp", ISD::SETCC_INVALID },
516 { RTLIB::FPTOUINT_F64_I32, "__fixunsdfsivfp", ISD::SETCC_INVALID },
517 { RTLIB::FPTOSINT_F32_I32, "__fixsfsivfp", ISD::SETCC_INVALID },
518 { RTLIB::FPTOUINT_F32_I32, "__fixunssfsivfp", ISD::SETCC_INVALID },
520 // Conversions between floating types.
521 { RTLIB::FPROUND_F64_F32, "__truncdfsf2vfp", ISD::SETCC_INVALID },
522 { RTLIB::FPEXT_F32_F64, "__extendsfdf2vfp", ISD::SETCC_INVALID },
524 // Integer to floating-point conversions.
525 // i64 conversions are done via library routines even when generating VFP
526 // instructions, so use the same ones.
527 // FIXME: There appears to be some naming inconsistency in ARM libgcc:
528 // e.g., __floatunsidf vs. __floatunssidfvfp.
529 { RTLIB::SINTTOFP_I32_F64, "__floatsidfvfp", ISD::SETCC_INVALID },
530 { RTLIB::UINTTOFP_I32_F64, "__floatunssidfvfp", ISD::SETCC_INVALID },
531 { RTLIB::SINTTOFP_I32_F32, "__floatsisfvfp", ISD::SETCC_INVALID },
532 { RTLIB::UINTTOFP_I32_F32, "__floatunssisfvfp", ISD::SETCC_INVALID },
535 for (const auto &LC : LibraryCalls) {
536 setLibcallName(LC.Op, LC.Name);
537 if (LC.Cond != ISD::SETCC_INVALID)
538 setCmpLibcallCC(LC.Op, LC.Cond);
543 // These libcalls are not available in 32-bit.
544 setLibcallName(RTLIB::SHL_I128, nullptr);
545 setLibcallName(RTLIB::SRL_I128, nullptr);
546 setLibcallName(RTLIB::SRA_I128, nullptr);
547 setLibcallName(RTLIB::MUL_I128, nullptr);
549 // RTLIB
550 if (Subtarget->isAAPCS_ABI() &&
551 (Subtarget->isTargetAEABI() || Subtarget->isTargetGNUAEABI() ||
552 Subtarget->isTargetMuslAEABI() || Subtarget->isTargetAndroid())) {
553 static const struct {
554 const RTLIB::Libcall Op;
555 const char * const Name;
556 const CallingConv::ID CC;
557 const ISD::CondCode Cond;
558 } LibraryCalls[] = {
559 // Double-precision floating-point arithmetic helper functions
560 // RTABI chapter 4.1.2, Table 2
561 { RTLIB::ADD_F64, "__aeabi_dadd", CallingConv::ARM_AAPCS, ISD::SETCC_INVALID },
562 { RTLIB::DIV_F64, "__aeabi_ddiv", CallingConv::ARM_AAPCS, ISD::SETCC_INVALID },
563 { RTLIB::MUL_F64, "__aeabi_dmul", CallingConv::ARM_AAPCS, ISD::SETCC_INVALID },
564 { RTLIB::SUB_F64, "__aeabi_dsub", CallingConv::ARM_AAPCS, ISD::SETCC_INVALID },
566 // Double-precision floating-point comparison helper functions
567 // RTABI chapter 4.1.2, Table 3
568 { RTLIB::OEQ_F64, "__aeabi_dcmpeq", CallingConv::ARM_AAPCS, ISD::SETNE },
569 { RTLIB::UNE_F64, "__aeabi_dcmpeq", CallingConv::ARM_AAPCS, ISD::SETEQ },
570 { RTLIB::OLT_F64, "__aeabi_dcmplt", CallingConv::ARM_AAPCS, ISD::SETNE },
571 { RTLIB::OLE_F64, "__aeabi_dcmple", CallingConv::ARM_AAPCS, ISD::SETNE },
572 { RTLIB::OGE_F64, "__aeabi_dcmpge", CallingConv::ARM_AAPCS, ISD::SETNE },
573 { RTLIB::OGT_F64, "__aeabi_dcmpgt", CallingConv::ARM_AAPCS, ISD::SETNE },
574 { RTLIB::UO_F64, "__aeabi_dcmpun", CallingConv::ARM_AAPCS, ISD::SETNE },
576 // Single-precision floating-point arithmetic helper functions
577 // RTABI chapter 4.1.2, Table 4
578 { RTLIB::ADD_F32, "__aeabi_fadd", CallingConv::ARM_AAPCS, ISD::SETCC_INVALID },
579 { RTLIB::DIV_F32, "__aeabi_fdiv", CallingConv::ARM_AAPCS, ISD::SETCC_INVALID },
580 { RTLIB::MUL_F32, "__aeabi_fmul", CallingConv::ARM_AAPCS, ISD::SETCC_INVALID },
581 { RTLIB::SUB_F32, "__aeabi_fsub", CallingConv::ARM_AAPCS, ISD::SETCC_INVALID },
583 // Single-precision floating-point comparison helper functions
584 // RTABI chapter 4.1.2, Table 5
585 { RTLIB::OEQ_F32, "__aeabi_fcmpeq", CallingConv::ARM_AAPCS, ISD::SETNE },
586 { RTLIB::UNE_F32, "__aeabi_fcmpeq", CallingConv::ARM_AAPCS, ISD::SETEQ },
587 { RTLIB::OLT_F32, "__aeabi_fcmplt", CallingConv::ARM_AAPCS, ISD::SETNE },
588 { RTLIB::OLE_F32, "__aeabi_fcmple", CallingConv::ARM_AAPCS, ISD::SETNE },
589 { RTLIB::OGE_F32, "__aeabi_fcmpge", CallingConv::ARM_AAPCS, ISD::SETNE },
590 { RTLIB::OGT_F32, "__aeabi_fcmpgt", CallingConv::ARM_AAPCS, ISD::SETNE },
591 { RTLIB::UO_F32, "__aeabi_fcmpun", CallingConv::ARM_AAPCS, ISD::SETNE },
593 // Floating-point to integer conversions.
594 // RTABI chapter 4.1.2, Table 6
595 { RTLIB::FPTOSINT_F64_I32, "__aeabi_d2iz", CallingConv::ARM_AAPCS, ISD::SETCC_INVALID },
596 { RTLIB::FPTOUINT_F64_I32, "__aeabi_d2uiz", CallingConv::ARM_AAPCS, ISD::SETCC_INVALID },
597 { RTLIB::FPTOSINT_F64_I64, "__aeabi_d2lz", CallingConv::ARM_AAPCS, ISD::SETCC_INVALID },
598 { RTLIB::FPTOUINT_F64_I64, "__aeabi_d2ulz", CallingConv::ARM_AAPCS, ISD::SETCC_INVALID },
599 { RTLIB::FPTOSINT_F32_I32, "__aeabi_f2iz", CallingConv::ARM_AAPCS, ISD::SETCC_INVALID },
600 { RTLIB::FPTOUINT_F32_I32, "__aeabi_f2uiz", CallingConv::ARM_AAPCS, ISD::SETCC_INVALID },
601 { RTLIB::FPTOSINT_F32_I64, "__aeabi_f2lz", CallingConv::ARM_AAPCS, ISD::SETCC_INVALID },
602 { RTLIB::FPTOUINT_F32_I64, "__aeabi_f2ulz", CallingConv::ARM_AAPCS, ISD::SETCC_INVALID },
604 // Conversions between floating types.
605 // RTABI chapter 4.1.2, Table 7
606 { RTLIB::FPROUND_F64_F32, "__aeabi_d2f", CallingConv::ARM_AAPCS, ISD::SETCC_INVALID },
607 { RTLIB::FPROUND_F64_F16, "__aeabi_d2h", CallingConv::ARM_AAPCS, ISD::SETCC_INVALID },
608 { RTLIB::FPEXT_F32_F64, "__aeabi_f2d", CallingConv::ARM_AAPCS, ISD::SETCC_INVALID },
610 // Integer to floating-point conversions.
611 // RTABI chapter 4.1.2, Table 8
612 { RTLIB::SINTTOFP_I32_F64, "__aeabi_i2d", CallingConv::ARM_AAPCS, ISD::SETCC_INVALID },
613 { RTLIB::UINTTOFP_I32_F64, "__aeabi_ui2d", CallingConv::ARM_AAPCS, ISD::SETCC_INVALID },
614 { RTLIB::SINTTOFP_I64_F64, "__aeabi_l2d", CallingConv::ARM_AAPCS, ISD::SETCC_INVALID },
615 { RTLIB::UINTTOFP_I64_F64, "__aeabi_ul2d", CallingConv::ARM_AAPCS, ISD::SETCC_INVALID },
616 { RTLIB::SINTTOFP_I32_F32, "__aeabi_i2f", CallingConv::ARM_AAPCS, ISD::SETCC_INVALID },
617 { RTLIB::UINTTOFP_I32_F32, "__aeabi_ui2f", CallingConv::ARM_AAPCS, ISD::SETCC_INVALID },
618 { RTLIB::SINTTOFP_I64_F32, "__aeabi_l2f", CallingConv::ARM_AAPCS, ISD::SETCC_INVALID },
619 { RTLIB::UINTTOFP_I64_F32, "__aeabi_ul2f", CallingConv::ARM_AAPCS, ISD::SETCC_INVALID },
621 // Long long helper functions
622 // RTABI chapter 4.2, Table 9
623 { RTLIB::MUL_I64, "__aeabi_lmul", CallingConv::ARM_AAPCS, ISD::SETCC_INVALID },
624 { RTLIB::SHL_I64, "__aeabi_llsl", CallingConv::ARM_AAPCS, ISD::SETCC_INVALID },
625 { RTLIB::SRL_I64, "__aeabi_llsr", CallingConv::ARM_AAPCS, ISD::SETCC_INVALID },
626 { RTLIB::SRA_I64, "__aeabi_lasr", CallingConv::ARM_AAPCS, ISD::SETCC_INVALID },
628 // Integer division functions
629 // RTABI chapter 4.3.1
630 { RTLIB::SDIV_I8, "__aeabi_idiv", CallingConv::ARM_AAPCS, ISD::SETCC_INVALID },
631 { RTLIB::SDIV_I16, "__aeabi_idiv", CallingConv::ARM_AAPCS, ISD::SETCC_INVALID },
632 { RTLIB::SDIV_I32, "__aeabi_idiv", CallingConv::ARM_AAPCS, ISD::SETCC_INVALID },
633 { RTLIB::SDIV_I64, "__aeabi_ldivmod", CallingConv::ARM_AAPCS, ISD::SETCC_INVALID },
634 { RTLIB::UDIV_I8, "__aeabi_uidiv", CallingConv::ARM_AAPCS, ISD::SETCC_INVALID },
635 { RTLIB::UDIV_I16, "__aeabi_uidiv", CallingConv::ARM_AAPCS, ISD::SETCC_INVALID },
636 { RTLIB::UDIV_I32, "__aeabi_uidiv", CallingConv::ARM_AAPCS, ISD::SETCC_INVALID },
637 { RTLIB::UDIV_I64, "__aeabi_uldivmod", CallingConv::ARM_AAPCS, ISD::SETCC_INVALID },
640 for (const auto &LC : LibraryCalls) {
641 setLibcallName(LC.Op, LC.Name);
642 setLibcallCallingConv(LC.Op, LC.CC);
643 if (LC.Cond != ISD::SETCC_INVALID)
644 setCmpLibcallCC(LC.Op, LC.Cond);
647 // EABI dependent RTLIB
648 if (TM.Options.EABIVersion == EABI::EABI4 ||
649 TM.Options.EABIVersion == EABI::EABI5) {
650 static const struct {
651 const RTLIB::Libcall Op;
652 const char *const Name;
653 const CallingConv::ID CC;
654 const ISD::CondCode Cond;
655 } MemOpsLibraryCalls[] = {
656 // Memory operations
657 // RTABI chapter 4.3.4
658 { RTLIB::MEMCPY, "__aeabi_memcpy", CallingConv::ARM_AAPCS, ISD::SETCC_INVALID },
659 { RTLIB::MEMMOVE, "__aeabi_memmove", CallingConv::ARM_AAPCS, ISD::SETCC_INVALID },
660 { RTLIB::MEMSET, "__aeabi_memset", CallingConv::ARM_AAPCS, ISD::SETCC_INVALID },
663 for (const auto &LC : MemOpsLibraryCalls) {
664 setLibcallName(LC.Op, LC.Name);
665 setLibcallCallingConv(LC.Op, LC.CC);
666 if (LC.Cond != ISD::SETCC_INVALID)
667 setCmpLibcallCC(LC.Op, LC.Cond);
672 if (Subtarget->isTargetWindows()) {
673 static const struct {
674 const RTLIB::Libcall Op;
675 const char * const Name;
676 const CallingConv::ID CC;
677 } LibraryCalls[] = {
678 { RTLIB::FPTOSINT_F32_I64, "__stoi64", CallingConv::ARM_AAPCS_VFP },
679 { RTLIB::FPTOSINT_F64_I64, "__dtoi64", CallingConv::ARM_AAPCS_VFP },
680 { RTLIB::FPTOUINT_F32_I64, "__stou64", CallingConv::ARM_AAPCS_VFP },
681 { RTLIB::FPTOUINT_F64_I64, "__dtou64", CallingConv::ARM_AAPCS_VFP },
682 { RTLIB::SINTTOFP_I64_F32, "__i64tos", CallingConv::ARM_AAPCS_VFP },
683 { RTLIB::SINTTOFP_I64_F64, "__i64tod", CallingConv::ARM_AAPCS_VFP },
684 { RTLIB::UINTTOFP_I64_F32, "__u64tos", CallingConv::ARM_AAPCS_VFP },
685 { RTLIB::UINTTOFP_I64_F64, "__u64tod", CallingConv::ARM_AAPCS_VFP },
688 for (const auto &LC : LibraryCalls) {
689 setLibcallName(LC.Op, LC.Name);
690 setLibcallCallingConv(LC.Op, LC.CC);
694 // Use divmod compiler-rt calls for iOS 5.0 and later.
695 if (Subtarget->isTargetMachO() &&
696 !(Subtarget->isTargetIOS() &&
697 Subtarget->getTargetTriple().isOSVersionLT(5, 0))) {
698 setLibcallName(RTLIB::SDIVREM_I32, "__divmodsi4");
699 setLibcallName(RTLIB::UDIVREM_I32, "__udivmodsi4");
702 // The half <-> float conversion functions are always soft-float on
703 // non-watchos platforms, but are needed for some targets which use a
704 // hard-float calling convention by default.
705 if (!Subtarget->isTargetWatchABI()) {
706 if (Subtarget->isAAPCS_ABI()) {
707 setLibcallCallingConv(RTLIB::FPROUND_F32_F16, CallingConv::ARM_AAPCS);
708 setLibcallCallingConv(RTLIB::FPROUND_F64_F16, CallingConv::ARM_AAPCS);
709 setLibcallCallingConv(RTLIB::FPEXT_F16_F32, CallingConv::ARM_AAPCS);
710 } else {
711 setLibcallCallingConv(RTLIB::FPROUND_F32_F16, CallingConv::ARM_APCS);
712 setLibcallCallingConv(RTLIB::FPROUND_F64_F16, CallingConv::ARM_APCS);
713 setLibcallCallingConv(RTLIB::FPEXT_F16_F32, CallingConv::ARM_APCS);
717 // In EABI, these functions have an __aeabi_ prefix, but in GNUEABI they have
718 // a __gnu_ prefix (which is the default).
719 if (Subtarget->isTargetAEABI()) {
720 static const struct {
721 const RTLIB::Libcall Op;
722 const char * const Name;
723 const CallingConv::ID CC;
724 } LibraryCalls[] = {
725 { RTLIB::FPROUND_F32_F16, "__aeabi_f2h", CallingConv::ARM_AAPCS },
726 { RTLIB::FPROUND_F64_F16, "__aeabi_d2h", CallingConv::ARM_AAPCS },
727 { RTLIB::FPEXT_F16_F32, "__aeabi_h2f", CallingConv::ARM_AAPCS },
730 for (const auto &LC : LibraryCalls) {
731 setLibcallName(LC.Op, LC.Name);
732 setLibcallCallingConv(LC.Op, LC.CC);
736 if (Subtarget->isThumb1Only())
737 addRegisterClass(MVT::i32, &ARM::tGPRRegClass);
738 else
739 addRegisterClass(MVT::i32, &ARM::GPRRegClass);
741 if (!Subtarget->useSoftFloat() && !Subtarget->isThumb1Only() &&
742 Subtarget->hasFPRegs()) {
743 addRegisterClass(MVT::f32, &ARM::SPRRegClass);
744 addRegisterClass(MVT::f64, &ARM::DPRRegClass);
745 if (!Subtarget->hasVFP2Base())
746 setAllExpand(MVT::f32);
747 if (!Subtarget->hasFP64())
748 setAllExpand(MVT::f64);
751 if (Subtarget->hasFullFP16()) {
752 addRegisterClass(MVT::f16, &ARM::HPRRegClass);
753 setOperationAction(ISD::BITCAST, MVT::i16, Custom);
754 setOperationAction(ISD::BITCAST, MVT::f16, Custom);
756 setOperationAction(ISD::FMINNUM, MVT::f16, Legal);
757 setOperationAction(ISD::FMAXNUM, MVT::f16, Legal);
760 if (Subtarget->hasBF16()) {
761 addRegisterClass(MVT::bf16, &ARM::HPRRegClass);
762 setAllExpand(MVT::bf16);
763 if (!Subtarget->hasFullFP16())
764 setOperationAction(ISD::BITCAST, MVT::bf16, Custom);
767 for (MVT VT : MVT::fixedlen_vector_valuetypes()) {
768 for (MVT InnerVT : MVT::fixedlen_vector_valuetypes()) {
769 setTruncStoreAction(VT, InnerVT, Expand);
770 addAllExtLoads(VT, InnerVT, Expand);
773 setOperationAction(ISD::SMUL_LOHI, VT, Expand);
774 setOperationAction(ISD::UMUL_LOHI, VT, Expand);
776 setOperationAction(ISD::BSWAP, VT, Expand);
779 setOperationAction(ISD::ConstantFP, MVT::f32, Custom);
780 setOperationAction(ISD::ConstantFP, MVT::f64, Custom);
782 setOperationAction(ISD::READ_REGISTER, MVT::i64, Custom);
783 setOperationAction(ISD::WRITE_REGISTER, MVT::i64, Custom);
785 if (Subtarget->hasMVEIntegerOps())
786 addMVEVectorTypes(Subtarget->hasMVEFloatOps());
788 // Combine low-overhead loop intrinsics so that we can lower i1 types.
789 if (Subtarget->hasLOB()) {
790 setTargetDAGCombine(ISD::BRCOND);
791 setTargetDAGCombine(ISD::BR_CC);
794 if (Subtarget->hasNEON()) {
795 addDRTypeForNEON(MVT::v2f32);
796 addDRTypeForNEON(MVT::v8i8);
797 addDRTypeForNEON(MVT::v4i16);
798 addDRTypeForNEON(MVT::v2i32);
799 addDRTypeForNEON(MVT::v1i64);
801 addQRTypeForNEON(MVT::v4f32);
802 addQRTypeForNEON(MVT::v2f64);
803 addQRTypeForNEON(MVT::v16i8);
804 addQRTypeForNEON(MVT::v8i16);
805 addQRTypeForNEON(MVT::v4i32);
806 addQRTypeForNEON(MVT::v2i64);
808 if (Subtarget->hasFullFP16()) {
809 addQRTypeForNEON(MVT::v8f16);
810 addDRTypeForNEON(MVT::v4f16);
813 if (Subtarget->hasBF16()) {
814 addQRTypeForNEON(MVT::v8bf16);
815 addDRTypeForNEON(MVT::v4bf16);
819 if (Subtarget->hasMVEIntegerOps() || Subtarget->hasNEON()) {
820 // v2f64 is legal so that QR subregs can be extracted as f64 elements, but
821 // none of Neon, MVE or VFP supports any arithmetic operations on it.
822 setOperationAction(ISD::FADD, MVT::v2f64, Expand);
823 setOperationAction(ISD::FSUB, MVT::v2f64, Expand);
824 setOperationAction(ISD::FMUL, MVT::v2f64, Expand);
825 // FIXME: Code duplication: FDIV and FREM are expanded always, see
826 // ARMTargetLowering::addTypeForNEON method for details.
827 setOperationAction(ISD::FDIV, MVT::v2f64, Expand);
828 setOperationAction(ISD::FREM, MVT::v2f64, Expand);
829 // FIXME: Create unittest.
830 // In another words, find a way when "copysign" appears in DAG with vector
831 // operands.
832 setOperationAction(ISD::FCOPYSIGN, MVT::v2f64, Expand);
833 // FIXME: Code duplication: SETCC has custom operation action, see
834 // ARMTargetLowering::addTypeForNEON method for details.
835 setOperationAction(ISD::SETCC, MVT::v2f64, Expand);
836 // FIXME: Create unittest for FNEG and for FABS.
837 setOperationAction(ISD::FNEG, MVT::v2f64, Expand);
838 setOperationAction(ISD::FABS, MVT::v2f64, Expand);
839 setOperationAction(ISD::FSQRT, MVT::v2f64, Expand);
840 setOperationAction(ISD::FSIN, MVT::v2f64, Expand);
841 setOperationAction(ISD::FCOS, MVT::v2f64, Expand);
842 setOperationAction(ISD::FPOW, MVT::v2f64, Expand);
843 setOperationAction(ISD::FLOG, MVT::v2f64, Expand);
844 setOperationAction(ISD::FLOG2, MVT::v2f64, Expand);
845 setOperationAction(ISD::FLOG10, MVT::v2f64, Expand);
846 setOperationAction(ISD::FEXP, MVT::v2f64, Expand);
847 setOperationAction(ISD::FEXP2, MVT::v2f64, Expand);
848 // FIXME: Create unittest for FCEIL, FTRUNC, FRINT, FNEARBYINT, FFLOOR.
849 setOperationAction(ISD::FCEIL, MVT::v2f64, Expand);
850 setOperationAction(ISD::FTRUNC, MVT::v2f64, Expand);
851 setOperationAction(ISD::FRINT, MVT::v2f64, Expand);
852 setOperationAction(ISD::FNEARBYINT, MVT::v2f64, Expand);
853 setOperationAction(ISD::FFLOOR, MVT::v2f64, Expand);
854 setOperationAction(ISD::FMA, MVT::v2f64, Expand);
857 if (Subtarget->hasNEON()) {
858 // The same with v4f32. But keep in mind that vadd, vsub, vmul are natively
859 // supported for v4f32.
860 setOperationAction(ISD::FSQRT, MVT::v4f32, Expand);
861 setOperationAction(ISD::FSIN, MVT::v4f32, Expand);
862 setOperationAction(ISD::FCOS, MVT::v4f32, Expand);
863 setOperationAction(ISD::FPOW, MVT::v4f32, Expand);
864 setOperationAction(ISD::FLOG, MVT::v4f32, Expand);
865 setOperationAction(ISD::FLOG2, MVT::v4f32, Expand);
866 setOperationAction(ISD::FLOG10, MVT::v4f32, Expand);
867 setOperationAction(ISD::FEXP, MVT::v4f32, Expand);
868 setOperationAction(ISD::FEXP2, MVT::v4f32, Expand);
869 setOperationAction(ISD::FCEIL, MVT::v4f32, Expand);
870 setOperationAction(ISD::FTRUNC, MVT::v4f32, Expand);
871 setOperationAction(ISD::FRINT, MVT::v4f32, Expand);
872 setOperationAction(ISD::FNEARBYINT, MVT::v4f32, Expand);
873 setOperationAction(ISD::FFLOOR, MVT::v4f32, Expand);
875 // Mark v2f32 intrinsics.
876 setOperationAction(ISD::FSQRT, MVT::v2f32, Expand);
877 setOperationAction(ISD::FSIN, MVT::v2f32, Expand);
878 setOperationAction(ISD::FCOS, MVT::v2f32, Expand);
879 setOperationAction(ISD::FPOW, MVT::v2f32, Expand);
880 setOperationAction(ISD::FLOG, MVT::v2f32, Expand);
881 setOperationAction(ISD::FLOG2, MVT::v2f32, Expand);
882 setOperationAction(ISD::FLOG10, MVT::v2f32, Expand);
883 setOperationAction(ISD::FEXP, MVT::v2f32, Expand);
884 setOperationAction(ISD::FEXP2, MVT::v2f32, Expand);
885 setOperationAction(ISD::FCEIL, MVT::v2f32, Expand);
886 setOperationAction(ISD::FTRUNC, MVT::v2f32, Expand);
887 setOperationAction(ISD::FRINT, MVT::v2f32, Expand);
888 setOperationAction(ISD::FNEARBYINT, MVT::v2f32, Expand);
889 setOperationAction(ISD::FFLOOR, MVT::v2f32, Expand);
891 // Neon does not support some operations on v1i64 and v2i64 types.
892 setOperationAction(ISD::MUL, MVT::v1i64, Expand);
893 // Custom handling for some quad-vector types to detect VMULL.
894 setOperationAction(ISD::MUL, MVT::v8i16, Custom);
895 setOperationAction(ISD::MUL, MVT::v4i32, Custom);
896 setOperationAction(ISD::MUL, MVT::v2i64, Custom);
897 // Custom handling for some vector types to avoid expensive expansions
898 setOperationAction(ISD::SDIV, MVT::v4i16, Custom);
899 setOperationAction(ISD::SDIV, MVT::v8i8, Custom);
900 setOperationAction(ISD::UDIV, MVT::v4i16, Custom);
901 setOperationAction(ISD::UDIV, MVT::v8i8, Custom);
902 // Neon does not have single instruction SINT_TO_FP and UINT_TO_FP with
903 // a destination type that is wider than the source, and nor does
904 // it have a FP_TO_[SU]INT instruction with a narrower destination than
905 // source.
906 setOperationAction(ISD::SINT_TO_FP, MVT::v4i16, Custom);
907 setOperationAction(ISD::SINT_TO_FP, MVT::v8i16, Custom);
908 setOperationAction(ISD::UINT_TO_FP, MVT::v4i16, Custom);
909 setOperationAction(ISD::UINT_TO_FP, MVT::v8i16, Custom);
910 setOperationAction(ISD::FP_TO_UINT, MVT::v4i16, Custom);
911 setOperationAction(ISD::FP_TO_UINT, MVT::v8i16, Custom);
912 setOperationAction(ISD::FP_TO_SINT, MVT::v4i16, Custom);
913 setOperationAction(ISD::FP_TO_SINT, MVT::v8i16, Custom);
915 setOperationAction(ISD::FP_ROUND, MVT::v2f32, Expand);
916 setOperationAction(ISD::FP_EXTEND, MVT::v2f64, Expand);
918 // NEON does not have single instruction CTPOP for vectors with element
919 // types wider than 8-bits. However, custom lowering can leverage the
920 // v8i8/v16i8 vcnt instruction.
921 setOperationAction(ISD::CTPOP, MVT::v2i32, Custom);
922 setOperationAction(ISD::CTPOP, MVT::v4i32, Custom);
923 setOperationAction(ISD::CTPOP, MVT::v4i16, Custom);
924 setOperationAction(ISD::CTPOP, MVT::v8i16, Custom);
925 setOperationAction(ISD::CTPOP, MVT::v1i64, Custom);
926 setOperationAction(ISD::CTPOP, MVT::v2i64, Custom);
928 setOperationAction(ISD::CTLZ, MVT::v1i64, Expand);
929 setOperationAction(ISD::CTLZ, MVT::v2i64, Expand);
931 // NEON does not have single instruction CTTZ for vectors.
932 setOperationAction(ISD::CTTZ, MVT::v8i8, Custom);
933 setOperationAction(ISD::CTTZ, MVT::v4i16, Custom);
934 setOperationAction(ISD::CTTZ, MVT::v2i32, Custom);
935 setOperationAction(ISD::CTTZ, MVT::v1i64, Custom);
937 setOperationAction(ISD::CTTZ, MVT::v16i8, Custom);
938 setOperationAction(ISD::CTTZ, MVT::v8i16, Custom);
939 setOperationAction(ISD::CTTZ, MVT::v4i32, Custom);
940 setOperationAction(ISD::CTTZ, MVT::v2i64, Custom);
942 setOperationAction(ISD::CTTZ_ZERO_UNDEF, MVT::v8i8, Custom);
943 setOperationAction(ISD::CTTZ_ZERO_UNDEF, MVT::v4i16, Custom);
944 setOperationAction(ISD::CTTZ_ZERO_UNDEF, MVT::v2i32, Custom);
945 setOperationAction(ISD::CTTZ_ZERO_UNDEF, MVT::v1i64, Custom);
947 setOperationAction(ISD::CTTZ_ZERO_UNDEF, MVT::v16i8, Custom);
948 setOperationAction(ISD::CTTZ_ZERO_UNDEF, MVT::v8i16, Custom);
949 setOperationAction(ISD::CTTZ_ZERO_UNDEF, MVT::v4i32, Custom);
950 setOperationAction(ISD::CTTZ_ZERO_UNDEF, MVT::v2i64, Custom);
952 for (MVT VT : MVT::fixedlen_vector_valuetypes()) {
953 setOperationAction(ISD::MULHS, VT, Expand);
954 setOperationAction(ISD::MULHU, VT, Expand);
957 // NEON only has FMA instructions as of VFP4.
958 if (!Subtarget->hasVFP4Base()) {
959 setOperationAction(ISD::FMA, MVT::v2f32, Expand);
960 setOperationAction(ISD::FMA, MVT::v4f32, Expand);
963 setTargetDAGCombine(ISD::SHL);
964 setTargetDAGCombine(ISD::SRL);
965 setTargetDAGCombine(ISD::SRA);
966 setTargetDAGCombine(ISD::FP_TO_SINT);
967 setTargetDAGCombine(ISD::FP_TO_UINT);
968 setTargetDAGCombine(ISD::FDIV);
969 setTargetDAGCombine(ISD::LOAD);
971 // It is legal to extload from v4i8 to v4i16 or v4i32.
972 for (MVT Ty : {MVT::v8i8, MVT::v4i8, MVT::v2i8, MVT::v4i16, MVT::v2i16,
973 MVT::v2i32}) {
974 for (MVT VT : MVT::integer_fixedlen_vector_valuetypes()) {
975 setLoadExtAction(ISD::EXTLOAD, VT, Ty, Legal);
976 setLoadExtAction(ISD::ZEXTLOAD, VT, Ty, Legal);
977 setLoadExtAction(ISD::SEXTLOAD, VT, Ty, Legal);
982 if (Subtarget->hasNEON() || Subtarget->hasMVEIntegerOps()) {
983 setTargetDAGCombine(ISD::BUILD_VECTOR);
984 setTargetDAGCombine(ISD::VECTOR_SHUFFLE);
985 setTargetDAGCombine(ISD::INSERT_SUBVECTOR);
986 setTargetDAGCombine(ISD::INSERT_VECTOR_ELT);
987 setTargetDAGCombine(ISD::EXTRACT_VECTOR_ELT);
988 setTargetDAGCombine(ISD::SIGN_EXTEND_INREG);
989 setTargetDAGCombine(ISD::STORE);
990 setTargetDAGCombine(ISD::SIGN_EXTEND);
991 setTargetDAGCombine(ISD::ZERO_EXTEND);
992 setTargetDAGCombine(ISD::ANY_EXTEND);
993 setTargetDAGCombine(ISD::INTRINSIC_WO_CHAIN);
994 setTargetDAGCombine(ISD::INTRINSIC_W_CHAIN);
995 setTargetDAGCombine(ISD::INTRINSIC_VOID);
996 setTargetDAGCombine(ISD::VECREDUCE_ADD);
997 setTargetDAGCombine(ISD::ADD);
998 setTargetDAGCombine(ISD::BITCAST);
1000 if (Subtarget->hasMVEIntegerOps()) {
1001 setTargetDAGCombine(ISD::SMIN);
1002 setTargetDAGCombine(ISD::UMIN);
1003 setTargetDAGCombine(ISD::SMAX);
1004 setTargetDAGCombine(ISD::UMAX);
1005 setTargetDAGCombine(ISD::FP_EXTEND);
1006 setTargetDAGCombine(ISD::SELECT);
1007 setTargetDAGCombine(ISD::SELECT_CC);
1010 if (!Subtarget->hasFP64()) {
1011 // When targeting a floating-point unit with only single-precision
1012 // operations, f64 is legal for the few double-precision instructions which
1013 // are present However, no double-precision operations other than moves,
1014 // loads and stores are provided by the hardware.
1015 setOperationAction(ISD::FADD, MVT::f64, Expand);
1016 setOperationAction(ISD::FSUB, MVT::f64, Expand);
1017 setOperationAction(ISD::FMUL, MVT::f64, Expand);
1018 setOperationAction(ISD::FMA, MVT::f64, Expand);
1019 setOperationAction(ISD::FDIV, MVT::f64, Expand);
1020 setOperationAction(ISD::FREM, MVT::f64, Expand);
1021 setOperationAction(ISD::FCOPYSIGN, MVT::f64, Expand);
1022 setOperationAction(ISD::FGETSIGN, MVT::f64, Expand);
1023 setOperationAction(ISD::FNEG, MVT::f64, Expand);
1024 setOperationAction(ISD::FABS, MVT::f64, Expand);
1025 setOperationAction(ISD::FSQRT, MVT::f64, Expand);
1026 setOperationAction(ISD::FSIN, MVT::f64, Expand);
1027 setOperationAction(ISD::FCOS, MVT::f64, Expand);
1028 setOperationAction(ISD::FPOW, MVT::f64, Expand);
1029 setOperationAction(ISD::FLOG, MVT::f64, Expand);
1030 setOperationAction(ISD::FLOG2, MVT::f64, Expand);
1031 setOperationAction(ISD::FLOG10, MVT::f64, Expand);
1032 setOperationAction(ISD::FEXP, MVT::f64, Expand);
1033 setOperationAction(ISD::FEXP2, MVT::f64, Expand);
1034 setOperationAction(ISD::FCEIL, MVT::f64, Expand);
1035 setOperationAction(ISD::FTRUNC, MVT::f64, Expand);
1036 setOperationAction(ISD::FRINT, MVT::f64, Expand);
1037 setOperationAction(ISD::FNEARBYINT, MVT::f64, Expand);
1038 setOperationAction(ISD::FFLOOR, MVT::f64, Expand);
1039 setOperationAction(ISD::SINT_TO_FP, MVT::i32, Custom);
1040 setOperationAction(ISD::UINT_TO_FP, MVT::i32, Custom);
1041 setOperationAction(ISD::FP_TO_SINT, MVT::i32, Custom);
1042 setOperationAction(ISD::FP_TO_UINT, MVT::i32, Custom);
1043 setOperationAction(ISD::FP_TO_SINT, MVT::f64, Custom);
1044 setOperationAction(ISD::FP_TO_UINT, MVT::f64, Custom);
1045 setOperationAction(ISD::FP_ROUND, MVT::f32, Custom);
1046 setOperationAction(ISD::STRICT_FP_TO_SINT, MVT::i32, Custom);
1047 setOperationAction(ISD::STRICT_FP_TO_UINT, MVT::i32, Custom);
1048 setOperationAction(ISD::STRICT_FP_TO_SINT, MVT::f64, Custom);
1049 setOperationAction(ISD::STRICT_FP_TO_UINT, MVT::f64, Custom);
1050 setOperationAction(ISD::STRICT_FP_ROUND, MVT::f32, Custom);
1053 if (!Subtarget->hasFP64() || !Subtarget->hasFPARMv8Base()) {
1054 setOperationAction(ISD::FP_EXTEND, MVT::f64, Custom);
1055 setOperationAction(ISD::STRICT_FP_EXTEND, MVT::f64, Custom);
1056 if (Subtarget->hasFullFP16()) {
1057 setOperationAction(ISD::FP_ROUND, MVT::f16, Custom);
1058 setOperationAction(ISD::STRICT_FP_ROUND, MVT::f16, Custom);
1062 if (!Subtarget->hasFP16()) {
1063 setOperationAction(ISD::FP_EXTEND, MVT::f32, Custom);
1064 setOperationAction(ISD::STRICT_FP_EXTEND, MVT::f32, Custom);
1067 computeRegisterProperties(Subtarget->getRegisterInfo());
1069 // ARM does not have floating-point extending loads.
1070 for (MVT VT : MVT::fp_valuetypes()) {
1071 setLoadExtAction(ISD::EXTLOAD, VT, MVT::f32, Expand);
1072 setLoadExtAction(ISD::EXTLOAD, VT, MVT::f16, Expand);
1075 // ... or truncating stores
1076 setTruncStoreAction(MVT::f64, MVT::f32, Expand);
1077 setTruncStoreAction(MVT::f32, MVT::f16, Expand);
1078 setTruncStoreAction(MVT::f64, MVT::f16, Expand);
1080 // ARM does not have i1 sign extending load.
1081 for (MVT VT : MVT::integer_valuetypes())
1082 setLoadExtAction(ISD::SEXTLOAD, VT, MVT::i1, Promote);
1084 // ARM supports all 4 flavors of integer indexed load / store.
1085 if (!Subtarget->isThumb1Only()) {
1086 for (unsigned im = (unsigned)ISD::PRE_INC;
1087 im != (unsigned)ISD::LAST_INDEXED_MODE; ++im) {
1088 setIndexedLoadAction(im, MVT::i1, Legal);
1089 setIndexedLoadAction(im, MVT::i8, Legal);
1090 setIndexedLoadAction(im, MVT::i16, Legal);
1091 setIndexedLoadAction(im, MVT::i32, Legal);
1092 setIndexedStoreAction(im, MVT::i1, Legal);
1093 setIndexedStoreAction(im, MVT::i8, Legal);
1094 setIndexedStoreAction(im, MVT::i16, Legal);
1095 setIndexedStoreAction(im, MVT::i32, Legal);
1097 } else {
1098 // Thumb-1 has limited post-inc load/store support - LDM r0!, {r1}.
1099 setIndexedLoadAction(ISD::POST_INC, MVT::i32, Legal);
1100 setIndexedStoreAction(ISD::POST_INC, MVT::i32, Legal);
1103 setOperationAction(ISD::SADDO, MVT::i32, Custom);
1104 setOperationAction(ISD::UADDO, MVT::i32, Custom);
1105 setOperationAction(ISD::SSUBO, MVT::i32, Custom);
1106 setOperationAction(ISD::USUBO, MVT::i32, Custom);
1108 setOperationAction(ISD::ADDCARRY, MVT::i32, Custom);
1109 setOperationAction(ISD::SUBCARRY, MVT::i32, Custom);
1110 if (Subtarget->hasDSP()) {
1111 setOperationAction(ISD::SADDSAT, MVT::i8, Custom);
1112 setOperationAction(ISD::SSUBSAT, MVT::i8, Custom);
1113 setOperationAction(ISD::SADDSAT, MVT::i16, Custom);
1114 setOperationAction(ISD::SSUBSAT, MVT::i16, Custom);
1115 setOperationAction(ISD::UADDSAT, MVT::i8, Custom);
1116 setOperationAction(ISD::USUBSAT, MVT::i8, Custom);
1117 setOperationAction(ISD::UADDSAT, MVT::i16, Custom);
1118 setOperationAction(ISD::USUBSAT, MVT::i16, Custom);
1120 if (Subtarget->hasBaseDSP()) {
1121 setOperationAction(ISD::SADDSAT, MVT::i32, Legal);
1122 setOperationAction(ISD::SSUBSAT, MVT::i32, Legal);
1125 // i64 operation support.
1126 setOperationAction(ISD::MUL, MVT::i64, Expand);
1127 setOperationAction(ISD::MULHU, MVT::i32, Expand);
1128 if (Subtarget->isThumb1Only()) {
1129 setOperationAction(ISD::UMUL_LOHI, MVT::i32, Expand);
1130 setOperationAction(ISD::SMUL_LOHI, MVT::i32, Expand);
1132 if (Subtarget->isThumb1Only() || !Subtarget->hasV6Ops()
1133 || (Subtarget->isThumb2() && !Subtarget->hasDSP()))
1134 setOperationAction(ISD::MULHS, MVT::i32, Expand);
1136 setOperationAction(ISD::SHL_PARTS, MVT::i32, Custom);
1137 setOperationAction(ISD::SRA_PARTS, MVT::i32, Custom);
1138 setOperationAction(ISD::SRL_PARTS, MVT::i32, Custom);
1139 setOperationAction(ISD::SRL, MVT::i64, Custom);
1140 setOperationAction(ISD::SRA, MVT::i64, Custom);
1141 setOperationAction(ISD::INTRINSIC_VOID, MVT::Other, Custom);
1142 setOperationAction(ISD::INTRINSIC_WO_CHAIN, MVT::i64, Custom);
1143 setOperationAction(ISD::LOAD, MVT::i64, Custom);
1144 setOperationAction(ISD::STORE, MVT::i64, Custom);
1146 // MVE lowers 64 bit shifts to lsll and lsrl
1147 // assuming that ISD::SRL and SRA of i64 are already marked custom
1148 if (Subtarget->hasMVEIntegerOps())
1149 setOperationAction(ISD::SHL, MVT::i64, Custom);
1151 // Expand to __aeabi_l{lsl,lsr,asr} calls for Thumb1.
1152 if (Subtarget->isThumb1Only()) {
1153 setOperationAction(ISD::SHL_PARTS, MVT::i32, Expand);
1154 setOperationAction(ISD::SRA_PARTS, MVT::i32, Expand);
1155 setOperationAction(ISD::SRL_PARTS, MVT::i32, Expand);
1158 if (!Subtarget->isThumb1Only() && Subtarget->hasV6T2Ops())
1159 setOperationAction(ISD::BITREVERSE, MVT::i32, Legal);
1161 // ARM does not have ROTL.
1162 setOperationAction(ISD::ROTL, MVT::i32, Expand);
1163 for (MVT VT : MVT::fixedlen_vector_valuetypes()) {
1164 setOperationAction(ISD::ROTL, VT, Expand);
1165 setOperationAction(ISD::ROTR, VT, Expand);
1167 setOperationAction(ISD::CTTZ, MVT::i32, Custom);
1168 setOperationAction(ISD::CTPOP, MVT::i32, Expand);
1169 if (!Subtarget->hasV5TOps() || Subtarget->isThumb1Only()) {
1170 setOperationAction(ISD::CTLZ, MVT::i32, Expand);
1171 setOperationAction(ISD::CTLZ_ZERO_UNDEF, MVT::i32, LibCall);
1174 // @llvm.readcyclecounter requires the Performance Monitors extension.
1175 // Default to the 0 expansion on unsupported platforms.
1176 // FIXME: Technically there are older ARM CPUs that have
1177 // implementation-specific ways of obtaining this information.
1178 if (Subtarget->hasPerfMon())
1179 setOperationAction(ISD::READCYCLECOUNTER, MVT::i64, Custom);
1181 // Only ARMv6 has BSWAP.
1182 if (!Subtarget->hasV6Ops())
1183 setOperationAction(ISD::BSWAP, MVT::i32, Expand);
1185 bool hasDivide = Subtarget->isThumb() ? Subtarget->hasDivideInThumbMode()
1186 : Subtarget->hasDivideInARMMode();
1187 if (!hasDivide) {
1188 // These are expanded into libcalls if the cpu doesn't have HW divider.
1189 setOperationAction(ISD::SDIV, MVT::i32, LibCall);
1190 setOperationAction(ISD::UDIV, MVT::i32, LibCall);
1193 if (Subtarget->isTargetWindows() && !Subtarget->hasDivideInThumbMode()) {
1194 setOperationAction(ISD::SDIV, MVT::i32, Custom);
1195 setOperationAction(ISD::UDIV, MVT::i32, Custom);
1197 setOperationAction(ISD::SDIV, MVT::i64, Custom);
1198 setOperationAction(ISD::UDIV, MVT::i64, Custom);
1201 setOperationAction(ISD::SREM, MVT::i32, Expand);
1202 setOperationAction(ISD::UREM, MVT::i32, Expand);
1204 // Register based DivRem for AEABI (RTABI 4.2)
1205 if (Subtarget->isTargetAEABI() || Subtarget->isTargetAndroid() ||
1206 Subtarget->isTargetGNUAEABI() || Subtarget->isTargetMuslAEABI() ||
1207 Subtarget->isTargetWindows()) {
1208 setOperationAction(ISD::SREM, MVT::i64, Custom);
1209 setOperationAction(ISD::UREM, MVT::i64, Custom);
1210 HasStandaloneRem = false;
1212 if (Subtarget->isTargetWindows()) {
1213 const struct {
1214 const RTLIB::Libcall Op;
1215 const char * const Name;
1216 const CallingConv::ID CC;
1217 } LibraryCalls[] = {
1218 { RTLIB::SDIVREM_I8, "__rt_sdiv", CallingConv::ARM_AAPCS },
1219 { RTLIB::SDIVREM_I16, "__rt_sdiv", CallingConv::ARM_AAPCS },
1220 { RTLIB::SDIVREM_I32, "__rt_sdiv", CallingConv::ARM_AAPCS },
1221 { RTLIB::SDIVREM_I64, "__rt_sdiv64", CallingConv::ARM_AAPCS },
1223 { RTLIB::UDIVREM_I8, "__rt_udiv", CallingConv::ARM_AAPCS },
1224 { RTLIB::UDIVREM_I16, "__rt_udiv", CallingConv::ARM_AAPCS },
1225 { RTLIB::UDIVREM_I32, "__rt_udiv", CallingConv::ARM_AAPCS },
1226 { RTLIB::UDIVREM_I64, "__rt_udiv64", CallingConv::ARM_AAPCS },
1229 for (const auto &LC : LibraryCalls) {
1230 setLibcallName(LC.Op, LC.Name);
1231 setLibcallCallingConv(LC.Op, LC.CC);
1233 } else {
1234 const struct {
1235 const RTLIB::Libcall Op;
1236 const char * const Name;
1237 const CallingConv::ID CC;
1238 } LibraryCalls[] = {
1239 { RTLIB::SDIVREM_I8, "__aeabi_idivmod", CallingConv::ARM_AAPCS },
1240 { RTLIB::SDIVREM_I16, "__aeabi_idivmod", CallingConv::ARM_AAPCS },
1241 { RTLIB::SDIVREM_I32, "__aeabi_idivmod", CallingConv::ARM_AAPCS },
1242 { RTLIB::SDIVREM_I64, "__aeabi_ldivmod", CallingConv::ARM_AAPCS },
1244 { RTLIB::UDIVREM_I8, "__aeabi_uidivmod", CallingConv::ARM_AAPCS },
1245 { RTLIB::UDIVREM_I16, "__aeabi_uidivmod", CallingConv::ARM_AAPCS },
1246 { RTLIB::UDIVREM_I32, "__aeabi_uidivmod", CallingConv::ARM_AAPCS },
1247 { RTLIB::UDIVREM_I64, "__aeabi_uldivmod", CallingConv::ARM_AAPCS },
1250 for (const auto &LC : LibraryCalls) {
1251 setLibcallName(LC.Op, LC.Name);
1252 setLibcallCallingConv(LC.Op, LC.CC);
1256 setOperationAction(ISD::SDIVREM, MVT::i32, Custom);
1257 setOperationAction(ISD::UDIVREM, MVT::i32, Custom);
1258 setOperationAction(ISD::SDIVREM, MVT::i64, Custom);
1259 setOperationAction(ISD::UDIVREM, MVT::i64, Custom);
1260 } else {
1261 setOperationAction(ISD::SDIVREM, MVT::i32, Expand);
1262 setOperationAction(ISD::UDIVREM, MVT::i32, Expand);
1265 if (Subtarget->getTargetTriple().isOSMSVCRT()) {
1266 // MSVCRT doesn't have powi; fall back to pow
1267 setLibcallName(RTLIB::POWI_F32, nullptr);
1268 setLibcallName(RTLIB::POWI_F64, nullptr);
1271 setOperationAction(ISD::GlobalAddress, MVT::i32, Custom);
1272 setOperationAction(ISD::ConstantPool, MVT::i32, Custom);
1273 setOperationAction(ISD::GlobalTLSAddress, MVT::i32, Custom);
1274 setOperationAction(ISD::BlockAddress, MVT::i32, Custom);
1276 setOperationAction(ISD::TRAP, MVT::Other, Legal);
1277 setOperationAction(ISD::DEBUGTRAP, MVT::Other, Legal);
1279 // Use the default implementation.
1280 setOperationAction(ISD::VASTART, MVT::Other, Custom);
1281 setOperationAction(ISD::VAARG, MVT::Other, Expand);
1282 setOperationAction(ISD::VACOPY, MVT::Other, Expand);
1283 setOperationAction(ISD::VAEND, MVT::Other, Expand);
1284 setOperationAction(ISD::STACKSAVE, MVT::Other, Expand);
1285 setOperationAction(ISD::STACKRESTORE, MVT::Other, Expand);
1287 if (Subtarget->isTargetWindows())
1288 setOperationAction(ISD::DYNAMIC_STACKALLOC, MVT::i32, Custom);
1289 else
1290 setOperationAction(ISD::DYNAMIC_STACKALLOC, MVT::i32, Expand);
1292 // ARMv6 Thumb1 (except for CPUs that support dmb / dsb) and earlier use
1293 // the default expansion.
1294 InsertFencesForAtomic = false;
1295 if (Subtarget->hasAnyDataBarrier() &&
1296 (!Subtarget->isThumb() || Subtarget->hasV8MBaselineOps())) {
1297 // ATOMIC_FENCE needs custom lowering; the others should have been expanded
1298 // to ldrex/strex loops already.
1299 setOperationAction(ISD::ATOMIC_FENCE, MVT::Other, Custom);
1300 if (!Subtarget->isThumb() || !Subtarget->isMClass())
1301 setOperationAction(ISD::ATOMIC_CMP_SWAP, MVT::i64, Custom);
1303 // On v8, we have particularly efficient implementations of atomic fences
1304 // if they can be combined with nearby atomic loads and stores.
1305 if (!Subtarget->hasAcquireRelease() ||
1306 getTargetMachine().getOptLevel() == 0) {
1307 // Automatically insert fences (dmb ish) around ATOMIC_SWAP etc.
1308 InsertFencesForAtomic = true;
1310 } else {
1311 // If there's anything we can use as a barrier, go through custom lowering
1312 // for ATOMIC_FENCE.
1313 // If target has DMB in thumb, Fences can be inserted.
1314 if (Subtarget->hasDataBarrier())
1315 InsertFencesForAtomic = true;
1317 setOperationAction(ISD::ATOMIC_FENCE, MVT::Other,
1318 Subtarget->hasAnyDataBarrier() ? Custom : Expand);
1320 // Set them all for expansion, which will force libcalls.
1321 setOperationAction(ISD::ATOMIC_CMP_SWAP, MVT::i32, Expand);
1322 setOperationAction(ISD::ATOMIC_SWAP, MVT::i32, Expand);
1323 setOperationAction(ISD::ATOMIC_LOAD_ADD, MVT::i32, Expand);
1324 setOperationAction(ISD::ATOMIC_LOAD_SUB, MVT::i32, Expand);
1325 setOperationAction(ISD::ATOMIC_LOAD_AND, MVT::i32, Expand);
1326 setOperationAction(ISD::ATOMIC_LOAD_OR, MVT::i32, Expand);
1327 setOperationAction(ISD::ATOMIC_LOAD_XOR, MVT::i32, Expand);
1328 setOperationAction(ISD::ATOMIC_LOAD_NAND, MVT::i32, Expand);
1329 setOperationAction(ISD::ATOMIC_LOAD_MIN, MVT::i32, Expand);
1330 setOperationAction(ISD::ATOMIC_LOAD_MAX, MVT::i32, Expand);
1331 setOperationAction(ISD::ATOMIC_LOAD_UMIN, MVT::i32, Expand);
1332 setOperationAction(ISD::ATOMIC_LOAD_UMAX, MVT::i32, Expand);
1333 // Mark ATOMIC_LOAD and ATOMIC_STORE custom so we can handle the
1334 // Unordered/Monotonic case.
1335 if (!InsertFencesForAtomic) {
1336 setOperationAction(ISD::ATOMIC_LOAD, MVT::i32, Custom);
1337 setOperationAction(ISD::ATOMIC_STORE, MVT::i32, Custom);
1341 setOperationAction(ISD::PREFETCH, MVT::Other, Custom);
1343 // Requires SXTB/SXTH, available on v6 and up in both ARM and Thumb modes.
1344 if (!Subtarget->hasV6Ops()) {
1345 setOperationAction(ISD::SIGN_EXTEND_INREG, MVT::i16, Expand);
1346 setOperationAction(ISD::SIGN_EXTEND_INREG, MVT::i8, Expand);
1348 setOperationAction(ISD::SIGN_EXTEND_INREG, MVT::i1, Expand);
1350 if (!Subtarget->useSoftFloat() && Subtarget->hasFPRegs() &&
1351 !Subtarget->isThumb1Only()) {
1352 // Turn f64->i64 into VMOVRRD, i64 -> f64 to VMOVDRR
1353 // iff target supports vfp2.
1354 setOperationAction(ISD::BITCAST, MVT::i64, Custom);
1355 setOperationAction(ISD::FLT_ROUNDS_, MVT::i32, Custom);
1356 setOperationAction(ISD::SET_ROUNDING, MVT::Other, Custom);
1359 // We want to custom lower some of our intrinsics.
1360 setOperationAction(ISD::INTRINSIC_WO_CHAIN, MVT::Other, Custom);
1361 setOperationAction(ISD::EH_SJLJ_SETJMP, MVT::i32, Custom);
1362 setOperationAction(ISD::EH_SJLJ_LONGJMP, MVT::Other, Custom);
1363 setOperationAction(ISD::EH_SJLJ_SETUP_DISPATCH, MVT::Other, Custom);
1364 if (Subtarget->useSjLjEH())
1365 setLibcallName(RTLIB::UNWIND_RESUME, "_Unwind_SjLj_Resume");
1367 setOperationAction(ISD::SETCC, MVT::i32, Expand);
1368 setOperationAction(ISD::SETCC, MVT::f32, Expand);
1369 setOperationAction(ISD::SETCC, MVT::f64, Expand);
1370 setOperationAction(ISD::SELECT, MVT::i32, Custom);
1371 setOperationAction(ISD::SELECT, MVT::f32, Custom);
1372 setOperationAction(ISD::SELECT, MVT::f64, Custom);
1373 setOperationAction(ISD::SELECT_CC, MVT::i32, Custom);
1374 setOperationAction(ISD::SELECT_CC, MVT::f32, Custom);
1375 setOperationAction(ISD::SELECT_CC, MVT::f64, Custom);
1376 if (Subtarget->hasFullFP16()) {
1377 setOperationAction(ISD::SETCC, MVT::f16, Expand);
1378 setOperationAction(ISD::SELECT, MVT::f16, Custom);
1379 setOperationAction(ISD::SELECT_CC, MVT::f16, Custom);
1382 setOperationAction(ISD::SETCCCARRY, MVT::i32, Custom);
1384 setOperationAction(ISD::BRCOND, MVT::Other, Custom);
1385 setOperationAction(ISD::BR_CC, MVT::i32, Custom);
1386 if (Subtarget->hasFullFP16())
1387 setOperationAction(ISD::BR_CC, MVT::f16, Custom);
1388 setOperationAction(ISD::BR_CC, MVT::f32, Custom);
1389 setOperationAction(ISD::BR_CC, MVT::f64, Custom);
1390 setOperationAction(ISD::BR_JT, MVT::Other, Custom);
1392 // We don't support sin/cos/fmod/copysign/pow
1393 setOperationAction(ISD::FSIN, MVT::f64, Expand);
1394 setOperationAction(ISD::FSIN, MVT::f32, Expand);
1395 setOperationAction(ISD::FCOS, MVT::f32, Expand);
1396 setOperationAction(ISD::FCOS, MVT::f64, Expand);
1397 setOperationAction(ISD::FSINCOS, MVT::f64, Expand);
1398 setOperationAction(ISD::FSINCOS, MVT::f32, Expand);
1399 setOperationAction(ISD::FREM, MVT::f64, Expand);
1400 setOperationAction(ISD::FREM, MVT::f32, Expand);
1401 if (!Subtarget->useSoftFloat() && Subtarget->hasVFP2Base() &&
1402 !Subtarget->isThumb1Only()) {
1403 setOperationAction(ISD::FCOPYSIGN, MVT::f64, Custom);
1404 setOperationAction(ISD::FCOPYSIGN, MVT::f32, Custom);
1406 setOperationAction(ISD::FPOW, MVT::f64, Expand);
1407 setOperationAction(ISD::FPOW, MVT::f32, Expand);
1409 if (!Subtarget->hasVFP4Base()) {
1410 setOperationAction(ISD::FMA, MVT::f64, Expand);
1411 setOperationAction(ISD::FMA, MVT::f32, Expand);
1414 // Various VFP goodness
1415 if (!Subtarget->useSoftFloat() && !Subtarget->isThumb1Only()) {
1416 // FP-ARMv8 adds f64 <-> f16 conversion. Before that it should be expanded.
1417 if (!Subtarget->hasFPARMv8Base() || !Subtarget->hasFP64()) {
1418 setOperationAction(ISD::FP16_TO_FP, MVT::f64, Expand);
1419 setOperationAction(ISD::FP_TO_FP16, MVT::f64, Expand);
1422 // fp16 is a special v7 extension that adds f16 <-> f32 conversions.
1423 if (!Subtarget->hasFP16()) {
1424 setOperationAction(ISD::FP16_TO_FP, MVT::f32, Expand);
1425 setOperationAction(ISD::FP_TO_FP16, MVT::f32, Expand);
1428 // Strict floating-point comparisons need custom lowering.
1429 setOperationAction(ISD::STRICT_FSETCC, MVT::f16, Custom);
1430 setOperationAction(ISD::STRICT_FSETCCS, MVT::f16, Custom);
1431 setOperationAction(ISD::STRICT_FSETCC, MVT::f32, Custom);
1432 setOperationAction(ISD::STRICT_FSETCCS, MVT::f32, Custom);
1433 setOperationAction(ISD::STRICT_FSETCC, MVT::f64, Custom);
1434 setOperationAction(ISD::STRICT_FSETCCS, MVT::f64, Custom);
1437 // Use __sincos_stret if available.
1438 if (getLibcallName(RTLIB::SINCOS_STRET_F32) != nullptr &&
1439 getLibcallName(RTLIB::SINCOS_STRET_F64) != nullptr) {
1440 setOperationAction(ISD::FSINCOS, MVT::f64, Custom);
1441 setOperationAction(ISD::FSINCOS, MVT::f32, Custom);
1444 // FP-ARMv8 implements a lot of rounding-like FP operations.
1445 if (Subtarget->hasFPARMv8Base()) {
1446 setOperationAction(ISD::FFLOOR, MVT::f32, Legal);
1447 setOperationAction(ISD::FCEIL, MVT::f32, Legal);
1448 setOperationAction(ISD::FROUND, MVT::f32, Legal);
1449 setOperationAction(ISD::FTRUNC, MVT::f32, Legal);
1450 setOperationAction(ISD::FNEARBYINT, MVT::f32, Legal);
1451 setOperationAction(ISD::FRINT, MVT::f32, Legal);
1452 setOperationAction(ISD::FMINNUM, MVT::f32, Legal);
1453 setOperationAction(ISD::FMAXNUM, MVT::f32, Legal);
1454 if (Subtarget->hasNEON()) {
1455 setOperationAction(ISD::FMINNUM, MVT::v2f32, Legal);
1456 setOperationAction(ISD::FMAXNUM, MVT::v2f32, Legal);
1457 setOperationAction(ISD::FMINNUM, MVT::v4f32, Legal);
1458 setOperationAction(ISD::FMAXNUM, MVT::v4f32, Legal);
1461 if (Subtarget->hasFP64()) {
1462 setOperationAction(ISD::FFLOOR, MVT::f64, Legal);
1463 setOperationAction(ISD::FCEIL, MVT::f64, Legal);
1464 setOperationAction(ISD::FROUND, MVT::f64, Legal);
1465 setOperationAction(ISD::FTRUNC, MVT::f64, Legal);
1466 setOperationAction(ISD::FNEARBYINT, MVT::f64, Legal);
1467 setOperationAction(ISD::FRINT, MVT::f64, Legal);
1468 setOperationAction(ISD::FMINNUM, MVT::f64, Legal);
1469 setOperationAction(ISD::FMAXNUM, MVT::f64, Legal);
1473 // FP16 often need to be promoted to call lib functions
1474 if (Subtarget->hasFullFP16()) {
1475 setOperationAction(ISD::FREM, MVT::f16, Promote);
1476 setOperationAction(ISD::FCOPYSIGN, MVT::f16, Expand);
1477 setOperationAction(ISD::FSIN, MVT::f16, Promote);
1478 setOperationAction(ISD::FCOS, MVT::f16, Promote);
1479 setOperationAction(ISD::FSINCOS, MVT::f16, Promote);
1480 setOperationAction(ISD::FPOWI, MVT::f16, Promote);
1481 setOperationAction(ISD::FPOW, MVT::f16, Promote);
1482 setOperationAction(ISD::FEXP, MVT::f16, Promote);
1483 setOperationAction(ISD::FEXP2, MVT::f16, Promote);
1484 setOperationAction(ISD::FLOG, MVT::f16, Promote);
1485 setOperationAction(ISD::FLOG10, MVT::f16, Promote);
1486 setOperationAction(ISD::FLOG2, MVT::f16, Promote);
1488 setOperationAction(ISD::FROUND, MVT::f16, Legal);
1491 if (Subtarget->hasNEON()) {
1492 // vmin and vmax aren't available in a scalar form, so we can use
1493 // a NEON instruction with an undef lane instead. This has a performance
1494 // penalty on some cores, so we don't do this unless we have been
1495 // asked to by the core tuning model.
1496 if (Subtarget->useNEONForSinglePrecisionFP()) {
1497 setOperationAction(ISD::FMINIMUM, MVT::f32, Legal);
1498 setOperationAction(ISD::FMAXIMUM, MVT::f32, Legal);
1499 setOperationAction(ISD::FMINIMUM, MVT::f16, Legal);
1500 setOperationAction(ISD::FMAXIMUM, MVT::f16, Legal);
1502 setOperationAction(ISD::FMINIMUM, MVT::v2f32, Legal);
1503 setOperationAction(ISD::FMAXIMUM, MVT::v2f32, Legal);
1504 setOperationAction(ISD::FMINIMUM, MVT::v4f32, Legal);
1505 setOperationAction(ISD::FMAXIMUM, MVT::v4f32, Legal);
1507 if (Subtarget->hasFullFP16()) {
1508 setOperationAction(ISD::FMINNUM, MVT::v4f16, Legal);
1509 setOperationAction(ISD::FMAXNUM, MVT::v4f16, Legal);
1510 setOperationAction(ISD::FMINNUM, MVT::v8f16, Legal);
1511 setOperationAction(ISD::FMAXNUM, MVT::v8f16, Legal);
1513 setOperationAction(ISD::FMINIMUM, MVT::v4f16, Legal);
1514 setOperationAction(ISD::FMAXIMUM, MVT::v4f16, Legal);
1515 setOperationAction(ISD::FMINIMUM, MVT::v8f16, Legal);
1516 setOperationAction(ISD::FMAXIMUM, MVT::v8f16, Legal);
1520 // We have target-specific dag combine patterns for the following nodes:
1521 // ARMISD::VMOVRRD - No need to call setTargetDAGCombine
1522 setTargetDAGCombine(ISD::ADD);
1523 setTargetDAGCombine(ISD::SUB);
1524 setTargetDAGCombine(ISD::MUL);
1525 setTargetDAGCombine(ISD::AND);
1526 setTargetDAGCombine(ISD::OR);
1527 setTargetDAGCombine(ISD::XOR);
1529 if (Subtarget->hasMVEIntegerOps())
1530 setTargetDAGCombine(ISD::VSELECT);
1532 if (Subtarget->hasV6Ops())
1533 setTargetDAGCombine(ISD::SRL);
1534 if (Subtarget->isThumb1Only())
1535 setTargetDAGCombine(ISD::SHL);
1537 setStackPointerRegisterToSaveRestore(ARM::SP);
1539 if (Subtarget->useSoftFloat() || Subtarget->isThumb1Only() ||
1540 !Subtarget->hasVFP2Base() || Subtarget->hasMinSize())
1541 setSchedulingPreference(Sched::RegPressure);
1542 else
1543 setSchedulingPreference(Sched::Hybrid);
1545 //// temporary - rewrite interface to use type
1546 MaxStoresPerMemset = 8;
1547 MaxStoresPerMemsetOptSize = 4;
1548 MaxStoresPerMemcpy = 4; // For @llvm.memcpy -> sequence of stores
1549 MaxStoresPerMemcpyOptSize = 2;
1550 MaxStoresPerMemmove = 4; // For @llvm.memmove -> sequence of stores
1551 MaxStoresPerMemmoveOptSize = 2;
1553 // On ARM arguments smaller than 4 bytes are extended, so all arguments
1554 // are at least 4 bytes aligned.
1555 setMinStackArgumentAlignment(Align(4));
1557 // Prefer likely predicted branches to selects on out-of-order cores.
1558 PredictableSelectIsExpensive = Subtarget->getSchedModel().isOutOfOrder();
1560 setPrefLoopAlignment(Align(1ULL << Subtarget->getPrefLoopLogAlignment()));
1562 setMinFunctionAlignment(Subtarget->isThumb() ? Align(2) : Align(4));
1564 if (Subtarget->isThumb() || Subtarget->isThumb2())
1565 setTargetDAGCombine(ISD::ABS);
1568 bool ARMTargetLowering::useSoftFloat() const {
1569 return Subtarget->useSoftFloat();
1572 // FIXME: It might make sense to define the representative register class as the
1573 // nearest super-register that has a non-null superset. For example, DPR_VFP2 is
1574 // a super-register of SPR, and DPR is a superset if DPR_VFP2. Consequently,
1575 // SPR's representative would be DPR_VFP2. This should work well if register
1576 // pressure tracking were modified such that a register use would increment the
1577 // pressure of the register class's representative and all of it's super
1578 // classes' representatives transitively. We have not implemented this because
1579 // of the difficulty prior to coalescing of modeling operand register classes
1580 // due to the common occurrence of cross class copies and subregister insertions
1581 // and extractions.
1582 std::pair<const TargetRegisterClass *, uint8_t>
1583 ARMTargetLowering::findRepresentativeClass(const TargetRegisterInfo *TRI,
1584 MVT VT) const {
1585 const TargetRegisterClass *RRC = nullptr;
1586 uint8_t Cost = 1;
1587 switch (VT.SimpleTy) {
1588 default:
1589 return TargetLowering::findRepresentativeClass(TRI, VT);
1590 // Use DPR as representative register class for all floating point
1591 // and vector types. Since there are 32 SPR registers and 32 DPR registers so
1592 // the cost is 1 for both f32 and f64.
1593 case MVT::f32: case MVT::f64: case MVT::v8i8: case MVT::v4i16:
1594 case MVT::v2i32: case MVT::v1i64: case MVT::v2f32:
1595 RRC = &ARM::DPRRegClass;
1596 // When NEON is used for SP, only half of the register file is available
1597 // because operations that define both SP and DP results will be constrained
1598 // to the VFP2 class (D0-D15). We currently model this constraint prior to
1599 // coalescing by double-counting the SP regs. See the FIXME above.
1600 if (Subtarget->useNEONForSinglePrecisionFP())
1601 Cost = 2;
1602 break;
1603 case MVT::v16i8: case MVT::v8i16: case MVT::v4i32: case MVT::v2i64:
1604 case MVT::v4f32: case MVT::v2f64:
1605 RRC = &ARM::DPRRegClass;
1606 Cost = 2;
1607 break;
1608 case MVT::v4i64:
1609 RRC = &ARM::DPRRegClass;
1610 Cost = 4;
1611 break;
1612 case MVT::v8i64:
1613 RRC = &ARM::DPRRegClass;
1614 Cost = 8;
1615 break;
1617 return std::make_pair(RRC, Cost);
1620 const char *ARMTargetLowering::getTargetNodeName(unsigned Opcode) const {
1621 #define MAKE_CASE(V) \
1622 case V: \
1623 return #V;
1624 switch ((ARMISD::NodeType)Opcode) {
1625 case ARMISD::FIRST_NUMBER:
1626 break;
1627 MAKE_CASE(ARMISD::Wrapper)
1628 MAKE_CASE(ARMISD::WrapperPIC)
1629 MAKE_CASE(ARMISD::WrapperJT)
1630 MAKE_CASE(ARMISD::COPY_STRUCT_BYVAL)
1631 MAKE_CASE(ARMISD::CALL)
1632 MAKE_CASE(ARMISD::CALL_PRED)
1633 MAKE_CASE(ARMISD::CALL_NOLINK)
1634 MAKE_CASE(ARMISD::tSECALL)
1635 MAKE_CASE(ARMISD::BRCOND)
1636 MAKE_CASE(ARMISD::BR_JT)
1637 MAKE_CASE(ARMISD::BR2_JT)
1638 MAKE_CASE(ARMISD::RET_FLAG)
1639 MAKE_CASE(ARMISD::SERET_FLAG)
1640 MAKE_CASE(ARMISD::INTRET_FLAG)
1641 MAKE_CASE(ARMISD::PIC_ADD)
1642 MAKE_CASE(ARMISD::CMP)
1643 MAKE_CASE(ARMISD::CMN)
1644 MAKE_CASE(ARMISD::CMPZ)
1645 MAKE_CASE(ARMISD::CMPFP)
1646 MAKE_CASE(ARMISD::CMPFPE)
1647 MAKE_CASE(ARMISD::CMPFPw0)
1648 MAKE_CASE(ARMISD::CMPFPEw0)
1649 MAKE_CASE(ARMISD::BCC_i64)
1650 MAKE_CASE(ARMISD::FMSTAT)
1651 MAKE_CASE(ARMISD::CMOV)
1652 MAKE_CASE(ARMISD::SUBS)
1653 MAKE_CASE(ARMISD::SSAT)
1654 MAKE_CASE(ARMISD::USAT)
1655 MAKE_CASE(ARMISD::ASRL)
1656 MAKE_CASE(ARMISD::LSRL)
1657 MAKE_CASE(ARMISD::LSLL)
1658 MAKE_CASE(ARMISD::SRL_FLAG)
1659 MAKE_CASE(ARMISD::SRA_FLAG)
1660 MAKE_CASE(ARMISD::RRX)
1661 MAKE_CASE(ARMISD::ADDC)
1662 MAKE_CASE(ARMISD::ADDE)
1663 MAKE_CASE(ARMISD::SUBC)
1664 MAKE_CASE(ARMISD::SUBE)
1665 MAKE_CASE(ARMISD::LSLS)
1666 MAKE_CASE(ARMISD::VMOVRRD)
1667 MAKE_CASE(ARMISD::VMOVDRR)
1668 MAKE_CASE(ARMISD::VMOVhr)
1669 MAKE_CASE(ARMISD::VMOVrh)
1670 MAKE_CASE(ARMISD::VMOVSR)
1671 MAKE_CASE(ARMISD::EH_SJLJ_SETJMP)
1672 MAKE_CASE(ARMISD::EH_SJLJ_LONGJMP)
1673 MAKE_CASE(ARMISD::EH_SJLJ_SETUP_DISPATCH)
1674 MAKE_CASE(ARMISD::TC_RETURN)
1675 MAKE_CASE(ARMISD::THREAD_POINTER)
1676 MAKE_CASE(ARMISD::DYN_ALLOC)
1677 MAKE_CASE(ARMISD::MEMBARRIER_MCR)
1678 MAKE_CASE(ARMISD::PRELOAD)
1679 MAKE_CASE(ARMISD::LDRD)
1680 MAKE_CASE(ARMISD::STRD)
1681 MAKE_CASE(ARMISD::WIN__CHKSTK)
1682 MAKE_CASE(ARMISD::WIN__DBZCHK)
1683 MAKE_CASE(ARMISD::PREDICATE_CAST)
1684 MAKE_CASE(ARMISD::VECTOR_REG_CAST)
1685 MAKE_CASE(ARMISD::MVESEXT)
1686 MAKE_CASE(ARMISD::MVEZEXT)
1687 MAKE_CASE(ARMISD::MVETRUNC)
1688 MAKE_CASE(ARMISD::VCMP)
1689 MAKE_CASE(ARMISD::VCMPZ)
1690 MAKE_CASE(ARMISD::VTST)
1691 MAKE_CASE(ARMISD::VSHLs)
1692 MAKE_CASE(ARMISD::VSHLu)
1693 MAKE_CASE(ARMISD::VSHLIMM)
1694 MAKE_CASE(ARMISD::VSHRsIMM)
1695 MAKE_CASE(ARMISD::VSHRuIMM)
1696 MAKE_CASE(ARMISD::VRSHRsIMM)
1697 MAKE_CASE(ARMISD::VRSHRuIMM)
1698 MAKE_CASE(ARMISD::VRSHRNIMM)
1699 MAKE_CASE(ARMISD::VQSHLsIMM)
1700 MAKE_CASE(ARMISD::VQSHLuIMM)
1701 MAKE_CASE(ARMISD::VQSHLsuIMM)
1702 MAKE_CASE(ARMISD::VQSHRNsIMM)
1703 MAKE_CASE(ARMISD::VQSHRNuIMM)
1704 MAKE_CASE(ARMISD::VQSHRNsuIMM)
1705 MAKE_CASE(ARMISD::VQRSHRNsIMM)
1706 MAKE_CASE(ARMISD::VQRSHRNuIMM)
1707 MAKE_CASE(ARMISD::VQRSHRNsuIMM)
1708 MAKE_CASE(ARMISD::VSLIIMM)
1709 MAKE_CASE(ARMISD::VSRIIMM)
1710 MAKE_CASE(ARMISD::VGETLANEu)
1711 MAKE_CASE(ARMISD::VGETLANEs)
1712 MAKE_CASE(ARMISD::VMOVIMM)
1713 MAKE_CASE(ARMISD::VMVNIMM)
1714 MAKE_CASE(ARMISD::VMOVFPIMM)
1715 MAKE_CASE(ARMISD::VDUP)
1716 MAKE_CASE(ARMISD::VDUPLANE)
1717 MAKE_CASE(ARMISD::VEXT)
1718 MAKE_CASE(ARMISD::VREV64)
1719 MAKE_CASE(ARMISD::VREV32)
1720 MAKE_CASE(ARMISD::VREV16)
1721 MAKE_CASE(ARMISD::VZIP)
1722 MAKE_CASE(ARMISD::VUZP)
1723 MAKE_CASE(ARMISD::VTRN)
1724 MAKE_CASE(ARMISD::VTBL1)
1725 MAKE_CASE(ARMISD::VTBL2)
1726 MAKE_CASE(ARMISD::VMOVN)
1727 MAKE_CASE(ARMISD::VQMOVNs)
1728 MAKE_CASE(ARMISD::VQMOVNu)
1729 MAKE_CASE(ARMISD::VCVTN)
1730 MAKE_CASE(ARMISD::VCVTL)
1731 MAKE_CASE(ARMISD::VIDUP)
1732 MAKE_CASE(ARMISD::VMULLs)
1733 MAKE_CASE(ARMISD::VMULLu)
1734 MAKE_CASE(ARMISD::VQDMULH)
1735 MAKE_CASE(ARMISD::VADDVs)
1736 MAKE_CASE(ARMISD::VADDVu)
1737 MAKE_CASE(ARMISD::VADDVps)
1738 MAKE_CASE(ARMISD::VADDVpu)
1739 MAKE_CASE(ARMISD::VADDLVs)
1740 MAKE_CASE(ARMISD::VADDLVu)
1741 MAKE_CASE(ARMISD::VADDLVAs)
1742 MAKE_CASE(ARMISD::VADDLVAu)
1743 MAKE_CASE(ARMISD::VADDLVps)
1744 MAKE_CASE(ARMISD::VADDLVpu)
1745 MAKE_CASE(ARMISD::VADDLVAps)
1746 MAKE_CASE(ARMISD::VADDLVApu)
1747 MAKE_CASE(ARMISD::VMLAVs)
1748 MAKE_CASE(ARMISD::VMLAVu)
1749 MAKE_CASE(ARMISD::VMLAVps)
1750 MAKE_CASE(ARMISD::VMLAVpu)
1751 MAKE_CASE(ARMISD::VMLALVs)
1752 MAKE_CASE(ARMISD::VMLALVu)
1753 MAKE_CASE(ARMISD::VMLALVps)
1754 MAKE_CASE(ARMISD::VMLALVpu)
1755 MAKE_CASE(ARMISD::VMLALVAs)
1756 MAKE_CASE(ARMISD::VMLALVAu)
1757 MAKE_CASE(ARMISD::VMLALVAps)
1758 MAKE_CASE(ARMISD::VMLALVApu)
1759 MAKE_CASE(ARMISD::VMINVu)
1760 MAKE_CASE(ARMISD::VMINVs)
1761 MAKE_CASE(ARMISD::VMAXVu)
1762 MAKE_CASE(ARMISD::VMAXVs)
1763 MAKE_CASE(ARMISD::UMAAL)
1764 MAKE_CASE(ARMISD::UMLAL)
1765 MAKE_CASE(ARMISD::SMLAL)
1766 MAKE_CASE(ARMISD::SMLALBB)
1767 MAKE_CASE(ARMISD::SMLALBT)
1768 MAKE_CASE(ARMISD::SMLALTB)
1769 MAKE_CASE(ARMISD::SMLALTT)
1770 MAKE_CASE(ARMISD::SMULWB)
1771 MAKE_CASE(ARMISD::SMULWT)
1772 MAKE_CASE(ARMISD::SMLALD)
1773 MAKE_CASE(ARMISD::SMLALDX)
1774 MAKE_CASE(ARMISD::SMLSLD)
1775 MAKE_CASE(ARMISD::SMLSLDX)
1776 MAKE_CASE(ARMISD::SMMLAR)
1777 MAKE_CASE(ARMISD::SMMLSR)
1778 MAKE_CASE(ARMISD::QADD16b)
1779 MAKE_CASE(ARMISD::QSUB16b)
1780 MAKE_CASE(ARMISD::QADD8b)
1781 MAKE_CASE(ARMISD::QSUB8b)
1782 MAKE_CASE(ARMISD::UQADD16b)
1783 MAKE_CASE(ARMISD::UQSUB16b)
1784 MAKE_CASE(ARMISD::UQADD8b)
1785 MAKE_CASE(ARMISD::UQSUB8b)
1786 MAKE_CASE(ARMISD::BUILD_VECTOR)
1787 MAKE_CASE(ARMISD::BFI)
1788 MAKE_CASE(ARMISD::VORRIMM)
1789 MAKE_CASE(ARMISD::VBICIMM)
1790 MAKE_CASE(ARMISD::VBSP)
1791 MAKE_CASE(ARMISD::MEMCPY)
1792 MAKE_CASE(ARMISD::VLD1DUP)
1793 MAKE_CASE(ARMISD::VLD2DUP)
1794 MAKE_CASE(ARMISD::VLD3DUP)
1795 MAKE_CASE(ARMISD::VLD4DUP)
1796 MAKE_CASE(ARMISD::VLD1_UPD)
1797 MAKE_CASE(ARMISD::VLD2_UPD)
1798 MAKE_CASE(ARMISD::VLD3_UPD)
1799 MAKE_CASE(ARMISD::VLD4_UPD)
1800 MAKE_CASE(ARMISD::VLD1x2_UPD)
1801 MAKE_CASE(ARMISD::VLD1x3_UPD)
1802 MAKE_CASE(ARMISD::VLD1x4_UPD)
1803 MAKE_CASE(ARMISD::VLD2LN_UPD)
1804 MAKE_CASE(ARMISD::VLD3LN_UPD)
1805 MAKE_CASE(ARMISD::VLD4LN_UPD)
1806 MAKE_CASE(ARMISD::VLD1DUP_UPD)
1807 MAKE_CASE(ARMISD::VLD2DUP_UPD)
1808 MAKE_CASE(ARMISD::VLD3DUP_UPD)
1809 MAKE_CASE(ARMISD::VLD4DUP_UPD)
1810 MAKE_CASE(ARMISD::VST1_UPD)
1811 MAKE_CASE(ARMISD::VST2_UPD)
1812 MAKE_CASE(ARMISD::VST3_UPD)
1813 MAKE_CASE(ARMISD::VST4_UPD)
1814 MAKE_CASE(ARMISD::VST1x2_UPD)
1815 MAKE_CASE(ARMISD::VST1x3_UPD)
1816 MAKE_CASE(ARMISD::VST1x4_UPD)
1817 MAKE_CASE(ARMISD::VST2LN_UPD)
1818 MAKE_CASE(ARMISD::VST3LN_UPD)
1819 MAKE_CASE(ARMISD::VST4LN_UPD)
1820 MAKE_CASE(ARMISD::WLS)
1821 MAKE_CASE(ARMISD::WLSSETUP)
1822 MAKE_CASE(ARMISD::LE)
1823 MAKE_CASE(ARMISD::LOOP_DEC)
1824 MAKE_CASE(ARMISD::CSINV)
1825 MAKE_CASE(ARMISD::CSNEG)
1826 MAKE_CASE(ARMISD::CSINC)
1827 MAKE_CASE(ARMISD::MEMCPYLOOP)
1828 MAKE_CASE(ARMISD::MEMSETLOOP)
1829 #undef MAKE_CASE
1831 return nullptr;
1834 EVT ARMTargetLowering::getSetCCResultType(const DataLayout &DL, LLVMContext &,
1835 EVT VT) const {
1836 if (!VT.isVector())
1837 return getPointerTy(DL);
1839 // MVE has a predicate register.
1840 if ((Subtarget->hasMVEIntegerOps() &&
1841 (VT == MVT::v4i32 || VT == MVT::v8i16 || VT == MVT::v16i8)) ||
1842 (Subtarget->hasMVEFloatOps() && (VT == MVT::v4f32 || VT == MVT::v8f16)))
1843 return MVT::getVectorVT(MVT::i1, VT.getVectorElementCount());
1844 return VT.changeVectorElementTypeToInteger();
1847 /// getRegClassFor - Return the register class that should be used for the
1848 /// specified value type.
1849 const TargetRegisterClass *
1850 ARMTargetLowering::getRegClassFor(MVT VT, bool isDivergent) const {
1851 (void)isDivergent;
1852 // Map v4i64 to QQ registers but do not make the type legal. Similarly map
1853 // v8i64 to QQQQ registers. v4i64 and v8i64 are only used for REG_SEQUENCE to
1854 // load / store 4 to 8 consecutive NEON D registers, or 2 to 4 consecutive
1855 // MVE Q registers.
1856 if (Subtarget->hasNEON()) {
1857 if (VT == MVT::v4i64)
1858 return &ARM::QQPRRegClass;
1859 if (VT == MVT::v8i64)
1860 return &ARM::QQQQPRRegClass;
1862 if (Subtarget->hasMVEIntegerOps()) {
1863 if (VT == MVT::v4i64)
1864 return &ARM::MQQPRRegClass;
1865 if (VT == MVT::v8i64)
1866 return &ARM::MQQQQPRRegClass;
1868 return TargetLowering::getRegClassFor(VT);
1871 // memcpy, and other memory intrinsics, typically tries to use LDM/STM if the
1872 // source/dest is aligned and the copy size is large enough. We therefore want
1873 // to align such objects passed to memory intrinsics.
1874 bool ARMTargetLowering::shouldAlignPointerArgs(CallInst *CI, unsigned &MinSize,
1875 unsigned &PrefAlign) const {
1876 if (!isa<MemIntrinsic>(CI))
1877 return false;
1878 MinSize = 8;
1879 // On ARM11 onwards (excluding M class) 8-byte aligned LDM is typically 1
1880 // cycle faster than 4-byte aligned LDM.
1881 PrefAlign = (Subtarget->hasV6Ops() && !Subtarget->isMClass() ? 8 : 4);
1882 return true;
1885 // Create a fast isel object.
1886 FastISel *
1887 ARMTargetLowering::createFastISel(FunctionLoweringInfo &funcInfo,
1888 const TargetLibraryInfo *libInfo) const {
1889 return ARM::createFastISel(funcInfo, libInfo);
1892 Sched::Preference ARMTargetLowering::getSchedulingPreference(SDNode *N) const {
1893 unsigned NumVals = N->getNumValues();
1894 if (!NumVals)
1895 return Sched::RegPressure;
1897 for (unsigned i = 0; i != NumVals; ++i) {
1898 EVT VT = N->getValueType(i);
1899 if (VT == MVT::Glue || VT == MVT::Other)
1900 continue;
1901 if (VT.isFloatingPoint() || VT.isVector())
1902 return Sched::ILP;
1905 if (!N->isMachineOpcode())
1906 return Sched::RegPressure;
1908 // Load are scheduled for latency even if there instruction itinerary
1909 // is not available.
1910 const TargetInstrInfo *TII = Subtarget->getInstrInfo();
1911 const MCInstrDesc &MCID = TII->get(N->getMachineOpcode());
1913 if (MCID.getNumDefs() == 0)
1914 return Sched::RegPressure;
1915 if (!Itins->isEmpty() &&
1916 Itins->getOperandCycle(MCID.getSchedClass(), 0) > 2)
1917 return Sched::ILP;
1919 return Sched::RegPressure;
1922 //===----------------------------------------------------------------------===//
1923 // Lowering Code
1924 //===----------------------------------------------------------------------===//
1926 static bool isSRL16(const SDValue &Op) {
1927 if (Op.getOpcode() != ISD::SRL)
1928 return false;
1929 if (auto Const = dyn_cast<ConstantSDNode>(Op.getOperand(1)))
1930 return Const->getZExtValue() == 16;
1931 return false;
1934 static bool isSRA16(const SDValue &Op) {
1935 if (Op.getOpcode() != ISD::SRA)
1936 return false;
1937 if (auto Const = dyn_cast<ConstantSDNode>(Op.getOperand(1)))
1938 return Const->getZExtValue() == 16;
1939 return false;
1942 static bool isSHL16(const SDValue &Op) {
1943 if (Op.getOpcode() != ISD::SHL)
1944 return false;
1945 if (auto Const = dyn_cast<ConstantSDNode>(Op.getOperand(1)))
1946 return Const->getZExtValue() == 16;
1947 return false;
1950 // Check for a signed 16-bit value. We special case SRA because it makes it
1951 // more simple when also looking for SRAs that aren't sign extending a
1952 // smaller value. Without the check, we'd need to take extra care with
1953 // checking order for some operations.
1954 static bool isS16(const SDValue &Op, SelectionDAG &DAG) {
1955 if (isSRA16(Op))
1956 return isSHL16(Op.getOperand(0));
1957 return DAG.ComputeNumSignBits(Op) == 17;
1960 /// IntCCToARMCC - Convert a DAG integer condition code to an ARM CC
1961 static ARMCC::CondCodes IntCCToARMCC(ISD::CondCode CC) {
1962 switch (CC) {
1963 default: llvm_unreachable("Unknown condition code!");
1964 case ISD::SETNE: return ARMCC::NE;
1965 case ISD::SETEQ: return ARMCC::EQ;
1966 case ISD::SETGT: return ARMCC::GT;
1967 case ISD::SETGE: return ARMCC::GE;
1968 case ISD::SETLT: return ARMCC::LT;
1969 case ISD::SETLE: return ARMCC::LE;
1970 case ISD::SETUGT: return ARMCC::HI;
1971 case ISD::SETUGE: return ARMCC::HS;
1972 case ISD::SETULT: return ARMCC::LO;
1973 case ISD::SETULE: return ARMCC::LS;
1977 /// FPCCToARMCC - Convert a DAG fp condition code to an ARM CC.
1978 static void FPCCToARMCC(ISD::CondCode CC, ARMCC::CondCodes &CondCode,
1979 ARMCC::CondCodes &CondCode2) {
1980 CondCode2 = ARMCC::AL;
1981 switch (CC) {
1982 default: llvm_unreachable("Unknown FP condition!");
1983 case ISD::SETEQ:
1984 case ISD::SETOEQ: CondCode = ARMCC::EQ; break;
1985 case ISD::SETGT:
1986 case ISD::SETOGT: CondCode = ARMCC::GT; break;
1987 case ISD::SETGE:
1988 case ISD::SETOGE: CondCode = ARMCC::GE; break;
1989 case ISD::SETOLT: CondCode = ARMCC::MI; break;
1990 case ISD::SETOLE: CondCode = ARMCC::LS; break;
1991 case ISD::SETONE: CondCode = ARMCC::MI; CondCode2 = ARMCC::GT; break;
1992 case ISD::SETO: CondCode = ARMCC::VC; break;
1993 case ISD::SETUO: CondCode = ARMCC::VS; break;
1994 case ISD::SETUEQ: CondCode = ARMCC::EQ; CondCode2 = ARMCC::VS; break;
1995 case ISD::SETUGT: CondCode = ARMCC::HI; break;
1996 case ISD::SETUGE: CondCode = ARMCC::PL; break;
1997 case ISD::SETLT:
1998 case ISD::SETULT: CondCode = ARMCC::LT; break;
1999 case ISD::SETLE:
2000 case ISD::SETULE: CondCode = ARMCC::LE; break;
2001 case ISD::SETNE:
2002 case ISD::SETUNE: CondCode = ARMCC::NE; break;
2006 //===----------------------------------------------------------------------===//
2007 // Calling Convention Implementation
2008 //===----------------------------------------------------------------------===//
2010 /// getEffectiveCallingConv - Get the effective calling convention, taking into
2011 /// account presence of floating point hardware and calling convention
2012 /// limitations, such as support for variadic functions.
2013 CallingConv::ID
2014 ARMTargetLowering::getEffectiveCallingConv(CallingConv::ID CC,
2015 bool isVarArg) const {
2016 switch (CC) {
2017 default:
2018 report_fatal_error("Unsupported calling convention");
2019 case CallingConv::ARM_AAPCS:
2020 case CallingConv::ARM_APCS:
2021 case CallingConv::GHC:
2022 case CallingConv::CFGuard_Check:
2023 return CC;
2024 case CallingConv::PreserveMost:
2025 return CallingConv::PreserveMost;
2026 case CallingConv::ARM_AAPCS_VFP:
2027 case CallingConv::Swift:
2028 case CallingConv::SwiftTail:
2029 return isVarArg ? CallingConv::ARM_AAPCS : CallingConv::ARM_AAPCS_VFP;
2030 case CallingConv::C:
2031 case CallingConv::Tail:
2032 if (!Subtarget->isAAPCS_ABI())
2033 return CallingConv::ARM_APCS;
2034 else if (Subtarget->hasVFP2Base() && !Subtarget->isThumb1Only() &&
2035 getTargetMachine().Options.FloatABIType == FloatABI::Hard &&
2036 !isVarArg)
2037 return CallingConv::ARM_AAPCS_VFP;
2038 else
2039 return CallingConv::ARM_AAPCS;
2040 case CallingConv::Fast:
2041 case CallingConv::CXX_FAST_TLS:
2042 if (!Subtarget->isAAPCS_ABI()) {
2043 if (Subtarget->hasVFP2Base() && !Subtarget->isThumb1Only() && !isVarArg)
2044 return CallingConv::Fast;
2045 return CallingConv::ARM_APCS;
2046 } else if (Subtarget->hasVFP2Base() &&
2047 !Subtarget->isThumb1Only() && !isVarArg)
2048 return CallingConv::ARM_AAPCS_VFP;
2049 else
2050 return CallingConv::ARM_AAPCS;
2054 CCAssignFn *ARMTargetLowering::CCAssignFnForCall(CallingConv::ID CC,
2055 bool isVarArg) const {
2056 return CCAssignFnForNode(CC, false, isVarArg);
2059 CCAssignFn *ARMTargetLowering::CCAssignFnForReturn(CallingConv::ID CC,
2060 bool isVarArg) const {
2061 return CCAssignFnForNode(CC, true, isVarArg);
2064 /// CCAssignFnForNode - Selects the correct CCAssignFn for the given
2065 /// CallingConvention.
2066 CCAssignFn *ARMTargetLowering::CCAssignFnForNode(CallingConv::ID CC,
2067 bool Return,
2068 bool isVarArg) const {
2069 switch (getEffectiveCallingConv(CC, isVarArg)) {
2070 default:
2071 report_fatal_error("Unsupported calling convention");
2072 case CallingConv::ARM_APCS:
2073 return (Return ? RetCC_ARM_APCS : CC_ARM_APCS);
2074 case CallingConv::ARM_AAPCS:
2075 return (Return ? RetCC_ARM_AAPCS : CC_ARM_AAPCS);
2076 case CallingConv::ARM_AAPCS_VFP:
2077 return (Return ? RetCC_ARM_AAPCS_VFP : CC_ARM_AAPCS_VFP);
2078 case CallingConv::Fast:
2079 return (Return ? RetFastCC_ARM_APCS : FastCC_ARM_APCS);
2080 case CallingConv::GHC:
2081 return (Return ? RetCC_ARM_APCS : CC_ARM_APCS_GHC);
2082 case CallingConv::PreserveMost:
2083 return (Return ? RetCC_ARM_AAPCS : CC_ARM_AAPCS);
2084 case CallingConv::CFGuard_Check:
2085 return (Return ? RetCC_ARM_AAPCS : CC_ARM_Win32_CFGuard_Check);
2089 SDValue ARMTargetLowering::MoveToHPR(const SDLoc &dl, SelectionDAG &DAG,
2090 MVT LocVT, MVT ValVT, SDValue Val) const {
2091 Val = DAG.getNode(ISD::BITCAST, dl, MVT::getIntegerVT(LocVT.getSizeInBits()),
2092 Val);
2093 if (Subtarget->hasFullFP16()) {
2094 Val = DAG.getNode(ARMISD::VMOVhr, dl, ValVT, Val);
2095 } else {
2096 Val = DAG.getNode(ISD::TRUNCATE, dl,
2097 MVT::getIntegerVT(ValVT.getSizeInBits()), Val);
2098 Val = DAG.getNode(ISD::BITCAST, dl, ValVT, Val);
2100 return Val;
2103 SDValue ARMTargetLowering::MoveFromHPR(const SDLoc &dl, SelectionDAG &DAG,
2104 MVT LocVT, MVT ValVT,
2105 SDValue Val) const {
2106 if (Subtarget->hasFullFP16()) {
2107 Val = DAG.getNode(ARMISD::VMOVrh, dl,
2108 MVT::getIntegerVT(LocVT.getSizeInBits()), Val);
2109 } else {
2110 Val = DAG.getNode(ISD::BITCAST, dl,
2111 MVT::getIntegerVT(ValVT.getSizeInBits()), Val);
2112 Val = DAG.getNode(ISD::ZERO_EXTEND, dl,
2113 MVT::getIntegerVT(LocVT.getSizeInBits()), Val);
2115 return DAG.getNode(ISD::BITCAST, dl, LocVT, Val);
2118 /// LowerCallResult - Lower the result values of a call into the
2119 /// appropriate copies out of appropriate physical registers.
2120 SDValue ARMTargetLowering::LowerCallResult(
2121 SDValue Chain, SDValue InFlag, CallingConv::ID CallConv, bool isVarArg,
2122 const SmallVectorImpl<ISD::InputArg> &Ins, const SDLoc &dl,
2123 SelectionDAG &DAG, SmallVectorImpl<SDValue> &InVals, bool isThisReturn,
2124 SDValue ThisVal) const {
2125 // Assign locations to each value returned by this call.
2126 SmallVector<CCValAssign, 16> RVLocs;
2127 CCState CCInfo(CallConv, isVarArg, DAG.getMachineFunction(), RVLocs,
2128 *DAG.getContext());
2129 CCInfo.AnalyzeCallResult(Ins, CCAssignFnForReturn(CallConv, isVarArg));
2131 // Copy all of the result registers out of their specified physreg.
2132 for (unsigned i = 0; i != RVLocs.size(); ++i) {
2133 CCValAssign VA = RVLocs[i];
2135 // Pass 'this' value directly from the argument to return value, to avoid
2136 // reg unit interference
2137 if (i == 0 && isThisReturn) {
2138 assert(!VA.needsCustom() && VA.getLocVT() == MVT::i32 &&
2139 "unexpected return calling convention register assignment");
2140 InVals.push_back(ThisVal);
2141 continue;
2144 SDValue Val;
2145 if (VA.needsCustom() &&
2146 (VA.getLocVT() == MVT::f64 || VA.getLocVT() == MVT::v2f64)) {
2147 // Handle f64 or half of a v2f64.
2148 SDValue Lo = DAG.getCopyFromReg(Chain, dl, VA.getLocReg(), MVT::i32,
2149 InFlag);
2150 Chain = Lo.getValue(1);
2151 InFlag = Lo.getValue(2);
2152 VA = RVLocs[++i]; // skip ahead to next loc
2153 SDValue Hi = DAG.getCopyFromReg(Chain, dl, VA.getLocReg(), MVT::i32,
2154 InFlag);
2155 Chain = Hi.getValue(1);
2156 InFlag = Hi.getValue(2);
2157 if (!Subtarget->isLittle())
2158 std::swap (Lo, Hi);
2159 Val = DAG.getNode(ARMISD::VMOVDRR, dl, MVT::f64, Lo, Hi);
2161 if (VA.getLocVT() == MVT::v2f64) {
2162 SDValue Vec = DAG.getNode(ISD::UNDEF, dl, MVT::v2f64);
2163 Vec = DAG.getNode(ISD::INSERT_VECTOR_ELT, dl, MVT::v2f64, Vec, Val,
2164 DAG.getConstant(0, dl, MVT::i32));
2166 VA = RVLocs[++i]; // skip ahead to next loc
2167 Lo = DAG.getCopyFromReg(Chain, dl, VA.getLocReg(), MVT::i32, InFlag);
2168 Chain = Lo.getValue(1);
2169 InFlag = Lo.getValue(2);
2170 VA = RVLocs[++i]; // skip ahead to next loc
2171 Hi = DAG.getCopyFromReg(Chain, dl, VA.getLocReg(), MVT::i32, InFlag);
2172 Chain = Hi.getValue(1);
2173 InFlag = Hi.getValue(2);
2174 if (!Subtarget->isLittle())
2175 std::swap (Lo, Hi);
2176 Val = DAG.getNode(ARMISD::VMOVDRR, dl, MVT::f64, Lo, Hi);
2177 Val = DAG.getNode(ISD::INSERT_VECTOR_ELT, dl, MVT::v2f64, Vec, Val,
2178 DAG.getConstant(1, dl, MVT::i32));
2180 } else {
2181 Val = DAG.getCopyFromReg(Chain, dl, VA.getLocReg(), VA.getLocVT(),
2182 InFlag);
2183 Chain = Val.getValue(1);
2184 InFlag = Val.getValue(2);
2187 switch (VA.getLocInfo()) {
2188 default: llvm_unreachable("Unknown loc info!");
2189 case CCValAssign::Full: break;
2190 case CCValAssign::BCvt:
2191 Val = DAG.getNode(ISD::BITCAST, dl, VA.getValVT(), Val);
2192 break;
2195 // f16 arguments have their size extended to 4 bytes and passed as if they
2196 // had been copied to the LSBs of a 32-bit register.
2197 // For that, it's passed extended to i32 (soft ABI) or to f32 (hard ABI)
2198 if (VA.needsCustom() &&
2199 (VA.getValVT() == MVT::f16 || VA.getValVT() == MVT::bf16))
2200 Val = MoveToHPR(dl, DAG, VA.getLocVT(), VA.getValVT(), Val);
2202 InVals.push_back(Val);
2205 return Chain;
2208 std::pair<SDValue, MachinePointerInfo> ARMTargetLowering::computeAddrForCallArg(
2209 const SDLoc &dl, SelectionDAG &DAG, const CCValAssign &VA, SDValue StackPtr,
2210 bool IsTailCall, int SPDiff) const {
2211 SDValue DstAddr;
2212 MachinePointerInfo DstInfo;
2213 int32_t Offset = VA.getLocMemOffset();
2214 MachineFunction &MF = DAG.getMachineFunction();
2216 if (IsTailCall) {
2217 Offset += SPDiff;
2218 auto PtrVT = getPointerTy(DAG.getDataLayout());
2219 int Size = VA.getLocVT().getFixedSizeInBits() / 8;
2220 int FI = MF.getFrameInfo().CreateFixedObject(Size, Offset, true);
2221 DstAddr = DAG.getFrameIndex(FI, PtrVT);
2222 DstInfo =
2223 MachinePointerInfo::getFixedStack(DAG.getMachineFunction(), FI);
2224 } else {
2225 SDValue PtrOff = DAG.getIntPtrConstant(Offset, dl);
2226 DstAddr = DAG.getNode(ISD::ADD, dl, getPointerTy(DAG.getDataLayout()),
2227 StackPtr, PtrOff);
2228 DstInfo =
2229 MachinePointerInfo::getStack(DAG.getMachineFunction(), Offset);
2232 return std::make_pair(DstAddr, DstInfo);
2235 void ARMTargetLowering::PassF64ArgInRegs(const SDLoc &dl, SelectionDAG &DAG,
2236 SDValue Chain, SDValue &Arg,
2237 RegsToPassVector &RegsToPass,
2238 CCValAssign &VA, CCValAssign &NextVA,
2239 SDValue &StackPtr,
2240 SmallVectorImpl<SDValue> &MemOpChains,
2241 bool IsTailCall,
2242 int SPDiff) const {
2243 SDValue fmrrd = DAG.getNode(ARMISD::VMOVRRD, dl,
2244 DAG.getVTList(MVT::i32, MVT::i32), Arg);
2245 unsigned id = Subtarget->isLittle() ? 0 : 1;
2246 RegsToPass.push_back(std::make_pair(VA.getLocReg(), fmrrd.getValue(id)));
2248 if (NextVA.isRegLoc())
2249 RegsToPass.push_back(std::make_pair(NextVA.getLocReg(), fmrrd.getValue(1-id)));
2250 else {
2251 assert(NextVA.isMemLoc());
2252 if (!StackPtr.getNode())
2253 StackPtr = DAG.getCopyFromReg(Chain, dl, ARM::SP,
2254 getPointerTy(DAG.getDataLayout()));
2256 SDValue DstAddr;
2257 MachinePointerInfo DstInfo;
2258 std::tie(DstAddr, DstInfo) =
2259 computeAddrForCallArg(dl, DAG, NextVA, StackPtr, IsTailCall, SPDiff);
2260 MemOpChains.push_back(
2261 DAG.getStore(Chain, dl, fmrrd.getValue(1 - id), DstAddr, DstInfo));
2265 static bool canGuaranteeTCO(CallingConv::ID CC, bool GuaranteeTailCalls) {
2266 return (CC == CallingConv::Fast && GuaranteeTailCalls) ||
2267 CC == CallingConv::Tail || CC == CallingConv::SwiftTail;
2270 /// LowerCall - Lowering a call into a callseq_start <-
2271 /// ARMISD:CALL <- callseq_end chain. Also add input and output parameter
2272 /// nodes.
2273 SDValue
2274 ARMTargetLowering::LowerCall(TargetLowering::CallLoweringInfo &CLI,
2275 SmallVectorImpl<SDValue> &InVals) const {
2276 SelectionDAG &DAG = CLI.DAG;
2277 SDLoc &dl = CLI.DL;
2278 SmallVectorImpl<ISD::OutputArg> &Outs = CLI.Outs;
2279 SmallVectorImpl<SDValue> &OutVals = CLI.OutVals;
2280 SmallVectorImpl<ISD::InputArg> &Ins = CLI.Ins;
2281 SDValue Chain = CLI.Chain;
2282 SDValue Callee = CLI.Callee;
2283 bool &isTailCall = CLI.IsTailCall;
2284 CallingConv::ID CallConv = CLI.CallConv;
2285 bool doesNotRet = CLI.DoesNotReturn;
2286 bool isVarArg = CLI.IsVarArg;
2288 MachineFunction &MF = DAG.getMachineFunction();
2289 ARMFunctionInfo *AFI = MF.getInfo<ARMFunctionInfo>();
2290 MachineFunction::CallSiteInfo CSInfo;
2291 bool isStructRet = (Outs.empty()) ? false : Outs[0].Flags.isSRet();
2292 bool isThisReturn = false;
2293 bool isCmseNSCall = false;
2294 bool isSibCall = false;
2295 bool PreferIndirect = false;
2297 // Determine whether this is a non-secure function call.
2298 if (CLI.CB && CLI.CB->getAttributes().hasFnAttr("cmse_nonsecure_call"))
2299 isCmseNSCall = true;
2301 // Disable tail calls if they're not supported.
2302 if (!Subtarget->supportsTailCall())
2303 isTailCall = false;
2305 // For both the non-secure calls and the returns from a CMSE entry function,
2306 // the function needs to do some extra work afte r the call, or before the
2307 // return, respectively, thus it cannot end with atail call
2308 if (isCmseNSCall || AFI->isCmseNSEntryFunction())
2309 isTailCall = false;
2311 if (isa<GlobalAddressSDNode>(Callee)) {
2312 // If we're optimizing for minimum size and the function is called three or
2313 // more times in this block, we can improve codesize by calling indirectly
2314 // as BLXr has a 16-bit encoding.
2315 auto *GV = cast<GlobalAddressSDNode>(Callee)->getGlobal();
2316 if (CLI.CB) {
2317 auto *BB = CLI.CB->getParent();
2318 PreferIndirect = Subtarget->isThumb() && Subtarget->hasMinSize() &&
2319 count_if(GV->users(), [&BB](const User *U) {
2320 return isa<Instruction>(U) &&
2321 cast<Instruction>(U)->getParent() == BB;
2322 }) > 2;
2325 if (isTailCall) {
2326 // Check if it's really possible to do a tail call.
2327 isTailCall = IsEligibleForTailCallOptimization(
2328 Callee, CallConv, isVarArg, isStructRet,
2329 MF.getFunction().hasStructRetAttr(), Outs, OutVals, Ins, DAG,
2330 PreferIndirect);
2332 if (isTailCall && !getTargetMachine().Options.GuaranteedTailCallOpt &&
2333 CallConv != CallingConv::Tail && CallConv != CallingConv::SwiftTail)
2334 isSibCall = true;
2336 // We don't support GuaranteedTailCallOpt for ARM, only automatically
2337 // detected sibcalls.
2338 if (isTailCall)
2339 ++NumTailCalls;
2342 if (!isTailCall && CLI.CB && CLI.CB->isMustTailCall())
2343 report_fatal_error("failed to perform tail call elimination on a call "
2344 "site marked musttail");
2345 // Analyze operands of the call, assigning locations to each operand.
2346 SmallVector<CCValAssign, 16> ArgLocs;
2347 CCState CCInfo(CallConv, isVarArg, DAG.getMachineFunction(), ArgLocs,
2348 *DAG.getContext());
2349 CCInfo.AnalyzeCallOperands(Outs, CCAssignFnForCall(CallConv, isVarArg));
2351 // Get a count of how many bytes are to be pushed on the stack.
2352 unsigned NumBytes = CCInfo.getNextStackOffset();
2354 // SPDiff is the byte offset of the call's argument area from the callee's.
2355 // Stores to callee stack arguments will be placed in FixedStackSlots offset
2356 // by this amount for a tail call. In a sibling call it must be 0 because the
2357 // caller will deallocate the entire stack and the callee still expects its
2358 // arguments to begin at SP+0. Completely unused for non-tail calls.
2359 int SPDiff = 0;
2361 if (isTailCall && !isSibCall) {
2362 auto FuncInfo = MF.getInfo<ARMFunctionInfo>();
2363 unsigned NumReusableBytes = FuncInfo->getArgumentStackSize();
2365 // Since callee will pop argument stack as a tail call, we must keep the
2366 // popped size 16-byte aligned.
2367 Align StackAlign = DAG.getDataLayout().getStackAlignment();
2368 NumBytes = alignTo(NumBytes, StackAlign);
2370 // SPDiff will be negative if this tail call requires more space than we
2371 // would automatically have in our incoming argument space. Positive if we
2372 // can actually shrink the stack.
2373 SPDiff = NumReusableBytes - NumBytes;
2375 // If this call requires more stack than we have available from
2376 // LowerFormalArguments, tell FrameLowering to reserve space for it.
2377 if (SPDiff < 0 && AFI->getArgRegsSaveSize() < (unsigned)-SPDiff)
2378 AFI->setArgRegsSaveSize(-SPDiff);
2381 if (isSibCall) {
2382 // For sibling tail calls, memory operands are available in our caller's stack.
2383 NumBytes = 0;
2384 } else {
2385 // Adjust the stack pointer for the new arguments...
2386 // These operations are automatically eliminated by the prolog/epilog pass
2387 Chain = DAG.getCALLSEQ_START(Chain, isTailCall ? 0 : NumBytes, 0, dl);
2390 SDValue StackPtr =
2391 DAG.getCopyFromReg(Chain, dl, ARM::SP, getPointerTy(DAG.getDataLayout()));
2393 RegsToPassVector RegsToPass;
2394 SmallVector<SDValue, 8> MemOpChains;
2396 // During a tail call, stores to the argument area must happen after all of
2397 // the function's incoming arguments have been loaded because they may alias.
2398 // This is done by folding in a TokenFactor from LowerFormalArguments, but
2399 // there's no point in doing so repeatedly so this tracks whether that's
2400 // happened yet.
2401 bool AfterFormalArgLoads = false;
2403 // Walk the register/memloc assignments, inserting copies/loads. In the case
2404 // of tail call optimization, arguments are handled later.
2405 for (unsigned i = 0, realArgIdx = 0, e = ArgLocs.size();
2406 i != e;
2407 ++i, ++realArgIdx) {
2408 CCValAssign &VA = ArgLocs[i];
2409 SDValue Arg = OutVals[realArgIdx];
2410 ISD::ArgFlagsTy Flags = Outs[realArgIdx].Flags;
2411 bool isByVal = Flags.isByVal();
2413 // Promote the value if needed.
2414 switch (VA.getLocInfo()) {
2415 default: llvm_unreachable("Unknown loc info!");
2416 case CCValAssign::Full: break;
2417 case CCValAssign::SExt:
2418 Arg = DAG.getNode(ISD::SIGN_EXTEND, dl, VA.getLocVT(), Arg);
2419 break;
2420 case CCValAssign::ZExt:
2421 Arg = DAG.getNode(ISD::ZERO_EXTEND, dl, VA.getLocVT(), Arg);
2422 break;
2423 case CCValAssign::AExt:
2424 Arg = DAG.getNode(ISD::ANY_EXTEND, dl, VA.getLocVT(), Arg);
2425 break;
2426 case CCValAssign::BCvt:
2427 Arg = DAG.getNode(ISD::BITCAST, dl, VA.getLocVT(), Arg);
2428 break;
2431 if (isTailCall && VA.isMemLoc() && !AfterFormalArgLoads) {
2432 Chain = DAG.getStackArgumentTokenFactor(Chain);
2433 AfterFormalArgLoads = true;
2436 // f16 arguments have their size extended to 4 bytes and passed as if they
2437 // had been copied to the LSBs of a 32-bit register.
2438 // For that, it's passed extended to i32 (soft ABI) or to f32 (hard ABI)
2439 if (VA.needsCustom() &&
2440 (VA.getValVT() == MVT::f16 || VA.getValVT() == MVT::bf16)) {
2441 Arg = MoveFromHPR(dl, DAG, VA.getLocVT(), VA.getValVT(), Arg);
2442 } else {
2443 // f16 arguments could have been extended prior to argument lowering.
2444 // Mask them arguments if this is a CMSE nonsecure call.
2445 auto ArgVT = Outs[realArgIdx].ArgVT;
2446 if (isCmseNSCall && (ArgVT == MVT::f16)) {
2447 auto LocBits = VA.getLocVT().getSizeInBits();
2448 auto MaskValue = APInt::getLowBitsSet(LocBits, ArgVT.getSizeInBits());
2449 SDValue Mask =
2450 DAG.getConstant(MaskValue, dl, MVT::getIntegerVT(LocBits));
2451 Arg = DAG.getNode(ISD::BITCAST, dl, MVT::getIntegerVT(LocBits), Arg);
2452 Arg = DAG.getNode(ISD::AND, dl, MVT::getIntegerVT(LocBits), Arg, Mask);
2453 Arg = DAG.getNode(ISD::BITCAST, dl, VA.getLocVT(), Arg);
2457 // f64 and v2f64 might be passed in i32 pairs and must be split into pieces
2458 if (VA.needsCustom() && VA.getLocVT() == MVT::v2f64) {
2459 SDValue Op0 = DAG.getNode(ISD::EXTRACT_VECTOR_ELT, dl, MVT::f64, Arg,
2460 DAG.getConstant(0, dl, MVT::i32));
2461 SDValue Op1 = DAG.getNode(ISD::EXTRACT_VECTOR_ELT, dl, MVT::f64, Arg,
2462 DAG.getConstant(1, dl, MVT::i32));
2464 PassF64ArgInRegs(dl, DAG, Chain, Op0, RegsToPass, VA, ArgLocs[++i],
2465 StackPtr, MemOpChains, isTailCall, SPDiff);
2467 VA = ArgLocs[++i]; // skip ahead to next loc
2468 if (VA.isRegLoc()) {
2469 PassF64ArgInRegs(dl, DAG, Chain, Op1, RegsToPass, VA, ArgLocs[++i],
2470 StackPtr, MemOpChains, isTailCall, SPDiff);
2471 } else {
2472 assert(VA.isMemLoc());
2473 SDValue DstAddr;
2474 MachinePointerInfo DstInfo;
2475 std::tie(DstAddr, DstInfo) =
2476 computeAddrForCallArg(dl, DAG, VA, StackPtr, isTailCall, SPDiff);
2477 MemOpChains.push_back(DAG.getStore(Chain, dl, Op1, DstAddr, DstInfo));
2479 } else if (VA.needsCustom() && VA.getLocVT() == MVT::f64) {
2480 PassF64ArgInRegs(dl, DAG, Chain, Arg, RegsToPass, VA, ArgLocs[++i],
2481 StackPtr, MemOpChains, isTailCall, SPDiff);
2482 } else if (VA.isRegLoc()) {
2483 if (realArgIdx == 0 && Flags.isReturned() && !Flags.isSwiftSelf() &&
2484 Outs[0].VT == MVT::i32) {
2485 assert(VA.getLocVT() == MVT::i32 &&
2486 "unexpected calling convention register assignment");
2487 assert(!Ins.empty() && Ins[0].VT == MVT::i32 &&
2488 "unexpected use of 'returned'");
2489 isThisReturn = true;
2491 const TargetOptions &Options = DAG.getTarget().Options;
2492 if (Options.EmitCallSiteInfo)
2493 CSInfo.emplace_back(VA.getLocReg(), i);
2494 RegsToPass.push_back(std::make_pair(VA.getLocReg(), Arg));
2495 } else if (isByVal) {
2496 assert(VA.isMemLoc());
2497 unsigned offset = 0;
2499 // True if this byval aggregate will be split between registers
2500 // and memory.
2501 unsigned ByValArgsCount = CCInfo.getInRegsParamsCount();
2502 unsigned CurByValIdx = CCInfo.getInRegsParamsProcessed();
2504 if (CurByValIdx < ByValArgsCount) {
2506 unsigned RegBegin, RegEnd;
2507 CCInfo.getInRegsParamInfo(CurByValIdx, RegBegin, RegEnd);
2509 EVT PtrVT =
2510 DAG.getTargetLoweringInfo().getPointerTy(DAG.getDataLayout());
2511 unsigned int i, j;
2512 for (i = 0, j = RegBegin; j < RegEnd; i++, j++) {
2513 SDValue Const = DAG.getConstant(4*i, dl, MVT::i32);
2514 SDValue AddArg = DAG.getNode(ISD::ADD, dl, PtrVT, Arg, Const);
2515 SDValue Load =
2516 DAG.getLoad(PtrVT, dl, Chain, AddArg, MachinePointerInfo(),
2517 DAG.InferPtrAlign(AddArg));
2518 MemOpChains.push_back(Load.getValue(1));
2519 RegsToPass.push_back(std::make_pair(j, Load));
2522 // If parameter size outsides register area, "offset" value
2523 // helps us to calculate stack slot for remained part properly.
2524 offset = RegEnd - RegBegin;
2526 CCInfo.nextInRegsParam();
2529 if (Flags.getByValSize() > 4*offset) {
2530 auto PtrVT = getPointerTy(DAG.getDataLayout());
2531 SDValue Dst;
2532 MachinePointerInfo DstInfo;
2533 std::tie(Dst, DstInfo) =
2534 computeAddrForCallArg(dl, DAG, VA, StackPtr, isTailCall, SPDiff);
2535 SDValue SrcOffset = DAG.getIntPtrConstant(4*offset, dl);
2536 SDValue Src = DAG.getNode(ISD::ADD, dl, PtrVT, Arg, SrcOffset);
2537 SDValue SizeNode = DAG.getConstant(Flags.getByValSize() - 4*offset, dl,
2538 MVT::i32);
2539 SDValue AlignNode =
2540 DAG.getConstant(Flags.getNonZeroByValAlign().value(), dl, MVT::i32);
2542 SDVTList VTs = DAG.getVTList(MVT::Other, MVT::Glue);
2543 SDValue Ops[] = { Chain, Dst, Src, SizeNode, AlignNode};
2544 MemOpChains.push_back(DAG.getNode(ARMISD::COPY_STRUCT_BYVAL, dl, VTs,
2545 Ops));
2547 } else {
2548 assert(VA.isMemLoc());
2549 SDValue DstAddr;
2550 MachinePointerInfo DstInfo;
2551 std::tie(DstAddr, DstInfo) =
2552 computeAddrForCallArg(dl, DAG, VA, StackPtr, isTailCall, SPDiff);
2554 SDValue Store = DAG.getStore(Chain, dl, Arg, DstAddr, DstInfo);
2555 MemOpChains.push_back(Store);
2559 if (!MemOpChains.empty())
2560 Chain = DAG.getNode(ISD::TokenFactor, dl, MVT::Other, MemOpChains);
2562 // Build a sequence of copy-to-reg nodes chained together with token chain
2563 // and flag operands which copy the outgoing args into the appropriate regs.
2564 SDValue InFlag;
2565 for (unsigned i = 0, e = RegsToPass.size(); i != e; ++i) {
2566 Chain = DAG.getCopyToReg(Chain, dl, RegsToPass[i].first,
2567 RegsToPass[i].second, InFlag);
2568 InFlag = Chain.getValue(1);
2571 // If the callee is a GlobalAddress/ExternalSymbol node (quite common, every
2572 // direct call is) turn it into a TargetGlobalAddress/TargetExternalSymbol
2573 // node so that legalize doesn't hack it.
2574 bool isDirect = false;
2576 const TargetMachine &TM = getTargetMachine();
2577 const Module *Mod = MF.getFunction().getParent();
2578 const GlobalValue *GV = nullptr;
2579 if (GlobalAddressSDNode *G = dyn_cast<GlobalAddressSDNode>(Callee))
2580 GV = G->getGlobal();
2581 bool isStub =
2582 !TM.shouldAssumeDSOLocal(*Mod, GV) && Subtarget->isTargetMachO();
2584 bool isARMFunc = !Subtarget->isThumb() || (isStub && !Subtarget->isMClass());
2585 bool isLocalARMFunc = false;
2586 auto PtrVt = getPointerTy(DAG.getDataLayout());
2588 if (Subtarget->genLongCalls()) {
2589 assert((!isPositionIndependent() || Subtarget->isTargetWindows()) &&
2590 "long-calls codegen is not position independent!");
2591 // Handle a global address or an external symbol. If it's not one of
2592 // those, the target's already in a register, so we don't need to do
2593 // anything extra.
2594 if (isa<GlobalAddressSDNode>(Callee)) {
2595 // Create a constant pool entry for the callee address
2596 unsigned ARMPCLabelIndex = AFI->createPICLabelUId();
2597 ARMConstantPoolValue *CPV =
2598 ARMConstantPoolConstant::Create(GV, ARMPCLabelIndex, ARMCP::CPValue, 0);
2600 // Get the address of the callee into a register
2601 SDValue CPAddr = DAG.getTargetConstantPool(CPV, PtrVt, Align(4));
2602 CPAddr = DAG.getNode(ARMISD::Wrapper, dl, MVT::i32, CPAddr);
2603 Callee = DAG.getLoad(
2604 PtrVt, dl, DAG.getEntryNode(), CPAddr,
2605 MachinePointerInfo::getConstantPool(DAG.getMachineFunction()));
2606 } else if (ExternalSymbolSDNode *S=dyn_cast<ExternalSymbolSDNode>(Callee)) {
2607 const char *Sym = S->getSymbol();
2609 // Create a constant pool entry for the callee address
2610 unsigned ARMPCLabelIndex = AFI->createPICLabelUId();
2611 ARMConstantPoolValue *CPV =
2612 ARMConstantPoolSymbol::Create(*DAG.getContext(), Sym,
2613 ARMPCLabelIndex, 0);
2614 // Get the address of the callee into a register
2615 SDValue CPAddr = DAG.getTargetConstantPool(CPV, PtrVt, Align(4));
2616 CPAddr = DAG.getNode(ARMISD::Wrapper, dl, MVT::i32, CPAddr);
2617 Callee = DAG.getLoad(
2618 PtrVt, dl, DAG.getEntryNode(), CPAddr,
2619 MachinePointerInfo::getConstantPool(DAG.getMachineFunction()));
2621 } else if (isa<GlobalAddressSDNode>(Callee)) {
2622 if (!PreferIndirect) {
2623 isDirect = true;
2624 bool isDef = GV->isStrongDefinitionForLinker();
2626 // ARM call to a local ARM function is predicable.
2627 isLocalARMFunc = !Subtarget->isThumb() && (isDef || !ARMInterworking);
2628 // tBX takes a register source operand.
2629 if (isStub && Subtarget->isThumb1Only() && !Subtarget->hasV5TOps()) {
2630 assert(Subtarget->isTargetMachO() && "WrapperPIC use on non-MachO?");
2631 Callee = DAG.getNode(
2632 ARMISD::WrapperPIC, dl, PtrVt,
2633 DAG.getTargetGlobalAddress(GV, dl, PtrVt, 0, ARMII::MO_NONLAZY));
2634 Callee = DAG.getLoad(
2635 PtrVt, dl, DAG.getEntryNode(), Callee,
2636 MachinePointerInfo::getGOT(DAG.getMachineFunction()), MaybeAlign(),
2637 MachineMemOperand::MODereferenceable |
2638 MachineMemOperand::MOInvariant);
2639 } else if (Subtarget->isTargetCOFF()) {
2640 assert(Subtarget->isTargetWindows() &&
2641 "Windows is the only supported COFF target");
2642 unsigned TargetFlags = ARMII::MO_NO_FLAG;
2643 if (GV->hasDLLImportStorageClass())
2644 TargetFlags = ARMII::MO_DLLIMPORT;
2645 else if (!TM.shouldAssumeDSOLocal(*GV->getParent(), GV))
2646 TargetFlags = ARMII::MO_COFFSTUB;
2647 Callee = DAG.getTargetGlobalAddress(GV, dl, PtrVt, /*offset=*/0,
2648 TargetFlags);
2649 if (TargetFlags & (ARMII::MO_DLLIMPORT | ARMII::MO_COFFSTUB))
2650 Callee =
2651 DAG.getLoad(PtrVt, dl, DAG.getEntryNode(),
2652 DAG.getNode(ARMISD::Wrapper, dl, PtrVt, Callee),
2653 MachinePointerInfo::getGOT(DAG.getMachineFunction()));
2654 } else {
2655 Callee = DAG.getTargetGlobalAddress(GV, dl, PtrVt, 0, 0);
2658 } else if (ExternalSymbolSDNode *S = dyn_cast<ExternalSymbolSDNode>(Callee)) {
2659 isDirect = true;
2660 // tBX takes a register source operand.
2661 const char *Sym = S->getSymbol();
2662 if (isARMFunc && Subtarget->isThumb1Only() && !Subtarget->hasV5TOps()) {
2663 unsigned ARMPCLabelIndex = AFI->createPICLabelUId();
2664 ARMConstantPoolValue *CPV =
2665 ARMConstantPoolSymbol::Create(*DAG.getContext(), Sym,
2666 ARMPCLabelIndex, 4);
2667 SDValue CPAddr = DAG.getTargetConstantPool(CPV, PtrVt, Align(4));
2668 CPAddr = DAG.getNode(ARMISD::Wrapper, dl, MVT::i32, CPAddr);
2669 Callee = DAG.getLoad(
2670 PtrVt, dl, DAG.getEntryNode(), CPAddr,
2671 MachinePointerInfo::getConstantPool(DAG.getMachineFunction()));
2672 SDValue PICLabel = DAG.getConstant(ARMPCLabelIndex, dl, MVT::i32);
2673 Callee = DAG.getNode(ARMISD::PIC_ADD, dl, PtrVt, Callee, PICLabel);
2674 } else {
2675 Callee = DAG.getTargetExternalSymbol(Sym, PtrVt, 0);
2679 if (isCmseNSCall) {
2680 assert(!isARMFunc && !isDirect &&
2681 "Cannot handle call to ARM function or direct call");
2682 if (NumBytes > 0) {
2683 DiagnosticInfoUnsupported Diag(DAG.getMachineFunction().getFunction(),
2684 "call to non-secure function would "
2685 "require passing arguments on stack",
2686 dl.getDebugLoc());
2687 DAG.getContext()->diagnose(Diag);
2689 if (isStructRet) {
2690 DiagnosticInfoUnsupported Diag(
2691 DAG.getMachineFunction().getFunction(),
2692 "call to non-secure function would return value through pointer",
2693 dl.getDebugLoc());
2694 DAG.getContext()->diagnose(Diag);
2698 // FIXME: handle tail calls differently.
2699 unsigned CallOpc;
2700 if (Subtarget->isThumb()) {
2701 if (isCmseNSCall)
2702 CallOpc = ARMISD::tSECALL;
2703 else if ((!isDirect || isARMFunc) && !Subtarget->hasV5TOps())
2704 CallOpc = ARMISD::CALL_NOLINK;
2705 else
2706 CallOpc = ARMISD::CALL;
2707 } else {
2708 if (!isDirect && !Subtarget->hasV5TOps())
2709 CallOpc = ARMISD::CALL_NOLINK;
2710 else if (doesNotRet && isDirect && Subtarget->hasRetAddrStack() &&
2711 // Emit regular call when code size is the priority
2712 !Subtarget->hasMinSize())
2713 // "mov lr, pc; b _foo" to avoid confusing the RSP
2714 CallOpc = ARMISD::CALL_NOLINK;
2715 else
2716 CallOpc = isLocalARMFunc ? ARMISD::CALL_PRED : ARMISD::CALL;
2719 // We don't usually want to end the call-sequence here because we would tidy
2720 // the frame up *after* the call, however in the ABI-changing tail-call case
2721 // we've carefully laid out the parameters so that when sp is reset they'll be
2722 // in the correct location.
2723 if (isTailCall && !isSibCall) {
2724 Chain = DAG.getCALLSEQ_END(Chain, DAG.getIntPtrConstant(0, dl, true),
2725 DAG.getIntPtrConstant(0, dl, true), InFlag, dl);
2726 InFlag = Chain.getValue(1);
2729 std::vector<SDValue> Ops;
2730 Ops.push_back(Chain);
2731 Ops.push_back(Callee);
2733 if (isTailCall) {
2734 Ops.push_back(DAG.getTargetConstant(SPDiff, dl, MVT::i32));
2737 // Add argument registers to the end of the list so that they are known live
2738 // into the call.
2739 for (unsigned i = 0, e = RegsToPass.size(); i != e; ++i)
2740 Ops.push_back(DAG.getRegister(RegsToPass[i].first,
2741 RegsToPass[i].second.getValueType()));
2743 // Add a register mask operand representing the call-preserved registers.
2744 if (!isTailCall) {
2745 const uint32_t *Mask;
2746 const ARMBaseRegisterInfo *ARI = Subtarget->getRegisterInfo();
2747 if (isThisReturn) {
2748 // For 'this' returns, use the R0-preserving mask if applicable
2749 Mask = ARI->getThisReturnPreservedMask(MF, CallConv);
2750 if (!Mask) {
2751 // Set isThisReturn to false if the calling convention is not one that
2752 // allows 'returned' to be modeled in this way, so LowerCallResult does
2753 // not try to pass 'this' straight through
2754 isThisReturn = false;
2755 Mask = ARI->getCallPreservedMask(MF, CallConv);
2757 } else
2758 Mask = ARI->getCallPreservedMask(MF, CallConv);
2760 assert(Mask && "Missing call preserved mask for calling convention");
2761 Ops.push_back(DAG.getRegisterMask(Mask));
2764 if (InFlag.getNode())
2765 Ops.push_back(InFlag);
2767 SDVTList NodeTys = DAG.getVTList(MVT::Other, MVT::Glue);
2768 if (isTailCall) {
2769 MF.getFrameInfo().setHasTailCall();
2770 SDValue Ret = DAG.getNode(ARMISD::TC_RETURN, dl, NodeTys, Ops);
2771 DAG.addCallSiteInfo(Ret.getNode(), std::move(CSInfo));
2772 return Ret;
2775 // Returns a chain and a flag for retval copy to use.
2776 Chain = DAG.getNode(CallOpc, dl, NodeTys, Ops);
2777 DAG.addNoMergeSiteInfo(Chain.getNode(), CLI.NoMerge);
2778 InFlag = Chain.getValue(1);
2779 DAG.addCallSiteInfo(Chain.getNode(), std::move(CSInfo));
2781 // If we're guaranteeing tail-calls will be honoured, the callee must
2782 // pop its own argument stack on return. But this call is *not* a tail call so
2783 // we need to undo that after it returns to restore the status-quo.
2784 bool TailCallOpt = getTargetMachine().Options.GuaranteedTailCallOpt;
2785 uint64_t CalleePopBytes =
2786 canGuaranteeTCO(CallConv, TailCallOpt) ? alignTo(NumBytes, 16) : -1ULL;
2788 Chain = DAG.getCALLSEQ_END(Chain, DAG.getIntPtrConstant(NumBytes, dl, true),
2789 DAG.getIntPtrConstant(CalleePopBytes, dl, true),
2790 InFlag, dl);
2791 if (!Ins.empty())
2792 InFlag = Chain.getValue(1);
2794 // Handle result values, copying them out of physregs into vregs that we
2795 // return.
2796 return LowerCallResult(Chain, InFlag, CallConv, isVarArg, Ins, dl, DAG,
2797 InVals, isThisReturn,
2798 isThisReturn ? OutVals[0] : SDValue());
2801 /// HandleByVal - Every parameter *after* a byval parameter is passed
2802 /// on the stack. Remember the next parameter register to allocate,
2803 /// and then confiscate the rest of the parameter registers to insure
2804 /// this.
2805 void ARMTargetLowering::HandleByVal(CCState *State, unsigned &Size,
2806 Align Alignment) const {
2807 // Byval (as with any stack) slots are always at least 4 byte aligned.
2808 Alignment = std::max(Alignment, Align(4));
2810 unsigned Reg = State->AllocateReg(GPRArgRegs);
2811 if (!Reg)
2812 return;
2814 unsigned AlignInRegs = Alignment.value() / 4;
2815 unsigned Waste = (ARM::R4 - Reg) % AlignInRegs;
2816 for (unsigned i = 0; i < Waste; ++i)
2817 Reg = State->AllocateReg(GPRArgRegs);
2819 if (!Reg)
2820 return;
2822 unsigned Excess = 4 * (ARM::R4 - Reg);
2824 // Special case when NSAA != SP and parameter size greater than size of
2825 // all remained GPR regs. In that case we can't split parameter, we must
2826 // send it to stack. We also must set NCRN to R4, so waste all
2827 // remained registers.
2828 const unsigned NSAAOffset = State->getNextStackOffset();
2829 if (NSAAOffset != 0 && Size > Excess) {
2830 while (State->AllocateReg(GPRArgRegs))
2832 return;
2835 // First register for byval parameter is the first register that wasn't
2836 // allocated before this method call, so it would be "reg".
2837 // If parameter is small enough to be saved in range [reg, r4), then
2838 // the end (first after last) register would be reg + param-size-in-regs,
2839 // else parameter would be splitted between registers and stack,
2840 // end register would be r4 in this case.
2841 unsigned ByValRegBegin = Reg;
2842 unsigned ByValRegEnd = std::min<unsigned>(Reg + Size / 4, ARM::R4);
2843 State->addInRegsParamInfo(ByValRegBegin, ByValRegEnd);
2844 // Note, first register is allocated in the beginning of function already,
2845 // allocate remained amount of registers we need.
2846 for (unsigned i = Reg + 1; i != ByValRegEnd; ++i)
2847 State->AllocateReg(GPRArgRegs);
2848 // A byval parameter that is split between registers and memory needs its
2849 // size truncated here.
2850 // In the case where the entire structure fits in registers, we set the
2851 // size in memory to zero.
2852 Size = std::max<int>(Size - Excess, 0);
2855 /// MatchingStackOffset - Return true if the given stack call argument is
2856 /// already available in the same position (relatively) of the caller's
2857 /// incoming argument stack.
2858 static
2859 bool MatchingStackOffset(SDValue Arg, unsigned Offset, ISD::ArgFlagsTy Flags,
2860 MachineFrameInfo &MFI, const MachineRegisterInfo *MRI,
2861 const TargetInstrInfo *TII) {
2862 unsigned Bytes = Arg.getValueSizeInBits() / 8;
2863 int FI = std::numeric_limits<int>::max();
2864 if (Arg.getOpcode() == ISD::CopyFromReg) {
2865 unsigned VR = cast<RegisterSDNode>(Arg.getOperand(1))->getReg();
2866 if (!Register::isVirtualRegister(VR))
2867 return false;
2868 MachineInstr *Def = MRI->getVRegDef(VR);
2869 if (!Def)
2870 return false;
2871 if (!Flags.isByVal()) {
2872 if (!TII->isLoadFromStackSlot(*Def, FI))
2873 return false;
2874 } else {
2875 return false;
2877 } else if (LoadSDNode *Ld = dyn_cast<LoadSDNode>(Arg)) {
2878 if (Flags.isByVal())
2879 // ByVal argument is passed in as a pointer but it's now being
2880 // dereferenced. e.g.
2881 // define @foo(%struct.X* %A) {
2882 // tail call @bar(%struct.X* byval %A)
2883 // }
2884 return false;
2885 SDValue Ptr = Ld->getBasePtr();
2886 FrameIndexSDNode *FINode = dyn_cast<FrameIndexSDNode>(Ptr);
2887 if (!FINode)
2888 return false;
2889 FI = FINode->getIndex();
2890 } else
2891 return false;
2893 assert(FI != std::numeric_limits<int>::max());
2894 if (!MFI.isFixedObjectIndex(FI))
2895 return false;
2896 return Offset == MFI.getObjectOffset(FI) && Bytes == MFI.getObjectSize(FI);
2899 /// IsEligibleForTailCallOptimization - Check whether the call is eligible
2900 /// for tail call optimization. Targets which want to do tail call
2901 /// optimization should implement this function.
2902 bool ARMTargetLowering::IsEligibleForTailCallOptimization(
2903 SDValue Callee, CallingConv::ID CalleeCC, bool isVarArg,
2904 bool isCalleeStructRet, bool isCallerStructRet,
2905 const SmallVectorImpl<ISD::OutputArg> &Outs,
2906 const SmallVectorImpl<SDValue> &OutVals,
2907 const SmallVectorImpl<ISD::InputArg> &Ins, SelectionDAG &DAG,
2908 const bool isIndirect) const {
2909 MachineFunction &MF = DAG.getMachineFunction();
2910 const Function &CallerF = MF.getFunction();
2911 CallingConv::ID CallerCC = CallerF.getCallingConv();
2913 assert(Subtarget->supportsTailCall());
2915 // Indirect tail calls cannot be optimized for Thumb1 if the args
2916 // to the call take up r0-r3. The reason is that there are no legal registers
2917 // left to hold the pointer to the function to be called.
2918 if (Subtarget->isThumb1Only() && Outs.size() >= 4 &&
2919 (!isa<GlobalAddressSDNode>(Callee.getNode()) || isIndirect))
2920 return false;
2922 // Look for obvious safe cases to perform tail call optimization that do not
2923 // require ABI changes. This is what gcc calls sibcall.
2925 // Exception-handling functions need a special set of instructions to indicate
2926 // a return to the hardware. Tail-calling another function would probably
2927 // break this.
2928 if (CallerF.hasFnAttribute("interrupt"))
2929 return false;
2931 if (canGuaranteeTCO(CalleeCC, getTargetMachine().Options.GuaranteedTailCallOpt))
2932 return CalleeCC == CallerCC;
2934 // Also avoid sibcall optimization if either caller or callee uses struct
2935 // return semantics.
2936 if (isCalleeStructRet || isCallerStructRet)
2937 return false;
2939 // Externally-defined functions with weak linkage should not be
2940 // tail-called on ARM when the OS does not support dynamic
2941 // pre-emption of symbols, as the AAELF spec requires normal calls
2942 // to undefined weak functions to be replaced with a NOP or jump to the
2943 // next instruction. The behaviour of branch instructions in this
2944 // situation (as used for tail calls) is implementation-defined, so we
2945 // cannot rely on the linker replacing the tail call with a return.
2946 if (GlobalAddressSDNode *G = dyn_cast<GlobalAddressSDNode>(Callee)) {
2947 const GlobalValue *GV = G->getGlobal();
2948 const Triple &TT = getTargetMachine().getTargetTriple();
2949 if (GV->hasExternalWeakLinkage() &&
2950 (!TT.isOSWindows() || TT.isOSBinFormatELF() || TT.isOSBinFormatMachO()))
2951 return false;
2954 // Check that the call results are passed in the same way.
2955 LLVMContext &C = *DAG.getContext();
2956 if (!CCState::resultsCompatible(
2957 getEffectiveCallingConv(CalleeCC, isVarArg),
2958 getEffectiveCallingConv(CallerCC, CallerF.isVarArg()), MF, C, Ins,
2959 CCAssignFnForReturn(CalleeCC, isVarArg),
2960 CCAssignFnForReturn(CallerCC, CallerF.isVarArg())))
2961 return false;
2962 // The callee has to preserve all registers the caller needs to preserve.
2963 const ARMBaseRegisterInfo *TRI = Subtarget->getRegisterInfo();
2964 const uint32_t *CallerPreserved = TRI->getCallPreservedMask(MF, CallerCC);
2965 if (CalleeCC != CallerCC) {
2966 const uint32_t *CalleePreserved = TRI->getCallPreservedMask(MF, CalleeCC);
2967 if (!TRI->regmaskSubsetEqual(CallerPreserved, CalleePreserved))
2968 return false;
2971 // If Caller's vararg or byval argument has been split between registers and
2972 // stack, do not perform tail call, since part of the argument is in caller's
2973 // local frame.
2974 const ARMFunctionInfo *AFI_Caller = MF.getInfo<ARMFunctionInfo>();
2975 if (AFI_Caller->getArgRegsSaveSize())
2976 return false;
2978 // If the callee takes no arguments then go on to check the results of the
2979 // call.
2980 if (!Outs.empty()) {
2981 // Check if stack adjustment is needed. For now, do not do this if any
2982 // argument is passed on the stack.
2983 SmallVector<CCValAssign, 16> ArgLocs;
2984 CCState CCInfo(CalleeCC, isVarArg, MF, ArgLocs, C);
2985 CCInfo.AnalyzeCallOperands(Outs, CCAssignFnForCall(CalleeCC, isVarArg));
2986 if (CCInfo.getNextStackOffset()) {
2987 // Check if the arguments are already laid out in the right way as
2988 // the caller's fixed stack objects.
2989 MachineFrameInfo &MFI = MF.getFrameInfo();
2990 const MachineRegisterInfo *MRI = &MF.getRegInfo();
2991 const TargetInstrInfo *TII = Subtarget->getInstrInfo();
2992 for (unsigned i = 0, realArgIdx = 0, e = ArgLocs.size();
2993 i != e;
2994 ++i, ++realArgIdx) {
2995 CCValAssign &VA = ArgLocs[i];
2996 EVT RegVT = VA.getLocVT();
2997 SDValue Arg = OutVals[realArgIdx];
2998 ISD::ArgFlagsTy Flags = Outs[realArgIdx].Flags;
2999 if (VA.getLocInfo() == CCValAssign::Indirect)
3000 return false;
3001 if (VA.needsCustom() && (RegVT == MVT::f64 || RegVT == MVT::v2f64)) {
3002 // f64 and vector types are split into multiple registers or
3003 // register/stack-slot combinations. The types will not match
3004 // the registers; give up on memory f64 refs until we figure
3005 // out what to do about this.
3006 if (!VA.isRegLoc())
3007 return false;
3008 if (!ArgLocs[++i].isRegLoc())
3009 return false;
3010 if (RegVT == MVT::v2f64) {
3011 if (!ArgLocs[++i].isRegLoc())
3012 return false;
3013 if (!ArgLocs[++i].isRegLoc())
3014 return false;
3016 } else if (!VA.isRegLoc()) {
3017 if (!MatchingStackOffset(Arg, VA.getLocMemOffset(), Flags,
3018 MFI, MRI, TII))
3019 return false;
3024 const MachineRegisterInfo &MRI = MF.getRegInfo();
3025 if (!parametersInCSRMatch(MRI, CallerPreserved, ArgLocs, OutVals))
3026 return false;
3029 return true;
3032 bool
3033 ARMTargetLowering::CanLowerReturn(CallingConv::ID CallConv,
3034 MachineFunction &MF, bool isVarArg,
3035 const SmallVectorImpl<ISD::OutputArg> &Outs,
3036 LLVMContext &Context) const {
3037 SmallVector<CCValAssign, 16> RVLocs;
3038 CCState CCInfo(CallConv, isVarArg, MF, RVLocs, Context);
3039 return CCInfo.CheckReturn(Outs, CCAssignFnForReturn(CallConv, isVarArg));
3042 static SDValue LowerInterruptReturn(SmallVectorImpl<SDValue> &RetOps,
3043 const SDLoc &DL, SelectionDAG &DAG) {
3044 const MachineFunction &MF = DAG.getMachineFunction();
3045 const Function &F = MF.getFunction();
3047 StringRef IntKind = F.getFnAttribute("interrupt").getValueAsString();
3049 // See ARM ARM v7 B1.8.3. On exception entry LR is set to a possibly offset
3050 // version of the "preferred return address". These offsets affect the return
3051 // instruction if this is a return from PL1 without hypervisor extensions.
3052 // IRQ/FIQ: +4 "subs pc, lr, #4"
3053 // SWI: 0 "subs pc, lr, #0"
3054 // ABORT: +4 "subs pc, lr, #4"
3055 // UNDEF: +4/+2 "subs pc, lr, #0"
3056 // UNDEF varies depending on where the exception came from ARM or Thumb
3057 // mode. Alongside GCC, we throw our hands up in disgust and pretend it's 0.
3059 int64_t LROffset;
3060 if (IntKind == "" || IntKind == "IRQ" || IntKind == "FIQ" ||
3061 IntKind == "ABORT")
3062 LROffset = 4;
3063 else if (IntKind == "SWI" || IntKind == "UNDEF")
3064 LROffset = 0;
3065 else
3066 report_fatal_error("Unsupported interrupt attribute. If present, value "
3067 "must be one of: IRQ, FIQ, SWI, ABORT or UNDEF");
3069 RetOps.insert(RetOps.begin() + 1,
3070 DAG.getConstant(LROffset, DL, MVT::i32, false));
3072 return DAG.getNode(ARMISD::INTRET_FLAG, DL, MVT::Other, RetOps);
3075 SDValue
3076 ARMTargetLowering::LowerReturn(SDValue Chain, CallingConv::ID CallConv,
3077 bool isVarArg,
3078 const SmallVectorImpl<ISD::OutputArg> &Outs,
3079 const SmallVectorImpl<SDValue> &OutVals,
3080 const SDLoc &dl, SelectionDAG &DAG) const {
3081 // CCValAssign - represent the assignment of the return value to a location.
3082 SmallVector<CCValAssign, 16> RVLocs;
3084 // CCState - Info about the registers and stack slots.
3085 CCState CCInfo(CallConv, isVarArg, DAG.getMachineFunction(), RVLocs,
3086 *DAG.getContext());
3088 // Analyze outgoing return values.
3089 CCInfo.AnalyzeReturn(Outs, CCAssignFnForReturn(CallConv, isVarArg));
3091 SDValue Flag;
3092 SmallVector<SDValue, 4> RetOps;
3093 RetOps.push_back(Chain); // Operand #0 = Chain (updated below)
3094 bool isLittleEndian = Subtarget->isLittle();
3096 MachineFunction &MF = DAG.getMachineFunction();
3097 ARMFunctionInfo *AFI = MF.getInfo<ARMFunctionInfo>();
3098 AFI->setReturnRegsCount(RVLocs.size());
3100 // Report error if cmse entry function returns structure through first ptr arg.
3101 if (AFI->isCmseNSEntryFunction() && MF.getFunction().hasStructRetAttr()) {
3102 // Note: using an empty SDLoc(), as the first line of the function is a
3103 // better place to report than the last line.
3104 DiagnosticInfoUnsupported Diag(
3105 DAG.getMachineFunction().getFunction(),
3106 "secure entry function would return value through pointer",
3107 SDLoc().getDebugLoc());
3108 DAG.getContext()->diagnose(Diag);
3111 // Copy the result values into the output registers.
3112 for (unsigned i = 0, realRVLocIdx = 0;
3113 i != RVLocs.size();
3114 ++i, ++realRVLocIdx) {
3115 CCValAssign &VA = RVLocs[i];
3116 assert(VA.isRegLoc() && "Can only return in registers!");
3118 SDValue Arg = OutVals[realRVLocIdx];
3119 bool ReturnF16 = false;
3121 if (Subtarget->hasFullFP16() && Subtarget->isTargetHardFloat()) {
3122 // Half-precision return values can be returned like this:
3124 // t11 f16 = fadd ...
3125 // t12: i16 = bitcast t11
3126 // t13: i32 = zero_extend t12
3127 // t14: f32 = bitcast t13 <~~~~~~~ Arg
3129 // to avoid code generation for bitcasts, we simply set Arg to the node
3130 // that produces the f16 value, t11 in this case.
3132 if (Arg.getValueType() == MVT::f32 && Arg.getOpcode() == ISD::BITCAST) {
3133 SDValue ZE = Arg.getOperand(0);
3134 if (ZE.getOpcode() == ISD::ZERO_EXTEND && ZE.getValueType() == MVT::i32) {
3135 SDValue BC = ZE.getOperand(0);
3136 if (BC.getOpcode() == ISD::BITCAST && BC.getValueType() == MVT::i16) {
3137 Arg = BC.getOperand(0);
3138 ReturnF16 = true;
3144 switch (VA.getLocInfo()) {
3145 default: llvm_unreachable("Unknown loc info!");
3146 case CCValAssign::Full: break;
3147 case CCValAssign::BCvt:
3148 if (!ReturnF16)
3149 Arg = DAG.getNode(ISD::BITCAST, dl, VA.getLocVT(), Arg);
3150 break;
3153 // Mask f16 arguments if this is a CMSE nonsecure entry.
3154 auto RetVT = Outs[realRVLocIdx].ArgVT;
3155 if (AFI->isCmseNSEntryFunction() && (RetVT == MVT::f16)) {
3156 if (VA.needsCustom() && VA.getValVT() == MVT::f16) {
3157 Arg = MoveFromHPR(dl, DAG, VA.getLocVT(), VA.getValVT(), Arg);
3158 } else {
3159 auto LocBits = VA.getLocVT().getSizeInBits();
3160 auto MaskValue = APInt::getLowBitsSet(LocBits, RetVT.getSizeInBits());
3161 SDValue Mask =
3162 DAG.getConstant(MaskValue, dl, MVT::getIntegerVT(LocBits));
3163 Arg = DAG.getNode(ISD::BITCAST, dl, MVT::getIntegerVT(LocBits), Arg);
3164 Arg = DAG.getNode(ISD::AND, dl, MVT::getIntegerVT(LocBits), Arg, Mask);
3165 Arg = DAG.getNode(ISD::BITCAST, dl, VA.getLocVT(), Arg);
3169 if (VA.needsCustom() &&
3170 (VA.getLocVT() == MVT::v2f64 || VA.getLocVT() == MVT::f64)) {
3171 if (VA.getLocVT() == MVT::v2f64) {
3172 // Extract the first half and return it in two registers.
3173 SDValue Half = DAG.getNode(ISD::EXTRACT_VECTOR_ELT, dl, MVT::f64, Arg,
3174 DAG.getConstant(0, dl, MVT::i32));
3175 SDValue HalfGPRs = DAG.getNode(ARMISD::VMOVRRD, dl,
3176 DAG.getVTList(MVT::i32, MVT::i32), Half);
3178 Chain =
3179 DAG.getCopyToReg(Chain, dl, VA.getLocReg(),
3180 HalfGPRs.getValue(isLittleEndian ? 0 : 1), Flag);
3181 Flag = Chain.getValue(1);
3182 RetOps.push_back(DAG.getRegister(VA.getLocReg(), VA.getLocVT()));
3183 VA = RVLocs[++i]; // skip ahead to next loc
3184 Chain =
3185 DAG.getCopyToReg(Chain, dl, VA.getLocReg(),
3186 HalfGPRs.getValue(isLittleEndian ? 1 : 0), Flag);
3187 Flag = Chain.getValue(1);
3188 RetOps.push_back(DAG.getRegister(VA.getLocReg(), VA.getLocVT()));
3189 VA = RVLocs[++i]; // skip ahead to next loc
3191 // Extract the 2nd half and fall through to handle it as an f64 value.
3192 Arg = DAG.getNode(ISD::EXTRACT_VECTOR_ELT, dl, MVT::f64, Arg,
3193 DAG.getConstant(1, dl, MVT::i32));
3195 // Legalize ret f64 -> ret 2 x i32. We always have fmrrd if f64 is
3196 // available.
3197 SDValue fmrrd = DAG.getNode(ARMISD::VMOVRRD, dl,
3198 DAG.getVTList(MVT::i32, MVT::i32), Arg);
3199 Chain = DAG.getCopyToReg(Chain, dl, VA.getLocReg(),
3200 fmrrd.getValue(isLittleEndian ? 0 : 1), Flag);
3201 Flag = Chain.getValue(1);
3202 RetOps.push_back(DAG.getRegister(VA.getLocReg(), VA.getLocVT()));
3203 VA = RVLocs[++i]; // skip ahead to next loc
3204 Chain = DAG.getCopyToReg(Chain, dl, VA.getLocReg(),
3205 fmrrd.getValue(isLittleEndian ? 1 : 0), Flag);
3206 } else
3207 Chain = DAG.getCopyToReg(Chain, dl, VA.getLocReg(), Arg, Flag);
3209 // Guarantee that all emitted copies are
3210 // stuck together, avoiding something bad.
3211 Flag = Chain.getValue(1);
3212 RetOps.push_back(DAG.getRegister(
3213 VA.getLocReg(), ReturnF16 ? Arg.getValueType() : VA.getLocVT()));
3215 const ARMBaseRegisterInfo *TRI = Subtarget->getRegisterInfo();
3216 const MCPhysReg *I =
3217 TRI->getCalleeSavedRegsViaCopy(&DAG.getMachineFunction());
3218 if (I) {
3219 for (; *I; ++I) {
3220 if (ARM::GPRRegClass.contains(*I))
3221 RetOps.push_back(DAG.getRegister(*I, MVT::i32));
3222 else if (ARM::DPRRegClass.contains(*I))
3223 RetOps.push_back(DAG.getRegister(*I, MVT::getFloatingPointVT(64)));
3224 else
3225 llvm_unreachable("Unexpected register class in CSRsViaCopy!");
3229 // Update chain and glue.
3230 RetOps[0] = Chain;
3231 if (Flag.getNode())
3232 RetOps.push_back(Flag);
3234 // CPUs which aren't M-class use a special sequence to return from
3235 // exceptions (roughly, any instruction setting pc and cpsr simultaneously,
3236 // though we use "subs pc, lr, #N").
3238 // M-class CPUs actually use a normal return sequence with a special
3239 // (hardware-provided) value in LR, so the normal code path works.
3240 if (DAG.getMachineFunction().getFunction().hasFnAttribute("interrupt") &&
3241 !Subtarget->isMClass()) {
3242 if (Subtarget->isThumb1Only())
3243 report_fatal_error("interrupt attribute is not supported in Thumb1");
3244 return LowerInterruptReturn(RetOps, dl, DAG);
3247 ARMISD::NodeType RetNode = AFI->isCmseNSEntryFunction() ? ARMISD::SERET_FLAG :
3248 ARMISD::RET_FLAG;
3249 return DAG.getNode(RetNode, dl, MVT::Other, RetOps);
3252 bool ARMTargetLowering::isUsedByReturnOnly(SDNode *N, SDValue &Chain) const {
3253 if (N->getNumValues() != 1)
3254 return false;
3255 if (!N->hasNUsesOfValue(1, 0))
3256 return false;
3258 SDValue TCChain = Chain;
3259 SDNode *Copy = *N->use_begin();
3260 if (Copy->getOpcode() == ISD::CopyToReg) {
3261 // If the copy has a glue operand, we conservatively assume it isn't safe to
3262 // perform a tail call.
3263 if (Copy->getOperand(Copy->getNumOperands()-1).getValueType() == MVT::Glue)
3264 return false;
3265 TCChain = Copy->getOperand(0);
3266 } else if (Copy->getOpcode() == ARMISD::VMOVRRD) {
3267 SDNode *VMov = Copy;
3268 // f64 returned in a pair of GPRs.
3269 SmallPtrSet<SDNode*, 2> Copies;
3270 for (SDNode::use_iterator UI = VMov->use_begin(), UE = VMov->use_end();
3271 UI != UE; ++UI) {
3272 if (UI->getOpcode() != ISD::CopyToReg)
3273 return false;
3274 Copies.insert(*UI);
3276 if (Copies.size() > 2)
3277 return false;
3279 for (SDNode::use_iterator UI = VMov->use_begin(), UE = VMov->use_end();
3280 UI != UE; ++UI) {
3281 SDValue UseChain = UI->getOperand(0);
3282 if (Copies.count(UseChain.getNode()))
3283 // Second CopyToReg
3284 Copy = *UI;
3285 else {
3286 // We are at the top of this chain.
3287 // If the copy has a glue operand, we conservatively assume it
3288 // isn't safe to perform a tail call.
3289 if (UI->getOperand(UI->getNumOperands()-1).getValueType() == MVT::Glue)
3290 return false;
3291 // First CopyToReg
3292 TCChain = UseChain;
3295 } else if (Copy->getOpcode() == ISD::BITCAST) {
3296 // f32 returned in a single GPR.
3297 if (!Copy->hasOneUse())
3298 return false;
3299 Copy = *Copy->use_begin();
3300 if (Copy->getOpcode() != ISD::CopyToReg || !Copy->hasNUsesOfValue(1, 0))
3301 return false;
3302 // If the copy has a glue operand, we conservatively assume it isn't safe to
3303 // perform a tail call.
3304 if (Copy->getOperand(Copy->getNumOperands()-1).getValueType() == MVT::Glue)
3305 return false;
3306 TCChain = Copy->getOperand(0);
3307 } else {
3308 return false;
3311 bool HasRet = false;
3312 for (SDNode::use_iterator UI = Copy->use_begin(), UE = Copy->use_end();
3313 UI != UE; ++UI) {
3314 if (UI->getOpcode() != ARMISD::RET_FLAG &&
3315 UI->getOpcode() != ARMISD::INTRET_FLAG)
3316 return false;
3317 HasRet = true;
3320 if (!HasRet)
3321 return false;
3323 Chain = TCChain;
3324 return true;
3327 bool ARMTargetLowering::mayBeEmittedAsTailCall(const CallInst *CI) const {
3328 if (!Subtarget->supportsTailCall())
3329 return false;
3331 if (!CI->isTailCall())
3332 return false;
3334 return true;
3337 // Trying to write a 64 bit value so need to split into two 32 bit values first,
3338 // and pass the lower and high parts through.
3339 static SDValue LowerWRITE_REGISTER(SDValue Op, SelectionDAG &DAG) {
3340 SDLoc DL(Op);
3341 SDValue WriteValue = Op->getOperand(2);
3343 // This function is only supposed to be called for i64 type argument.
3344 assert(WriteValue.getValueType() == MVT::i64
3345 && "LowerWRITE_REGISTER called for non-i64 type argument.");
3347 SDValue Lo = DAG.getNode(ISD::EXTRACT_ELEMENT, DL, MVT::i32, WriteValue,
3348 DAG.getConstant(0, DL, MVT::i32));
3349 SDValue Hi = DAG.getNode(ISD::EXTRACT_ELEMENT, DL, MVT::i32, WriteValue,
3350 DAG.getConstant(1, DL, MVT::i32));
3351 SDValue Ops[] = { Op->getOperand(0), Op->getOperand(1), Lo, Hi };
3352 return DAG.getNode(ISD::WRITE_REGISTER, DL, MVT::Other, Ops);
3355 // ConstantPool, JumpTable, GlobalAddress, and ExternalSymbol are lowered as
3356 // their target counterpart wrapped in the ARMISD::Wrapper node. Suppose N is
3357 // one of the above mentioned nodes. It has to be wrapped because otherwise
3358 // Select(N) returns N. So the raw TargetGlobalAddress nodes, etc. can only
3359 // be used to form addressing mode. These wrapped nodes will be selected
3360 // into MOVi.
3361 SDValue ARMTargetLowering::LowerConstantPool(SDValue Op,
3362 SelectionDAG &DAG) const {
3363 EVT PtrVT = Op.getValueType();
3364 // FIXME there is no actual debug info here
3365 SDLoc dl(Op);
3366 ConstantPoolSDNode *CP = cast<ConstantPoolSDNode>(Op);
3367 SDValue Res;
3369 // When generating execute-only code Constant Pools must be promoted to the
3370 // global data section. It's a bit ugly that we can't share them across basic
3371 // blocks, but this way we guarantee that execute-only behaves correct with
3372 // position-independent addressing modes.
3373 if (Subtarget->genExecuteOnly()) {
3374 auto AFI = DAG.getMachineFunction().getInfo<ARMFunctionInfo>();
3375 auto T = const_cast<Type*>(CP->getType());
3376 auto C = const_cast<Constant*>(CP->getConstVal());
3377 auto M = const_cast<Module*>(DAG.getMachineFunction().
3378 getFunction().getParent());
3379 auto GV = new GlobalVariable(
3380 *M, T, /*isConstant=*/true, GlobalVariable::InternalLinkage, C,
3381 Twine(DAG.getDataLayout().getPrivateGlobalPrefix()) + "CP" +
3382 Twine(DAG.getMachineFunction().getFunctionNumber()) + "_" +
3383 Twine(AFI->createPICLabelUId())
3385 SDValue GA = DAG.getTargetGlobalAddress(dyn_cast<GlobalValue>(GV),
3386 dl, PtrVT);
3387 return LowerGlobalAddress(GA, DAG);
3390 if (CP->isMachineConstantPoolEntry())
3391 Res =
3392 DAG.getTargetConstantPool(CP->getMachineCPVal(), PtrVT, CP->getAlign());
3393 else
3394 Res = DAG.getTargetConstantPool(CP->getConstVal(), PtrVT, CP->getAlign());
3395 return DAG.getNode(ARMISD::Wrapper, dl, MVT::i32, Res);
3398 unsigned ARMTargetLowering::getJumpTableEncoding() const {
3399 return MachineJumpTableInfo::EK_Inline;
3402 SDValue ARMTargetLowering::LowerBlockAddress(SDValue Op,
3403 SelectionDAG &DAG) const {
3404 MachineFunction &MF = DAG.getMachineFunction();
3405 ARMFunctionInfo *AFI = MF.getInfo<ARMFunctionInfo>();
3406 unsigned ARMPCLabelIndex = 0;
3407 SDLoc DL(Op);
3408 EVT PtrVT = getPointerTy(DAG.getDataLayout());
3409 const BlockAddress *BA = cast<BlockAddressSDNode>(Op)->getBlockAddress();
3410 SDValue CPAddr;
3411 bool IsPositionIndependent = isPositionIndependent() || Subtarget->isROPI();
3412 if (!IsPositionIndependent) {
3413 CPAddr = DAG.getTargetConstantPool(BA, PtrVT, Align(4));
3414 } else {
3415 unsigned PCAdj = Subtarget->isThumb() ? 4 : 8;
3416 ARMPCLabelIndex = AFI->createPICLabelUId();
3417 ARMConstantPoolValue *CPV =
3418 ARMConstantPoolConstant::Create(BA, ARMPCLabelIndex,
3419 ARMCP::CPBlockAddress, PCAdj);
3420 CPAddr = DAG.getTargetConstantPool(CPV, PtrVT, Align(4));
3422 CPAddr = DAG.getNode(ARMISD::Wrapper, DL, PtrVT, CPAddr);
3423 SDValue Result = DAG.getLoad(
3424 PtrVT, DL, DAG.getEntryNode(), CPAddr,
3425 MachinePointerInfo::getConstantPool(DAG.getMachineFunction()));
3426 if (!IsPositionIndependent)
3427 return Result;
3428 SDValue PICLabel = DAG.getConstant(ARMPCLabelIndex, DL, MVT::i32);
3429 return DAG.getNode(ARMISD::PIC_ADD, DL, PtrVT, Result, PICLabel);
3432 /// Convert a TLS address reference into the correct sequence of loads
3433 /// and calls to compute the variable's address for Darwin, and return an
3434 /// SDValue containing the final node.
3436 /// Darwin only has one TLS scheme which must be capable of dealing with the
3437 /// fully general situation, in the worst case. This means:
3438 /// + "extern __thread" declaration.
3439 /// + Defined in a possibly unknown dynamic library.
3441 /// The general system is that each __thread variable has a [3 x i32] descriptor
3442 /// which contains information used by the runtime to calculate the address. The
3443 /// only part of this the compiler needs to know about is the first word, which
3444 /// contains a function pointer that must be called with the address of the
3445 /// entire descriptor in "r0".
3447 /// Since this descriptor may be in a different unit, in general access must
3448 /// proceed along the usual ARM rules. A common sequence to produce is:
3450 /// movw rT1, :lower16:_var$non_lazy_ptr
3451 /// movt rT1, :upper16:_var$non_lazy_ptr
3452 /// ldr r0, [rT1]
3453 /// ldr rT2, [r0]
3454 /// blx rT2
3455 /// [...address now in r0...]
3456 SDValue
3457 ARMTargetLowering::LowerGlobalTLSAddressDarwin(SDValue Op,
3458 SelectionDAG &DAG) const {
3459 assert(Subtarget->isTargetDarwin() &&
3460 "This function expects a Darwin target");
3461 SDLoc DL(Op);
3463 // First step is to get the address of the actua global symbol. This is where
3464 // the TLS descriptor lives.
3465 SDValue DescAddr = LowerGlobalAddressDarwin(Op, DAG);
3467 // The first entry in the descriptor is a function pointer that we must call
3468 // to obtain the address of the variable.
3469 SDValue Chain = DAG.getEntryNode();
3470 SDValue FuncTLVGet = DAG.getLoad(
3471 MVT::i32, DL, Chain, DescAddr,
3472 MachinePointerInfo::getGOT(DAG.getMachineFunction()), Align(4),
3473 MachineMemOperand::MONonTemporal | MachineMemOperand::MODereferenceable |
3474 MachineMemOperand::MOInvariant);
3475 Chain = FuncTLVGet.getValue(1);
3477 MachineFunction &F = DAG.getMachineFunction();
3478 MachineFrameInfo &MFI = F.getFrameInfo();
3479 MFI.setAdjustsStack(true);
3481 // TLS calls preserve all registers except those that absolutely must be
3482 // trashed: R0 (it takes an argument), LR (it's a call) and CPSR (let's not be
3483 // silly).
3484 auto TRI =
3485 getTargetMachine().getSubtargetImpl(F.getFunction())->getRegisterInfo();
3486 auto ARI = static_cast<const ARMRegisterInfo *>(TRI);
3487 const uint32_t *Mask = ARI->getTLSCallPreservedMask(DAG.getMachineFunction());
3489 // Finally, we can make the call. This is just a degenerate version of a
3490 // normal AArch64 call node: r0 takes the address of the descriptor, and
3491 // returns the address of the variable in this thread.
3492 Chain = DAG.getCopyToReg(Chain, DL, ARM::R0, DescAddr, SDValue());
3493 Chain =
3494 DAG.getNode(ARMISD::CALL, DL, DAG.getVTList(MVT::Other, MVT::Glue),
3495 Chain, FuncTLVGet, DAG.getRegister(ARM::R0, MVT::i32),
3496 DAG.getRegisterMask(Mask), Chain.getValue(1));
3497 return DAG.getCopyFromReg(Chain, DL, ARM::R0, MVT::i32, Chain.getValue(1));
3500 SDValue
3501 ARMTargetLowering::LowerGlobalTLSAddressWindows(SDValue Op,
3502 SelectionDAG &DAG) const {
3503 assert(Subtarget->isTargetWindows() && "Windows specific TLS lowering");
3505 SDValue Chain = DAG.getEntryNode();
3506 EVT PtrVT = getPointerTy(DAG.getDataLayout());
3507 SDLoc DL(Op);
3509 // Load the current TEB (thread environment block)
3510 SDValue Ops[] = {Chain,
3511 DAG.getTargetConstant(Intrinsic::arm_mrc, DL, MVT::i32),
3512 DAG.getTargetConstant(15, DL, MVT::i32),
3513 DAG.getTargetConstant(0, DL, MVT::i32),
3514 DAG.getTargetConstant(13, DL, MVT::i32),
3515 DAG.getTargetConstant(0, DL, MVT::i32),
3516 DAG.getTargetConstant(2, DL, MVT::i32)};
3517 SDValue CurrentTEB = DAG.getNode(ISD::INTRINSIC_W_CHAIN, DL,
3518 DAG.getVTList(MVT::i32, MVT::Other), Ops);
3520 SDValue TEB = CurrentTEB.getValue(0);
3521 Chain = CurrentTEB.getValue(1);
3523 // Load the ThreadLocalStoragePointer from the TEB
3524 // A pointer to the TLS array is located at offset 0x2c from the TEB.
3525 SDValue TLSArray =
3526 DAG.getNode(ISD::ADD, DL, PtrVT, TEB, DAG.getIntPtrConstant(0x2c, DL));
3527 TLSArray = DAG.getLoad(PtrVT, DL, Chain, TLSArray, MachinePointerInfo());
3529 // The pointer to the thread's TLS data area is at the TLS Index scaled by 4
3530 // offset into the TLSArray.
3532 // Load the TLS index from the C runtime
3533 SDValue TLSIndex =
3534 DAG.getTargetExternalSymbol("_tls_index", PtrVT, ARMII::MO_NO_FLAG);
3535 TLSIndex = DAG.getNode(ARMISD::Wrapper, DL, PtrVT, TLSIndex);
3536 TLSIndex = DAG.getLoad(PtrVT, DL, Chain, TLSIndex, MachinePointerInfo());
3538 SDValue Slot = DAG.getNode(ISD::SHL, DL, PtrVT, TLSIndex,
3539 DAG.getConstant(2, DL, MVT::i32));
3540 SDValue TLS = DAG.getLoad(PtrVT, DL, Chain,
3541 DAG.getNode(ISD::ADD, DL, PtrVT, TLSArray, Slot),
3542 MachinePointerInfo());
3544 // Get the offset of the start of the .tls section (section base)
3545 const auto *GA = cast<GlobalAddressSDNode>(Op);
3546 auto *CPV = ARMConstantPoolConstant::Create(GA->getGlobal(), ARMCP::SECREL);
3547 SDValue Offset = DAG.getLoad(
3548 PtrVT, DL, Chain,
3549 DAG.getNode(ARMISD::Wrapper, DL, MVT::i32,
3550 DAG.getTargetConstantPool(CPV, PtrVT, Align(4))),
3551 MachinePointerInfo::getConstantPool(DAG.getMachineFunction()));
3553 return DAG.getNode(ISD::ADD, DL, PtrVT, TLS, Offset);
3556 // Lower ISD::GlobalTLSAddress using the "general dynamic" model
3557 SDValue
3558 ARMTargetLowering::LowerToTLSGeneralDynamicModel(GlobalAddressSDNode *GA,
3559 SelectionDAG &DAG) const {
3560 SDLoc dl(GA);
3561 EVT PtrVT = getPointerTy(DAG.getDataLayout());
3562 unsigned char PCAdj = Subtarget->isThumb() ? 4 : 8;
3563 MachineFunction &MF = DAG.getMachineFunction();
3564 ARMFunctionInfo *AFI = MF.getInfo<ARMFunctionInfo>();
3565 unsigned ARMPCLabelIndex = AFI->createPICLabelUId();
3566 ARMConstantPoolValue *CPV =
3567 ARMConstantPoolConstant::Create(GA->getGlobal(), ARMPCLabelIndex,
3568 ARMCP::CPValue, PCAdj, ARMCP::TLSGD, true);
3569 SDValue Argument = DAG.getTargetConstantPool(CPV, PtrVT, Align(4));
3570 Argument = DAG.getNode(ARMISD::Wrapper, dl, MVT::i32, Argument);
3571 Argument = DAG.getLoad(
3572 PtrVT, dl, DAG.getEntryNode(), Argument,
3573 MachinePointerInfo::getConstantPool(DAG.getMachineFunction()));
3574 SDValue Chain = Argument.getValue(1);
3576 SDValue PICLabel = DAG.getConstant(ARMPCLabelIndex, dl, MVT::i32);
3577 Argument = DAG.getNode(ARMISD::PIC_ADD, dl, PtrVT, Argument, PICLabel);
3579 // call __tls_get_addr.
3580 ArgListTy Args;
3581 ArgListEntry Entry;
3582 Entry.Node = Argument;
3583 Entry.Ty = (Type *) Type::getInt32Ty(*DAG.getContext());
3584 Args.push_back(Entry);
3586 // FIXME: is there useful debug info available here?
3587 TargetLowering::CallLoweringInfo CLI(DAG);
3588 CLI.setDebugLoc(dl).setChain(Chain).setLibCallee(
3589 CallingConv::C, Type::getInt32Ty(*DAG.getContext()),
3590 DAG.getExternalSymbol("__tls_get_addr", PtrVT), std::move(Args));
3592 std::pair<SDValue, SDValue> CallResult = LowerCallTo(CLI);
3593 return CallResult.first;
3596 // Lower ISD::GlobalTLSAddress using the "initial exec" or
3597 // "local exec" model.
3598 SDValue
3599 ARMTargetLowering::LowerToTLSExecModels(GlobalAddressSDNode *GA,
3600 SelectionDAG &DAG,
3601 TLSModel::Model model) const {
3602 const GlobalValue *GV = GA->getGlobal();
3603 SDLoc dl(GA);
3604 SDValue Offset;
3605 SDValue Chain = DAG.getEntryNode();
3606 EVT PtrVT = getPointerTy(DAG.getDataLayout());
3607 // Get the Thread Pointer
3608 SDValue ThreadPointer = DAG.getNode(ARMISD::THREAD_POINTER, dl, PtrVT);
3610 if (model == TLSModel::InitialExec) {
3611 MachineFunction &MF = DAG.getMachineFunction();
3612 ARMFunctionInfo *AFI = MF.getInfo<ARMFunctionInfo>();
3613 unsigned ARMPCLabelIndex = AFI->createPICLabelUId();
3614 // Initial exec model.
3615 unsigned char PCAdj = Subtarget->isThumb() ? 4 : 8;
3616 ARMConstantPoolValue *CPV =
3617 ARMConstantPoolConstant::Create(GA->getGlobal(), ARMPCLabelIndex,
3618 ARMCP::CPValue, PCAdj, ARMCP::GOTTPOFF,
3619 true);
3620 Offset = DAG.getTargetConstantPool(CPV, PtrVT, Align(4));
3621 Offset = DAG.getNode(ARMISD::Wrapper, dl, MVT::i32, Offset);
3622 Offset = DAG.getLoad(
3623 PtrVT, dl, Chain, Offset,
3624 MachinePointerInfo::getConstantPool(DAG.getMachineFunction()));
3625 Chain = Offset.getValue(1);
3627 SDValue PICLabel = DAG.getConstant(ARMPCLabelIndex, dl, MVT::i32);
3628 Offset = DAG.getNode(ARMISD::PIC_ADD, dl, PtrVT, Offset, PICLabel);
3630 Offset = DAG.getLoad(
3631 PtrVT, dl, Chain, Offset,
3632 MachinePointerInfo::getConstantPool(DAG.getMachineFunction()));
3633 } else {
3634 // local exec model
3635 assert(model == TLSModel::LocalExec);
3636 ARMConstantPoolValue *CPV =
3637 ARMConstantPoolConstant::Create(GV, ARMCP::TPOFF);
3638 Offset = DAG.getTargetConstantPool(CPV, PtrVT, Align(4));
3639 Offset = DAG.getNode(ARMISD::Wrapper, dl, MVT::i32, Offset);
3640 Offset = DAG.getLoad(
3641 PtrVT, dl, Chain, Offset,
3642 MachinePointerInfo::getConstantPool(DAG.getMachineFunction()));
3645 // The address of the thread local variable is the add of the thread
3646 // pointer with the offset of the variable.
3647 return DAG.getNode(ISD::ADD, dl, PtrVT, ThreadPointer, Offset);
3650 SDValue
3651 ARMTargetLowering::LowerGlobalTLSAddress(SDValue Op, SelectionDAG &DAG) const {
3652 GlobalAddressSDNode *GA = cast<GlobalAddressSDNode>(Op);
3653 if (DAG.getTarget().useEmulatedTLS())
3654 return LowerToTLSEmulatedModel(GA, DAG);
3656 if (Subtarget->isTargetDarwin())
3657 return LowerGlobalTLSAddressDarwin(Op, DAG);
3659 if (Subtarget->isTargetWindows())
3660 return LowerGlobalTLSAddressWindows(Op, DAG);
3662 // TODO: implement the "local dynamic" model
3663 assert(Subtarget->isTargetELF() && "Only ELF implemented here");
3664 TLSModel::Model model = getTargetMachine().getTLSModel(GA->getGlobal());
3666 switch (model) {
3667 case TLSModel::GeneralDynamic:
3668 case TLSModel::LocalDynamic:
3669 return LowerToTLSGeneralDynamicModel(GA, DAG);
3670 case TLSModel::InitialExec:
3671 case TLSModel::LocalExec:
3672 return LowerToTLSExecModels(GA, DAG, model);
3674 llvm_unreachable("bogus TLS model");
3677 /// Return true if all users of V are within function F, looking through
3678 /// ConstantExprs.
3679 static bool allUsersAreInFunction(const Value *V, const Function *F) {
3680 SmallVector<const User*,4> Worklist(V->users());
3681 while (!Worklist.empty()) {
3682 auto *U = Worklist.pop_back_val();
3683 if (isa<ConstantExpr>(U)) {
3684 append_range(Worklist, U->users());
3685 continue;
3688 auto *I = dyn_cast<Instruction>(U);
3689 if (!I || I->getParent()->getParent() != F)
3690 return false;
3692 return true;
3695 static SDValue promoteToConstantPool(const ARMTargetLowering *TLI,
3696 const GlobalValue *GV, SelectionDAG &DAG,
3697 EVT PtrVT, const SDLoc &dl) {
3698 // If we're creating a pool entry for a constant global with unnamed address,
3699 // and the global is small enough, we can emit it inline into the constant pool
3700 // to save ourselves an indirection.
3702 // This is a win if the constant is only used in one function (so it doesn't
3703 // need to be duplicated) or duplicating the constant wouldn't increase code
3704 // size (implying the constant is no larger than 4 bytes).
3705 const Function &F = DAG.getMachineFunction().getFunction();
3707 // We rely on this decision to inline being idemopotent and unrelated to the
3708 // use-site. We know that if we inline a variable at one use site, we'll
3709 // inline it elsewhere too (and reuse the constant pool entry). Fast-isel
3710 // doesn't know about this optimization, so bail out if it's enabled else
3711 // we could decide to inline here (and thus never emit the GV) but require
3712 // the GV from fast-isel generated code.
3713 if (!EnableConstpoolPromotion ||
3714 DAG.getMachineFunction().getTarget().Options.EnableFastISel)
3715 return SDValue();
3717 auto *GVar = dyn_cast<GlobalVariable>(GV);
3718 if (!GVar || !GVar->hasInitializer() ||
3719 !GVar->isConstant() || !GVar->hasGlobalUnnamedAddr() ||
3720 !GVar->hasLocalLinkage())
3721 return SDValue();
3723 // If we inline a value that contains relocations, we move the relocations
3724 // from .data to .text. This is not allowed in position-independent code.
3725 auto *Init = GVar->getInitializer();
3726 if ((TLI->isPositionIndependent() || TLI->getSubtarget()->isROPI()) &&
3727 Init->needsDynamicRelocation())
3728 return SDValue();
3730 // The constant islands pass can only really deal with alignment requests
3731 // <= 4 bytes and cannot pad constants itself. Therefore we cannot promote
3732 // any type wanting greater alignment requirements than 4 bytes. We also
3733 // can only promote constants that are multiples of 4 bytes in size or
3734 // are paddable to a multiple of 4. Currently we only try and pad constants
3735 // that are strings for simplicity.
3736 auto *CDAInit = dyn_cast<ConstantDataArray>(Init);
3737 unsigned Size = DAG.getDataLayout().getTypeAllocSize(Init->getType());
3738 Align PrefAlign = DAG.getDataLayout().getPreferredAlign(GVar);
3739 unsigned RequiredPadding = 4 - (Size % 4);
3740 bool PaddingPossible =
3741 RequiredPadding == 4 || (CDAInit && CDAInit->isString());
3742 if (!PaddingPossible || PrefAlign > 4 || Size > ConstpoolPromotionMaxSize ||
3743 Size == 0)
3744 return SDValue();
3746 unsigned PaddedSize = Size + ((RequiredPadding == 4) ? 0 : RequiredPadding);
3747 MachineFunction &MF = DAG.getMachineFunction();
3748 ARMFunctionInfo *AFI = MF.getInfo<ARMFunctionInfo>();
3750 // We can't bloat the constant pool too much, else the ConstantIslands pass
3751 // may fail to converge. If we haven't promoted this global yet (it may have
3752 // multiple uses), and promoting it would increase the constant pool size (Sz
3753 // > 4), ensure we have space to do so up to MaxTotal.
3754 if (!AFI->getGlobalsPromotedToConstantPool().count(GVar) && Size > 4)
3755 if (AFI->getPromotedConstpoolIncrease() + PaddedSize - 4 >=
3756 ConstpoolPromotionMaxTotal)
3757 return SDValue();
3759 // This is only valid if all users are in a single function; we can't clone
3760 // the constant in general. The LLVM IR unnamed_addr allows merging
3761 // constants, but not cloning them.
3763 // We could potentially allow cloning if we could prove all uses of the
3764 // constant in the current function don't care about the address, like
3765 // printf format strings. But that isn't implemented for now.
3766 if (!allUsersAreInFunction(GVar, &F))
3767 return SDValue();
3769 // We're going to inline this global. Pad it out if needed.
3770 if (RequiredPadding != 4) {
3771 StringRef S = CDAInit->getAsString();
3773 SmallVector<uint8_t,16> V(S.size());
3774 std::copy(S.bytes_begin(), S.bytes_end(), V.begin());
3775 while (RequiredPadding--)
3776 V.push_back(0);
3777 Init = ConstantDataArray::get(*DAG.getContext(), V);
3780 auto CPVal = ARMConstantPoolConstant::Create(GVar, Init);
3781 SDValue CPAddr = DAG.getTargetConstantPool(CPVal, PtrVT, Align(4));
3782 if (!AFI->getGlobalsPromotedToConstantPool().count(GVar)) {
3783 AFI->markGlobalAsPromotedToConstantPool(GVar);
3784 AFI->setPromotedConstpoolIncrease(AFI->getPromotedConstpoolIncrease() +
3785 PaddedSize - 4);
3787 ++NumConstpoolPromoted;
3788 return DAG.getNode(ARMISD::Wrapper, dl, MVT::i32, CPAddr);
3791 bool ARMTargetLowering::isReadOnly(const GlobalValue *GV) const {
3792 if (const GlobalAlias *GA = dyn_cast<GlobalAlias>(GV))
3793 if (!(GV = GA->getBaseObject()))
3794 return false;
3795 if (const auto *V = dyn_cast<GlobalVariable>(GV))
3796 return V->isConstant();
3797 return isa<Function>(GV);
3800 SDValue ARMTargetLowering::LowerGlobalAddress(SDValue Op,
3801 SelectionDAG &DAG) const {
3802 switch (Subtarget->getTargetTriple().getObjectFormat()) {
3803 default: llvm_unreachable("unknown object format");
3804 case Triple::COFF:
3805 return LowerGlobalAddressWindows(Op, DAG);
3806 case Triple::ELF:
3807 return LowerGlobalAddressELF(Op, DAG);
3808 case Triple::MachO:
3809 return LowerGlobalAddressDarwin(Op, DAG);
3813 SDValue ARMTargetLowering::LowerGlobalAddressELF(SDValue Op,
3814 SelectionDAG &DAG) const {
3815 EVT PtrVT = getPointerTy(DAG.getDataLayout());
3816 SDLoc dl(Op);
3817 const GlobalValue *GV = cast<GlobalAddressSDNode>(Op)->getGlobal();
3818 const TargetMachine &TM = getTargetMachine();
3819 bool IsRO = isReadOnly(GV);
3821 // promoteToConstantPool only if not generating XO text section
3822 if (TM.shouldAssumeDSOLocal(*GV->getParent(), GV) && !Subtarget->genExecuteOnly())
3823 if (SDValue V = promoteToConstantPool(this, GV, DAG, PtrVT, dl))
3824 return V;
3826 if (isPositionIndependent()) {
3827 bool UseGOT_PREL = !TM.shouldAssumeDSOLocal(*GV->getParent(), GV);
3828 SDValue G = DAG.getTargetGlobalAddress(GV, dl, PtrVT, 0,
3829 UseGOT_PREL ? ARMII::MO_GOT : 0);
3830 SDValue Result = DAG.getNode(ARMISD::WrapperPIC, dl, PtrVT, G);
3831 if (UseGOT_PREL)
3832 Result =
3833 DAG.getLoad(PtrVT, dl, DAG.getEntryNode(), Result,
3834 MachinePointerInfo::getGOT(DAG.getMachineFunction()));
3835 return Result;
3836 } else if (Subtarget->isROPI() && IsRO) {
3837 // PC-relative.
3838 SDValue G = DAG.getTargetGlobalAddress(GV, dl, PtrVT);
3839 SDValue Result = DAG.getNode(ARMISD::WrapperPIC, dl, PtrVT, G);
3840 return Result;
3841 } else if (Subtarget->isRWPI() && !IsRO) {
3842 // SB-relative.
3843 SDValue RelAddr;
3844 if (Subtarget->useMovt()) {
3845 ++NumMovwMovt;
3846 SDValue G = DAG.getTargetGlobalAddress(GV, dl, PtrVT, 0, ARMII::MO_SBREL);
3847 RelAddr = DAG.getNode(ARMISD::Wrapper, dl, PtrVT, G);
3848 } else { // use literal pool for address constant
3849 ARMConstantPoolValue *CPV =
3850 ARMConstantPoolConstant::Create(GV, ARMCP::SBREL);
3851 SDValue CPAddr = DAG.getTargetConstantPool(CPV, PtrVT, Align(4));
3852 CPAddr = DAG.getNode(ARMISD::Wrapper, dl, MVT::i32, CPAddr);
3853 RelAddr = DAG.getLoad(
3854 PtrVT, dl, DAG.getEntryNode(), CPAddr,
3855 MachinePointerInfo::getConstantPool(DAG.getMachineFunction()));
3857 SDValue SB = DAG.getCopyFromReg(DAG.getEntryNode(), dl, ARM::R9, PtrVT);
3858 SDValue Result = DAG.getNode(ISD::ADD, dl, PtrVT, SB, RelAddr);
3859 return Result;
3862 // If we have T2 ops, we can materialize the address directly via movt/movw
3863 // pair. This is always cheaper.
3864 if (Subtarget->useMovt()) {
3865 ++NumMovwMovt;
3866 // FIXME: Once remat is capable of dealing with instructions with register
3867 // operands, expand this into two nodes.
3868 return DAG.getNode(ARMISD::Wrapper, dl, PtrVT,
3869 DAG.getTargetGlobalAddress(GV, dl, PtrVT));
3870 } else {
3871 SDValue CPAddr = DAG.getTargetConstantPool(GV, PtrVT, Align(4));
3872 CPAddr = DAG.getNode(ARMISD::Wrapper, dl, MVT::i32, CPAddr);
3873 return DAG.getLoad(
3874 PtrVT, dl, DAG.getEntryNode(), CPAddr,
3875 MachinePointerInfo::getConstantPool(DAG.getMachineFunction()));
3879 SDValue ARMTargetLowering::LowerGlobalAddressDarwin(SDValue Op,
3880 SelectionDAG &DAG) const {
3881 assert(!Subtarget->isROPI() && !Subtarget->isRWPI() &&
3882 "ROPI/RWPI not currently supported for Darwin");
3883 EVT PtrVT = getPointerTy(DAG.getDataLayout());
3884 SDLoc dl(Op);
3885 const GlobalValue *GV = cast<GlobalAddressSDNode>(Op)->getGlobal();
3887 if (Subtarget->useMovt())
3888 ++NumMovwMovt;
3890 // FIXME: Once remat is capable of dealing with instructions with register
3891 // operands, expand this into multiple nodes
3892 unsigned Wrapper =
3893 isPositionIndependent() ? ARMISD::WrapperPIC : ARMISD::Wrapper;
3895 SDValue G = DAG.getTargetGlobalAddress(GV, dl, PtrVT, 0, ARMII::MO_NONLAZY);
3896 SDValue Result = DAG.getNode(Wrapper, dl, PtrVT, G);
3898 if (Subtarget->isGVIndirectSymbol(GV))
3899 Result = DAG.getLoad(PtrVT, dl, DAG.getEntryNode(), Result,
3900 MachinePointerInfo::getGOT(DAG.getMachineFunction()));
3901 return Result;
3904 SDValue ARMTargetLowering::LowerGlobalAddressWindows(SDValue Op,
3905 SelectionDAG &DAG) const {
3906 assert(Subtarget->isTargetWindows() && "non-Windows COFF is not supported");
3907 assert(Subtarget->useMovt() &&
3908 "Windows on ARM expects to use movw/movt");
3909 assert(!Subtarget->isROPI() && !Subtarget->isRWPI() &&
3910 "ROPI/RWPI not currently supported for Windows");
3912 const TargetMachine &TM = getTargetMachine();
3913 const GlobalValue *GV = cast<GlobalAddressSDNode>(Op)->getGlobal();
3914 ARMII::TOF TargetFlags = ARMII::MO_NO_FLAG;
3915 if (GV->hasDLLImportStorageClass())
3916 TargetFlags = ARMII::MO_DLLIMPORT;
3917 else if (!TM.shouldAssumeDSOLocal(*GV->getParent(), GV))
3918 TargetFlags = ARMII::MO_COFFSTUB;
3919 EVT PtrVT = getPointerTy(DAG.getDataLayout());
3920 SDValue Result;
3921 SDLoc DL(Op);
3923 ++NumMovwMovt;
3925 // FIXME: Once remat is capable of dealing with instructions with register
3926 // operands, expand this into two nodes.
3927 Result = DAG.getNode(ARMISD::Wrapper, DL, PtrVT,
3928 DAG.getTargetGlobalAddress(GV, DL, PtrVT, /*offset=*/0,
3929 TargetFlags));
3930 if (TargetFlags & (ARMII::MO_DLLIMPORT | ARMII::MO_COFFSTUB))
3931 Result = DAG.getLoad(PtrVT, DL, DAG.getEntryNode(), Result,
3932 MachinePointerInfo::getGOT(DAG.getMachineFunction()));
3933 return Result;
3936 SDValue
3937 ARMTargetLowering::LowerEH_SJLJ_SETJMP(SDValue Op, SelectionDAG &DAG) const {
3938 SDLoc dl(Op);
3939 SDValue Val = DAG.getConstant(0, dl, MVT::i32);
3940 return DAG.getNode(ARMISD::EH_SJLJ_SETJMP, dl,
3941 DAG.getVTList(MVT::i32, MVT::Other), Op.getOperand(0),
3942 Op.getOperand(1), Val);
3945 SDValue
3946 ARMTargetLowering::LowerEH_SJLJ_LONGJMP(SDValue Op, SelectionDAG &DAG) const {
3947 SDLoc dl(Op);
3948 return DAG.getNode(ARMISD::EH_SJLJ_LONGJMP, dl, MVT::Other, Op.getOperand(0),
3949 Op.getOperand(1), DAG.getConstant(0, dl, MVT::i32));
3952 SDValue ARMTargetLowering::LowerEH_SJLJ_SETUP_DISPATCH(SDValue Op,
3953 SelectionDAG &DAG) const {
3954 SDLoc dl(Op);
3955 return DAG.getNode(ARMISD::EH_SJLJ_SETUP_DISPATCH, dl, MVT::Other,
3956 Op.getOperand(0));
3959 SDValue ARMTargetLowering::LowerINTRINSIC_VOID(
3960 SDValue Op, SelectionDAG &DAG, const ARMSubtarget *Subtarget) const {
3961 unsigned IntNo =
3962 cast<ConstantSDNode>(
3963 Op.getOperand(Op.getOperand(0).getValueType() == MVT::Other))
3964 ->getZExtValue();
3965 switch (IntNo) {
3966 default:
3967 return SDValue(); // Don't custom lower most intrinsics.
3968 case Intrinsic::arm_gnu_eabi_mcount: {
3969 MachineFunction &MF = DAG.getMachineFunction();
3970 EVT PtrVT = getPointerTy(DAG.getDataLayout());
3971 SDLoc dl(Op);
3972 SDValue Chain = Op.getOperand(0);
3973 // call "\01__gnu_mcount_nc"
3974 const ARMBaseRegisterInfo *ARI = Subtarget->getRegisterInfo();
3975 const uint32_t *Mask =
3976 ARI->getCallPreservedMask(DAG.getMachineFunction(), CallingConv::C);
3977 assert(Mask && "Missing call preserved mask for calling convention");
3978 // Mark LR an implicit live-in.
3979 unsigned Reg = MF.addLiveIn(ARM::LR, getRegClassFor(MVT::i32));
3980 SDValue ReturnAddress =
3981 DAG.getCopyFromReg(DAG.getEntryNode(), dl, Reg, PtrVT);
3982 constexpr EVT ResultTys[] = {MVT::Other, MVT::Glue};
3983 SDValue Callee =
3984 DAG.getTargetExternalSymbol("\01__gnu_mcount_nc", PtrVT, 0);
3985 SDValue RegisterMask = DAG.getRegisterMask(Mask);
3986 if (Subtarget->isThumb())
3987 return SDValue(
3988 DAG.getMachineNode(
3989 ARM::tBL_PUSHLR, dl, ResultTys,
3990 {ReturnAddress, DAG.getTargetConstant(ARMCC::AL, dl, PtrVT),
3991 DAG.getRegister(0, PtrVT), Callee, RegisterMask, Chain}),
3993 return SDValue(
3994 DAG.getMachineNode(ARM::BL_PUSHLR, dl, ResultTys,
3995 {ReturnAddress, Callee, RegisterMask, Chain}),
4001 SDValue
4002 ARMTargetLowering::LowerINTRINSIC_WO_CHAIN(SDValue Op, SelectionDAG &DAG,
4003 const ARMSubtarget *Subtarget) const {
4004 unsigned IntNo = cast<ConstantSDNode>(Op.getOperand(0))->getZExtValue();
4005 SDLoc dl(Op);
4006 switch (IntNo) {
4007 default: return SDValue(); // Don't custom lower most intrinsics.
4008 case Intrinsic::thread_pointer: {
4009 EVT PtrVT = getPointerTy(DAG.getDataLayout());
4010 return DAG.getNode(ARMISD::THREAD_POINTER, dl, PtrVT);
4012 case Intrinsic::arm_cls: {
4013 const SDValue &Operand = Op.getOperand(1);
4014 const EVT VTy = Op.getValueType();
4015 SDValue SRA =
4016 DAG.getNode(ISD::SRA, dl, VTy, Operand, DAG.getConstant(31, dl, VTy));
4017 SDValue XOR = DAG.getNode(ISD::XOR, dl, VTy, SRA, Operand);
4018 SDValue SHL =
4019 DAG.getNode(ISD::SHL, dl, VTy, XOR, DAG.getConstant(1, dl, VTy));
4020 SDValue OR =
4021 DAG.getNode(ISD::OR, dl, VTy, SHL, DAG.getConstant(1, dl, VTy));
4022 SDValue Result = DAG.getNode(ISD::CTLZ, dl, VTy, OR);
4023 return Result;
4025 case Intrinsic::arm_cls64: {
4026 // cls(x) = if cls(hi(x)) != 31 then cls(hi(x))
4027 // else 31 + clz(if hi(x) == 0 then lo(x) else not(lo(x)))
4028 const SDValue &Operand = Op.getOperand(1);
4029 const EVT VTy = Op.getValueType();
4031 SDValue Hi = DAG.getNode(ISD::EXTRACT_ELEMENT, dl, VTy, Operand,
4032 DAG.getConstant(1, dl, VTy));
4033 SDValue Lo = DAG.getNode(ISD::EXTRACT_ELEMENT, dl, VTy, Operand,
4034 DAG.getConstant(0, dl, VTy));
4035 SDValue Constant0 = DAG.getConstant(0, dl, VTy);
4036 SDValue Constant1 = DAG.getConstant(1, dl, VTy);
4037 SDValue Constant31 = DAG.getConstant(31, dl, VTy);
4038 SDValue SRAHi = DAG.getNode(ISD::SRA, dl, VTy, Hi, Constant31);
4039 SDValue XORHi = DAG.getNode(ISD::XOR, dl, VTy, SRAHi, Hi);
4040 SDValue SHLHi = DAG.getNode(ISD::SHL, dl, VTy, XORHi, Constant1);
4041 SDValue ORHi = DAG.getNode(ISD::OR, dl, VTy, SHLHi, Constant1);
4042 SDValue CLSHi = DAG.getNode(ISD::CTLZ, dl, VTy, ORHi);
4043 SDValue CheckLo =
4044 DAG.getSetCC(dl, MVT::i1, CLSHi, Constant31, ISD::CondCode::SETEQ);
4045 SDValue HiIsZero =
4046 DAG.getSetCC(dl, MVT::i1, Hi, Constant0, ISD::CondCode::SETEQ);
4047 SDValue AdjustedLo =
4048 DAG.getSelect(dl, VTy, HiIsZero, Lo, DAG.getNOT(dl, Lo, VTy));
4049 SDValue CLZAdjustedLo = DAG.getNode(ISD::CTLZ, dl, VTy, AdjustedLo);
4050 SDValue Result =
4051 DAG.getSelect(dl, VTy, CheckLo,
4052 DAG.getNode(ISD::ADD, dl, VTy, CLZAdjustedLo, Constant31), CLSHi);
4053 return Result;
4055 case Intrinsic::eh_sjlj_lsda: {
4056 MachineFunction &MF = DAG.getMachineFunction();
4057 ARMFunctionInfo *AFI = MF.getInfo<ARMFunctionInfo>();
4058 unsigned ARMPCLabelIndex = AFI->createPICLabelUId();
4059 EVT PtrVT = getPointerTy(DAG.getDataLayout());
4060 SDValue CPAddr;
4061 bool IsPositionIndependent = isPositionIndependent();
4062 unsigned PCAdj = IsPositionIndependent ? (Subtarget->isThumb() ? 4 : 8) : 0;
4063 ARMConstantPoolValue *CPV =
4064 ARMConstantPoolConstant::Create(&MF.getFunction(), ARMPCLabelIndex,
4065 ARMCP::CPLSDA, PCAdj);
4066 CPAddr = DAG.getTargetConstantPool(CPV, PtrVT, Align(4));
4067 CPAddr = DAG.getNode(ARMISD::Wrapper, dl, MVT::i32, CPAddr);
4068 SDValue Result = DAG.getLoad(
4069 PtrVT, dl, DAG.getEntryNode(), CPAddr,
4070 MachinePointerInfo::getConstantPool(DAG.getMachineFunction()));
4072 if (IsPositionIndependent) {
4073 SDValue PICLabel = DAG.getConstant(ARMPCLabelIndex, dl, MVT::i32);
4074 Result = DAG.getNode(ARMISD::PIC_ADD, dl, PtrVT, Result, PICLabel);
4076 return Result;
4078 case Intrinsic::arm_neon_vabs:
4079 return DAG.getNode(ISD::ABS, SDLoc(Op), Op.getValueType(),
4080 Op.getOperand(1));
4081 case Intrinsic::arm_neon_vmulls:
4082 case Intrinsic::arm_neon_vmullu: {
4083 unsigned NewOpc = (IntNo == Intrinsic::arm_neon_vmulls)
4084 ? ARMISD::VMULLs : ARMISD::VMULLu;
4085 return DAG.getNode(NewOpc, SDLoc(Op), Op.getValueType(),
4086 Op.getOperand(1), Op.getOperand(2));
4088 case Intrinsic::arm_neon_vminnm:
4089 case Intrinsic::arm_neon_vmaxnm: {
4090 unsigned NewOpc = (IntNo == Intrinsic::arm_neon_vminnm)
4091 ? ISD::FMINNUM : ISD::FMAXNUM;
4092 return DAG.getNode(NewOpc, SDLoc(Op), Op.getValueType(),
4093 Op.getOperand(1), Op.getOperand(2));
4095 case Intrinsic::arm_neon_vminu:
4096 case Intrinsic::arm_neon_vmaxu: {
4097 if (Op.getValueType().isFloatingPoint())
4098 return SDValue();
4099 unsigned NewOpc = (IntNo == Intrinsic::arm_neon_vminu)
4100 ? ISD::UMIN : ISD::UMAX;
4101 return DAG.getNode(NewOpc, SDLoc(Op), Op.getValueType(),
4102 Op.getOperand(1), Op.getOperand(2));
4104 case Intrinsic::arm_neon_vmins:
4105 case Intrinsic::arm_neon_vmaxs: {
4106 // v{min,max}s is overloaded between signed integers and floats.
4107 if (!Op.getValueType().isFloatingPoint()) {
4108 unsigned NewOpc = (IntNo == Intrinsic::arm_neon_vmins)
4109 ? ISD::SMIN : ISD::SMAX;
4110 return DAG.getNode(NewOpc, SDLoc(Op), Op.getValueType(),
4111 Op.getOperand(1), Op.getOperand(2));
4113 unsigned NewOpc = (IntNo == Intrinsic::arm_neon_vmins)
4114 ? ISD::FMINIMUM : ISD::FMAXIMUM;
4115 return DAG.getNode(NewOpc, SDLoc(Op), Op.getValueType(),
4116 Op.getOperand(1), Op.getOperand(2));
4118 case Intrinsic::arm_neon_vtbl1:
4119 return DAG.getNode(ARMISD::VTBL1, SDLoc(Op), Op.getValueType(),
4120 Op.getOperand(1), Op.getOperand(2));
4121 case Intrinsic::arm_neon_vtbl2:
4122 return DAG.getNode(ARMISD::VTBL2, SDLoc(Op), Op.getValueType(),
4123 Op.getOperand(1), Op.getOperand(2), Op.getOperand(3));
4124 case Intrinsic::arm_mve_pred_i2v:
4125 case Intrinsic::arm_mve_pred_v2i:
4126 return DAG.getNode(ARMISD::PREDICATE_CAST, SDLoc(Op), Op.getValueType(),
4127 Op.getOperand(1));
4128 case Intrinsic::arm_mve_vreinterpretq:
4129 return DAG.getNode(ARMISD::VECTOR_REG_CAST, SDLoc(Op), Op.getValueType(),
4130 Op.getOperand(1));
4131 case Intrinsic::arm_mve_lsll:
4132 return DAG.getNode(ARMISD::LSLL, SDLoc(Op), Op->getVTList(),
4133 Op.getOperand(1), Op.getOperand(2), Op.getOperand(3));
4134 case Intrinsic::arm_mve_asrl:
4135 return DAG.getNode(ARMISD::ASRL, SDLoc(Op), Op->getVTList(),
4136 Op.getOperand(1), Op.getOperand(2), Op.getOperand(3));
4140 static SDValue LowerATOMIC_FENCE(SDValue Op, SelectionDAG &DAG,
4141 const ARMSubtarget *Subtarget) {
4142 SDLoc dl(Op);
4143 ConstantSDNode *SSIDNode = cast<ConstantSDNode>(Op.getOperand(2));
4144 auto SSID = static_cast<SyncScope::ID>(SSIDNode->getZExtValue());
4145 if (SSID == SyncScope::SingleThread)
4146 return Op;
4148 if (!Subtarget->hasDataBarrier()) {
4149 // Some ARMv6 cpus can support data barriers with an mcr instruction.
4150 // Thumb1 and pre-v6 ARM mode use a libcall instead and should never get
4151 // here.
4152 assert(Subtarget->hasV6Ops() && !Subtarget->isThumb() &&
4153 "Unexpected ISD::ATOMIC_FENCE encountered. Should be libcall!");
4154 return DAG.getNode(ARMISD::MEMBARRIER_MCR, dl, MVT::Other, Op.getOperand(0),
4155 DAG.getConstant(0, dl, MVT::i32));
4158 ConstantSDNode *OrdN = cast<ConstantSDNode>(Op.getOperand(1));
4159 AtomicOrdering Ord = static_cast<AtomicOrdering>(OrdN->getZExtValue());
4160 ARM_MB::MemBOpt Domain = ARM_MB::ISH;
4161 if (Subtarget->isMClass()) {
4162 // Only a full system barrier exists in the M-class architectures.
4163 Domain = ARM_MB::SY;
4164 } else if (Subtarget->preferISHSTBarriers() &&
4165 Ord == AtomicOrdering::Release) {
4166 // Swift happens to implement ISHST barriers in a way that's compatible with
4167 // Release semantics but weaker than ISH so we'd be fools not to use
4168 // it. Beware: other processors probably don't!
4169 Domain = ARM_MB::ISHST;
4172 return DAG.getNode(ISD::INTRINSIC_VOID, dl, MVT::Other, Op.getOperand(0),
4173 DAG.getConstant(Intrinsic::arm_dmb, dl, MVT::i32),
4174 DAG.getConstant(Domain, dl, MVT::i32));
4177 static SDValue LowerPREFETCH(SDValue Op, SelectionDAG &DAG,
4178 const ARMSubtarget *Subtarget) {
4179 // ARM pre v5TE and Thumb1 does not have preload instructions.
4180 if (!(Subtarget->isThumb2() ||
4181 (!Subtarget->isThumb1Only() && Subtarget->hasV5TEOps())))
4182 // Just preserve the chain.
4183 return Op.getOperand(0);
4185 SDLoc dl(Op);
4186 unsigned isRead = ~cast<ConstantSDNode>(Op.getOperand(2))->getZExtValue() & 1;
4187 if (!isRead &&
4188 (!Subtarget->hasV7Ops() || !Subtarget->hasMPExtension()))
4189 // ARMv7 with MP extension has PLDW.
4190 return Op.getOperand(0);
4192 unsigned isData = cast<ConstantSDNode>(Op.getOperand(4))->getZExtValue();
4193 if (Subtarget->isThumb()) {
4194 // Invert the bits.
4195 isRead = ~isRead & 1;
4196 isData = ~isData & 1;
4199 return DAG.getNode(ARMISD::PRELOAD, dl, MVT::Other, Op.getOperand(0),
4200 Op.getOperand(1), DAG.getConstant(isRead, dl, MVT::i32),
4201 DAG.getConstant(isData, dl, MVT::i32));
4204 static SDValue LowerVASTART(SDValue Op, SelectionDAG &DAG) {
4205 MachineFunction &MF = DAG.getMachineFunction();
4206 ARMFunctionInfo *FuncInfo = MF.getInfo<ARMFunctionInfo>();
4208 // vastart just stores the address of the VarArgsFrameIndex slot into the
4209 // memory location argument.
4210 SDLoc dl(Op);
4211 EVT PtrVT = DAG.getTargetLoweringInfo().getPointerTy(DAG.getDataLayout());
4212 SDValue FR = DAG.getFrameIndex(FuncInfo->getVarArgsFrameIndex(), PtrVT);
4213 const Value *SV = cast<SrcValueSDNode>(Op.getOperand(2))->getValue();
4214 return DAG.getStore(Op.getOperand(0), dl, FR, Op.getOperand(1),
4215 MachinePointerInfo(SV));
4218 SDValue ARMTargetLowering::GetF64FormalArgument(CCValAssign &VA,
4219 CCValAssign &NextVA,
4220 SDValue &Root,
4221 SelectionDAG &DAG,
4222 const SDLoc &dl) const {
4223 MachineFunction &MF = DAG.getMachineFunction();
4224 ARMFunctionInfo *AFI = MF.getInfo<ARMFunctionInfo>();
4226 const TargetRegisterClass *RC;
4227 if (AFI->isThumb1OnlyFunction())
4228 RC = &ARM::tGPRRegClass;
4229 else
4230 RC = &ARM::GPRRegClass;
4232 // Transform the arguments stored in physical registers into virtual ones.
4233 unsigned Reg = MF.addLiveIn(VA.getLocReg(), RC);
4234 SDValue ArgValue = DAG.getCopyFromReg(Root, dl, Reg, MVT::i32);
4236 SDValue ArgValue2;
4237 if (NextVA.isMemLoc()) {
4238 MachineFrameInfo &MFI = MF.getFrameInfo();
4239 int FI = MFI.CreateFixedObject(4, NextVA.getLocMemOffset(), true);
4241 // Create load node to retrieve arguments from the stack.
4242 SDValue FIN = DAG.getFrameIndex(FI, getPointerTy(DAG.getDataLayout()));
4243 ArgValue2 = DAG.getLoad(
4244 MVT::i32, dl, Root, FIN,
4245 MachinePointerInfo::getFixedStack(DAG.getMachineFunction(), FI));
4246 } else {
4247 Reg = MF.addLiveIn(NextVA.getLocReg(), RC);
4248 ArgValue2 = DAG.getCopyFromReg(Root, dl, Reg, MVT::i32);
4250 if (!Subtarget->isLittle())
4251 std::swap (ArgValue, ArgValue2);
4252 return DAG.getNode(ARMISD::VMOVDRR, dl, MVT::f64, ArgValue, ArgValue2);
4255 // The remaining GPRs hold either the beginning of variable-argument
4256 // data, or the beginning of an aggregate passed by value (usually
4257 // byval). Either way, we allocate stack slots adjacent to the data
4258 // provided by our caller, and store the unallocated registers there.
4259 // If this is a variadic function, the va_list pointer will begin with
4260 // these values; otherwise, this reassembles a (byval) structure that
4261 // was split between registers and memory.
4262 // Return: The frame index registers were stored into.
4263 int ARMTargetLowering::StoreByValRegs(CCState &CCInfo, SelectionDAG &DAG,
4264 const SDLoc &dl, SDValue &Chain,
4265 const Value *OrigArg,
4266 unsigned InRegsParamRecordIdx,
4267 int ArgOffset, unsigned ArgSize) const {
4268 // Currently, two use-cases possible:
4269 // Case #1. Non-var-args function, and we meet first byval parameter.
4270 // Setup first unallocated register as first byval register;
4271 // eat all remained registers
4272 // (these two actions are performed by HandleByVal method).
4273 // Then, here, we initialize stack frame with
4274 // "store-reg" instructions.
4275 // Case #2. Var-args function, that doesn't contain byval parameters.
4276 // The same: eat all remained unallocated registers,
4277 // initialize stack frame.
4279 MachineFunction &MF = DAG.getMachineFunction();
4280 MachineFrameInfo &MFI = MF.getFrameInfo();
4281 ARMFunctionInfo *AFI = MF.getInfo<ARMFunctionInfo>();
4282 unsigned RBegin, REnd;
4283 if (InRegsParamRecordIdx < CCInfo.getInRegsParamsCount()) {
4284 CCInfo.getInRegsParamInfo(InRegsParamRecordIdx, RBegin, REnd);
4285 } else {
4286 unsigned RBeginIdx = CCInfo.getFirstUnallocated(GPRArgRegs);
4287 RBegin = RBeginIdx == 4 ? (unsigned)ARM::R4 : GPRArgRegs[RBeginIdx];
4288 REnd = ARM::R4;
4291 if (REnd != RBegin)
4292 ArgOffset = -4 * (ARM::R4 - RBegin);
4294 auto PtrVT = getPointerTy(DAG.getDataLayout());
4295 int FrameIndex = MFI.CreateFixedObject(ArgSize, ArgOffset, false);
4296 SDValue FIN = DAG.getFrameIndex(FrameIndex, PtrVT);
4298 SmallVector<SDValue, 4> MemOps;
4299 const TargetRegisterClass *RC =
4300 AFI->isThumb1OnlyFunction() ? &ARM::tGPRRegClass : &ARM::GPRRegClass;
4302 for (unsigned Reg = RBegin, i = 0; Reg < REnd; ++Reg, ++i) {
4303 unsigned VReg = MF.addLiveIn(Reg, RC);
4304 SDValue Val = DAG.getCopyFromReg(Chain, dl, VReg, MVT::i32);
4305 SDValue Store = DAG.getStore(Val.getValue(1), dl, Val, FIN,
4306 MachinePointerInfo(OrigArg, 4 * i));
4307 MemOps.push_back(Store);
4308 FIN = DAG.getNode(ISD::ADD, dl, PtrVT, FIN, DAG.getConstant(4, dl, PtrVT));
4311 if (!MemOps.empty())
4312 Chain = DAG.getNode(ISD::TokenFactor, dl, MVT::Other, MemOps);
4313 return FrameIndex;
4316 // Setup stack frame, the va_list pointer will start from.
4317 void ARMTargetLowering::VarArgStyleRegisters(CCState &CCInfo, SelectionDAG &DAG,
4318 const SDLoc &dl, SDValue &Chain,
4319 unsigned ArgOffset,
4320 unsigned TotalArgRegsSaveSize,
4321 bool ForceMutable) const {
4322 MachineFunction &MF = DAG.getMachineFunction();
4323 ARMFunctionInfo *AFI = MF.getInfo<ARMFunctionInfo>();
4325 // Try to store any remaining integer argument regs
4326 // to their spots on the stack so that they may be loaded by dereferencing
4327 // the result of va_next.
4328 // If there is no regs to be stored, just point address after last
4329 // argument passed via stack.
4330 int FrameIndex = StoreByValRegs(CCInfo, DAG, dl, Chain, nullptr,
4331 CCInfo.getInRegsParamsCount(),
4332 CCInfo.getNextStackOffset(),
4333 std::max(4U, TotalArgRegsSaveSize));
4334 AFI->setVarArgsFrameIndex(FrameIndex);
4337 bool ARMTargetLowering::splitValueIntoRegisterParts(
4338 SelectionDAG &DAG, const SDLoc &DL, SDValue Val, SDValue *Parts,
4339 unsigned NumParts, MVT PartVT, Optional<CallingConv::ID> CC) const {
4340 bool IsABIRegCopy = CC.hasValue();
4341 EVT ValueVT = Val.getValueType();
4342 if (IsABIRegCopy && (ValueVT == MVT::f16 || ValueVT == MVT::bf16) &&
4343 PartVT == MVT::f32) {
4344 unsigned ValueBits = ValueVT.getSizeInBits();
4345 unsigned PartBits = PartVT.getSizeInBits();
4346 Val = DAG.getNode(ISD::BITCAST, DL, MVT::getIntegerVT(ValueBits), Val);
4347 Val = DAG.getNode(ISD::ANY_EXTEND, DL, MVT::getIntegerVT(PartBits), Val);
4348 Val = DAG.getNode(ISD::BITCAST, DL, PartVT, Val);
4349 Parts[0] = Val;
4350 return true;
4352 return false;
4355 SDValue ARMTargetLowering::joinRegisterPartsIntoValue(
4356 SelectionDAG &DAG, const SDLoc &DL, const SDValue *Parts, unsigned NumParts,
4357 MVT PartVT, EVT ValueVT, Optional<CallingConv::ID> CC) const {
4358 bool IsABIRegCopy = CC.hasValue();
4359 if (IsABIRegCopy && (ValueVT == MVT::f16 || ValueVT == MVT::bf16) &&
4360 PartVT == MVT::f32) {
4361 unsigned ValueBits = ValueVT.getSizeInBits();
4362 unsigned PartBits = PartVT.getSizeInBits();
4363 SDValue Val = Parts[0];
4365 Val = DAG.getNode(ISD::BITCAST, DL, MVT::getIntegerVT(PartBits), Val);
4366 Val = DAG.getNode(ISD::TRUNCATE, DL, MVT::getIntegerVT(ValueBits), Val);
4367 Val = DAG.getNode(ISD::BITCAST, DL, ValueVT, Val);
4368 return Val;
4370 return SDValue();
4373 SDValue ARMTargetLowering::LowerFormalArguments(
4374 SDValue Chain, CallingConv::ID CallConv, bool isVarArg,
4375 const SmallVectorImpl<ISD::InputArg> &Ins, const SDLoc &dl,
4376 SelectionDAG &DAG, SmallVectorImpl<SDValue> &InVals) const {
4377 MachineFunction &MF = DAG.getMachineFunction();
4378 MachineFrameInfo &MFI = MF.getFrameInfo();
4380 ARMFunctionInfo *AFI = MF.getInfo<ARMFunctionInfo>();
4382 // Assign locations to all of the incoming arguments.
4383 SmallVector<CCValAssign, 16> ArgLocs;
4384 CCState CCInfo(CallConv, isVarArg, DAG.getMachineFunction(), ArgLocs,
4385 *DAG.getContext());
4386 CCInfo.AnalyzeFormalArguments(Ins, CCAssignFnForCall(CallConv, isVarArg));
4388 SmallVector<SDValue, 16> ArgValues;
4389 SDValue ArgValue;
4390 Function::const_arg_iterator CurOrigArg = MF.getFunction().arg_begin();
4391 unsigned CurArgIdx = 0;
4393 // Initially ArgRegsSaveSize is zero.
4394 // Then we increase this value each time we meet byval parameter.
4395 // We also increase this value in case of varargs function.
4396 AFI->setArgRegsSaveSize(0);
4398 // Calculate the amount of stack space that we need to allocate to store
4399 // byval and variadic arguments that are passed in registers.
4400 // We need to know this before we allocate the first byval or variadic
4401 // argument, as they will be allocated a stack slot below the CFA (Canonical
4402 // Frame Address, the stack pointer at entry to the function).
4403 unsigned ArgRegBegin = ARM::R4;
4404 for (unsigned i = 0, e = ArgLocs.size(); i != e; ++i) {
4405 if (CCInfo.getInRegsParamsProcessed() >= CCInfo.getInRegsParamsCount())
4406 break;
4408 CCValAssign &VA = ArgLocs[i];
4409 unsigned Index = VA.getValNo();
4410 ISD::ArgFlagsTy Flags = Ins[Index].Flags;
4411 if (!Flags.isByVal())
4412 continue;
4414 assert(VA.isMemLoc() && "unexpected byval pointer in reg");
4415 unsigned RBegin, REnd;
4416 CCInfo.getInRegsParamInfo(CCInfo.getInRegsParamsProcessed(), RBegin, REnd);
4417 ArgRegBegin = std::min(ArgRegBegin, RBegin);
4419 CCInfo.nextInRegsParam();
4421 CCInfo.rewindByValRegsInfo();
4423 int lastInsIndex = -1;
4424 if (isVarArg && MFI.hasVAStart()) {
4425 unsigned RegIdx = CCInfo.getFirstUnallocated(GPRArgRegs);
4426 if (RegIdx != array_lengthof(GPRArgRegs))
4427 ArgRegBegin = std::min(ArgRegBegin, (unsigned)GPRArgRegs[RegIdx]);
4430 unsigned TotalArgRegsSaveSize = 4 * (ARM::R4 - ArgRegBegin);
4431 AFI->setArgRegsSaveSize(TotalArgRegsSaveSize);
4432 auto PtrVT = getPointerTy(DAG.getDataLayout());
4434 for (unsigned i = 0, e = ArgLocs.size(); i != e; ++i) {
4435 CCValAssign &VA = ArgLocs[i];
4436 if (Ins[VA.getValNo()].isOrigArg()) {
4437 std::advance(CurOrigArg,
4438 Ins[VA.getValNo()].getOrigArgIndex() - CurArgIdx);
4439 CurArgIdx = Ins[VA.getValNo()].getOrigArgIndex();
4441 // Arguments stored in registers.
4442 if (VA.isRegLoc()) {
4443 EVT RegVT = VA.getLocVT();
4445 if (VA.needsCustom() && VA.getLocVT() == MVT::v2f64) {
4446 // f64 and vector types are split up into multiple registers or
4447 // combinations of registers and stack slots.
4448 SDValue ArgValue1 =
4449 GetF64FormalArgument(VA, ArgLocs[++i], Chain, DAG, dl);
4450 VA = ArgLocs[++i]; // skip ahead to next loc
4451 SDValue ArgValue2;
4452 if (VA.isMemLoc()) {
4453 int FI = MFI.CreateFixedObject(8, VA.getLocMemOffset(), true);
4454 SDValue FIN = DAG.getFrameIndex(FI, PtrVT);
4455 ArgValue2 = DAG.getLoad(
4456 MVT::f64, dl, Chain, FIN,
4457 MachinePointerInfo::getFixedStack(DAG.getMachineFunction(), FI));
4458 } else {
4459 ArgValue2 = GetF64FormalArgument(VA, ArgLocs[++i], Chain, DAG, dl);
4461 ArgValue = DAG.getNode(ISD::UNDEF, dl, MVT::v2f64);
4462 ArgValue = DAG.getNode(ISD::INSERT_VECTOR_ELT, dl, MVT::v2f64, ArgValue,
4463 ArgValue1, DAG.getIntPtrConstant(0, dl));
4464 ArgValue = DAG.getNode(ISD::INSERT_VECTOR_ELT, dl, MVT::v2f64, ArgValue,
4465 ArgValue2, DAG.getIntPtrConstant(1, dl));
4466 } else if (VA.needsCustom() && VA.getLocVT() == MVT::f64) {
4467 ArgValue = GetF64FormalArgument(VA, ArgLocs[++i], Chain, DAG, dl);
4468 } else {
4469 const TargetRegisterClass *RC;
4471 if (RegVT == MVT::f16 || RegVT == MVT::bf16)
4472 RC = &ARM::HPRRegClass;
4473 else if (RegVT == MVT::f32)
4474 RC = &ARM::SPRRegClass;
4475 else if (RegVT == MVT::f64 || RegVT == MVT::v4f16 ||
4476 RegVT == MVT::v4bf16)
4477 RC = &ARM::DPRRegClass;
4478 else if (RegVT == MVT::v2f64 || RegVT == MVT::v8f16 ||
4479 RegVT == MVT::v8bf16)
4480 RC = &ARM::QPRRegClass;
4481 else if (RegVT == MVT::i32)
4482 RC = AFI->isThumb1OnlyFunction() ? &ARM::tGPRRegClass
4483 : &ARM::GPRRegClass;
4484 else
4485 llvm_unreachable("RegVT not supported by FORMAL_ARGUMENTS Lowering");
4487 // Transform the arguments in physical registers into virtual ones.
4488 unsigned Reg = MF.addLiveIn(VA.getLocReg(), RC);
4489 ArgValue = DAG.getCopyFromReg(Chain, dl, Reg, RegVT);
4491 // If this value is passed in r0 and has the returned attribute (e.g.
4492 // C++ 'structors), record this fact for later use.
4493 if (VA.getLocReg() == ARM::R0 && Ins[VA.getValNo()].Flags.isReturned()) {
4494 AFI->setPreservesR0();
4498 // If this is an 8 or 16-bit value, it is really passed promoted
4499 // to 32 bits. Insert an assert[sz]ext to capture this, then
4500 // truncate to the right size.
4501 switch (VA.getLocInfo()) {
4502 default: llvm_unreachable("Unknown loc info!");
4503 case CCValAssign::Full: break;
4504 case CCValAssign::BCvt:
4505 ArgValue = DAG.getNode(ISD::BITCAST, dl, VA.getValVT(), ArgValue);
4506 break;
4507 case CCValAssign::SExt:
4508 ArgValue = DAG.getNode(ISD::AssertSext, dl, RegVT, ArgValue,
4509 DAG.getValueType(VA.getValVT()));
4510 ArgValue = DAG.getNode(ISD::TRUNCATE, dl, VA.getValVT(), ArgValue);
4511 break;
4512 case CCValAssign::ZExt:
4513 ArgValue = DAG.getNode(ISD::AssertZext, dl, RegVT, ArgValue,
4514 DAG.getValueType(VA.getValVT()));
4515 ArgValue = DAG.getNode(ISD::TRUNCATE, dl, VA.getValVT(), ArgValue);
4516 break;
4519 // f16 arguments have their size extended to 4 bytes and passed as if they
4520 // had been copied to the LSBs of a 32-bit register.
4521 // For that, it's passed extended to i32 (soft ABI) or to f32 (hard ABI)
4522 if (VA.needsCustom() &&
4523 (VA.getValVT() == MVT::f16 || VA.getValVT() == MVT::bf16))
4524 ArgValue = MoveToHPR(dl, DAG, VA.getLocVT(), VA.getValVT(), ArgValue);
4526 InVals.push_back(ArgValue);
4527 } else { // VA.isRegLoc()
4528 // sanity check
4529 assert(VA.isMemLoc());
4530 assert(VA.getValVT() != MVT::i64 && "i64 should already be lowered");
4532 int index = VA.getValNo();
4534 // Some Ins[] entries become multiple ArgLoc[] entries.
4535 // Process them only once.
4536 if (index != lastInsIndex)
4538 ISD::ArgFlagsTy Flags = Ins[index].Flags;
4539 // FIXME: For now, all byval parameter objects are marked mutable.
4540 // This can be changed with more analysis.
4541 // In case of tail call optimization mark all arguments mutable.
4542 // Since they could be overwritten by lowering of arguments in case of
4543 // a tail call.
4544 if (Flags.isByVal()) {
4545 assert(Ins[index].isOrigArg() &&
4546 "Byval arguments cannot be implicit");
4547 unsigned CurByValIndex = CCInfo.getInRegsParamsProcessed();
4549 int FrameIndex = StoreByValRegs(
4550 CCInfo, DAG, dl, Chain, &*CurOrigArg, CurByValIndex,
4551 VA.getLocMemOffset(), Flags.getByValSize());
4552 InVals.push_back(DAG.getFrameIndex(FrameIndex, PtrVT));
4553 CCInfo.nextInRegsParam();
4554 } else {
4555 unsigned FIOffset = VA.getLocMemOffset();
4556 int FI = MFI.CreateFixedObject(VA.getLocVT().getSizeInBits()/8,
4557 FIOffset, true);
4559 // Create load nodes to retrieve arguments from the stack.
4560 SDValue FIN = DAG.getFrameIndex(FI, PtrVT);
4561 InVals.push_back(DAG.getLoad(VA.getValVT(), dl, Chain, FIN,
4562 MachinePointerInfo::getFixedStack(
4563 DAG.getMachineFunction(), FI)));
4565 lastInsIndex = index;
4570 // varargs
4571 if (isVarArg && MFI.hasVAStart()) {
4572 VarArgStyleRegisters(CCInfo, DAG, dl, Chain, CCInfo.getNextStackOffset(),
4573 TotalArgRegsSaveSize);
4574 if (AFI->isCmseNSEntryFunction()) {
4575 DiagnosticInfoUnsupported Diag(
4576 DAG.getMachineFunction().getFunction(),
4577 "secure entry function must not be variadic", dl.getDebugLoc());
4578 DAG.getContext()->diagnose(Diag);
4582 unsigned StackArgSize = CCInfo.getNextStackOffset();
4583 bool TailCallOpt = MF.getTarget().Options.GuaranteedTailCallOpt;
4584 if (canGuaranteeTCO(CallConv, TailCallOpt)) {
4585 // The only way to guarantee a tail call is if the callee restores its
4586 // argument area, but it must also keep the stack aligned when doing so.
4587 const DataLayout &DL = DAG.getDataLayout();
4588 StackArgSize = alignTo(StackArgSize, DL.getStackAlignment());
4590 AFI->setArgumentStackToRestore(StackArgSize);
4592 AFI->setArgumentStackSize(StackArgSize);
4594 if (CCInfo.getNextStackOffset() > 0 && AFI->isCmseNSEntryFunction()) {
4595 DiagnosticInfoUnsupported Diag(
4596 DAG.getMachineFunction().getFunction(),
4597 "secure entry function requires arguments on stack", dl.getDebugLoc());
4598 DAG.getContext()->diagnose(Diag);
4601 return Chain;
4604 /// isFloatingPointZero - Return true if this is +0.0.
4605 static bool isFloatingPointZero(SDValue Op) {
4606 if (ConstantFPSDNode *CFP = dyn_cast<ConstantFPSDNode>(Op))
4607 return CFP->getValueAPF().isPosZero();
4608 else if (ISD::isEXTLoad(Op.getNode()) || ISD::isNON_EXTLoad(Op.getNode())) {
4609 // Maybe this has already been legalized into the constant pool?
4610 if (Op.getOperand(1).getOpcode() == ARMISD::Wrapper) {
4611 SDValue WrapperOp = Op.getOperand(1).getOperand(0);
4612 if (ConstantPoolSDNode *CP = dyn_cast<ConstantPoolSDNode>(WrapperOp))
4613 if (const ConstantFP *CFP = dyn_cast<ConstantFP>(CP->getConstVal()))
4614 return CFP->getValueAPF().isPosZero();
4616 } else if (Op->getOpcode() == ISD::BITCAST &&
4617 Op->getValueType(0) == MVT::f64) {
4618 // Handle (ISD::BITCAST (ARMISD::VMOVIMM (ISD::TargetConstant 0)) MVT::f64)
4619 // created by LowerConstantFP().
4620 SDValue BitcastOp = Op->getOperand(0);
4621 if (BitcastOp->getOpcode() == ARMISD::VMOVIMM &&
4622 isNullConstant(BitcastOp->getOperand(0)))
4623 return true;
4625 return false;
4628 /// Returns appropriate ARM CMP (cmp) and corresponding condition code for
4629 /// the given operands.
4630 SDValue ARMTargetLowering::getARMCmp(SDValue LHS, SDValue RHS, ISD::CondCode CC,
4631 SDValue &ARMcc, SelectionDAG &DAG,
4632 const SDLoc &dl) const {
4633 if (ConstantSDNode *RHSC = dyn_cast<ConstantSDNode>(RHS.getNode())) {
4634 unsigned C = RHSC->getZExtValue();
4635 if (!isLegalICmpImmediate((int32_t)C)) {
4636 // Constant does not fit, try adjusting it by one.
4637 switch (CC) {
4638 default: break;
4639 case ISD::SETLT:
4640 case ISD::SETGE:
4641 if (C != 0x80000000 && isLegalICmpImmediate(C-1)) {
4642 CC = (CC == ISD::SETLT) ? ISD::SETLE : ISD::SETGT;
4643 RHS = DAG.getConstant(C - 1, dl, MVT::i32);
4645 break;
4646 case ISD::SETULT:
4647 case ISD::SETUGE:
4648 if (C != 0 && isLegalICmpImmediate(C-1)) {
4649 CC = (CC == ISD::SETULT) ? ISD::SETULE : ISD::SETUGT;
4650 RHS = DAG.getConstant(C - 1, dl, MVT::i32);
4652 break;
4653 case ISD::SETLE:
4654 case ISD::SETGT:
4655 if (C != 0x7fffffff && isLegalICmpImmediate(C+1)) {
4656 CC = (CC == ISD::SETLE) ? ISD::SETLT : ISD::SETGE;
4657 RHS = DAG.getConstant(C + 1, dl, MVT::i32);
4659 break;
4660 case ISD::SETULE:
4661 case ISD::SETUGT:
4662 if (C != 0xffffffff && isLegalICmpImmediate(C+1)) {
4663 CC = (CC == ISD::SETULE) ? ISD::SETULT : ISD::SETUGE;
4664 RHS = DAG.getConstant(C + 1, dl, MVT::i32);
4666 break;
4669 } else if ((ARM_AM::getShiftOpcForNode(LHS.getOpcode()) != ARM_AM::no_shift) &&
4670 (ARM_AM::getShiftOpcForNode(RHS.getOpcode()) == ARM_AM::no_shift)) {
4671 // In ARM and Thumb-2, the compare instructions can shift their second
4672 // operand.
4673 CC = ISD::getSetCCSwappedOperands(CC);
4674 std::swap(LHS, RHS);
4677 // Thumb1 has very limited immediate modes, so turning an "and" into a
4678 // shift can save multiple instructions.
4680 // If we have (x & C1), and C1 is an appropriate mask, we can transform it
4681 // into "((x << n) >> n)". But that isn't necessarily profitable on its
4682 // own. If it's the operand to an unsigned comparison with an immediate,
4683 // we can eliminate one of the shifts: we transform
4684 // "((x << n) >> n) == C2" to "(x << n) == (C2 << n)".
4686 // We avoid transforming cases which aren't profitable due to encoding
4687 // details:
4689 // 1. C2 fits into the immediate field of a cmp, and the transformed version
4690 // would not; in that case, we're essentially trading one immediate load for
4691 // another.
4692 // 2. C1 is 255 or 65535, so we can use uxtb or uxth.
4693 // 3. C2 is zero; we have other code for this special case.
4695 // FIXME: Figure out profitability for Thumb2; we usually can't save an
4696 // instruction, since the AND is always one instruction anyway, but we could
4697 // use narrow instructions in some cases.
4698 if (Subtarget->isThumb1Only() && LHS->getOpcode() == ISD::AND &&
4699 LHS->hasOneUse() && isa<ConstantSDNode>(LHS.getOperand(1)) &&
4700 LHS.getValueType() == MVT::i32 && isa<ConstantSDNode>(RHS) &&
4701 !isSignedIntSetCC(CC)) {
4702 unsigned Mask = cast<ConstantSDNode>(LHS.getOperand(1))->getZExtValue();
4703 auto *RHSC = cast<ConstantSDNode>(RHS.getNode());
4704 uint64_t RHSV = RHSC->getZExtValue();
4705 if (isMask_32(Mask) && (RHSV & ~Mask) == 0 && Mask != 255 && Mask != 65535) {
4706 unsigned ShiftBits = countLeadingZeros(Mask);
4707 if (RHSV && (RHSV > 255 || (RHSV << ShiftBits) <= 255)) {
4708 SDValue ShiftAmt = DAG.getConstant(ShiftBits, dl, MVT::i32);
4709 LHS = DAG.getNode(ISD::SHL, dl, MVT::i32, LHS.getOperand(0), ShiftAmt);
4710 RHS = DAG.getConstant(RHSV << ShiftBits, dl, MVT::i32);
4715 // The specific comparison "(x<<c) > 0x80000000U" can be optimized to a
4716 // single "lsls x, c+1". The shift sets the "C" and "Z" flags the same
4717 // way a cmp would.
4718 // FIXME: Add support for ARM/Thumb2; this would need isel patterns, and
4719 // some tweaks to the heuristics for the previous and->shift transform.
4720 // FIXME: Optimize cases where the LHS isn't a shift.
4721 if (Subtarget->isThumb1Only() && LHS->getOpcode() == ISD::SHL &&
4722 isa<ConstantSDNode>(RHS) &&
4723 cast<ConstantSDNode>(RHS)->getZExtValue() == 0x80000000U &&
4724 CC == ISD::SETUGT && isa<ConstantSDNode>(LHS.getOperand(1)) &&
4725 cast<ConstantSDNode>(LHS.getOperand(1))->getZExtValue() < 31) {
4726 unsigned ShiftAmt =
4727 cast<ConstantSDNode>(LHS.getOperand(1))->getZExtValue() + 1;
4728 SDValue Shift = DAG.getNode(ARMISD::LSLS, dl,
4729 DAG.getVTList(MVT::i32, MVT::i32),
4730 LHS.getOperand(0),
4731 DAG.getConstant(ShiftAmt, dl, MVT::i32));
4732 SDValue Chain = DAG.getCopyToReg(DAG.getEntryNode(), dl, ARM::CPSR,
4733 Shift.getValue(1), SDValue());
4734 ARMcc = DAG.getConstant(ARMCC::HI, dl, MVT::i32);
4735 return Chain.getValue(1);
4738 ARMCC::CondCodes CondCode = IntCCToARMCC(CC);
4740 // If the RHS is a constant zero then the V (overflow) flag will never be
4741 // set. This can allow us to simplify GE to PL or LT to MI, which can be
4742 // simpler for other passes (like the peephole optimiser) to deal with.
4743 if (isNullConstant(RHS)) {
4744 switch (CondCode) {
4745 default: break;
4746 case ARMCC::GE:
4747 CondCode = ARMCC::PL;
4748 break;
4749 case ARMCC::LT:
4750 CondCode = ARMCC::MI;
4751 break;
4755 ARMISD::NodeType CompareType;
4756 switch (CondCode) {
4757 default:
4758 CompareType = ARMISD::CMP;
4759 break;
4760 case ARMCC::EQ:
4761 case ARMCC::NE:
4762 // Uses only Z Flag
4763 CompareType = ARMISD::CMPZ;
4764 break;
4766 ARMcc = DAG.getConstant(CondCode, dl, MVT::i32);
4767 return DAG.getNode(CompareType, dl, MVT::Glue, LHS, RHS);
4770 /// Returns a appropriate VFP CMP (fcmp{s|d}+fmstat) for the given operands.
4771 SDValue ARMTargetLowering::getVFPCmp(SDValue LHS, SDValue RHS,
4772 SelectionDAG &DAG, const SDLoc &dl,
4773 bool Signaling) const {
4774 assert(Subtarget->hasFP64() || RHS.getValueType() != MVT::f64);
4775 SDValue Cmp;
4776 if (!isFloatingPointZero(RHS))
4777 Cmp = DAG.getNode(Signaling ? ARMISD::CMPFPE : ARMISD::CMPFP,
4778 dl, MVT::Glue, LHS, RHS);
4779 else
4780 Cmp = DAG.getNode(Signaling ? ARMISD::CMPFPEw0 : ARMISD::CMPFPw0,
4781 dl, MVT::Glue, LHS);
4782 return DAG.getNode(ARMISD::FMSTAT, dl, MVT::Glue, Cmp);
4785 /// duplicateCmp - Glue values can have only one use, so this function
4786 /// duplicates a comparison node.
4787 SDValue
4788 ARMTargetLowering::duplicateCmp(SDValue Cmp, SelectionDAG &DAG) const {
4789 unsigned Opc = Cmp.getOpcode();
4790 SDLoc DL(Cmp);
4791 if (Opc == ARMISD::CMP || Opc == ARMISD::CMPZ)
4792 return DAG.getNode(Opc, DL, MVT::Glue, Cmp.getOperand(0),Cmp.getOperand(1));
4794 assert(Opc == ARMISD::FMSTAT && "unexpected comparison operation");
4795 Cmp = Cmp.getOperand(0);
4796 Opc = Cmp.getOpcode();
4797 if (Opc == ARMISD::CMPFP)
4798 Cmp = DAG.getNode(Opc, DL, MVT::Glue, Cmp.getOperand(0),Cmp.getOperand(1));
4799 else {
4800 assert(Opc == ARMISD::CMPFPw0 && "unexpected operand of FMSTAT");
4801 Cmp = DAG.getNode(Opc, DL, MVT::Glue, Cmp.getOperand(0));
4803 return DAG.getNode(ARMISD::FMSTAT, DL, MVT::Glue, Cmp);
4806 // This function returns three things: the arithmetic computation itself
4807 // (Value), a comparison (OverflowCmp), and a condition code (ARMcc). The
4808 // comparison and the condition code define the case in which the arithmetic
4809 // computation *does not* overflow.
4810 std::pair<SDValue, SDValue>
4811 ARMTargetLowering::getARMXALUOOp(SDValue Op, SelectionDAG &DAG,
4812 SDValue &ARMcc) const {
4813 assert(Op.getValueType() == MVT::i32 && "Unsupported value type");
4815 SDValue Value, OverflowCmp;
4816 SDValue LHS = Op.getOperand(0);
4817 SDValue RHS = Op.getOperand(1);
4818 SDLoc dl(Op);
4820 // FIXME: We are currently always generating CMPs because we don't support
4821 // generating CMN through the backend. This is not as good as the natural
4822 // CMP case because it causes a register dependency and cannot be folded
4823 // later.
4825 switch (Op.getOpcode()) {
4826 default:
4827 llvm_unreachable("Unknown overflow instruction!");
4828 case ISD::SADDO:
4829 ARMcc = DAG.getConstant(ARMCC::VC, dl, MVT::i32);
4830 Value = DAG.getNode(ISD::ADD, dl, Op.getValueType(), LHS, RHS);
4831 OverflowCmp = DAG.getNode(ARMISD::CMP, dl, MVT::Glue, Value, LHS);
4832 break;
4833 case ISD::UADDO:
4834 ARMcc = DAG.getConstant(ARMCC::HS, dl, MVT::i32);
4835 // We use ADDC here to correspond to its use in LowerUnsignedALUO.
4836 // We do not use it in the USUBO case as Value may not be used.
4837 Value = DAG.getNode(ARMISD::ADDC, dl,
4838 DAG.getVTList(Op.getValueType(), MVT::i32), LHS, RHS)
4839 .getValue(0);
4840 OverflowCmp = DAG.getNode(ARMISD::CMP, dl, MVT::Glue, Value, LHS);
4841 break;
4842 case ISD::SSUBO:
4843 ARMcc = DAG.getConstant(ARMCC::VC, dl, MVT::i32);
4844 Value = DAG.getNode(ISD::SUB, dl, Op.getValueType(), LHS, RHS);
4845 OverflowCmp = DAG.getNode(ARMISD::CMP, dl, MVT::Glue, LHS, RHS);
4846 break;
4847 case ISD::USUBO:
4848 ARMcc = DAG.getConstant(ARMCC::HS, dl, MVT::i32);
4849 Value = DAG.getNode(ISD::SUB, dl, Op.getValueType(), LHS, RHS);
4850 OverflowCmp = DAG.getNode(ARMISD::CMP, dl, MVT::Glue, LHS, RHS);
4851 break;
4852 case ISD::UMULO:
4853 // We generate a UMUL_LOHI and then check if the high word is 0.
4854 ARMcc = DAG.getConstant(ARMCC::EQ, dl, MVT::i32);
4855 Value = DAG.getNode(ISD::UMUL_LOHI, dl,
4856 DAG.getVTList(Op.getValueType(), Op.getValueType()),
4857 LHS, RHS);
4858 OverflowCmp = DAG.getNode(ARMISD::CMP, dl, MVT::Glue, Value.getValue(1),
4859 DAG.getConstant(0, dl, MVT::i32));
4860 Value = Value.getValue(0); // We only want the low 32 bits for the result.
4861 break;
4862 case ISD::SMULO:
4863 // We generate a SMUL_LOHI and then check if all the bits of the high word
4864 // are the same as the sign bit of the low word.
4865 ARMcc = DAG.getConstant(ARMCC::EQ, dl, MVT::i32);
4866 Value = DAG.getNode(ISD::SMUL_LOHI, dl,
4867 DAG.getVTList(Op.getValueType(), Op.getValueType()),
4868 LHS, RHS);
4869 OverflowCmp = DAG.getNode(ARMISD::CMP, dl, MVT::Glue, Value.getValue(1),
4870 DAG.getNode(ISD::SRA, dl, Op.getValueType(),
4871 Value.getValue(0),
4872 DAG.getConstant(31, dl, MVT::i32)));
4873 Value = Value.getValue(0); // We only want the low 32 bits for the result.
4874 break;
4875 } // switch (...)
4877 return std::make_pair(Value, OverflowCmp);
4880 SDValue
4881 ARMTargetLowering::LowerSignedALUO(SDValue Op, SelectionDAG &DAG) const {
4882 // Let legalize expand this if it isn't a legal type yet.
4883 if (!DAG.getTargetLoweringInfo().isTypeLegal(Op.getValueType()))
4884 return SDValue();
4886 SDValue Value, OverflowCmp;
4887 SDValue ARMcc;
4888 std::tie(Value, OverflowCmp) = getARMXALUOOp(Op, DAG, ARMcc);
4889 SDValue CCR = DAG.getRegister(ARM::CPSR, MVT::i32);
4890 SDLoc dl(Op);
4891 // We use 0 and 1 as false and true values.
4892 SDValue TVal = DAG.getConstant(1, dl, MVT::i32);
4893 SDValue FVal = DAG.getConstant(0, dl, MVT::i32);
4894 EVT VT = Op.getValueType();
4896 SDValue Overflow = DAG.getNode(ARMISD::CMOV, dl, VT, TVal, FVal,
4897 ARMcc, CCR, OverflowCmp);
4899 SDVTList VTs = DAG.getVTList(Op.getValueType(), MVT::i32);
4900 return DAG.getNode(ISD::MERGE_VALUES, dl, VTs, Value, Overflow);
4903 static SDValue ConvertBooleanCarryToCarryFlag(SDValue BoolCarry,
4904 SelectionDAG &DAG) {
4905 SDLoc DL(BoolCarry);
4906 EVT CarryVT = BoolCarry.getValueType();
4908 // This converts the boolean value carry into the carry flag by doing
4909 // ARMISD::SUBC Carry, 1
4910 SDValue Carry = DAG.getNode(ARMISD::SUBC, DL,
4911 DAG.getVTList(CarryVT, MVT::i32),
4912 BoolCarry, DAG.getConstant(1, DL, CarryVT));
4913 return Carry.getValue(1);
4916 static SDValue ConvertCarryFlagToBooleanCarry(SDValue Flags, EVT VT,
4917 SelectionDAG &DAG) {
4918 SDLoc DL(Flags);
4920 // Now convert the carry flag into a boolean carry. We do this
4921 // using ARMISD:ADDE 0, 0, Carry
4922 return DAG.getNode(ARMISD::ADDE, DL, DAG.getVTList(VT, MVT::i32),
4923 DAG.getConstant(0, DL, MVT::i32),
4924 DAG.getConstant(0, DL, MVT::i32), Flags);
4927 SDValue ARMTargetLowering::LowerUnsignedALUO(SDValue Op,
4928 SelectionDAG &DAG) const {
4929 // Let legalize expand this if it isn't a legal type yet.
4930 if (!DAG.getTargetLoweringInfo().isTypeLegal(Op.getValueType()))
4931 return SDValue();
4933 SDValue LHS = Op.getOperand(0);
4934 SDValue RHS = Op.getOperand(1);
4935 SDLoc dl(Op);
4937 EVT VT = Op.getValueType();
4938 SDVTList VTs = DAG.getVTList(VT, MVT::i32);
4939 SDValue Value;
4940 SDValue Overflow;
4941 switch (Op.getOpcode()) {
4942 default:
4943 llvm_unreachable("Unknown overflow instruction!");
4944 case ISD::UADDO:
4945 Value = DAG.getNode(ARMISD::ADDC, dl, VTs, LHS, RHS);
4946 // Convert the carry flag into a boolean value.
4947 Overflow = ConvertCarryFlagToBooleanCarry(Value.getValue(1), VT, DAG);
4948 break;
4949 case ISD::USUBO: {
4950 Value = DAG.getNode(ARMISD::SUBC, dl, VTs, LHS, RHS);
4951 // Convert the carry flag into a boolean value.
4952 Overflow = ConvertCarryFlagToBooleanCarry(Value.getValue(1), VT, DAG);
4953 // ARMISD::SUBC returns 0 when we have to borrow, so make it an overflow
4954 // value. So compute 1 - C.
4955 Overflow = DAG.getNode(ISD::SUB, dl, MVT::i32,
4956 DAG.getConstant(1, dl, MVT::i32), Overflow);
4957 break;
4961 return DAG.getNode(ISD::MERGE_VALUES, dl, VTs, Value, Overflow);
4964 static SDValue LowerADDSUBSAT(SDValue Op, SelectionDAG &DAG,
4965 const ARMSubtarget *Subtarget) {
4966 EVT VT = Op.getValueType();
4967 if (!Subtarget->hasV6Ops() || !Subtarget->hasDSP())
4968 return SDValue();
4969 if (!VT.isSimple())
4970 return SDValue();
4972 unsigned NewOpcode;
4973 switch (VT.getSimpleVT().SimpleTy) {
4974 default:
4975 return SDValue();
4976 case MVT::i8:
4977 switch (Op->getOpcode()) {
4978 case ISD::UADDSAT:
4979 NewOpcode = ARMISD::UQADD8b;
4980 break;
4981 case ISD::SADDSAT:
4982 NewOpcode = ARMISD::QADD8b;
4983 break;
4984 case ISD::USUBSAT:
4985 NewOpcode = ARMISD::UQSUB8b;
4986 break;
4987 case ISD::SSUBSAT:
4988 NewOpcode = ARMISD::QSUB8b;
4989 break;
4991 break;
4992 case MVT::i16:
4993 switch (Op->getOpcode()) {
4994 case ISD::UADDSAT:
4995 NewOpcode = ARMISD::UQADD16b;
4996 break;
4997 case ISD::SADDSAT:
4998 NewOpcode = ARMISD::QADD16b;
4999 break;
5000 case ISD::USUBSAT:
5001 NewOpcode = ARMISD::UQSUB16b;
5002 break;
5003 case ISD::SSUBSAT:
5004 NewOpcode = ARMISD::QSUB16b;
5005 break;
5007 break;
5010 SDLoc dl(Op);
5011 SDValue Add =
5012 DAG.getNode(NewOpcode, dl, MVT::i32,
5013 DAG.getSExtOrTrunc(Op->getOperand(0), dl, MVT::i32),
5014 DAG.getSExtOrTrunc(Op->getOperand(1), dl, MVT::i32));
5015 return DAG.getNode(ISD::TRUNCATE, dl, VT, Add);
5018 SDValue ARMTargetLowering::LowerSELECT(SDValue Op, SelectionDAG &DAG) const {
5019 SDValue Cond = Op.getOperand(0);
5020 SDValue SelectTrue = Op.getOperand(1);
5021 SDValue SelectFalse = Op.getOperand(2);
5022 SDLoc dl(Op);
5023 unsigned Opc = Cond.getOpcode();
5025 if (Cond.getResNo() == 1 &&
5026 (Opc == ISD::SADDO || Opc == ISD::UADDO || Opc == ISD::SSUBO ||
5027 Opc == ISD::USUBO)) {
5028 if (!DAG.getTargetLoweringInfo().isTypeLegal(Cond->getValueType(0)))
5029 return SDValue();
5031 SDValue Value, OverflowCmp;
5032 SDValue ARMcc;
5033 std::tie(Value, OverflowCmp) = getARMXALUOOp(Cond, DAG, ARMcc);
5034 SDValue CCR = DAG.getRegister(ARM::CPSR, MVT::i32);
5035 EVT VT = Op.getValueType();
5037 return getCMOV(dl, VT, SelectTrue, SelectFalse, ARMcc, CCR,
5038 OverflowCmp, DAG);
5041 // Convert:
5043 // (select (cmov 1, 0, cond), t, f) -> (cmov t, f, cond)
5044 // (select (cmov 0, 1, cond), t, f) -> (cmov f, t, cond)
5046 if (Cond.getOpcode() == ARMISD::CMOV && Cond.hasOneUse()) {
5047 const ConstantSDNode *CMOVTrue =
5048 dyn_cast<ConstantSDNode>(Cond.getOperand(0));
5049 const ConstantSDNode *CMOVFalse =
5050 dyn_cast<ConstantSDNode>(Cond.getOperand(1));
5052 if (CMOVTrue && CMOVFalse) {
5053 unsigned CMOVTrueVal = CMOVTrue->getZExtValue();
5054 unsigned CMOVFalseVal = CMOVFalse->getZExtValue();
5056 SDValue True;
5057 SDValue False;
5058 if (CMOVTrueVal == 1 && CMOVFalseVal == 0) {
5059 True = SelectTrue;
5060 False = SelectFalse;
5061 } else if (CMOVTrueVal == 0 && CMOVFalseVal == 1) {
5062 True = SelectFalse;
5063 False = SelectTrue;
5066 if (True.getNode() && False.getNode()) {
5067 EVT VT = Op.getValueType();
5068 SDValue ARMcc = Cond.getOperand(2);
5069 SDValue CCR = Cond.getOperand(3);
5070 SDValue Cmp = duplicateCmp(Cond.getOperand(4), DAG);
5071 assert(True.getValueType() == VT);
5072 return getCMOV(dl, VT, True, False, ARMcc, CCR, Cmp, DAG);
5077 // ARM's BooleanContents value is UndefinedBooleanContent. Mask out the
5078 // undefined bits before doing a full-word comparison with zero.
5079 Cond = DAG.getNode(ISD::AND, dl, Cond.getValueType(), Cond,
5080 DAG.getConstant(1, dl, Cond.getValueType()));
5082 return DAG.getSelectCC(dl, Cond,
5083 DAG.getConstant(0, dl, Cond.getValueType()),
5084 SelectTrue, SelectFalse, ISD::SETNE);
5087 static void checkVSELConstraints(ISD::CondCode CC, ARMCC::CondCodes &CondCode,
5088 bool &swpCmpOps, bool &swpVselOps) {
5089 // Start by selecting the GE condition code for opcodes that return true for
5090 // 'equality'
5091 if (CC == ISD::SETUGE || CC == ISD::SETOGE || CC == ISD::SETOLE ||
5092 CC == ISD::SETULE || CC == ISD::SETGE || CC == ISD::SETLE)
5093 CondCode = ARMCC::GE;
5095 // and GT for opcodes that return false for 'equality'.
5096 else if (CC == ISD::SETUGT || CC == ISD::SETOGT || CC == ISD::SETOLT ||
5097 CC == ISD::SETULT || CC == ISD::SETGT || CC == ISD::SETLT)
5098 CondCode = ARMCC::GT;
5100 // Since we are constrained to GE/GT, if the opcode contains 'less', we need
5101 // to swap the compare operands.
5102 if (CC == ISD::SETOLE || CC == ISD::SETULE || CC == ISD::SETOLT ||
5103 CC == ISD::SETULT || CC == ISD::SETLE || CC == ISD::SETLT)
5104 swpCmpOps = true;
5106 // Both GT and GE are ordered comparisons, and return false for 'unordered'.
5107 // If we have an unordered opcode, we need to swap the operands to the VSEL
5108 // instruction (effectively negating the condition).
5110 // This also has the effect of swapping which one of 'less' or 'greater'
5111 // returns true, so we also swap the compare operands. It also switches
5112 // whether we return true for 'equality', so we compensate by picking the
5113 // opposite condition code to our original choice.
5114 if (CC == ISD::SETULE || CC == ISD::SETULT || CC == ISD::SETUGE ||
5115 CC == ISD::SETUGT) {
5116 swpCmpOps = !swpCmpOps;
5117 swpVselOps = !swpVselOps;
5118 CondCode = CondCode == ARMCC::GT ? ARMCC::GE : ARMCC::GT;
5121 // 'ordered' is 'anything but unordered', so use the VS condition code and
5122 // swap the VSEL operands.
5123 if (CC == ISD::SETO) {
5124 CondCode = ARMCC::VS;
5125 swpVselOps = true;
5128 // 'unordered or not equal' is 'anything but equal', so use the EQ condition
5129 // code and swap the VSEL operands. Also do this if we don't care about the
5130 // unordered case.
5131 if (CC == ISD::SETUNE || CC == ISD::SETNE) {
5132 CondCode = ARMCC::EQ;
5133 swpVselOps = true;
5137 SDValue ARMTargetLowering::getCMOV(const SDLoc &dl, EVT VT, SDValue FalseVal,
5138 SDValue TrueVal, SDValue ARMcc, SDValue CCR,
5139 SDValue Cmp, SelectionDAG &DAG) const {
5140 if (!Subtarget->hasFP64() && VT == MVT::f64) {
5141 FalseVal = DAG.getNode(ARMISD::VMOVRRD, dl,
5142 DAG.getVTList(MVT::i32, MVT::i32), FalseVal);
5143 TrueVal = DAG.getNode(ARMISD::VMOVRRD, dl,
5144 DAG.getVTList(MVT::i32, MVT::i32), TrueVal);
5146 SDValue TrueLow = TrueVal.getValue(0);
5147 SDValue TrueHigh = TrueVal.getValue(1);
5148 SDValue FalseLow = FalseVal.getValue(0);
5149 SDValue FalseHigh = FalseVal.getValue(1);
5151 SDValue Low = DAG.getNode(ARMISD::CMOV, dl, MVT::i32, FalseLow, TrueLow,
5152 ARMcc, CCR, Cmp);
5153 SDValue High = DAG.getNode(ARMISD::CMOV, dl, MVT::i32, FalseHigh, TrueHigh,
5154 ARMcc, CCR, duplicateCmp(Cmp, DAG));
5156 return DAG.getNode(ARMISD::VMOVDRR, dl, MVT::f64, Low, High);
5157 } else {
5158 return DAG.getNode(ARMISD::CMOV, dl, VT, FalseVal, TrueVal, ARMcc, CCR,
5159 Cmp);
5163 static bool isGTorGE(ISD::CondCode CC) {
5164 return CC == ISD::SETGT || CC == ISD::SETGE;
5167 static bool isLTorLE(ISD::CondCode CC) {
5168 return CC == ISD::SETLT || CC == ISD::SETLE;
5171 // See if a conditional (LHS CC RHS ? TrueVal : FalseVal) is lower-saturating.
5172 // All of these conditions (and their <= and >= counterparts) will do:
5173 // x < k ? k : x
5174 // x > k ? x : k
5175 // k < x ? x : k
5176 // k > x ? k : x
5177 static bool isLowerSaturate(const SDValue LHS, const SDValue RHS,
5178 const SDValue TrueVal, const SDValue FalseVal,
5179 const ISD::CondCode CC, const SDValue K) {
5180 return (isGTorGE(CC) &&
5181 ((K == LHS && K == TrueVal) || (K == RHS && K == FalseVal))) ||
5182 (isLTorLE(CC) &&
5183 ((K == RHS && K == TrueVal) || (K == LHS && K == FalseVal)));
5186 // Check if two chained conditionals could be converted into SSAT or USAT.
5188 // SSAT can replace a set of two conditional selectors that bound a number to an
5189 // interval of type [k, ~k] when k + 1 is a power of 2. Here are some examples:
5191 // x < -k ? -k : (x > k ? k : x)
5192 // x < -k ? -k : (x < k ? x : k)
5193 // x > -k ? (x > k ? k : x) : -k
5194 // x < k ? (x < -k ? -k : x) : k
5195 // etc.
5197 // LLVM canonicalizes these to either a min(max()) or a max(min())
5198 // pattern. This function tries to match one of these and will return a SSAT
5199 // node if successful.
5201 // USAT works similarily to SSAT but bounds on the interval [0, k] where k + 1
5202 // is a power of 2.
5203 static SDValue LowerSaturatingConditional(SDValue Op, SelectionDAG &DAG) {
5204 EVT VT = Op.getValueType();
5205 SDValue V1 = Op.getOperand(0);
5206 SDValue K1 = Op.getOperand(1);
5207 SDValue TrueVal1 = Op.getOperand(2);
5208 SDValue FalseVal1 = Op.getOperand(3);
5209 ISD::CondCode CC1 = cast<CondCodeSDNode>(Op.getOperand(4))->get();
5211 const SDValue Op2 = isa<ConstantSDNode>(TrueVal1) ? FalseVal1 : TrueVal1;
5212 if (Op2.getOpcode() != ISD::SELECT_CC)
5213 return SDValue();
5215 SDValue V2 = Op2.getOperand(0);
5216 SDValue K2 = Op2.getOperand(1);
5217 SDValue TrueVal2 = Op2.getOperand(2);
5218 SDValue FalseVal2 = Op2.getOperand(3);
5219 ISD::CondCode CC2 = cast<CondCodeSDNode>(Op2.getOperand(4))->get();
5221 SDValue V1Tmp = V1;
5222 SDValue V2Tmp = V2;
5224 // Check that the registers and the constants match a max(min()) or min(max())
5225 // pattern
5226 if (V1Tmp != TrueVal1 || V2Tmp != TrueVal2 || K1 != FalseVal1 ||
5227 K2 != FalseVal2 ||
5228 !((isGTorGE(CC1) && isLTorLE(CC2)) || (isLTorLE(CC1) && isGTorGE(CC2))))
5229 return SDValue();
5231 // Check that the constant in the lower-bound check is
5232 // the opposite of the constant in the upper-bound check
5233 // in 1's complement.
5234 if (!isa<ConstantSDNode>(K1) || !isa<ConstantSDNode>(K2))
5235 return SDValue();
5237 int64_t Val1 = cast<ConstantSDNode>(K1)->getSExtValue();
5238 int64_t Val2 = cast<ConstantSDNode>(K2)->getSExtValue();
5239 int64_t PosVal = std::max(Val1, Val2);
5240 int64_t NegVal = std::min(Val1, Val2);
5242 if (!((Val1 > Val2 && isLTorLE(CC1)) || (Val1 < Val2 && isLTorLE(CC2))) ||
5243 !isPowerOf2_64(PosVal + 1))
5244 return SDValue();
5246 // Handle the difference between USAT (unsigned) and SSAT (signed)
5247 // saturation
5248 // At this point, PosVal is guaranteed to be positive
5249 uint64_t K = PosVal;
5250 SDLoc dl(Op);
5251 if (Val1 == ~Val2)
5252 return DAG.getNode(ARMISD::SSAT, dl, VT, V2Tmp,
5253 DAG.getConstant(countTrailingOnes(K), dl, VT));
5254 if (NegVal == 0)
5255 return DAG.getNode(ARMISD::USAT, dl, VT, V2Tmp,
5256 DAG.getConstant(countTrailingOnes(K), dl, VT));
5258 return SDValue();
5261 // Check if a condition of the type x < k ? k : x can be converted into a
5262 // bit operation instead of conditional moves.
5263 // Currently this is allowed given:
5264 // - The conditions and values match up
5265 // - k is 0 or -1 (all ones)
5266 // This function will not check the last condition, thats up to the caller
5267 // It returns true if the transformation can be made, and in such case
5268 // returns x in V, and k in SatK.
5269 static bool isLowerSaturatingConditional(const SDValue &Op, SDValue &V,
5270 SDValue &SatK)
5272 SDValue LHS = Op.getOperand(0);
5273 SDValue RHS = Op.getOperand(1);
5274 ISD::CondCode CC = cast<CondCodeSDNode>(Op.getOperand(4))->get();
5275 SDValue TrueVal = Op.getOperand(2);
5276 SDValue FalseVal = Op.getOperand(3);
5278 SDValue *K = isa<ConstantSDNode>(LHS) ? &LHS : isa<ConstantSDNode>(RHS)
5279 ? &RHS
5280 : nullptr;
5282 // No constant operation in comparison, early out
5283 if (!K)
5284 return false;
5286 SDValue KTmp = isa<ConstantSDNode>(TrueVal) ? TrueVal : FalseVal;
5287 V = (KTmp == TrueVal) ? FalseVal : TrueVal;
5288 SDValue VTmp = (K && *K == LHS) ? RHS : LHS;
5290 // If the constant on left and right side, or variable on left and right,
5291 // does not match, early out
5292 if (*K != KTmp || V != VTmp)
5293 return false;
5295 if (isLowerSaturate(LHS, RHS, TrueVal, FalseVal, CC, *K)) {
5296 SatK = *K;
5297 return true;
5300 return false;
5303 bool ARMTargetLowering::isUnsupportedFloatingType(EVT VT) const {
5304 if (VT == MVT::f32)
5305 return !Subtarget->hasVFP2Base();
5306 if (VT == MVT::f64)
5307 return !Subtarget->hasFP64();
5308 if (VT == MVT::f16)
5309 return !Subtarget->hasFullFP16();
5310 return false;
5313 SDValue ARMTargetLowering::LowerSELECT_CC(SDValue Op, SelectionDAG &DAG) const {
5314 EVT VT = Op.getValueType();
5315 SDLoc dl(Op);
5317 // Try to convert two saturating conditional selects into a single SSAT
5318 if ((!Subtarget->isThumb() && Subtarget->hasV6Ops()) || Subtarget->isThumb2())
5319 if (SDValue SatValue = LowerSaturatingConditional(Op, DAG))
5320 return SatValue;
5322 // Try to convert expressions of the form x < k ? k : x (and similar forms)
5323 // into more efficient bit operations, which is possible when k is 0 or -1
5324 // On ARM and Thumb-2 which have flexible operand 2 this will result in
5325 // single instructions. On Thumb the shift and the bit operation will be two
5326 // instructions.
5327 // Only allow this transformation on full-width (32-bit) operations
5328 SDValue LowerSatConstant;
5329 SDValue SatValue;
5330 if (VT == MVT::i32 &&
5331 isLowerSaturatingConditional(Op, SatValue, LowerSatConstant)) {
5332 SDValue ShiftV = DAG.getNode(ISD::SRA, dl, VT, SatValue,
5333 DAG.getConstant(31, dl, VT));
5334 if (isNullConstant(LowerSatConstant)) {
5335 SDValue NotShiftV = DAG.getNode(ISD::XOR, dl, VT, ShiftV,
5336 DAG.getAllOnesConstant(dl, VT));
5337 return DAG.getNode(ISD::AND, dl, VT, SatValue, NotShiftV);
5338 } else if (isAllOnesConstant(LowerSatConstant))
5339 return DAG.getNode(ISD::OR, dl, VT, SatValue, ShiftV);
5342 SDValue LHS = Op.getOperand(0);
5343 SDValue RHS = Op.getOperand(1);
5344 ISD::CondCode CC = cast<CondCodeSDNode>(Op.getOperand(4))->get();
5345 SDValue TrueVal = Op.getOperand(2);
5346 SDValue FalseVal = Op.getOperand(3);
5347 ConstantSDNode *CFVal = dyn_cast<ConstantSDNode>(FalseVal);
5348 ConstantSDNode *CTVal = dyn_cast<ConstantSDNode>(TrueVal);
5350 if (Subtarget->hasV8_1MMainlineOps() && CFVal && CTVal &&
5351 LHS.getValueType() == MVT::i32 && RHS.getValueType() == MVT::i32) {
5352 unsigned TVal = CTVal->getZExtValue();
5353 unsigned FVal = CFVal->getZExtValue();
5354 unsigned Opcode = 0;
5356 if (TVal == ~FVal) {
5357 Opcode = ARMISD::CSINV;
5358 } else if (TVal == ~FVal + 1) {
5359 Opcode = ARMISD::CSNEG;
5360 } else if (TVal + 1 == FVal) {
5361 Opcode = ARMISD::CSINC;
5362 } else if (TVal == FVal + 1) {
5363 Opcode = ARMISD::CSINC;
5364 std::swap(TrueVal, FalseVal);
5365 std::swap(TVal, FVal);
5366 CC = ISD::getSetCCInverse(CC, LHS.getValueType());
5369 if (Opcode) {
5370 // If one of the constants is cheaper than another, materialise the
5371 // cheaper one and let the csel generate the other.
5372 if (Opcode != ARMISD::CSINC &&
5373 HasLowerConstantMaterializationCost(FVal, TVal, Subtarget)) {
5374 std::swap(TrueVal, FalseVal);
5375 std::swap(TVal, FVal);
5376 CC = ISD::getSetCCInverse(CC, LHS.getValueType());
5379 // Attempt to use ZR checking TVal is 0, possibly inverting the condition
5380 // to get there. CSINC not is invertable like the other two (~(~a) == a,
5381 // -(-a) == a, but (a+1)+1 != a).
5382 if (FVal == 0 && Opcode != ARMISD::CSINC) {
5383 std::swap(TrueVal, FalseVal);
5384 std::swap(TVal, FVal);
5385 CC = ISD::getSetCCInverse(CC, LHS.getValueType());
5388 // Drops F's value because we can get it by inverting/negating TVal.
5389 FalseVal = TrueVal;
5391 SDValue ARMcc;
5392 SDValue Cmp = getARMCmp(LHS, RHS, CC, ARMcc, DAG, dl);
5393 EVT VT = TrueVal.getValueType();
5394 return DAG.getNode(Opcode, dl, VT, TrueVal, FalseVal, ARMcc, Cmp);
5398 if (isUnsupportedFloatingType(LHS.getValueType())) {
5399 DAG.getTargetLoweringInfo().softenSetCCOperands(
5400 DAG, LHS.getValueType(), LHS, RHS, CC, dl, LHS, RHS);
5402 // If softenSetCCOperands only returned one value, we should compare it to
5403 // zero.
5404 if (!RHS.getNode()) {
5405 RHS = DAG.getConstant(0, dl, LHS.getValueType());
5406 CC = ISD::SETNE;
5410 if (LHS.getValueType() == MVT::i32) {
5411 // Try to generate VSEL on ARMv8.
5412 // The VSEL instruction can't use all the usual ARM condition
5413 // codes: it only has two bits to select the condition code, so it's
5414 // constrained to use only GE, GT, VS and EQ.
5416 // To implement all the various ISD::SETXXX opcodes, we sometimes need to
5417 // swap the operands of the previous compare instruction (effectively
5418 // inverting the compare condition, swapping 'less' and 'greater') and
5419 // sometimes need to swap the operands to the VSEL (which inverts the
5420 // condition in the sense of firing whenever the previous condition didn't)
5421 if (Subtarget->hasFPARMv8Base() && (TrueVal.getValueType() == MVT::f16 ||
5422 TrueVal.getValueType() == MVT::f32 ||
5423 TrueVal.getValueType() == MVT::f64)) {
5424 ARMCC::CondCodes CondCode = IntCCToARMCC(CC);
5425 if (CondCode == ARMCC::LT || CondCode == ARMCC::LE ||
5426 CondCode == ARMCC::VC || CondCode == ARMCC::NE) {
5427 CC = ISD::getSetCCInverse(CC, LHS.getValueType());
5428 std::swap(TrueVal, FalseVal);
5432 SDValue ARMcc;
5433 SDValue CCR = DAG.getRegister(ARM::CPSR, MVT::i32);
5434 SDValue Cmp = getARMCmp(LHS, RHS, CC, ARMcc, DAG, dl);
5435 // Choose GE over PL, which vsel does now support
5436 if (cast<ConstantSDNode>(ARMcc)->getZExtValue() == ARMCC::PL)
5437 ARMcc = DAG.getConstant(ARMCC::GE, dl, MVT::i32);
5438 return getCMOV(dl, VT, FalseVal, TrueVal, ARMcc, CCR, Cmp, DAG);
5441 ARMCC::CondCodes CondCode, CondCode2;
5442 FPCCToARMCC(CC, CondCode, CondCode2);
5444 // Normalize the fp compare. If RHS is zero we prefer to keep it there so we
5445 // match CMPFPw0 instead of CMPFP, though we don't do this for f16 because we
5446 // must use VSEL (limited condition codes), due to not having conditional f16
5447 // moves.
5448 if (Subtarget->hasFPARMv8Base() &&
5449 !(isFloatingPointZero(RHS) && TrueVal.getValueType() != MVT::f16) &&
5450 (TrueVal.getValueType() == MVT::f16 ||
5451 TrueVal.getValueType() == MVT::f32 ||
5452 TrueVal.getValueType() == MVT::f64)) {
5453 bool swpCmpOps = false;
5454 bool swpVselOps = false;
5455 checkVSELConstraints(CC, CondCode, swpCmpOps, swpVselOps);
5457 if (CondCode == ARMCC::GT || CondCode == ARMCC::GE ||
5458 CondCode == ARMCC::VS || CondCode == ARMCC::EQ) {
5459 if (swpCmpOps)
5460 std::swap(LHS, RHS);
5461 if (swpVselOps)
5462 std::swap(TrueVal, FalseVal);
5466 SDValue ARMcc = DAG.getConstant(CondCode, dl, MVT::i32);
5467 SDValue Cmp = getVFPCmp(LHS, RHS, DAG, dl);
5468 SDValue CCR = DAG.getRegister(ARM::CPSR, MVT::i32);
5469 SDValue Result = getCMOV(dl, VT, FalseVal, TrueVal, ARMcc, CCR, Cmp, DAG);
5470 if (CondCode2 != ARMCC::AL) {
5471 SDValue ARMcc2 = DAG.getConstant(CondCode2, dl, MVT::i32);
5472 // FIXME: Needs another CMP because flag can have but one use.
5473 SDValue Cmp2 = getVFPCmp(LHS, RHS, DAG, dl);
5474 Result = getCMOV(dl, VT, Result, TrueVal, ARMcc2, CCR, Cmp2, DAG);
5476 return Result;
5479 /// canChangeToInt - Given the fp compare operand, return true if it is suitable
5480 /// to morph to an integer compare sequence.
5481 static bool canChangeToInt(SDValue Op, bool &SeenZero,
5482 const ARMSubtarget *Subtarget) {
5483 SDNode *N = Op.getNode();
5484 if (!N->hasOneUse())
5485 // Otherwise it requires moving the value from fp to integer registers.
5486 return false;
5487 if (!N->getNumValues())
5488 return false;
5489 EVT VT = Op.getValueType();
5490 if (VT != MVT::f32 && !Subtarget->isFPBrccSlow())
5491 // f32 case is generally profitable. f64 case only makes sense when vcmpe +
5492 // vmrs are very slow, e.g. cortex-a8.
5493 return false;
5495 if (isFloatingPointZero(Op)) {
5496 SeenZero = true;
5497 return true;
5499 return ISD::isNormalLoad(N);
5502 static SDValue bitcastf32Toi32(SDValue Op, SelectionDAG &DAG) {
5503 if (isFloatingPointZero(Op))
5504 return DAG.getConstant(0, SDLoc(Op), MVT::i32);
5506 if (LoadSDNode *Ld = dyn_cast<LoadSDNode>(Op))
5507 return DAG.getLoad(MVT::i32, SDLoc(Op), Ld->getChain(), Ld->getBasePtr(),
5508 Ld->getPointerInfo(), Ld->getAlignment(),
5509 Ld->getMemOperand()->getFlags());
5511 llvm_unreachable("Unknown VFP cmp argument!");
5514 static void expandf64Toi32(SDValue Op, SelectionDAG &DAG,
5515 SDValue &RetVal1, SDValue &RetVal2) {
5516 SDLoc dl(Op);
5518 if (isFloatingPointZero(Op)) {
5519 RetVal1 = DAG.getConstant(0, dl, MVT::i32);
5520 RetVal2 = DAG.getConstant(0, dl, MVT::i32);
5521 return;
5524 if (LoadSDNode *Ld = dyn_cast<LoadSDNode>(Op)) {
5525 SDValue Ptr = Ld->getBasePtr();
5526 RetVal1 =
5527 DAG.getLoad(MVT::i32, dl, Ld->getChain(), Ptr, Ld->getPointerInfo(),
5528 Ld->getAlignment(), Ld->getMemOperand()->getFlags());
5530 EVT PtrType = Ptr.getValueType();
5531 unsigned NewAlign = MinAlign(Ld->getAlignment(), 4);
5532 SDValue NewPtr = DAG.getNode(ISD::ADD, dl,
5533 PtrType, Ptr, DAG.getConstant(4, dl, PtrType));
5534 RetVal2 = DAG.getLoad(MVT::i32, dl, Ld->getChain(), NewPtr,
5535 Ld->getPointerInfo().getWithOffset(4), NewAlign,
5536 Ld->getMemOperand()->getFlags());
5537 return;
5540 llvm_unreachable("Unknown VFP cmp argument!");
5543 /// OptimizeVFPBrcond - With -enable-unsafe-fp-math, it's legal to optimize some
5544 /// f32 and even f64 comparisons to integer ones.
5545 SDValue
5546 ARMTargetLowering::OptimizeVFPBrcond(SDValue Op, SelectionDAG &DAG) const {
5547 SDValue Chain = Op.getOperand(0);
5548 ISD::CondCode CC = cast<CondCodeSDNode>(Op.getOperand(1))->get();
5549 SDValue LHS = Op.getOperand(2);
5550 SDValue RHS = Op.getOperand(3);
5551 SDValue Dest = Op.getOperand(4);
5552 SDLoc dl(Op);
5554 bool LHSSeenZero = false;
5555 bool LHSOk = canChangeToInt(LHS, LHSSeenZero, Subtarget);
5556 bool RHSSeenZero = false;
5557 bool RHSOk = canChangeToInt(RHS, RHSSeenZero, Subtarget);
5558 if (LHSOk && RHSOk && (LHSSeenZero || RHSSeenZero)) {
5559 // If unsafe fp math optimization is enabled and there are no other uses of
5560 // the CMP operands, and the condition code is EQ or NE, we can optimize it
5561 // to an integer comparison.
5562 if (CC == ISD::SETOEQ)
5563 CC = ISD::SETEQ;
5564 else if (CC == ISD::SETUNE)
5565 CC = ISD::SETNE;
5567 SDValue Mask = DAG.getConstant(0x7fffffff, dl, MVT::i32);
5568 SDValue ARMcc;
5569 if (LHS.getValueType() == MVT::f32) {
5570 LHS = DAG.getNode(ISD::AND, dl, MVT::i32,
5571 bitcastf32Toi32(LHS, DAG), Mask);
5572 RHS = DAG.getNode(ISD::AND, dl, MVT::i32,
5573 bitcastf32Toi32(RHS, DAG), Mask);
5574 SDValue Cmp = getARMCmp(LHS, RHS, CC, ARMcc, DAG, dl);
5575 SDValue CCR = DAG.getRegister(ARM::CPSR, MVT::i32);
5576 return DAG.getNode(ARMISD::BRCOND, dl, MVT::Other,
5577 Chain, Dest, ARMcc, CCR, Cmp);
5580 SDValue LHS1, LHS2;
5581 SDValue RHS1, RHS2;
5582 expandf64Toi32(LHS, DAG, LHS1, LHS2);
5583 expandf64Toi32(RHS, DAG, RHS1, RHS2);
5584 LHS2 = DAG.getNode(ISD::AND, dl, MVT::i32, LHS2, Mask);
5585 RHS2 = DAG.getNode(ISD::AND, dl, MVT::i32, RHS2, Mask);
5586 ARMCC::CondCodes CondCode = IntCCToARMCC(CC);
5587 ARMcc = DAG.getConstant(CondCode, dl, MVT::i32);
5588 SDVTList VTList = DAG.getVTList(MVT::Other, MVT::Glue);
5589 SDValue Ops[] = { Chain, ARMcc, LHS1, LHS2, RHS1, RHS2, Dest };
5590 return DAG.getNode(ARMISD::BCC_i64, dl, VTList, Ops);
5593 return SDValue();
5596 SDValue ARMTargetLowering::LowerBRCOND(SDValue Op, SelectionDAG &DAG) const {
5597 SDValue Chain = Op.getOperand(0);
5598 SDValue Cond = Op.getOperand(1);
5599 SDValue Dest = Op.getOperand(2);
5600 SDLoc dl(Op);
5602 // Optimize {s|u}{add|sub|mul}.with.overflow feeding into a branch
5603 // instruction.
5604 unsigned Opc = Cond.getOpcode();
5605 bool OptimizeMul = (Opc == ISD::SMULO || Opc == ISD::UMULO) &&
5606 !Subtarget->isThumb1Only();
5607 if (Cond.getResNo() == 1 &&
5608 (Opc == ISD::SADDO || Opc == ISD::UADDO || Opc == ISD::SSUBO ||
5609 Opc == ISD::USUBO || OptimizeMul)) {
5610 // Only lower legal XALUO ops.
5611 if (!DAG.getTargetLoweringInfo().isTypeLegal(Cond->getValueType(0)))
5612 return SDValue();
5614 // The actual operation with overflow check.
5615 SDValue Value, OverflowCmp;
5616 SDValue ARMcc;
5617 std::tie(Value, OverflowCmp) = getARMXALUOOp(Cond, DAG, ARMcc);
5619 // Reverse the condition code.
5620 ARMCC::CondCodes CondCode =
5621 (ARMCC::CondCodes)cast<const ConstantSDNode>(ARMcc)->getZExtValue();
5622 CondCode = ARMCC::getOppositeCondition(CondCode);
5623 ARMcc = DAG.getConstant(CondCode, SDLoc(ARMcc), MVT::i32);
5624 SDValue CCR = DAG.getRegister(ARM::CPSR, MVT::i32);
5626 return DAG.getNode(ARMISD::BRCOND, dl, MVT::Other, Chain, Dest, ARMcc, CCR,
5627 OverflowCmp);
5630 return SDValue();
5633 SDValue ARMTargetLowering::LowerBR_CC(SDValue Op, SelectionDAG &DAG) const {
5634 SDValue Chain = Op.getOperand(0);
5635 ISD::CondCode CC = cast<CondCodeSDNode>(Op.getOperand(1))->get();
5636 SDValue LHS = Op.getOperand(2);
5637 SDValue RHS = Op.getOperand(3);
5638 SDValue Dest = Op.getOperand(4);
5639 SDLoc dl(Op);
5641 if (isUnsupportedFloatingType(LHS.getValueType())) {
5642 DAG.getTargetLoweringInfo().softenSetCCOperands(
5643 DAG, LHS.getValueType(), LHS, RHS, CC, dl, LHS, RHS);
5645 // If softenSetCCOperands only returned one value, we should compare it to
5646 // zero.
5647 if (!RHS.getNode()) {
5648 RHS = DAG.getConstant(0, dl, LHS.getValueType());
5649 CC = ISD::SETNE;
5653 // Optimize {s|u}{add|sub|mul}.with.overflow feeding into a branch
5654 // instruction.
5655 unsigned Opc = LHS.getOpcode();
5656 bool OptimizeMul = (Opc == ISD::SMULO || Opc == ISD::UMULO) &&
5657 !Subtarget->isThumb1Only();
5658 if (LHS.getResNo() == 1 && (isOneConstant(RHS) || isNullConstant(RHS)) &&
5659 (Opc == ISD::SADDO || Opc == ISD::UADDO || Opc == ISD::SSUBO ||
5660 Opc == ISD::USUBO || OptimizeMul) &&
5661 (CC == ISD::SETEQ || CC == ISD::SETNE)) {
5662 // Only lower legal XALUO ops.
5663 if (!DAG.getTargetLoweringInfo().isTypeLegal(LHS->getValueType(0)))
5664 return SDValue();
5666 // The actual operation with overflow check.
5667 SDValue Value, OverflowCmp;
5668 SDValue ARMcc;
5669 std::tie(Value, OverflowCmp) = getARMXALUOOp(LHS.getValue(0), DAG, ARMcc);
5671 if ((CC == ISD::SETNE) != isOneConstant(RHS)) {
5672 // Reverse the condition code.
5673 ARMCC::CondCodes CondCode =
5674 (ARMCC::CondCodes)cast<const ConstantSDNode>(ARMcc)->getZExtValue();
5675 CondCode = ARMCC::getOppositeCondition(CondCode);
5676 ARMcc = DAG.getConstant(CondCode, SDLoc(ARMcc), MVT::i32);
5678 SDValue CCR = DAG.getRegister(ARM::CPSR, MVT::i32);
5680 return DAG.getNode(ARMISD::BRCOND, dl, MVT::Other, Chain, Dest, ARMcc, CCR,
5681 OverflowCmp);
5684 if (LHS.getValueType() == MVT::i32) {
5685 SDValue ARMcc;
5686 SDValue Cmp = getARMCmp(LHS, RHS, CC, ARMcc, DAG, dl);
5687 SDValue CCR = DAG.getRegister(ARM::CPSR, MVT::i32);
5688 return DAG.getNode(ARMISD::BRCOND, dl, MVT::Other,
5689 Chain, Dest, ARMcc, CCR, Cmp);
5692 if (getTargetMachine().Options.UnsafeFPMath &&
5693 (CC == ISD::SETEQ || CC == ISD::SETOEQ ||
5694 CC == ISD::SETNE || CC == ISD::SETUNE)) {
5695 if (SDValue Result = OptimizeVFPBrcond(Op, DAG))
5696 return Result;
5699 ARMCC::CondCodes CondCode, CondCode2;
5700 FPCCToARMCC(CC, CondCode, CondCode2);
5702 SDValue ARMcc = DAG.getConstant(CondCode, dl, MVT::i32);
5703 SDValue Cmp = getVFPCmp(LHS, RHS, DAG, dl);
5704 SDValue CCR = DAG.getRegister(ARM::CPSR, MVT::i32);
5705 SDVTList VTList = DAG.getVTList(MVT::Other, MVT::Glue);
5706 SDValue Ops[] = { Chain, Dest, ARMcc, CCR, Cmp };
5707 SDValue Res = DAG.getNode(ARMISD::BRCOND, dl, VTList, Ops);
5708 if (CondCode2 != ARMCC::AL) {
5709 ARMcc = DAG.getConstant(CondCode2, dl, MVT::i32);
5710 SDValue Ops[] = { Res, Dest, ARMcc, CCR, Res.getValue(1) };
5711 Res = DAG.getNode(ARMISD::BRCOND, dl, VTList, Ops);
5713 return Res;
5716 SDValue ARMTargetLowering::LowerBR_JT(SDValue Op, SelectionDAG &DAG) const {
5717 SDValue Chain = Op.getOperand(0);
5718 SDValue Table = Op.getOperand(1);
5719 SDValue Index = Op.getOperand(2);
5720 SDLoc dl(Op);
5722 EVT PTy = getPointerTy(DAG.getDataLayout());
5723 JumpTableSDNode *JT = cast<JumpTableSDNode>(Table);
5724 SDValue JTI = DAG.getTargetJumpTable(JT->getIndex(), PTy);
5725 Table = DAG.getNode(ARMISD::WrapperJT, dl, MVT::i32, JTI);
5726 Index = DAG.getNode(ISD::MUL, dl, PTy, Index, DAG.getConstant(4, dl, PTy));
5727 SDValue Addr = DAG.getNode(ISD::ADD, dl, PTy, Table, Index);
5728 if (Subtarget->isThumb2() || (Subtarget->hasV8MBaselineOps() && Subtarget->isThumb())) {
5729 // Thumb2 and ARMv8-M use a two-level jump. That is, it jumps into the jump table
5730 // which does another jump to the destination. This also makes it easier
5731 // to translate it to TBB / TBH later (Thumb2 only).
5732 // FIXME: This might not work if the function is extremely large.
5733 return DAG.getNode(ARMISD::BR2_JT, dl, MVT::Other, Chain,
5734 Addr, Op.getOperand(2), JTI);
5736 if (isPositionIndependent() || Subtarget->isROPI()) {
5737 Addr =
5738 DAG.getLoad((EVT)MVT::i32, dl, Chain, Addr,
5739 MachinePointerInfo::getJumpTable(DAG.getMachineFunction()));
5740 Chain = Addr.getValue(1);
5741 Addr = DAG.getNode(ISD::ADD, dl, PTy, Table, Addr);
5742 return DAG.getNode(ARMISD::BR_JT, dl, MVT::Other, Chain, Addr, JTI);
5743 } else {
5744 Addr =
5745 DAG.getLoad(PTy, dl, Chain, Addr,
5746 MachinePointerInfo::getJumpTable(DAG.getMachineFunction()));
5747 Chain = Addr.getValue(1);
5748 return DAG.getNode(ARMISD::BR_JT, dl, MVT::Other, Chain, Addr, JTI);
5752 static SDValue LowerVectorFP_TO_INT(SDValue Op, SelectionDAG &DAG) {
5753 EVT VT = Op.getValueType();
5754 SDLoc dl(Op);
5756 if (Op.getValueType().getVectorElementType() == MVT::i32) {
5757 if (Op.getOperand(0).getValueType().getVectorElementType() == MVT::f32)
5758 return Op;
5759 return DAG.UnrollVectorOp(Op.getNode());
5762 const bool HasFullFP16 =
5763 static_cast<const ARMSubtarget&>(DAG.getSubtarget()).hasFullFP16();
5765 EVT NewTy;
5766 const EVT OpTy = Op.getOperand(0).getValueType();
5767 if (OpTy == MVT::v4f32)
5768 NewTy = MVT::v4i32;
5769 else if (OpTy == MVT::v4f16 && HasFullFP16)
5770 NewTy = MVT::v4i16;
5771 else if (OpTy == MVT::v8f16 && HasFullFP16)
5772 NewTy = MVT::v8i16;
5773 else
5774 llvm_unreachable("Invalid type for custom lowering!");
5776 if (VT != MVT::v4i16 && VT != MVT::v8i16)
5777 return DAG.UnrollVectorOp(Op.getNode());
5779 Op = DAG.getNode(Op.getOpcode(), dl, NewTy, Op.getOperand(0));
5780 return DAG.getNode(ISD::TRUNCATE, dl, VT, Op);
5783 SDValue ARMTargetLowering::LowerFP_TO_INT(SDValue Op, SelectionDAG &DAG) const {
5784 EVT VT = Op.getValueType();
5785 if (VT.isVector())
5786 return LowerVectorFP_TO_INT(Op, DAG);
5788 bool IsStrict = Op->isStrictFPOpcode();
5789 SDValue SrcVal = Op.getOperand(IsStrict ? 1 : 0);
5791 if (isUnsupportedFloatingType(SrcVal.getValueType())) {
5792 RTLIB::Libcall LC;
5793 if (Op.getOpcode() == ISD::FP_TO_SINT ||
5794 Op.getOpcode() == ISD::STRICT_FP_TO_SINT)
5795 LC = RTLIB::getFPTOSINT(SrcVal.getValueType(),
5796 Op.getValueType());
5797 else
5798 LC = RTLIB::getFPTOUINT(SrcVal.getValueType(),
5799 Op.getValueType());
5800 SDLoc Loc(Op);
5801 MakeLibCallOptions CallOptions;
5802 SDValue Chain = IsStrict ? Op.getOperand(0) : SDValue();
5803 SDValue Result;
5804 std::tie(Result, Chain) = makeLibCall(DAG, LC, Op.getValueType(), SrcVal,
5805 CallOptions, Loc, Chain);
5806 return IsStrict ? DAG.getMergeValues({Result, Chain}, Loc) : Result;
5809 // FIXME: Remove this when we have strict fp instruction selection patterns
5810 if (IsStrict) {
5811 SDLoc Loc(Op);
5812 SDValue Result =
5813 DAG.getNode(Op.getOpcode() == ISD::STRICT_FP_TO_SINT ? ISD::FP_TO_SINT
5814 : ISD::FP_TO_UINT,
5815 Loc, Op.getValueType(), SrcVal);
5816 return DAG.getMergeValues({Result, Op.getOperand(0)}, Loc);
5819 return Op;
5822 static SDValue LowerVectorINT_TO_FP(SDValue Op, SelectionDAG &DAG) {
5823 EVT VT = Op.getValueType();
5824 SDLoc dl(Op);
5826 if (Op.getOperand(0).getValueType().getVectorElementType() == MVT::i32) {
5827 if (VT.getVectorElementType() == MVT::f32)
5828 return Op;
5829 return DAG.UnrollVectorOp(Op.getNode());
5832 assert((Op.getOperand(0).getValueType() == MVT::v4i16 ||
5833 Op.getOperand(0).getValueType() == MVT::v8i16) &&
5834 "Invalid type for custom lowering!");
5836 const bool HasFullFP16 =
5837 static_cast<const ARMSubtarget&>(DAG.getSubtarget()).hasFullFP16();
5839 EVT DestVecType;
5840 if (VT == MVT::v4f32)
5841 DestVecType = MVT::v4i32;
5842 else if (VT == MVT::v4f16 && HasFullFP16)
5843 DestVecType = MVT::v4i16;
5844 else if (VT == MVT::v8f16 && HasFullFP16)
5845 DestVecType = MVT::v8i16;
5846 else
5847 return DAG.UnrollVectorOp(Op.getNode());
5849 unsigned CastOpc;
5850 unsigned Opc;
5851 switch (Op.getOpcode()) {
5852 default: llvm_unreachable("Invalid opcode!");
5853 case ISD::SINT_TO_FP:
5854 CastOpc = ISD::SIGN_EXTEND;
5855 Opc = ISD::SINT_TO_FP;
5856 break;
5857 case ISD::UINT_TO_FP:
5858 CastOpc = ISD::ZERO_EXTEND;
5859 Opc = ISD::UINT_TO_FP;
5860 break;
5863 Op = DAG.getNode(CastOpc, dl, DestVecType, Op.getOperand(0));
5864 return DAG.getNode(Opc, dl, VT, Op);
5867 SDValue ARMTargetLowering::LowerINT_TO_FP(SDValue Op, SelectionDAG &DAG) const {
5868 EVT VT = Op.getValueType();
5869 if (VT.isVector())
5870 return LowerVectorINT_TO_FP(Op, DAG);
5871 if (isUnsupportedFloatingType(VT)) {
5872 RTLIB::Libcall LC;
5873 if (Op.getOpcode() == ISD::SINT_TO_FP)
5874 LC = RTLIB::getSINTTOFP(Op.getOperand(0).getValueType(),
5875 Op.getValueType());
5876 else
5877 LC = RTLIB::getUINTTOFP(Op.getOperand(0).getValueType(),
5878 Op.getValueType());
5879 MakeLibCallOptions CallOptions;
5880 return makeLibCall(DAG, LC, Op.getValueType(), Op.getOperand(0),
5881 CallOptions, SDLoc(Op)).first;
5884 return Op;
5887 SDValue ARMTargetLowering::LowerFCOPYSIGN(SDValue Op, SelectionDAG &DAG) const {
5888 // Implement fcopysign with a fabs and a conditional fneg.
5889 SDValue Tmp0 = Op.getOperand(0);
5890 SDValue Tmp1 = Op.getOperand(1);
5891 SDLoc dl(Op);
5892 EVT VT = Op.getValueType();
5893 EVT SrcVT = Tmp1.getValueType();
5894 bool InGPR = Tmp0.getOpcode() == ISD::BITCAST ||
5895 Tmp0.getOpcode() == ARMISD::VMOVDRR;
5896 bool UseNEON = !InGPR && Subtarget->hasNEON();
5898 if (UseNEON) {
5899 // Use VBSL to copy the sign bit.
5900 unsigned EncodedVal = ARM_AM::createVMOVModImm(0x6, 0x80);
5901 SDValue Mask = DAG.getNode(ARMISD::VMOVIMM, dl, MVT::v2i32,
5902 DAG.getTargetConstant(EncodedVal, dl, MVT::i32));
5903 EVT OpVT = (VT == MVT::f32) ? MVT::v2i32 : MVT::v1i64;
5904 if (VT == MVT::f64)
5905 Mask = DAG.getNode(ARMISD::VSHLIMM, dl, OpVT,
5906 DAG.getNode(ISD::BITCAST, dl, OpVT, Mask),
5907 DAG.getConstant(32, dl, MVT::i32));
5908 else /*if (VT == MVT::f32)*/
5909 Tmp0 = DAG.getNode(ISD::SCALAR_TO_VECTOR, dl, MVT::v2f32, Tmp0);
5910 if (SrcVT == MVT::f32) {
5911 Tmp1 = DAG.getNode(ISD::SCALAR_TO_VECTOR, dl, MVT::v2f32, Tmp1);
5912 if (VT == MVT::f64)
5913 Tmp1 = DAG.getNode(ARMISD::VSHLIMM, dl, OpVT,
5914 DAG.getNode(ISD::BITCAST, dl, OpVT, Tmp1),
5915 DAG.getConstant(32, dl, MVT::i32));
5916 } else if (VT == MVT::f32)
5917 Tmp1 = DAG.getNode(ARMISD::VSHRuIMM, dl, MVT::v1i64,
5918 DAG.getNode(ISD::BITCAST, dl, MVT::v1i64, Tmp1),
5919 DAG.getConstant(32, dl, MVT::i32));
5920 Tmp0 = DAG.getNode(ISD::BITCAST, dl, OpVT, Tmp0);
5921 Tmp1 = DAG.getNode(ISD::BITCAST, dl, OpVT, Tmp1);
5923 SDValue AllOnes = DAG.getTargetConstant(ARM_AM::createVMOVModImm(0xe, 0xff),
5924 dl, MVT::i32);
5925 AllOnes = DAG.getNode(ARMISD::VMOVIMM, dl, MVT::v8i8, AllOnes);
5926 SDValue MaskNot = DAG.getNode(ISD::XOR, dl, OpVT, Mask,
5927 DAG.getNode(ISD::BITCAST, dl, OpVT, AllOnes));
5929 SDValue Res = DAG.getNode(ISD::OR, dl, OpVT,
5930 DAG.getNode(ISD::AND, dl, OpVT, Tmp1, Mask),
5931 DAG.getNode(ISD::AND, dl, OpVT, Tmp0, MaskNot));
5932 if (VT == MVT::f32) {
5933 Res = DAG.getNode(ISD::BITCAST, dl, MVT::v2f32, Res);
5934 Res = DAG.getNode(ISD::EXTRACT_VECTOR_ELT, dl, MVT::f32, Res,
5935 DAG.getConstant(0, dl, MVT::i32));
5936 } else {
5937 Res = DAG.getNode(ISD::BITCAST, dl, MVT::f64, Res);
5940 return Res;
5943 // Bitcast operand 1 to i32.
5944 if (SrcVT == MVT::f64)
5945 Tmp1 = DAG.getNode(ARMISD::VMOVRRD, dl, DAG.getVTList(MVT::i32, MVT::i32),
5946 Tmp1).getValue(1);
5947 Tmp1 = DAG.getNode(ISD::BITCAST, dl, MVT::i32, Tmp1);
5949 // Or in the signbit with integer operations.
5950 SDValue Mask1 = DAG.getConstant(0x80000000, dl, MVT::i32);
5951 SDValue Mask2 = DAG.getConstant(0x7fffffff, dl, MVT::i32);
5952 Tmp1 = DAG.getNode(ISD::AND, dl, MVT::i32, Tmp1, Mask1);
5953 if (VT == MVT::f32) {
5954 Tmp0 = DAG.getNode(ISD::AND, dl, MVT::i32,
5955 DAG.getNode(ISD::BITCAST, dl, MVT::i32, Tmp0), Mask2);
5956 return DAG.getNode(ISD::BITCAST, dl, MVT::f32,
5957 DAG.getNode(ISD::OR, dl, MVT::i32, Tmp0, Tmp1));
5960 // f64: Or the high part with signbit and then combine two parts.
5961 Tmp0 = DAG.getNode(ARMISD::VMOVRRD, dl, DAG.getVTList(MVT::i32, MVT::i32),
5962 Tmp0);
5963 SDValue Lo = Tmp0.getValue(0);
5964 SDValue Hi = DAG.getNode(ISD::AND, dl, MVT::i32, Tmp0.getValue(1), Mask2);
5965 Hi = DAG.getNode(ISD::OR, dl, MVT::i32, Hi, Tmp1);
5966 return DAG.getNode(ARMISD::VMOVDRR, dl, MVT::f64, Lo, Hi);
5969 SDValue ARMTargetLowering::LowerRETURNADDR(SDValue Op, SelectionDAG &DAG) const{
5970 MachineFunction &MF = DAG.getMachineFunction();
5971 MachineFrameInfo &MFI = MF.getFrameInfo();
5972 MFI.setReturnAddressIsTaken(true);
5974 if (verifyReturnAddressArgumentIsConstant(Op, DAG))
5975 return SDValue();
5977 EVT VT = Op.getValueType();
5978 SDLoc dl(Op);
5979 unsigned Depth = cast<ConstantSDNode>(Op.getOperand(0))->getZExtValue();
5980 if (Depth) {
5981 SDValue FrameAddr = LowerFRAMEADDR(Op, DAG);
5982 SDValue Offset = DAG.getConstant(4, dl, MVT::i32);
5983 return DAG.getLoad(VT, dl, DAG.getEntryNode(),
5984 DAG.getNode(ISD::ADD, dl, VT, FrameAddr, Offset),
5985 MachinePointerInfo());
5988 // Return LR, which contains the return address. Mark it an implicit live-in.
5989 unsigned Reg = MF.addLiveIn(ARM::LR, getRegClassFor(MVT::i32));
5990 return DAG.getCopyFromReg(DAG.getEntryNode(), dl, Reg, VT);
5993 SDValue ARMTargetLowering::LowerFRAMEADDR(SDValue Op, SelectionDAG &DAG) const {
5994 const ARMBaseRegisterInfo &ARI =
5995 *static_cast<const ARMBaseRegisterInfo*>(RegInfo);
5996 MachineFunction &MF = DAG.getMachineFunction();
5997 MachineFrameInfo &MFI = MF.getFrameInfo();
5998 MFI.setFrameAddressIsTaken(true);
6000 EVT VT = Op.getValueType();
6001 SDLoc dl(Op); // FIXME probably not meaningful
6002 unsigned Depth = cast<ConstantSDNode>(Op.getOperand(0))->getZExtValue();
6003 Register FrameReg = ARI.getFrameRegister(MF);
6004 SDValue FrameAddr = DAG.getCopyFromReg(DAG.getEntryNode(), dl, FrameReg, VT);
6005 while (Depth--)
6006 FrameAddr = DAG.getLoad(VT, dl, DAG.getEntryNode(), FrameAddr,
6007 MachinePointerInfo());
6008 return FrameAddr;
6011 // FIXME? Maybe this could be a TableGen attribute on some registers and
6012 // this table could be generated automatically from RegInfo.
6013 Register ARMTargetLowering::getRegisterByName(const char* RegName, LLT VT,
6014 const MachineFunction &MF) const {
6015 Register Reg = StringSwitch<unsigned>(RegName)
6016 .Case("sp", ARM::SP)
6017 .Default(0);
6018 if (Reg)
6019 return Reg;
6020 report_fatal_error(Twine("Invalid register name \""
6021 + StringRef(RegName) + "\"."));
6024 // Result is 64 bit value so split into two 32 bit values and return as a
6025 // pair of values.
6026 static void ExpandREAD_REGISTER(SDNode *N, SmallVectorImpl<SDValue> &Results,
6027 SelectionDAG &DAG) {
6028 SDLoc DL(N);
6030 // This function is only supposed to be called for i64 type destination.
6031 assert(N->getValueType(0) == MVT::i64
6032 && "ExpandREAD_REGISTER called for non-i64 type result.");
6034 SDValue Read = DAG.getNode(ISD::READ_REGISTER, DL,
6035 DAG.getVTList(MVT::i32, MVT::i32, MVT::Other),
6036 N->getOperand(0),
6037 N->getOperand(1));
6039 Results.push_back(DAG.getNode(ISD::BUILD_PAIR, DL, MVT::i64, Read.getValue(0),
6040 Read.getValue(1)));
6041 Results.push_back(Read.getOperand(0));
6044 /// \p BC is a bitcast that is about to be turned into a VMOVDRR.
6045 /// When \p DstVT, the destination type of \p BC, is on the vector
6046 /// register bank and the source of bitcast, \p Op, operates on the same bank,
6047 /// it might be possible to combine them, such that everything stays on the
6048 /// vector register bank.
6049 /// \p return The node that would replace \p BT, if the combine
6050 /// is possible.
6051 static SDValue CombineVMOVDRRCandidateWithVecOp(const SDNode *BC,
6052 SelectionDAG &DAG) {
6053 SDValue Op = BC->getOperand(0);
6054 EVT DstVT = BC->getValueType(0);
6056 // The only vector instruction that can produce a scalar (remember,
6057 // since the bitcast was about to be turned into VMOVDRR, the source
6058 // type is i64) from a vector is EXTRACT_VECTOR_ELT.
6059 // Moreover, we can do this combine only if there is one use.
6060 // Finally, if the destination type is not a vector, there is not
6061 // much point on forcing everything on the vector bank.
6062 if (!DstVT.isVector() || Op.getOpcode() != ISD::EXTRACT_VECTOR_ELT ||
6063 !Op.hasOneUse())
6064 return SDValue();
6066 // If the index is not constant, we will introduce an additional
6067 // multiply that will stick.
6068 // Give up in that case.
6069 ConstantSDNode *Index = dyn_cast<ConstantSDNode>(Op.getOperand(1));
6070 if (!Index)
6071 return SDValue();
6072 unsigned DstNumElt = DstVT.getVectorNumElements();
6074 // Compute the new index.
6075 const APInt &APIntIndex = Index->getAPIntValue();
6076 APInt NewIndex(APIntIndex.getBitWidth(), DstNumElt);
6077 NewIndex *= APIntIndex;
6078 // Check if the new constant index fits into i32.
6079 if (NewIndex.getBitWidth() > 32)
6080 return SDValue();
6082 // vMTy bitcast(i64 extractelt vNi64 src, i32 index) ->
6083 // vMTy extractsubvector vNxMTy (bitcast vNi64 src), i32 index*M)
6084 SDLoc dl(Op);
6085 SDValue ExtractSrc = Op.getOperand(0);
6086 EVT VecVT = EVT::getVectorVT(
6087 *DAG.getContext(), DstVT.getScalarType(),
6088 ExtractSrc.getValueType().getVectorNumElements() * DstNumElt);
6089 SDValue BitCast = DAG.getNode(ISD::BITCAST, dl, VecVT, ExtractSrc);
6090 return DAG.getNode(ISD::EXTRACT_SUBVECTOR, dl, DstVT, BitCast,
6091 DAG.getConstant(NewIndex.getZExtValue(), dl, MVT::i32));
6094 /// ExpandBITCAST - If the target supports VFP, this function is called to
6095 /// expand a bit convert where either the source or destination type is i64 to
6096 /// use a VMOVDRR or VMOVRRD node. This should not be done when the non-i64
6097 /// operand type is illegal (e.g., v2f32 for a target that doesn't support
6098 /// vectors), since the legalizer won't know what to do with that.
6099 SDValue ARMTargetLowering::ExpandBITCAST(SDNode *N, SelectionDAG &DAG,
6100 const ARMSubtarget *Subtarget) const {
6101 const TargetLowering &TLI = DAG.getTargetLoweringInfo();
6102 SDLoc dl(N);
6103 SDValue Op = N->getOperand(0);
6105 // This function is only supposed to be called for i16 and i64 types, either
6106 // as the source or destination of the bit convert.
6107 EVT SrcVT = Op.getValueType();
6108 EVT DstVT = N->getValueType(0);
6110 if ((SrcVT == MVT::i16 || SrcVT == MVT::i32) &&
6111 (DstVT == MVT::f16 || DstVT == MVT::bf16))
6112 return MoveToHPR(SDLoc(N), DAG, MVT::i32, DstVT.getSimpleVT(),
6113 DAG.getNode(ISD::ZERO_EXTEND, SDLoc(N), MVT::i32, Op));
6115 if ((DstVT == MVT::i16 || DstVT == MVT::i32) &&
6116 (SrcVT == MVT::f16 || SrcVT == MVT::bf16))
6117 return DAG.getNode(
6118 ISD::TRUNCATE, SDLoc(N), DstVT,
6119 MoveFromHPR(SDLoc(N), DAG, MVT::i32, SrcVT.getSimpleVT(), Op));
6121 if (!(SrcVT == MVT::i64 || DstVT == MVT::i64))
6122 return SDValue();
6124 // Turn i64->f64 into VMOVDRR.
6125 if (SrcVT == MVT::i64 && TLI.isTypeLegal(DstVT)) {
6126 // Do not force values to GPRs (this is what VMOVDRR does for the inputs)
6127 // if we can combine the bitcast with its source.
6128 if (SDValue Val = CombineVMOVDRRCandidateWithVecOp(N, DAG))
6129 return Val;
6131 SDValue Lo = DAG.getNode(ISD::EXTRACT_ELEMENT, dl, MVT::i32, Op,
6132 DAG.getConstant(0, dl, MVT::i32));
6133 SDValue Hi = DAG.getNode(ISD::EXTRACT_ELEMENT, dl, MVT::i32, Op,
6134 DAG.getConstant(1, dl, MVT::i32));
6135 return DAG.getNode(ISD::BITCAST, dl, DstVT,
6136 DAG.getNode(ARMISD::VMOVDRR, dl, MVT::f64, Lo, Hi));
6139 // Turn f64->i64 into VMOVRRD.
6140 if (DstVT == MVT::i64 && TLI.isTypeLegal(SrcVT)) {
6141 SDValue Cvt;
6142 if (DAG.getDataLayout().isBigEndian() && SrcVT.isVector() &&
6143 SrcVT.getVectorNumElements() > 1)
6144 Cvt = DAG.getNode(ARMISD::VMOVRRD, dl,
6145 DAG.getVTList(MVT::i32, MVT::i32),
6146 DAG.getNode(ARMISD::VREV64, dl, SrcVT, Op));
6147 else
6148 Cvt = DAG.getNode(ARMISD::VMOVRRD, dl,
6149 DAG.getVTList(MVT::i32, MVT::i32), Op);
6150 // Merge the pieces into a single i64 value.
6151 return DAG.getNode(ISD::BUILD_PAIR, dl, MVT::i64, Cvt, Cvt.getValue(1));
6154 return SDValue();
6157 /// getZeroVector - Returns a vector of specified type with all zero elements.
6158 /// Zero vectors are used to represent vector negation and in those cases
6159 /// will be implemented with the NEON VNEG instruction. However, VNEG does
6160 /// not support i64 elements, so sometimes the zero vectors will need to be
6161 /// explicitly constructed. Regardless, use a canonical VMOV to create the
6162 /// zero vector.
6163 static SDValue getZeroVector(EVT VT, SelectionDAG &DAG, const SDLoc &dl) {
6164 assert(VT.isVector() && "Expected a vector type");
6165 // The canonical modified immediate encoding of a zero vector is....0!
6166 SDValue EncodedVal = DAG.getTargetConstant(0, dl, MVT::i32);
6167 EVT VmovVT = VT.is128BitVector() ? MVT::v4i32 : MVT::v2i32;
6168 SDValue Vmov = DAG.getNode(ARMISD::VMOVIMM, dl, VmovVT, EncodedVal);
6169 return DAG.getNode(ISD::BITCAST, dl, VT, Vmov);
6172 /// LowerShiftRightParts - Lower SRA_PARTS, which returns two
6173 /// i32 values and take a 2 x i32 value to shift plus a shift amount.
6174 SDValue ARMTargetLowering::LowerShiftRightParts(SDValue Op,
6175 SelectionDAG &DAG) const {
6176 assert(Op.getNumOperands() == 3 && "Not a double-shift!");
6177 EVT VT = Op.getValueType();
6178 unsigned VTBits = VT.getSizeInBits();
6179 SDLoc dl(Op);
6180 SDValue ShOpLo = Op.getOperand(0);
6181 SDValue ShOpHi = Op.getOperand(1);
6182 SDValue ShAmt = Op.getOperand(2);
6183 SDValue ARMcc;
6184 SDValue CCR = DAG.getRegister(ARM::CPSR, MVT::i32);
6185 unsigned Opc = (Op.getOpcode() == ISD::SRA_PARTS) ? ISD::SRA : ISD::SRL;
6187 assert(Op.getOpcode() == ISD::SRA_PARTS || Op.getOpcode() == ISD::SRL_PARTS);
6189 SDValue RevShAmt = DAG.getNode(ISD::SUB, dl, MVT::i32,
6190 DAG.getConstant(VTBits, dl, MVT::i32), ShAmt);
6191 SDValue Tmp1 = DAG.getNode(ISD::SRL, dl, VT, ShOpLo, ShAmt);
6192 SDValue ExtraShAmt = DAG.getNode(ISD::SUB, dl, MVT::i32, ShAmt,
6193 DAG.getConstant(VTBits, dl, MVT::i32));
6194 SDValue Tmp2 = DAG.getNode(ISD::SHL, dl, VT, ShOpHi, RevShAmt);
6195 SDValue LoSmallShift = DAG.getNode(ISD::OR, dl, VT, Tmp1, Tmp2);
6196 SDValue LoBigShift = DAG.getNode(Opc, dl, VT, ShOpHi, ExtraShAmt);
6197 SDValue CmpLo = getARMCmp(ExtraShAmt, DAG.getConstant(0, dl, MVT::i32),
6198 ISD::SETGE, ARMcc, DAG, dl);
6199 SDValue Lo = DAG.getNode(ARMISD::CMOV, dl, VT, LoSmallShift, LoBigShift,
6200 ARMcc, CCR, CmpLo);
6202 SDValue HiSmallShift = DAG.getNode(Opc, dl, VT, ShOpHi, ShAmt);
6203 SDValue HiBigShift = Opc == ISD::SRA
6204 ? DAG.getNode(Opc, dl, VT, ShOpHi,
6205 DAG.getConstant(VTBits - 1, dl, VT))
6206 : DAG.getConstant(0, dl, VT);
6207 SDValue CmpHi = getARMCmp(ExtraShAmt, DAG.getConstant(0, dl, MVT::i32),
6208 ISD::SETGE, ARMcc, DAG, dl);
6209 SDValue Hi = DAG.getNode(ARMISD::CMOV, dl, VT, HiSmallShift, HiBigShift,
6210 ARMcc, CCR, CmpHi);
6212 SDValue Ops[2] = { Lo, Hi };
6213 return DAG.getMergeValues(Ops, dl);
6216 /// LowerShiftLeftParts - Lower SHL_PARTS, which returns two
6217 /// i32 values and take a 2 x i32 value to shift plus a shift amount.
6218 SDValue ARMTargetLowering::LowerShiftLeftParts(SDValue Op,
6219 SelectionDAG &DAG) const {
6220 assert(Op.getNumOperands() == 3 && "Not a double-shift!");
6221 EVT VT = Op.getValueType();
6222 unsigned VTBits = VT.getSizeInBits();
6223 SDLoc dl(Op);
6224 SDValue ShOpLo = Op.getOperand(0);
6225 SDValue ShOpHi = Op.getOperand(1);
6226 SDValue ShAmt = Op.getOperand(2);
6227 SDValue ARMcc;
6228 SDValue CCR = DAG.getRegister(ARM::CPSR, MVT::i32);
6230 assert(Op.getOpcode() == ISD::SHL_PARTS);
6231 SDValue RevShAmt = DAG.getNode(ISD::SUB, dl, MVT::i32,
6232 DAG.getConstant(VTBits, dl, MVT::i32), ShAmt);
6233 SDValue Tmp1 = DAG.getNode(ISD::SRL, dl, VT, ShOpLo, RevShAmt);
6234 SDValue Tmp2 = DAG.getNode(ISD::SHL, dl, VT, ShOpHi, ShAmt);
6235 SDValue HiSmallShift = DAG.getNode(ISD::OR, dl, VT, Tmp1, Tmp2);
6237 SDValue ExtraShAmt = DAG.getNode(ISD::SUB, dl, MVT::i32, ShAmt,
6238 DAG.getConstant(VTBits, dl, MVT::i32));
6239 SDValue HiBigShift = DAG.getNode(ISD::SHL, dl, VT, ShOpLo, ExtraShAmt);
6240 SDValue CmpHi = getARMCmp(ExtraShAmt, DAG.getConstant(0, dl, MVT::i32),
6241 ISD::SETGE, ARMcc, DAG, dl);
6242 SDValue Hi = DAG.getNode(ARMISD::CMOV, dl, VT, HiSmallShift, HiBigShift,
6243 ARMcc, CCR, CmpHi);
6245 SDValue CmpLo = getARMCmp(ExtraShAmt, DAG.getConstant(0, dl, MVT::i32),
6246 ISD::SETGE, ARMcc, DAG, dl);
6247 SDValue LoSmallShift = DAG.getNode(ISD::SHL, dl, VT, ShOpLo, ShAmt);
6248 SDValue Lo = DAG.getNode(ARMISD::CMOV, dl, VT, LoSmallShift,
6249 DAG.getConstant(0, dl, VT), ARMcc, CCR, CmpLo);
6251 SDValue Ops[2] = { Lo, Hi };
6252 return DAG.getMergeValues(Ops, dl);
6255 SDValue ARMTargetLowering::LowerFLT_ROUNDS_(SDValue Op,
6256 SelectionDAG &DAG) const {
6257 // The rounding mode is in bits 23:22 of the FPSCR.
6258 // The ARM rounding mode value to FLT_ROUNDS mapping is 0->1, 1->2, 2->3, 3->0
6259 // The formula we use to implement this is (((FPSCR + 1 << 22) >> 22) & 3)
6260 // so that the shift + and get folded into a bitfield extract.
6261 SDLoc dl(Op);
6262 SDValue Chain = Op.getOperand(0);
6263 SDValue Ops[] = {Chain,
6264 DAG.getConstant(Intrinsic::arm_get_fpscr, dl, MVT::i32)};
6266 SDValue FPSCR =
6267 DAG.getNode(ISD::INTRINSIC_W_CHAIN, dl, {MVT::i32, MVT::Other}, Ops);
6268 Chain = FPSCR.getValue(1);
6269 SDValue FltRounds = DAG.getNode(ISD::ADD, dl, MVT::i32, FPSCR,
6270 DAG.getConstant(1U << 22, dl, MVT::i32));
6271 SDValue RMODE = DAG.getNode(ISD::SRL, dl, MVT::i32, FltRounds,
6272 DAG.getConstant(22, dl, MVT::i32));
6273 SDValue And = DAG.getNode(ISD::AND, dl, MVT::i32, RMODE,
6274 DAG.getConstant(3, dl, MVT::i32));
6275 return DAG.getMergeValues({And, Chain}, dl);
6278 SDValue ARMTargetLowering::LowerSET_ROUNDING(SDValue Op,
6279 SelectionDAG &DAG) const {
6280 SDLoc DL(Op);
6281 SDValue Chain = Op->getOperand(0);
6282 SDValue RMValue = Op->getOperand(1);
6284 // The rounding mode is in bits 23:22 of the FPSCR.
6285 // The llvm.set.rounding argument value to ARM rounding mode value mapping
6286 // is 0->3, 1->0, 2->1, 3->2. The formula we use to implement this is
6287 // ((arg - 1) & 3) << 22).
6289 // It is expected that the argument of llvm.set.rounding is within the
6290 // segment [0, 3], so NearestTiesToAway (4) is not handled here. It is
6291 // responsibility of the code generated llvm.set.rounding to ensure this
6292 // condition.
6294 // Calculate new value of FPSCR[23:22].
6295 RMValue = DAG.getNode(ISD::SUB, DL, MVT::i32, RMValue,
6296 DAG.getConstant(1, DL, MVT::i32));
6297 RMValue = DAG.getNode(ISD::AND, DL, MVT::i32, RMValue,
6298 DAG.getConstant(0x3, DL, MVT::i32));
6299 RMValue = DAG.getNode(ISD::SHL, DL, MVT::i32, RMValue,
6300 DAG.getConstant(ARM::RoundingBitsPos, DL, MVT::i32));
6302 // Get current value of FPSCR.
6303 SDValue Ops[] = {Chain,
6304 DAG.getConstant(Intrinsic::arm_get_fpscr, DL, MVT::i32)};
6305 SDValue FPSCR =
6306 DAG.getNode(ISD::INTRINSIC_W_CHAIN, DL, {MVT::i32, MVT::Other}, Ops);
6307 Chain = FPSCR.getValue(1);
6308 FPSCR = FPSCR.getValue(0);
6310 // Put new rounding mode into FPSCR[23:22].
6311 const unsigned RMMask = ~(ARM::Rounding::rmMask << ARM::RoundingBitsPos);
6312 FPSCR = DAG.getNode(ISD::AND, DL, MVT::i32, FPSCR,
6313 DAG.getConstant(RMMask, DL, MVT::i32));
6314 FPSCR = DAG.getNode(ISD::OR, DL, MVT::i32, FPSCR, RMValue);
6315 SDValue Ops2[] = {
6316 Chain, DAG.getConstant(Intrinsic::arm_set_fpscr, DL, MVT::i32), FPSCR};
6317 return DAG.getNode(ISD::INTRINSIC_VOID, DL, MVT::Other, Ops2);
6320 static SDValue LowerCTTZ(SDNode *N, SelectionDAG &DAG,
6321 const ARMSubtarget *ST) {
6322 SDLoc dl(N);
6323 EVT VT = N->getValueType(0);
6324 if (VT.isVector() && ST->hasNEON()) {
6326 // Compute the least significant set bit: LSB = X & -X
6327 SDValue X = N->getOperand(0);
6328 SDValue NX = DAG.getNode(ISD::SUB, dl, VT, getZeroVector(VT, DAG, dl), X);
6329 SDValue LSB = DAG.getNode(ISD::AND, dl, VT, X, NX);
6331 EVT ElemTy = VT.getVectorElementType();
6333 if (ElemTy == MVT::i8) {
6334 // Compute with: cttz(x) = ctpop(lsb - 1)
6335 SDValue One = DAG.getNode(ARMISD::VMOVIMM, dl, VT,
6336 DAG.getTargetConstant(1, dl, ElemTy));
6337 SDValue Bits = DAG.getNode(ISD::SUB, dl, VT, LSB, One);
6338 return DAG.getNode(ISD::CTPOP, dl, VT, Bits);
6341 if ((ElemTy == MVT::i16 || ElemTy == MVT::i32) &&
6342 (N->getOpcode() == ISD::CTTZ_ZERO_UNDEF)) {
6343 // Compute with: cttz(x) = (width - 1) - ctlz(lsb), if x != 0
6344 unsigned NumBits = ElemTy.getSizeInBits();
6345 SDValue WidthMinus1 =
6346 DAG.getNode(ARMISD::VMOVIMM, dl, VT,
6347 DAG.getTargetConstant(NumBits - 1, dl, ElemTy));
6348 SDValue CTLZ = DAG.getNode(ISD::CTLZ, dl, VT, LSB);
6349 return DAG.getNode(ISD::SUB, dl, VT, WidthMinus1, CTLZ);
6352 // Compute with: cttz(x) = ctpop(lsb - 1)
6354 // Compute LSB - 1.
6355 SDValue Bits;
6356 if (ElemTy == MVT::i64) {
6357 // Load constant 0xffff'ffff'ffff'ffff to register.
6358 SDValue FF = DAG.getNode(ARMISD::VMOVIMM, dl, VT,
6359 DAG.getTargetConstant(0x1eff, dl, MVT::i32));
6360 Bits = DAG.getNode(ISD::ADD, dl, VT, LSB, FF);
6361 } else {
6362 SDValue One = DAG.getNode(ARMISD::VMOVIMM, dl, VT,
6363 DAG.getTargetConstant(1, dl, ElemTy));
6364 Bits = DAG.getNode(ISD::SUB, dl, VT, LSB, One);
6366 return DAG.getNode(ISD::CTPOP, dl, VT, Bits);
6369 if (!ST->hasV6T2Ops())
6370 return SDValue();
6372 SDValue rbit = DAG.getNode(ISD::BITREVERSE, dl, VT, N->getOperand(0));
6373 return DAG.getNode(ISD::CTLZ, dl, VT, rbit);
6376 static SDValue LowerCTPOP(SDNode *N, SelectionDAG &DAG,
6377 const ARMSubtarget *ST) {
6378 EVT VT = N->getValueType(0);
6379 SDLoc DL(N);
6381 assert(ST->hasNEON() && "Custom ctpop lowering requires NEON.");
6382 assert((VT == MVT::v1i64 || VT == MVT::v2i64 || VT == MVT::v2i32 ||
6383 VT == MVT::v4i32 || VT == MVT::v4i16 || VT == MVT::v8i16) &&
6384 "Unexpected type for custom ctpop lowering");
6386 const TargetLowering &TLI = DAG.getTargetLoweringInfo();
6387 EVT VT8Bit = VT.is64BitVector() ? MVT::v8i8 : MVT::v16i8;
6388 SDValue Res = DAG.getBitcast(VT8Bit, N->getOperand(0));
6389 Res = DAG.getNode(ISD::CTPOP, DL, VT8Bit, Res);
6391 // Widen v8i8/v16i8 CTPOP result to VT by repeatedly widening pairwise adds.
6392 unsigned EltSize = 8;
6393 unsigned NumElts = VT.is64BitVector() ? 8 : 16;
6394 while (EltSize != VT.getScalarSizeInBits()) {
6395 SmallVector<SDValue, 8> Ops;
6396 Ops.push_back(DAG.getConstant(Intrinsic::arm_neon_vpaddlu, DL,
6397 TLI.getPointerTy(DAG.getDataLayout())));
6398 Ops.push_back(Res);
6400 EltSize *= 2;
6401 NumElts /= 2;
6402 MVT WidenVT = MVT::getVectorVT(MVT::getIntegerVT(EltSize), NumElts);
6403 Res = DAG.getNode(ISD::INTRINSIC_WO_CHAIN, DL, WidenVT, Ops);
6406 return Res;
6409 /// Getvshiftimm - Check if this is a valid build_vector for the immediate
6410 /// operand of a vector shift operation, where all the elements of the
6411 /// build_vector must have the same constant integer value.
6412 static bool getVShiftImm(SDValue Op, unsigned ElementBits, int64_t &Cnt) {
6413 // Ignore bit_converts.
6414 while (Op.getOpcode() == ISD::BITCAST)
6415 Op = Op.getOperand(0);
6416 BuildVectorSDNode *BVN = dyn_cast<BuildVectorSDNode>(Op.getNode());
6417 APInt SplatBits, SplatUndef;
6418 unsigned SplatBitSize;
6419 bool HasAnyUndefs;
6420 if (!BVN ||
6421 !BVN->isConstantSplat(SplatBits, SplatUndef, SplatBitSize, HasAnyUndefs,
6422 ElementBits) ||
6423 SplatBitSize > ElementBits)
6424 return false;
6425 Cnt = SplatBits.getSExtValue();
6426 return true;
6429 /// isVShiftLImm - Check if this is a valid build_vector for the immediate
6430 /// operand of a vector shift left operation. That value must be in the range:
6431 /// 0 <= Value < ElementBits for a left shift; or
6432 /// 0 <= Value <= ElementBits for a long left shift.
6433 static bool isVShiftLImm(SDValue Op, EVT VT, bool isLong, int64_t &Cnt) {
6434 assert(VT.isVector() && "vector shift count is not a vector type");
6435 int64_t ElementBits = VT.getScalarSizeInBits();
6436 if (!getVShiftImm(Op, ElementBits, Cnt))
6437 return false;
6438 return (Cnt >= 0 && (isLong ? Cnt - 1 : Cnt) < ElementBits);
6441 /// isVShiftRImm - Check if this is a valid build_vector for the immediate
6442 /// operand of a vector shift right operation. For a shift opcode, the value
6443 /// is positive, but for an intrinsic the value count must be negative. The
6444 /// absolute value must be in the range:
6445 /// 1 <= |Value| <= ElementBits for a right shift; or
6446 /// 1 <= |Value| <= ElementBits/2 for a narrow right shift.
6447 static bool isVShiftRImm(SDValue Op, EVT VT, bool isNarrow, bool isIntrinsic,
6448 int64_t &Cnt) {
6449 assert(VT.isVector() && "vector shift count is not a vector type");
6450 int64_t ElementBits = VT.getScalarSizeInBits();
6451 if (!getVShiftImm(Op, ElementBits, Cnt))
6452 return false;
6453 if (!isIntrinsic)
6454 return (Cnt >= 1 && Cnt <= (isNarrow ? ElementBits / 2 : ElementBits));
6455 if (Cnt >= -(isNarrow ? ElementBits / 2 : ElementBits) && Cnt <= -1) {
6456 Cnt = -Cnt;
6457 return true;
6459 return false;
6462 static SDValue LowerShift(SDNode *N, SelectionDAG &DAG,
6463 const ARMSubtarget *ST) {
6464 EVT VT = N->getValueType(0);
6465 SDLoc dl(N);
6466 int64_t Cnt;
6468 if (!VT.isVector())
6469 return SDValue();
6471 // We essentially have two forms here. Shift by an immediate and shift by a
6472 // vector register (there are also shift by a gpr, but that is just handled
6473 // with a tablegen pattern). We cannot easily match shift by an immediate in
6474 // tablegen so we do that here and generate a VSHLIMM/VSHRsIMM/VSHRuIMM.
6475 // For shifting by a vector, we don't have VSHR, only VSHL (which can be
6476 // signed or unsigned, and a negative shift indicates a shift right).
6477 if (N->getOpcode() == ISD::SHL) {
6478 if (isVShiftLImm(N->getOperand(1), VT, false, Cnt))
6479 return DAG.getNode(ARMISD::VSHLIMM, dl, VT, N->getOperand(0),
6480 DAG.getConstant(Cnt, dl, MVT::i32));
6481 return DAG.getNode(ARMISD::VSHLu, dl, VT, N->getOperand(0),
6482 N->getOperand(1));
6485 assert((N->getOpcode() == ISD::SRA || N->getOpcode() == ISD::SRL) &&
6486 "unexpected vector shift opcode");
6488 if (isVShiftRImm(N->getOperand(1), VT, false, false, Cnt)) {
6489 unsigned VShiftOpc =
6490 (N->getOpcode() == ISD::SRA ? ARMISD::VSHRsIMM : ARMISD::VSHRuIMM);
6491 return DAG.getNode(VShiftOpc, dl, VT, N->getOperand(0),
6492 DAG.getConstant(Cnt, dl, MVT::i32));
6495 // Other right shifts we don't have operations for (we use a shift left by a
6496 // negative number).
6497 EVT ShiftVT = N->getOperand(1).getValueType();
6498 SDValue NegatedCount = DAG.getNode(
6499 ISD::SUB, dl, ShiftVT, getZeroVector(ShiftVT, DAG, dl), N->getOperand(1));
6500 unsigned VShiftOpc =
6501 (N->getOpcode() == ISD::SRA ? ARMISD::VSHLs : ARMISD::VSHLu);
6502 return DAG.getNode(VShiftOpc, dl, VT, N->getOperand(0), NegatedCount);
6505 static SDValue Expand64BitShift(SDNode *N, SelectionDAG &DAG,
6506 const ARMSubtarget *ST) {
6507 EVT VT = N->getValueType(0);
6508 SDLoc dl(N);
6510 // We can get here for a node like i32 = ISD::SHL i32, i64
6511 if (VT != MVT::i64)
6512 return SDValue();
6514 assert((N->getOpcode() == ISD::SRL || N->getOpcode() == ISD::SRA ||
6515 N->getOpcode() == ISD::SHL) &&
6516 "Unknown shift to lower!");
6518 unsigned ShOpc = N->getOpcode();
6519 if (ST->hasMVEIntegerOps()) {
6520 SDValue ShAmt = N->getOperand(1);
6521 unsigned ShPartsOpc = ARMISD::LSLL;
6522 ConstantSDNode *Con = dyn_cast<ConstantSDNode>(ShAmt);
6524 // If the shift amount is greater than 32 or has a greater bitwidth than 64
6525 // then do the default optimisation
6526 if (ShAmt->getValueType(0).getSizeInBits() > 64 ||
6527 (Con && (Con->getZExtValue() == 0 || Con->getZExtValue() >= 32)))
6528 return SDValue();
6530 // Extract the lower 32 bits of the shift amount if it's not an i32
6531 if (ShAmt->getValueType(0) != MVT::i32)
6532 ShAmt = DAG.getZExtOrTrunc(ShAmt, dl, MVT::i32);
6534 if (ShOpc == ISD::SRL) {
6535 if (!Con)
6536 // There is no t2LSRLr instruction so negate and perform an lsll if the
6537 // shift amount is in a register, emulating a right shift.
6538 ShAmt = DAG.getNode(ISD::SUB, dl, MVT::i32,
6539 DAG.getConstant(0, dl, MVT::i32), ShAmt);
6540 else
6541 // Else generate an lsrl on the immediate shift amount
6542 ShPartsOpc = ARMISD::LSRL;
6543 } else if (ShOpc == ISD::SRA)
6544 ShPartsOpc = ARMISD::ASRL;
6546 // Lower 32 bits of the destination/source
6547 SDValue Lo = DAG.getNode(ISD::EXTRACT_ELEMENT, dl, MVT::i32, N->getOperand(0),
6548 DAG.getConstant(0, dl, MVT::i32));
6549 // Upper 32 bits of the destination/source
6550 SDValue Hi = DAG.getNode(ISD::EXTRACT_ELEMENT, dl, MVT::i32, N->getOperand(0),
6551 DAG.getConstant(1, dl, MVT::i32));
6553 // Generate the shift operation as computed above
6554 Lo = DAG.getNode(ShPartsOpc, dl, DAG.getVTList(MVT::i32, MVT::i32), Lo, Hi,
6555 ShAmt);
6556 // The upper 32 bits come from the second return value of lsll
6557 Hi = SDValue(Lo.getNode(), 1);
6558 return DAG.getNode(ISD::BUILD_PAIR, dl, MVT::i64, Lo, Hi);
6561 // We only lower SRA, SRL of 1 here, all others use generic lowering.
6562 if (!isOneConstant(N->getOperand(1)) || N->getOpcode() == ISD::SHL)
6563 return SDValue();
6565 // If we are in thumb mode, we don't have RRX.
6566 if (ST->isThumb1Only())
6567 return SDValue();
6569 // Okay, we have a 64-bit SRA or SRL of 1. Lower this to an RRX expr.
6570 SDValue Lo = DAG.getNode(ISD::EXTRACT_ELEMENT, dl, MVT::i32, N->getOperand(0),
6571 DAG.getConstant(0, dl, MVT::i32));
6572 SDValue Hi = DAG.getNode(ISD::EXTRACT_ELEMENT, dl, MVT::i32, N->getOperand(0),
6573 DAG.getConstant(1, dl, MVT::i32));
6575 // First, build a SRA_FLAG/SRL_FLAG op, which shifts the top part by one and
6576 // captures the result into a carry flag.
6577 unsigned Opc = N->getOpcode() == ISD::SRL ? ARMISD::SRL_FLAG:ARMISD::SRA_FLAG;
6578 Hi = DAG.getNode(Opc, dl, DAG.getVTList(MVT::i32, MVT::Glue), Hi);
6580 // The low part is an ARMISD::RRX operand, which shifts the carry in.
6581 Lo = DAG.getNode(ARMISD::RRX, dl, MVT::i32, Lo, Hi.getValue(1));
6583 // Merge the pieces into a single i64 value.
6584 return DAG.getNode(ISD::BUILD_PAIR, dl, MVT::i64, Lo, Hi);
6587 static SDValue LowerVSETCC(SDValue Op, SelectionDAG &DAG,
6588 const ARMSubtarget *ST) {
6589 bool Invert = false;
6590 bool Swap = false;
6591 unsigned Opc = ARMCC::AL;
6593 SDValue Op0 = Op.getOperand(0);
6594 SDValue Op1 = Op.getOperand(1);
6595 SDValue CC = Op.getOperand(2);
6596 EVT VT = Op.getValueType();
6597 ISD::CondCode SetCCOpcode = cast<CondCodeSDNode>(CC)->get();
6598 SDLoc dl(Op);
6600 EVT CmpVT;
6601 if (ST->hasNEON())
6602 CmpVT = Op0.getValueType().changeVectorElementTypeToInteger();
6603 else {
6604 assert(ST->hasMVEIntegerOps() &&
6605 "No hardware support for integer vector comparison!");
6607 if (Op.getValueType().getVectorElementType() != MVT::i1)
6608 return SDValue();
6610 // Make sure we expand floating point setcc to scalar if we do not have
6611 // mve.fp, so that we can handle them from there.
6612 if (Op0.getValueType().isFloatingPoint() && !ST->hasMVEFloatOps())
6613 return SDValue();
6615 CmpVT = VT;
6618 if (Op0.getValueType().getVectorElementType() == MVT::i64 &&
6619 (SetCCOpcode == ISD::SETEQ || SetCCOpcode == ISD::SETNE)) {
6620 // Special-case integer 64-bit equality comparisons. They aren't legal,
6621 // but they can be lowered with a few vector instructions.
6622 unsigned CmpElements = CmpVT.getVectorNumElements() * 2;
6623 EVT SplitVT = EVT::getVectorVT(*DAG.getContext(), MVT::i32, CmpElements);
6624 SDValue CastOp0 = DAG.getNode(ISD::BITCAST, dl, SplitVT, Op0);
6625 SDValue CastOp1 = DAG.getNode(ISD::BITCAST, dl, SplitVT, Op1);
6626 SDValue Cmp = DAG.getNode(ISD::SETCC, dl, SplitVT, CastOp0, CastOp1,
6627 DAG.getCondCode(ISD::SETEQ));
6628 SDValue Reversed = DAG.getNode(ARMISD::VREV64, dl, SplitVT, Cmp);
6629 SDValue Merged = DAG.getNode(ISD::AND, dl, SplitVT, Cmp, Reversed);
6630 Merged = DAG.getNode(ISD::BITCAST, dl, CmpVT, Merged);
6631 if (SetCCOpcode == ISD::SETNE)
6632 Merged = DAG.getNOT(dl, Merged, CmpVT);
6633 Merged = DAG.getSExtOrTrunc(Merged, dl, VT);
6634 return Merged;
6637 if (CmpVT.getVectorElementType() == MVT::i64)
6638 // 64-bit comparisons are not legal in general.
6639 return SDValue();
6641 if (Op1.getValueType().isFloatingPoint()) {
6642 switch (SetCCOpcode) {
6643 default: llvm_unreachable("Illegal FP comparison");
6644 case ISD::SETUNE:
6645 case ISD::SETNE:
6646 if (ST->hasMVEFloatOps()) {
6647 Opc = ARMCC::NE; break;
6648 } else {
6649 Invert = true; LLVM_FALLTHROUGH;
6651 case ISD::SETOEQ:
6652 case ISD::SETEQ: Opc = ARMCC::EQ; break;
6653 case ISD::SETOLT:
6654 case ISD::SETLT: Swap = true; LLVM_FALLTHROUGH;
6655 case ISD::SETOGT:
6656 case ISD::SETGT: Opc = ARMCC::GT; break;
6657 case ISD::SETOLE:
6658 case ISD::SETLE: Swap = true; LLVM_FALLTHROUGH;
6659 case ISD::SETOGE:
6660 case ISD::SETGE: Opc = ARMCC::GE; break;
6661 case ISD::SETUGE: Swap = true; LLVM_FALLTHROUGH;
6662 case ISD::SETULE: Invert = true; Opc = ARMCC::GT; break;
6663 case ISD::SETUGT: Swap = true; LLVM_FALLTHROUGH;
6664 case ISD::SETULT: Invert = true; Opc = ARMCC::GE; break;
6665 case ISD::SETUEQ: Invert = true; LLVM_FALLTHROUGH;
6666 case ISD::SETONE: {
6667 // Expand this to (OLT | OGT).
6668 SDValue TmpOp0 = DAG.getNode(ARMISD::VCMP, dl, CmpVT, Op1, Op0,
6669 DAG.getConstant(ARMCC::GT, dl, MVT::i32));
6670 SDValue TmpOp1 = DAG.getNode(ARMISD::VCMP, dl, CmpVT, Op0, Op1,
6671 DAG.getConstant(ARMCC::GT, dl, MVT::i32));
6672 SDValue Result = DAG.getNode(ISD::OR, dl, CmpVT, TmpOp0, TmpOp1);
6673 if (Invert)
6674 Result = DAG.getNOT(dl, Result, VT);
6675 return Result;
6677 case ISD::SETUO: Invert = true; LLVM_FALLTHROUGH;
6678 case ISD::SETO: {
6679 // Expand this to (OLT | OGE).
6680 SDValue TmpOp0 = DAG.getNode(ARMISD::VCMP, dl, CmpVT, Op1, Op0,
6681 DAG.getConstant(ARMCC::GT, dl, MVT::i32));
6682 SDValue TmpOp1 = DAG.getNode(ARMISD::VCMP, dl, CmpVT, Op0, Op1,
6683 DAG.getConstant(ARMCC::GE, dl, MVT::i32));
6684 SDValue Result = DAG.getNode(ISD::OR, dl, CmpVT, TmpOp0, TmpOp1);
6685 if (Invert)
6686 Result = DAG.getNOT(dl, Result, VT);
6687 return Result;
6690 } else {
6691 // Integer comparisons.
6692 switch (SetCCOpcode) {
6693 default: llvm_unreachable("Illegal integer comparison");
6694 case ISD::SETNE:
6695 if (ST->hasMVEIntegerOps()) {
6696 Opc = ARMCC::NE; break;
6697 } else {
6698 Invert = true; LLVM_FALLTHROUGH;
6700 case ISD::SETEQ: Opc = ARMCC::EQ; break;
6701 case ISD::SETLT: Swap = true; LLVM_FALLTHROUGH;
6702 case ISD::SETGT: Opc = ARMCC::GT; break;
6703 case ISD::SETLE: Swap = true; LLVM_FALLTHROUGH;
6704 case ISD::SETGE: Opc = ARMCC::GE; break;
6705 case ISD::SETULT: Swap = true; LLVM_FALLTHROUGH;
6706 case ISD::SETUGT: Opc = ARMCC::HI; break;
6707 case ISD::SETULE: Swap = true; LLVM_FALLTHROUGH;
6708 case ISD::SETUGE: Opc = ARMCC::HS; break;
6711 // Detect VTST (Vector Test Bits) = icmp ne (and (op0, op1), zero).
6712 if (ST->hasNEON() && Opc == ARMCC::EQ) {
6713 SDValue AndOp;
6714 if (ISD::isBuildVectorAllZeros(Op1.getNode()))
6715 AndOp = Op0;
6716 else if (ISD::isBuildVectorAllZeros(Op0.getNode()))
6717 AndOp = Op1;
6719 // Ignore bitconvert.
6720 if (AndOp.getNode() && AndOp.getOpcode() == ISD::BITCAST)
6721 AndOp = AndOp.getOperand(0);
6723 if (AndOp.getNode() && AndOp.getOpcode() == ISD::AND) {
6724 Op0 = DAG.getNode(ISD::BITCAST, dl, CmpVT, AndOp.getOperand(0));
6725 Op1 = DAG.getNode(ISD::BITCAST, dl, CmpVT, AndOp.getOperand(1));
6726 SDValue Result = DAG.getNode(ARMISD::VTST, dl, CmpVT, Op0, Op1);
6727 if (!Invert)
6728 Result = DAG.getNOT(dl, Result, VT);
6729 return Result;
6734 if (Swap)
6735 std::swap(Op0, Op1);
6737 // If one of the operands is a constant vector zero, attempt to fold the
6738 // comparison to a specialized compare-against-zero form.
6739 SDValue SingleOp;
6740 if (ISD::isBuildVectorAllZeros(Op1.getNode()))
6741 SingleOp = Op0;
6742 else if (ISD::isBuildVectorAllZeros(Op0.getNode())) {
6743 if (Opc == ARMCC::GE)
6744 Opc = ARMCC::LE;
6745 else if (Opc == ARMCC::GT)
6746 Opc = ARMCC::LT;
6747 SingleOp = Op1;
6750 SDValue Result;
6751 if (SingleOp.getNode()) {
6752 Result = DAG.getNode(ARMISD::VCMPZ, dl, CmpVT, SingleOp,
6753 DAG.getConstant(Opc, dl, MVT::i32));
6754 } else {
6755 Result = DAG.getNode(ARMISD::VCMP, dl, CmpVT, Op0, Op1,
6756 DAG.getConstant(Opc, dl, MVT::i32));
6759 Result = DAG.getSExtOrTrunc(Result, dl, VT);
6761 if (Invert)
6762 Result = DAG.getNOT(dl, Result, VT);
6764 return Result;
6767 static SDValue LowerSETCCCARRY(SDValue Op, SelectionDAG &DAG) {
6768 SDValue LHS = Op.getOperand(0);
6769 SDValue RHS = Op.getOperand(1);
6770 SDValue Carry = Op.getOperand(2);
6771 SDValue Cond = Op.getOperand(3);
6772 SDLoc DL(Op);
6774 assert(LHS.getSimpleValueType().isInteger() && "SETCCCARRY is integer only.");
6776 // ARMISD::SUBE expects a carry not a borrow like ISD::SUBCARRY so we
6777 // have to invert the carry first.
6778 Carry = DAG.getNode(ISD::SUB, DL, MVT::i32,
6779 DAG.getConstant(1, DL, MVT::i32), Carry);
6780 // This converts the boolean value carry into the carry flag.
6781 Carry = ConvertBooleanCarryToCarryFlag(Carry, DAG);
6783 SDVTList VTs = DAG.getVTList(LHS.getValueType(), MVT::i32);
6784 SDValue Cmp = DAG.getNode(ARMISD::SUBE, DL, VTs, LHS, RHS, Carry);
6786 SDValue FVal = DAG.getConstant(0, DL, MVT::i32);
6787 SDValue TVal = DAG.getConstant(1, DL, MVT::i32);
6788 SDValue ARMcc = DAG.getConstant(
6789 IntCCToARMCC(cast<CondCodeSDNode>(Cond)->get()), DL, MVT::i32);
6790 SDValue CCR = DAG.getRegister(ARM::CPSR, MVT::i32);
6791 SDValue Chain = DAG.getCopyToReg(DAG.getEntryNode(), DL, ARM::CPSR,
6792 Cmp.getValue(1), SDValue());
6793 return DAG.getNode(ARMISD::CMOV, DL, Op.getValueType(), FVal, TVal, ARMcc,
6794 CCR, Chain.getValue(1));
6797 /// isVMOVModifiedImm - Check if the specified splat value corresponds to a
6798 /// valid vector constant for a NEON or MVE instruction with a "modified
6799 /// immediate" operand (e.g., VMOV). If so, return the encoded value.
6800 static SDValue isVMOVModifiedImm(uint64_t SplatBits, uint64_t SplatUndef,
6801 unsigned SplatBitSize, SelectionDAG &DAG,
6802 const SDLoc &dl, EVT &VT, EVT VectorVT,
6803 VMOVModImmType type) {
6804 unsigned OpCmode, Imm;
6805 bool is128Bits = VectorVT.is128BitVector();
6807 // SplatBitSize is set to the smallest size that splats the vector, so a
6808 // zero vector will always have SplatBitSize == 8. However, NEON modified
6809 // immediate instructions others than VMOV do not support the 8-bit encoding
6810 // of a zero vector, and the default encoding of zero is supposed to be the
6811 // 32-bit version.
6812 if (SplatBits == 0)
6813 SplatBitSize = 32;
6815 switch (SplatBitSize) {
6816 case 8:
6817 if (type != VMOVModImm)
6818 return SDValue();
6819 // Any 1-byte value is OK. Op=0, Cmode=1110.
6820 assert((SplatBits & ~0xff) == 0 && "one byte splat value is too big");
6821 OpCmode = 0xe;
6822 Imm = SplatBits;
6823 VT = is128Bits ? MVT::v16i8 : MVT::v8i8;
6824 break;
6826 case 16:
6827 // NEON's 16-bit VMOV supports splat values where only one byte is nonzero.
6828 VT = is128Bits ? MVT::v8i16 : MVT::v4i16;
6829 if ((SplatBits & ~0xff) == 0) {
6830 // Value = 0x00nn: Op=x, Cmode=100x.
6831 OpCmode = 0x8;
6832 Imm = SplatBits;
6833 break;
6835 if ((SplatBits & ~0xff00) == 0) {
6836 // Value = 0xnn00: Op=x, Cmode=101x.
6837 OpCmode = 0xa;
6838 Imm = SplatBits >> 8;
6839 break;
6841 return SDValue();
6843 case 32:
6844 // NEON's 32-bit VMOV supports splat values where:
6845 // * only one byte is nonzero, or
6846 // * the least significant byte is 0xff and the second byte is nonzero, or
6847 // * the least significant 2 bytes are 0xff and the third is nonzero.
6848 VT = is128Bits ? MVT::v4i32 : MVT::v2i32;
6849 if ((SplatBits & ~0xff) == 0) {
6850 // Value = 0x000000nn: Op=x, Cmode=000x.
6851 OpCmode = 0;
6852 Imm = SplatBits;
6853 break;
6855 if ((SplatBits & ~0xff00) == 0) {
6856 // Value = 0x0000nn00: Op=x, Cmode=001x.
6857 OpCmode = 0x2;
6858 Imm = SplatBits >> 8;
6859 break;
6861 if ((SplatBits & ~0xff0000) == 0) {
6862 // Value = 0x00nn0000: Op=x, Cmode=010x.
6863 OpCmode = 0x4;
6864 Imm = SplatBits >> 16;
6865 break;
6867 if ((SplatBits & ~0xff000000) == 0) {
6868 // Value = 0xnn000000: Op=x, Cmode=011x.
6869 OpCmode = 0x6;
6870 Imm = SplatBits >> 24;
6871 break;
6874 // cmode == 0b1100 and cmode == 0b1101 are not supported for VORR or VBIC
6875 if (type == OtherModImm) return SDValue();
6877 if ((SplatBits & ~0xffff) == 0 &&
6878 ((SplatBits | SplatUndef) & 0xff) == 0xff) {
6879 // Value = 0x0000nnff: Op=x, Cmode=1100.
6880 OpCmode = 0xc;
6881 Imm = SplatBits >> 8;
6882 break;
6885 // cmode == 0b1101 is not supported for MVE VMVN
6886 if (type == MVEVMVNModImm)
6887 return SDValue();
6889 if ((SplatBits & ~0xffffff) == 0 &&
6890 ((SplatBits | SplatUndef) & 0xffff) == 0xffff) {
6891 // Value = 0x00nnffff: Op=x, Cmode=1101.
6892 OpCmode = 0xd;
6893 Imm = SplatBits >> 16;
6894 break;
6897 // Note: there are a few 32-bit splat values (specifically: 00ffff00,
6898 // ff000000, ff0000ff, and ffff00ff) that are valid for VMOV.I64 but not
6899 // VMOV.I32. A (very) minor optimization would be to replicate the value
6900 // and fall through here to test for a valid 64-bit splat. But, then the
6901 // caller would also need to check and handle the change in size.
6902 return SDValue();
6904 case 64: {
6905 if (type != VMOVModImm)
6906 return SDValue();
6907 // NEON has a 64-bit VMOV splat where each byte is either 0 or 0xff.
6908 uint64_t BitMask = 0xff;
6909 unsigned ImmMask = 1;
6910 Imm = 0;
6911 for (int ByteNum = 0; ByteNum < 8; ++ByteNum) {
6912 if (((SplatBits | SplatUndef) & BitMask) == BitMask) {
6913 Imm |= ImmMask;
6914 } else if ((SplatBits & BitMask) != 0) {
6915 return SDValue();
6917 BitMask <<= 8;
6918 ImmMask <<= 1;
6921 if (DAG.getDataLayout().isBigEndian()) {
6922 // Reverse the order of elements within the vector.
6923 unsigned BytesPerElem = VectorVT.getScalarSizeInBits() / 8;
6924 unsigned Mask = (1 << BytesPerElem) - 1;
6925 unsigned NumElems = 8 / BytesPerElem;
6926 unsigned NewImm = 0;
6927 for (unsigned ElemNum = 0; ElemNum < NumElems; ++ElemNum) {
6928 unsigned Elem = ((Imm >> ElemNum * BytesPerElem) & Mask);
6929 NewImm |= Elem << (NumElems - ElemNum - 1) * BytesPerElem;
6931 Imm = NewImm;
6934 // Op=1, Cmode=1110.
6935 OpCmode = 0x1e;
6936 VT = is128Bits ? MVT::v2i64 : MVT::v1i64;
6937 break;
6940 default:
6941 llvm_unreachable("unexpected size for isVMOVModifiedImm");
6944 unsigned EncodedVal = ARM_AM::createVMOVModImm(OpCmode, Imm);
6945 return DAG.getTargetConstant(EncodedVal, dl, MVT::i32);
6948 SDValue ARMTargetLowering::LowerConstantFP(SDValue Op, SelectionDAG &DAG,
6949 const ARMSubtarget *ST) const {
6950 EVT VT = Op.getValueType();
6951 bool IsDouble = (VT == MVT::f64);
6952 ConstantFPSDNode *CFP = cast<ConstantFPSDNode>(Op);
6953 const APFloat &FPVal = CFP->getValueAPF();
6955 // Prevent floating-point constants from using literal loads
6956 // when execute-only is enabled.
6957 if (ST->genExecuteOnly()) {
6958 // If we can represent the constant as an immediate, don't lower it
6959 if (isFPImmLegal(FPVal, VT))
6960 return Op;
6961 // Otherwise, construct as integer, and move to float register
6962 APInt INTVal = FPVal.bitcastToAPInt();
6963 SDLoc DL(CFP);
6964 switch (VT.getSimpleVT().SimpleTy) {
6965 default:
6966 llvm_unreachable("Unknown floating point type!");
6967 break;
6968 case MVT::f64: {
6969 SDValue Lo = DAG.getConstant(INTVal.trunc(32), DL, MVT::i32);
6970 SDValue Hi = DAG.getConstant(INTVal.lshr(32).trunc(32), DL, MVT::i32);
6971 return DAG.getNode(ARMISD::VMOVDRR, DL, MVT::f64, Lo, Hi);
6973 case MVT::f32:
6974 return DAG.getNode(ARMISD::VMOVSR, DL, VT,
6975 DAG.getConstant(INTVal, DL, MVT::i32));
6979 if (!ST->hasVFP3Base())
6980 return SDValue();
6982 // Use the default (constant pool) lowering for double constants when we have
6983 // an SP-only FPU
6984 if (IsDouble && !Subtarget->hasFP64())
6985 return SDValue();
6987 // Try splatting with a VMOV.f32...
6988 int ImmVal = IsDouble ? ARM_AM::getFP64Imm(FPVal) : ARM_AM::getFP32Imm(FPVal);
6990 if (ImmVal != -1) {
6991 if (IsDouble || !ST->useNEONForSinglePrecisionFP()) {
6992 // We have code in place to select a valid ConstantFP already, no need to
6993 // do any mangling.
6994 return Op;
6997 // It's a float and we are trying to use NEON operations where
6998 // possible. Lower it to a splat followed by an extract.
6999 SDLoc DL(Op);
7000 SDValue NewVal = DAG.getTargetConstant(ImmVal, DL, MVT::i32);
7001 SDValue VecConstant = DAG.getNode(ARMISD::VMOVFPIMM, DL, MVT::v2f32,
7002 NewVal);
7003 return DAG.getNode(ISD::EXTRACT_VECTOR_ELT, DL, MVT::f32, VecConstant,
7004 DAG.getConstant(0, DL, MVT::i32));
7007 // The rest of our options are NEON only, make sure that's allowed before
7008 // proceeding..
7009 if (!ST->hasNEON() || (!IsDouble && !ST->useNEONForSinglePrecisionFP()))
7010 return SDValue();
7012 EVT VMovVT;
7013 uint64_t iVal = FPVal.bitcastToAPInt().getZExtValue();
7015 // It wouldn't really be worth bothering for doubles except for one very
7016 // important value, which does happen to match: 0.0. So make sure we don't do
7017 // anything stupid.
7018 if (IsDouble && (iVal & 0xffffffff) != (iVal >> 32))
7019 return SDValue();
7021 // Try a VMOV.i32 (FIXME: i8, i16, or i64 could work too).
7022 SDValue NewVal = isVMOVModifiedImm(iVal & 0xffffffffU, 0, 32, DAG, SDLoc(Op),
7023 VMovVT, VT, VMOVModImm);
7024 if (NewVal != SDValue()) {
7025 SDLoc DL(Op);
7026 SDValue VecConstant = DAG.getNode(ARMISD::VMOVIMM, DL, VMovVT,
7027 NewVal);
7028 if (IsDouble)
7029 return DAG.getNode(ISD::BITCAST, DL, MVT::f64, VecConstant);
7031 // It's a float: cast and extract a vector element.
7032 SDValue VecFConstant = DAG.getNode(ISD::BITCAST, DL, MVT::v2f32,
7033 VecConstant);
7034 return DAG.getNode(ISD::EXTRACT_VECTOR_ELT, DL, MVT::f32, VecFConstant,
7035 DAG.getConstant(0, DL, MVT::i32));
7038 // Finally, try a VMVN.i32
7039 NewVal = isVMOVModifiedImm(~iVal & 0xffffffffU, 0, 32, DAG, SDLoc(Op), VMovVT,
7040 VT, VMVNModImm);
7041 if (NewVal != SDValue()) {
7042 SDLoc DL(Op);
7043 SDValue VecConstant = DAG.getNode(ARMISD::VMVNIMM, DL, VMovVT, NewVal);
7045 if (IsDouble)
7046 return DAG.getNode(ISD::BITCAST, DL, MVT::f64, VecConstant);
7048 // It's a float: cast and extract a vector element.
7049 SDValue VecFConstant = DAG.getNode(ISD::BITCAST, DL, MVT::v2f32,
7050 VecConstant);
7051 return DAG.getNode(ISD::EXTRACT_VECTOR_ELT, DL, MVT::f32, VecFConstant,
7052 DAG.getConstant(0, DL, MVT::i32));
7055 return SDValue();
7058 // check if an VEXT instruction can handle the shuffle mask when the
7059 // vector sources of the shuffle are the same.
7060 static bool isSingletonVEXTMask(ArrayRef<int> M, EVT VT, unsigned &Imm) {
7061 unsigned NumElts = VT.getVectorNumElements();
7063 // Assume that the first shuffle index is not UNDEF. Fail if it is.
7064 if (M[0] < 0)
7065 return false;
7067 Imm = M[0];
7069 // If this is a VEXT shuffle, the immediate value is the index of the first
7070 // element. The other shuffle indices must be the successive elements after
7071 // the first one.
7072 unsigned ExpectedElt = Imm;
7073 for (unsigned i = 1; i < NumElts; ++i) {
7074 // Increment the expected index. If it wraps around, just follow it
7075 // back to index zero and keep going.
7076 ++ExpectedElt;
7077 if (ExpectedElt == NumElts)
7078 ExpectedElt = 0;
7080 if (M[i] < 0) continue; // ignore UNDEF indices
7081 if (ExpectedElt != static_cast<unsigned>(M[i]))
7082 return false;
7085 return true;
7088 static bool isVEXTMask(ArrayRef<int> M, EVT VT,
7089 bool &ReverseVEXT, unsigned &Imm) {
7090 unsigned NumElts = VT.getVectorNumElements();
7091 ReverseVEXT = false;
7093 // Assume that the first shuffle index is not UNDEF. Fail if it is.
7094 if (M[0] < 0)
7095 return false;
7097 Imm = M[0];
7099 // If this is a VEXT shuffle, the immediate value is the index of the first
7100 // element. The other shuffle indices must be the successive elements after
7101 // the first one.
7102 unsigned ExpectedElt = Imm;
7103 for (unsigned i = 1; i < NumElts; ++i) {
7104 // Increment the expected index. If it wraps around, it may still be
7105 // a VEXT but the source vectors must be swapped.
7106 ExpectedElt += 1;
7107 if (ExpectedElt == NumElts * 2) {
7108 ExpectedElt = 0;
7109 ReverseVEXT = true;
7112 if (M[i] < 0) continue; // ignore UNDEF indices
7113 if (ExpectedElt != static_cast<unsigned>(M[i]))
7114 return false;
7117 // Adjust the index value if the source operands will be swapped.
7118 if (ReverseVEXT)
7119 Imm -= NumElts;
7121 return true;
7124 static bool isVTBLMask(ArrayRef<int> M, EVT VT) {
7125 // We can handle <8 x i8> vector shuffles. If the index in the mask is out of
7126 // range, then 0 is placed into the resulting vector. So pretty much any mask
7127 // of 8 elements can work here.
7128 return VT == MVT::v8i8 && M.size() == 8;
7131 static unsigned SelectPairHalf(unsigned Elements, ArrayRef<int> Mask,
7132 unsigned Index) {
7133 if (Mask.size() == Elements * 2)
7134 return Index / Elements;
7135 return Mask[Index] == 0 ? 0 : 1;
7138 // Checks whether the shuffle mask represents a vector transpose (VTRN) by
7139 // checking that pairs of elements in the shuffle mask represent the same index
7140 // in each vector, incrementing the expected index by 2 at each step.
7141 // e.g. For v1,v2 of type v4i32 a valid shuffle mask is: [0, 4, 2, 6]
7142 // v1={a,b,c,d} => x=shufflevector v1, v2 shufflemask => x={a,e,c,g}
7143 // v2={e,f,g,h}
7144 // WhichResult gives the offset for each element in the mask based on which
7145 // of the two results it belongs to.
7147 // The transpose can be represented either as:
7148 // result1 = shufflevector v1, v2, result1_shuffle_mask
7149 // result2 = shufflevector v1, v2, result2_shuffle_mask
7150 // where v1/v2 and the shuffle masks have the same number of elements
7151 // (here WhichResult (see below) indicates which result is being checked)
7153 // or as:
7154 // results = shufflevector v1, v2, shuffle_mask
7155 // where both results are returned in one vector and the shuffle mask has twice
7156 // as many elements as v1/v2 (here WhichResult will always be 0 if true) here we
7157 // want to check the low half and high half of the shuffle mask as if it were
7158 // the other case
7159 static bool isVTRNMask(ArrayRef<int> M, EVT VT, unsigned &WhichResult) {
7160 unsigned EltSz = VT.getScalarSizeInBits();
7161 if (EltSz == 64)
7162 return false;
7164 unsigned NumElts = VT.getVectorNumElements();
7165 if (M.size() != NumElts && M.size() != NumElts*2)
7166 return false;
7168 // If the mask is twice as long as the input vector then we need to check the
7169 // upper and lower parts of the mask with a matching value for WhichResult
7170 // FIXME: A mask with only even values will be rejected in case the first
7171 // element is undefined, e.g. [-1, 4, 2, 6] will be rejected, because only
7172 // M[0] is used to determine WhichResult
7173 for (unsigned i = 0; i < M.size(); i += NumElts) {
7174 WhichResult = SelectPairHalf(NumElts, M, i);
7175 for (unsigned j = 0; j < NumElts; j += 2) {
7176 if ((M[i+j] >= 0 && (unsigned) M[i+j] != j + WhichResult) ||
7177 (M[i+j+1] >= 0 && (unsigned) M[i+j+1] != j + NumElts + WhichResult))
7178 return false;
7182 if (M.size() == NumElts*2)
7183 WhichResult = 0;
7185 return true;
7188 /// isVTRN_v_undef_Mask - Special case of isVTRNMask for canonical form of
7189 /// "vector_shuffle v, v", i.e., "vector_shuffle v, undef".
7190 /// Mask is e.g., <0, 0, 2, 2> instead of <0, 4, 2, 6>.
7191 static bool isVTRN_v_undef_Mask(ArrayRef<int> M, EVT VT, unsigned &WhichResult){
7192 unsigned EltSz = VT.getScalarSizeInBits();
7193 if (EltSz == 64)
7194 return false;
7196 unsigned NumElts = VT.getVectorNumElements();
7197 if (M.size() != NumElts && M.size() != NumElts*2)
7198 return false;
7200 for (unsigned i = 0; i < M.size(); i += NumElts) {
7201 WhichResult = SelectPairHalf(NumElts, M, i);
7202 for (unsigned j = 0; j < NumElts; j += 2) {
7203 if ((M[i+j] >= 0 && (unsigned) M[i+j] != j + WhichResult) ||
7204 (M[i+j+1] >= 0 && (unsigned) M[i+j+1] != j + WhichResult))
7205 return false;
7209 if (M.size() == NumElts*2)
7210 WhichResult = 0;
7212 return true;
7215 // Checks whether the shuffle mask represents a vector unzip (VUZP) by checking
7216 // that the mask elements are either all even and in steps of size 2 or all odd
7217 // and in steps of size 2.
7218 // e.g. For v1,v2 of type v4i32 a valid shuffle mask is: [0, 2, 4, 6]
7219 // v1={a,b,c,d} => x=shufflevector v1, v2 shufflemask => x={a,c,e,g}
7220 // v2={e,f,g,h}
7221 // Requires similar checks to that of isVTRNMask with
7222 // respect the how results are returned.
7223 static bool isVUZPMask(ArrayRef<int> M, EVT VT, unsigned &WhichResult) {
7224 unsigned EltSz = VT.getScalarSizeInBits();
7225 if (EltSz == 64)
7226 return false;
7228 unsigned NumElts = VT.getVectorNumElements();
7229 if (M.size() != NumElts && M.size() != NumElts*2)
7230 return false;
7232 for (unsigned i = 0; i < M.size(); i += NumElts) {
7233 WhichResult = SelectPairHalf(NumElts, M, i);
7234 for (unsigned j = 0; j < NumElts; ++j) {
7235 if (M[i+j] >= 0 && (unsigned) M[i+j] != 2 * j + WhichResult)
7236 return false;
7240 if (M.size() == NumElts*2)
7241 WhichResult = 0;
7243 // VUZP.32 for 64-bit vectors is a pseudo-instruction alias for VTRN.32.
7244 if (VT.is64BitVector() && EltSz == 32)
7245 return false;
7247 return true;
7250 /// isVUZP_v_undef_Mask - Special case of isVUZPMask for canonical form of
7251 /// "vector_shuffle v, v", i.e., "vector_shuffle v, undef".
7252 /// Mask is e.g., <0, 2, 0, 2> instead of <0, 2, 4, 6>,
7253 static bool isVUZP_v_undef_Mask(ArrayRef<int> M, EVT VT, unsigned &WhichResult){
7254 unsigned EltSz = VT.getScalarSizeInBits();
7255 if (EltSz == 64)
7256 return false;
7258 unsigned NumElts = VT.getVectorNumElements();
7259 if (M.size() != NumElts && M.size() != NumElts*2)
7260 return false;
7262 unsigned Half = NumElts / 2;
7263 for (unsigned i = 0; i < M.size(); i += NumElts) {
7264 WhichResult = SelectPairHalf(NumElts, M, i);
7265 for (unsigned j = 0; j < NumElts; j += Half) {
7266 unsigned Idx = WhichResult;
7267 for (unsigned k = 0; k < Half; ++k) {
7268 int MIdx = M[i + j + k];
7269 if (MIdx >= 0 && (unsigned) MIdx != Idx)
7270 return false;
7271 Idx += 2;
7276 if (M.size() == NumElts*2)
7277 WhichResult = 0;
7279 // VUZP.32 for 64-bit vectors is a pseudo-instruction alias for VTRN.32.
7280 if (VT.is64BitVector() && EltSz == 32)
7281 return false;
7283 return true;
7286 // Checks whether the shuffle mask represents a vector zip (VZIP) by checking
7287 // that pairs of elements of the shufflemask represent the same index in each
7288 // vector incrementing sequentially through the vectors.
7289 // e.g. For v1,v2 of type v4i32 a valid shuffle mask is: [0, 4, 1, 5]
7290 // v1={a,b,c,d} => x=shufflevector v1, v2 shufflemask => x={a,e,b,f}
7291 // v2={e,f,g,h}
7292 // Requires similar checks to that of isVTRNMask with respect the how results
7293 // are returned.
7294 static bool isVZIPMask(ArrayRef<int> M, EVT VT, unsigned &WhichResult) {
7295 unsigned EltSz = VT.getScalarSizeInBits();
7296 if (EltSz == 64)
7297 return false;
7299 unsigned NumElts = VT.getVectorNumElements();
7300 if (M.size() != NumElts && M.size() != NumElts*2)
7301 return false;
7303 for (unsigned i = 0; i < M.size(); i += NumElts) {
7304 WhichResult = SelectPairHalf(NumElts, M, i);
7305 unsigned Idx = WhichResult * NumElts / 2;
7306 for (unsigned j = 0; j < NumElts; j += 2) {
7307 if ((M[i+j] >= 0 && (unsigned) M[i+j] != Idx) ||
7308 (M[i+j+1] >= 0 && (unsigned) M[i+j+1] != Idx + NumElts))
7309 return false;
7310 Idx += 1;
7314 if (M.size() == NumElts*2)
7315 WhichResult = 0;
7317 // VZIP.32 for 64-bit vectors is a pseudo-instruction alias for VTRN.32.
7318 if (VT.is64BitVector() && EltSz == 32)
7319 return false;
7321 return true;
7324 /// isVZIP_v_undef_Mask - Special case of isVZIPMask for canonical form of
7325 /// "vector_shuffle v, v", i.e., "vector_shuffle v, undef".
7326 /// Mask is e.g., <0, 0, 1, 1> instead of <0, 4, 1, 5>.
7327 static bool isVZIP_v_undef_Mask(ArrayRef<int> M, EVT VT, unsigned &WhichResult){
7328 unsigned EltSz = VT.getScalarSizeInBits();
7329 if (EltSz == 64)
7330 return false;
7332 unsigned NumElts = VT.getVectorNumElements();
7333 if (M.size() != NumElts && M.size() != NumElts*2)
7334 return false;
7336 for (unsigned i = 0; i < M.size(); i += NumElts) {
7337 WhichResult = SelectPairHalf(NumElts, M, i);
7338 unsigned Idx = WhichResult * NumElts / 2;
7339 for (unsigned j = 0; j < NumElts; j += 2) {
7340 if ((M[i+j] >= 0 && (unsigned) M[i+j] != Idx) ||
7341 (M[i+j+1] >= 0 && (unsigned) M[i+j+1] != Idx))
7342 return false;
7343 Idx += 1;
7347 if (M.size() == NumElts*2)
7348 WhichResult = 0;
7350 // VZIP.32 for 64-bit vectors is a pseudo-instruction alias for VTRN.32.
7351 if (VT.is64BitVector() && EltSz == 32)
7352 return false;
7354 return true;
7357 /// Check if \p ShuffleMask is a NEON two-result shuffle (VZIP, VUZP, VTRN),
7358 /// and return the corresponding ARMISD opcode if it is, or 0 if it isn't.
7359 static unsigned isNEONTwoResultShuffleMask(ArrayRef<int> ShuffleMask, EVT VT,
7360 unsigned &WhichResult,
7361 bool &isV_UNDEF) {
7362 isV_UNDEF = false;
7363 if (isVTRNMask(ShuffleMask, VT, WhichResult))
7364 return ARMISD::VTRN;
7365 if (isVUZPMask(ShuffleMask, VT, WhichResult))
7366 return ARMISD::VUZP;
7367 if (isVZIPMask(ShuffleMask, VT, WhichResult))
7368 return ARMISD::VZIP;
7370 isV_UNDEF = true;
7371 if (isVTRN_v_undef_Mask(ShuffleMask, VT, WhichResult))
7372 return ARMISD::VTRN;
7373 if (isVUZP_v_undef_Mask(ShuffleMask, VT, WhichResult))
7374 return ARMISD::VUZP;
7375 if (isVZIP_v_undef_Mask(ShuffleMask, VT, WhichResult))
7376 return ARMISD::VZIP;
7378 return 0;
7381 /// \return true if this is a reverse operation on an vector.
7382 static bool isReverseMask(ArrayRef<int> M, EVT VT) {
7383 unsigned NumElts = VT.getVectorNumElements();
7384 // Make sure the mask has the right size.
7385 if (NumElts != M.size())
7386 return false;
7388 // Look for <15, ..., 3, -1, 1, 0>.
7389 for (unsigned i = 0; i != NumElts; ++i)
7390 if (M[i] >= 0 && M[i] != (int) (NumElts - 1 - i))
7391 return false;
7393 return true;
7396 static bool isVMOVNMask(ArrayRef<int> M, EVT VT, bool Top, bool SingleSource) {
7397 unsigned NumElts = VT.getVectorNumElements();
7398 // Make sure the mask has the right size.
7399 if (NumElts != M.size() || (VT != MVT::v8i16 && VT != MVT::v16i8))
7400 return false;
7402 // If Top
7403 // Look for <0, N, 2, N+2, 4, N+4, ..>.
7404 // This inserts Input2 into Input1
7405 // else if not Top
7406 // Look for <0, N+1, 2, N+3, 4, N+5, ..>
7407 // This inserts Input1 into Input2
7408 unsigned Offset = Top ? 0 : 1;
7409 unsigned N = SingleSource ? 0 : NumElts;
7410 for (unsigned i = 0; i < NumElts; i += 2) {
7411 if (M[i] >= 0 && M[i] != (int)i)
7412 return false;
7413 if (M[i + 1] >= 0 && M[i + 1] != (int)(N + i + Offset))
7414 return false;
7417 return true;
7420 static bool isVMOVNTruncMask(ArrayRef<int> M, EVT ToVT, bool rev) {
7421 unsigned NumElts = ToVT.getVectorNumElements();
7422 if (NumElts != M.size())
7423 return false;
7425 // Test if the Trunc can be convertable to a VMOVN with this shuffle. We are
7426 // looking for patterns of:
7427 // !rev: 0 N/2 1 N/2+1 2 N/2+2 ...
7428 // rev: N/2 0 N/2+1 1 N/2+2 2 ...
7430 unsigned Off0 = rev ? NumElts / 2 : 0;
7431 unsigned Off1 = rev ? 0 : NumElts / 2;
7432 for (unsigned i = 0; i < NumElts; i += 2) {
7433 if (M[i] >= 0 && M[i] != (int)(Off0 + i / 2))
7434 return false;
7435 if (M[i + 1] >= 0 && M[i + 1] != (int)(Off1 + i / 2))
7436 return false;
7439 return true;
7442 // Reconstruct an MVE VCVT from a BuildVector of scalar fptrunc, all extracted
7443 // from a pair of inputs. For example:
7444 // BUILDVECTOR(FP_ROUND(EXTRACT_ELT(X, 0),
7445 // FP_ROUND(EXTRACT_ELT(Y, 0),
7446 // FP_ROUND(EXTRACT_ELT(X, 1),
7447 // FP_ROUND(EXTRACT_ELT(Y, 1), ...)
7448 static SDValue LowerBuildVectorOfFPTrunc(SDValue BV, SelectionDAG &DAG,
7449 const ARMSubtarget *ST) {
7450 assert(BV.getOpcode() == ISD::BUILD_VECTOR && "Unknown opcode!");
7451 if (!ST->hasMVEFloatOps())
7452 return SDValue();
7454 SDLoc dl(BV);
7455 EVT VT = BV.getValueType();
7456 if (VT != MVT::v8f16)
7457 return SDValue();
7459 // We are looking for a buildvector of fptrunc elements, where all the
7460 // elements are interleavingly extracted from two sources. Check the first two
7461 // items are valid enough and extract some info from them (they are checked
7462 // properly in the loop below).
7463 if (BV.getOperand(0).getOpcode() != ISD::FP_ROUND ||
7464 BV.getOperand(0).getOperand(0).getOpcode() != ISD::EXTRACT_VECTOR_ELT ||
7465 BV.getOperand(0).getOperand(0).getConstantOperandVal(1) != 0)
7466 return SDValue();
7467 if (BV.getOperand(1).getOpcode() != ISD::FP_ROUND ||
7468 BV.getOperand(1).getOperand(0).getOpcode() != ISD::EXTRACT_VECTOR_ELT ||
7469 BV.getOperand(1).getOperand(0).getConstantOperandVal(1) != 0)
7470 return SDValue();
7471 SDValue Op0 = BV.getOperand(0).getOperand(0).getOperand(0);
7472 SDValue Op1 = BV.getOperand(1).getOperand(0).getOperand(0);
7473 if (Op0.getValueType() != MVT::v4f32 || Op1.getValueType() != MVT::v4f32)
7474 return SDValue();
7476 // Check all the values in the BuildVector line up with our expectations.
7477 for (unsigned i = 1; i < 4; i++) {
7478 auto Check = [](SDValue Trunc, SDValue Op, unsigned Idx) {
7479 return Trunc.getOpcode() == ISD::FP_ROUND &&
7480 Trunc.getOperand(0).getOpcode() == ISD::EXTRACT_VECTOR_ELT &&
7481 Trunc.getOperand(0).getOperand(0) == Op &&
7482 Trunc.getOperand(0).getConstantOperandVal(1) == Idx;
7484 if (!Check(BV.getOperand(i * 2 + 0), Op0, i))
7485 return SDValue();
7486 if (!Check(BV.getOperand(i * 2 + 1), Op1, i))
7487 return SDValue();
7490 SDValue N1 = DAG.getNode(ARMISD::VCVTN, dl, VT, DAG.getUNDEF(VT), Op0,
7491 DAG.getConstant(0, dl, MVT::i32));
7492 return DAG.getNode(ARMISD::VCVTN, dl, VT, N1, Op1,
7493 DAG.getConstant(1, dl, MVT::i32));
7496 // Reconstruct an MVE VCVT from a BuildVector of scalar fpext, all extracted
7497 // from a single input on alternating lanes. For example:
7498 // BUILDVECTOR(FP_ROUND(EXTRACT_ELT(X, 0),
7499 // FP_ROUND(EXTRACT_ELT(X, 2),
7500 // FP_ROUND(EXTRACT_ELT(X, 4), ...)
7501 static SDValue LowerBuildVectorOfFPExt(SDValue BV, SelectionDAG &DAG,
7502 const ARMSubtarget *ST) {
7503 assert(BV.getOpcode() == ISD::BUILD_VECTOR && "Unknown opcode!");
7504 if (!ST->hasMVEFloatOps())
7505 return SDValue();
7507 SDLoc dl(BV);
7508 EVT VT = BV.getValueType();
7509 if (VT != MVT::v4f32)
7510 return SDValue();
7512 // We are looking for a buildvector of fptext elements, where all the
7513 // elements are alternating lanes from a single source. For example <0,2,4,6>
7514 // or <1,3,5,7>. Check the first two items are valid enough and extract some
7515 // info from them (they are checked properly in the loop below).
7516 if (BV.getOperand(0).getOpcode() != ISD::FP_EXTEND ||
7517 BV.getOperand(0).getOperand(0).getOpcode() != ISD::EXTRACT_VECTOR_ELT)
7518 return SDValue();
7519 SDValue Op0 = BV.getOperand(0).getOperand(0).getOperand(0);
7520 int Offset = BV.getOperand(0).getOperand(0).getConstantOperandVal(1);
7521 if (Op0.getValueType() != MVT::v8f16 || (Offset != 0 && Offset != 1))
7522 return SDValue();
7524 // Check all the values in the BuildVector line up with our expectations.
7525 for (unsigned i = 1; i < 4; i++) {
7526 auto Check = [](SDValue Trunc, SDValue Op, unsigned Idx) {
7527 return Trunc.getOpcode() == ISD::FP_EXTEND &&
7528 Trunc.getOperand(0).getOpcode() == ISD::EXTRACT_VECTOR_ELT &&
7529 Trunc.getOperand(0).getOperand(0) == Op &&
7530 Trunc.getOperand(0).getConstantOperandVal(1) == Idx;
7532 if (!Check(BV.getOperand(i), Op0, 2 * i + Offset))
7533 return SDValue();
7536 return DAG.getNode(ARMISD::VCVTL, dl, VT, Op0,
7537 DAG.getConstant(Offset, dl, MVT::i32));
7540 // If N is an integer constant that can be moved into a register in one
7541 // instruction, return an SDValue of such a constant (will become a MOV
7542 // instruction). Otherwise return null.
7543 static SDValue IsSingleInstrConstant(SDValue N, SelectionDAG &DAG,
7544 const ARMSubtarget *ST, const SDLoc &dl) {
7545 uint64_t Val;
7546 if (!isa<ConstantSDNode>(N))
7547 return SDValue();
7548 Val = cast<ConstantSDNode>(N)->getZExtValue();
7550 if (ST->isThumb1Only()) {
7551 if (Val <= 255 || ~Val <= 255)
7552 return DAG.getConstant(Val, dl, MVT::i32);
7553 } else {
7554 if (ARM_AM::getSOImmVal(Val) != -1 || ARM_AM::getSOImmVal(~Val) != -1)
7555 return DAG.getConstant(Val, dl, MVT::i32);
7557 return SDValue();
7560 static SDValue LowerBUILD_VECTOR_i1(SDValue Op, SelectionDAG &DAG,
7561 const ARMSubtarget *ST) {
7562 SDLoc dl(Op);
7563 EVT VT = Op.getValueType();
7565 assert(ST->hasMVEIntegerOps() && "LowerBUILD_VECTOR_i1 called without MVE!");
7567 unsigned NumElts = VT.getVectorNumElements();
7568 unsigned BoolMask;
7569 unsigned BitsPerBool;
7570 if (NumElts == 4) {
7571 BitsPerBool = 4;
7572 BoolMask = 0xf;
7573 } else if (NumElts == 8) {
7574 BitsPerBool = 2;
7575 BoolMask = 0x3;
7576 } else if (NumElts == 16) {
7577 BitsPerBool = 1;
7578 BoolMask = 0x1;
7579 } else
7580 return SDValue();
7582 // If this is a single value copied into all lanes (a splat), we can just sign
7583 // extend that single value
7584 SDValue FirstOp = Op.getOperand(0);
7585 if (!isa<ConstantSDNode>(FirstOp) &&
7586 std::all_of(std::next(Op->op_begin()), Op->op_end(),
7587 [&FirstOp](SDUse &U) {
7588 return U.get().isUndef() || U.get() == FirstOp;
7589 })) {
7590 SDValue Ext = DAG.getNode(ISD::SIGN_EXTEND_INREG, dl, MVT::i32, FirstOp,
7591 DAG.getValueType(MVT::i1));
7592 return DAG.getNode(ARMISD::PREDICATE_CAST, dl, Op.getValueType(), Ext);
7595 // First create base with bits set where known
7596 unsigned Bits32 = 0;
7597 for (unsigned i = 0; i < NumElts; ++i) {
7598 SDValue V = Op.getOperand(i);
7599 if (!isa<ConstantSDNode>(V) && !V.isUndef())
7600 continue;
7601 bool BitSet = V.isUndef() ? false : cast<ConstantSDNode>(V)->getZExtValue();
7602 if (BitSet)
7603 Bits32 |= BoolMask << (i * BitsPerBool);
7606 // Add in unknown nodes
7607 SDValue Base = DAG.getNode(ARMISD::PREDICATE_CAST, dl, VT,
7608 DAG.getConstant(Bits32, dl, MVT::i32));
7609 for (unsigned i = 0; i < NumElts; ++i) {
7610 SDValue V = Op.getOperand(i);
7611 if (isa<ConstantSDNode>(V) || V.isUndef())
7612 continue;
7613 Base = DAG.getNode(ISD::INSERT_VECTOR_ELT, dl, VT, Base, V,
7614 DAG.getConstant(i, dl, MVT::i32));
7617 return Base;
7620 static SDValue LowerBUILD_VECTORToVIDUP(SDValue Op, SelectionDAG &DAG,
7621 const ARMSubtarget *ST) {
7622 if (!ST->hasMVEIntegerOps())
7623 return SDValue();
7625 // We are looking for a buildvector where each element is Op[0] + i*N
7626 EVT VT = Op.getValueType();
7627 SDValue Op0 = Op.getOperand(0);
7628 unsigned NumElts = VT.getVectorNumElements();
7630 // Get the increment value from operand 1
7631 SDValue Op1 = Op.getOperand(1);
7632 if (Op1.getOpcode() != ISD::ADD || Op1.getOperand(0) != Op0 ||
7633 !isa<ConstantSDNode>(Op1.getOperand(1)))
7634 return SDValue();
7635 unsigned N = Op1.getConstantOperandVal(1);
7636 if (N != 1 && N != 2 && N != 4 && N != 8)
7637 return SDValue();
7639 // Check that each other operand matches
7640 for (unsigned I = 2; I < NumElts; I++) {
7641 SDValue OpI = Op.getOperand(I);
7642 if (OpI.getOpcode() != ISD::ADD || OpI.getOperand(0) != Op0 ||
7643 !isa<ConstantSDNode>(OpI.getOperand(1)) ||
7644 OpI.getConstantOperandVal(1) != I * N)
7645 return SDValue();
7648 SDLoc DL(Op);
7649 return DAG.getNode(ARMISD::VIDUP, DL, DAG.getVTList(VT, MVT::i32), Op0,
7650 DAG.getConstant(N, DL, MVT::i32));
7653 // If this is a case we can't handle, return null and let the default
7654 // expansion code take care of it.
7655 SDValue ARMTargetLowering::LowerBUILD_VECTOR(SDValue Op, SelectionDAG &DAG,
7656 const ARMSubtarget *ST) const {
7657 BuildVectorSDNode *BVN = cast<BuildVectorSDNode>(Op.getNode());
7658 SDLoc dl(Op);
7659 EVT VT = Op.getValueType();
7661 if (ST->hasMVEIntegerOps() && VT.getScalarSizeInBits() == 1)
7662 return LowerBUILD_VECTOR_i1(Op, DAG, ST);
7664 if (SDValue R = LowerBUILD_VECTORToVIDUP(Op, DAG, ST))
7665 return R;
7667 APInt SplatBits, SplatUndef;
7668 unsigned SplatBitSize;
7669 bool HasAnyUndefs;
7670 if (BVN->isConstantSplat(SplatBits, SplatUndef, SplatBitSize, HasAnyUndefs)) {
7671 if (SplatUndef.isAllOnesValue())
7672 return DAG.getUNDEF(VT);
7674 if ((ST->hasNEON() && SplatBitSize <= 64) ||
7675 (ST->hasMVEIntegerOps() && SplatBitSize <= 64)) {
7676 // Check if an immediate VMOV works.
7677 EVT VmovVT;
7678 SDValue Val =
7679 isVMOVModifiedImm(SplatBits.getZExtValue(), SplatUndef.getZExtValue(),
7680 SplatBitSize, DAG, dl, VmovVT, VT, VMOVModImm);
7682 if (Val.getNode()) {
7683 SDValue Vmov = DAG.getNode(ARMISD::VMOVIMM, dl, VmovVT, Val);
7684 return DAG.getNode(ISD::BITCAST, dl, VT, Vmov);
7687 // Try an immediate VMVN.
7688 uint64_t NegatedImm = (~SplatBits).getZExtValue();
7689 Val = isVMOVModifiedImm(
7690 NegatedImm, SplatUndef.getZExtValue(), SplatBitSize, DAG, dl, VmovVT,
7691 VT, ST->hasMVEIntegerOps() ? MVEVMVNModImm : VMVNModImm);
7692 if (Val.getNode()) {
7693 SDValue Vmov = DAG.getNode(ARMISD::VMVNIMM, dl, VmovVT, Val);
7694 return DAG.getNode(ISD::BITCAST, dl, VT, Vmov);
7697 // Use vmov.f32 to materialize other v2f32 and v4f32 splats.
7698 if ((VT == MVT::v2f32 || VT == MVT::v4f32) && SplatBitSize == 32) {
7699 int ImmVal = ARM_AM::getFP32Imm(SplatBits);
7700 if (ImmVal != -1) {
7701 SDValue Val = DAG.getTargetConstant(ImmVal, dl, MVT::i32);
7702 return DAG.getNode(ARMISD::VMOVFPIMM, dl, VT, Val);
7706 // If we are under MVE, generate a VDUP(constant), bitcast to the original
7707 // type.
7708 if (ST->hasMVEIntegerOps() &&
7709 (SplatBitSize == 8 || SplatBitSize == 16 || SplatBitSize == 32)) {
7710 EVT DupVT = SplatBitSize == 32 ? MVT::v4i32
7711 : SplatBitSize == 16 ? MVT::v8i16
7712 : MVT::v16i8;
7713 SDValue Const = DAG.getConstant(SplatBits.getZExtValue(), dl, MVT::i32);
7714 SDValue VDup = DAG.getNode(ARMISD::VDUP, dl, DupVT, Const);
7715 return DAG.getNode(ARMISD::VECTOR_REG_CAST, dl, VT, VDup);
7720 // Scan through the operands to see if only one value is used.
7722 // As an optimisation, even if more than one value is used it may be more
7723 // profitable to splat with one value then change some lanes.
7725 // Heuristically we decide to do this if the vector has a "dominant" value,
7726 // defined as splatted to more than half of the lanes.
7727 unsigned NumElts = VT.getVectorNumElements();
7728 bool isOnlyLowElement = true;
7729 bool usesOnlyOneValue = true;
7730 bool hasDominantValue = false;
7731 bool isConstant = true;
7733 // Map of the number of times a particular SDValue appears in the
7734 // element list.
7735 DenseMap<SDValue, unsigned> ValueCounts;
7736 SDValue Value;
7737 for (unsigned i = 0; i < NumElts; ++i) {
7738 SDValue V = Op.getOperand(i);
7739 if (V.isUndef())
7740 continue;
7741 if (i > 0)
7742 isOnlyLowElement = false;
7743 if (!isa<ConstantFPSDNode>(V) && !isa<ConstantSDNode>(V))
7744 isConstant = false;
7746 ValueCounts.insert(std::make_pair(V, 0));
7747 unsigned &Count = ValueCounts[V];
7749 // Is this value dominant? (takes up more than half of the lanes)
7750 if (++Count > (NumElts / 2)) {
7751 hasDominantValue = true;
7752 Value = V;
7755 if (ValueCounts.size() != 1)
7756 usesOnlyOneValue = false;
7757 if (!Value.getNode() && !ValueCounts.empty())
7758 Value = ValueCounts.begin()->first;
7760 if (ValueCounts.empty())
7761 return DAG.getUNDEF(VT);
7763 // Loads are better lowered with insert_vector_elt/ARMISD::BUILD_VECTOR.
7764 // Keep going if we are hitting this case.
7765 if (isOnlyLowElement && !ISD::isNormalLoad(Value.getNode()))
7766 return DAG.getNode(ISD::SCALAR_TO_VECTOR, dl, VT, Value);
7768 unsigned EltSize = VT.getScalarSizeInBits();
7770 // Use VDUP for non-constant splats. For f32 constant splats, reduce to
7771 // i32 and try again.
7772 if (hasDominantValue && EltSize <= 32) {
7773 if (!isConstant) {
7774 SDValue N;
7776 // If we are VDUPing a value that comes directly from a vector, that will
7777 // cause an unnecessary move to and from a GPR, where instead we could
7778 // just use VDUPLANE. We can only do this if the lane being extracted
7779 // is at a constant index, as the VDUP from lane instructions only have
7780 // constant-index forms.
7781 ConstantSDNode *constIndex;
7782 if (Value->getOpcode() == ISD::EXTRACT_VECTOR_ELT &&
7783 (constIndex = dyn_cast<ConstantSDNode>(Value->getOperand(1)))) {
7784 // We need to create a new undef vector to use for the VDUPLANE if the
7785 // size of the vector from which we get the value is different than the
7786 // size of the vector that we need to create. We will insert the element
7787 // such that the register coalescer will remove unnecessary copies.
7788 if (VT != Value->getOperand(0).getValueType()) {
7789 unsigned index = constIndex->getAPIntValue().getLimitedValue() %
7790 VT.getVectorNumElements();
7791 N = DAG.getNode(ARMISD::VDUPLANE, dl, VT,
7792 DAG.getNode(ISD::INSERT_VECTOR_ELT, dl, VT, DAG.getUNDEF(VT),
7793 Value, DAG.getConstant(index, dl, MVT::i32)),
7794 DAG.getConstant(index, dl, MVT::i32));
7795 } else
7796 N = DAG.getNode(ARMISD::VDUPLANE, dl, VT,
7797 Value->getOperand(0), Value->getOperand(1));
7798 } else
7799 N = DAG.getNode(ARMISD::VDUP, dl, VT, Value);
7801 if (!usesOnlyOneValue) {
7802 // The dominant value was splatted as 'N', but we now have to insert
7803 // all differing elements.
7804 for (unsigned I = 0; I < NumElts; ++I) {
7805 if (Op.getOperand(I) == Value)
7806 continue;
7807 SmallVector<SDValue, 3> Ops;
7808 Ops.push_back(N);
7809 Ops.push_back(Op.getOperand(I));
7810 Ops.push_back(DAG.getConstant(I, dl, MVT::i32));
7811 N = DAG.getNode(ISD::INSERT_VECTOR_ELT, dl, VT, Ops);
7814 return N;
7816 if (VT.getVectorElementType().isFloatingPoint()) {
7817 SmallVector<SDValue, 8> Ops;
7818 MVT FVT = VT.getVectorElementType().getSimpleVT();
7819 assert(FVT == MVT::f32 || FVT == MVT::f16);
7820 MVT IVT = (FVT == MVT::f32) ? MVT::i32 : MVT::i16;
7821 for (unsigned i = 0; i < NumElts; ++i)
7822 Ops.push_back(DAG.getNode(ISD::BITCAST, dl, IVT,
7823 Op.getOperand(i)));
7824 EVT VecVT = EVT::getVectorVT(*DAG.getContext(), IVT, NumElts);
7825 SDValue Val = DAG.getBuildVector(VecVT, dl, Ops);
7826 Val = LowerBUILD_VECTOR(Val, DAG, ST);
7827 if (Val.getNode())
7828 return DAG.getNode(ISD::BITCAST, dl, VT, Val);
7830 if (usesOnlyOneValue) {
7831 SDValue Val = IsSingleInstrConstant(Value, DAG, ST, dl);
7832 if (isConstant && Val.getNode())
7833 return DAG.getNode(ARMISD::VDUP, dl, VT, Val);
7837 // If all elements are constants and the case above didn't get hit, fall back
7838 // to the default expansion, which will generate a load from the constant
7839 // pool.
7840 if (isConstant)
7841 return SDValue();
7843 // Reconstruct the BUILDVECTOR to one of the legal shuffles (such as vext and
7844 // vmovn). Empirical tests suggest this is rarely worth it for vectors of
7845 // length <= 2.
7846 if (NumElts >= 4)
7847 if (SDValue shuffle = ReconstructShuffle(Op, DAG))
7848 return shuffle;
7850 // Attempt to turn a buildvector of scalar fptrunc's or fpext's back into
7851 // VCVT's
7852 if (SDValue VCVT = LowerBuildVectorOfFPTrunc(Op, DAG, Subtarget))
7853 return VCVT;
7854 if (SDValue VCVT = LowerBuildVectorOfFPExt(Op, DAG, Subtarget))
7855 return VCVT;
7857 if (ST->hasNEON() && VT.is128BitVector() && VT != MVT::v2f64 && VT != MVT::v4f32) {
7858 // If we haven't found an efficient lowering, try splitting a 128-bit vector
7859 // into two 64-bit vectors; we might discover a better way to lower it.
7860 SmallVector<SDValue, 64> Ops(Op->op_begin(), Op->op_begin() + NumElts);
7861 EVT ExtVT = VT.getVectorElementType();
7862 EVT HVT = EVT::getVectorVT(*DAG.getContext(), ExtVT, NumElts / 2);
7863 SDValue Lower =
7864 DAG.getBuildVector(HVT, dl, makeArrayRef(&Ops[0], NumElts / 2));
7865 if (Lower.getOpcode() == ISD::BUILD_VECTOR)
7866 Lower = LowerBUILD_VECTOR(Lower, DAG, ST);
7867 SDValue Upper = DAG.getBuildVector(
7868 HVT, dl, makeArrayRef(&Ops[NumElts / 2], NumElts / 2));
7869 if (Upper.getOpcode() == ISD::BUILD_VECTOR)
7870 Upper = LowerBUILD_VECTOR(Upper, DAG, ST);
7871 if (Lower && Upper)
7872 return DAG.getNode(ISD::CONCAT_VECTORS, dl, VT, Lower, Upper);
7875 // Vectors with 32- or 64-bit elements can be built by directly assigning
7876 // the subregisters. Lower it to an ARMISD::BUILD_VECTOR so the operands
7877 // will be legalized.
7878 if (EltSize >= 32) {
7879 // Do the expansion with floating-point types, since that is what the VFP
7880 // registers are defined to use, and since i64 is not legal.
7881 EVT EltVT = EVT::getFloatingPointVT(EltSize);
7882 EVT VecVT = EVT::getVectorVT(*DAG.getContext(), EltVT, NumElts);
7883 SmallVector<SDValue, 8> Ops;
7884 for (unsigned i = 0; i < NumElts; ++i)
7885 Ops.push_back(DAG.getNode(ISD::BITCAST, dl, EltVT, Op.getOperand(i)));
7886 SDValue Val = DAG.getNode(ARMISD::BUILD_VECTOR, dl, VecVT, Ops);
7887 return DAG.getNode(ISD::BITCAST, dl, VT, Val);
7890 // If all else fails, just use a sequence of INSERT_VECTOR_ELT when we
7891 // know the default expansion would otherwise fall back on something even
7892 // worse. For a vector with one or two non-undef values, that's
7893 // scalar_to_vector for the elements followed by a shuffle (provided the
7894 // shuffle is valid for the target) and materialization element by element
7895 // on the stack followed by a load for everything else.
7896 if (!isConstant && !usesOnlyOneValue) {
7897 SDValue Vec = DAG.getUNDEF(VT);
7898 for (unsigned i = 0 ; i < NumElts; ++i) {
7899 SDValue V = Op.getOperand(i);
7900 if (V.isUndef())
7901 continue;
7902 SDValue LaneIdx = DAG.getConstant(i, dl, MVT::i32);
7903 Vec = DAG.getNode(ISD::INSERT_VECTOR_ELT, dl, VT, Vec, V, LaneIdx);
7905 return Vec;
7908 return SDValue();
7911 // Gather data to see if the operation can be modelled as a
7912 // shuffle in combination with VEXTs.
7913 SDValue ARMTargetLowering::ReconstructShuffle(SDValue Op,
7914 SelectionDAG &DAG) const {
7915 assert(Op.getOpcode() == ISD::BUILD_VECTOR && "Unknown opcode!");
7916 SDLoc dl(Op);
7917 EVT VT = Op.getValueType();
7918 unsigned NumElts = VT.getVectorNumElements();
7920 struct ShuffleSourceInfo {
7921 SDValue Vec;
7922 unsigned MinElt = std::numeric_limits<unsigned>::max();
7923 unsigned MaxElt = 0;
7925 // We may insert some combination of BITCASTs and VEXT nodes to force Vec to
7926 // be compatible with the shuffle we intend to construct. As a result
7927 // ShuffleVec will be some sliding window into the original Vec.
7928 SDValue ShuffleVec;
7930 // Code should guarantee that element i in Vec starts at element "WindowBase
7931 // + i * WindowScale in ShuffleVec".
7932 int WindowBase = 0;
7933 int WindowScale = 1;
7935 ShuffleSourceInfo(SDValue Vec) : Vec(Vec), ShuffleVec(Vec) {}
7937 bool operator ==(SDValue OtherVec) { return Vec == OtherVec; }
7940 // First gather all vectors used as an immediate source for this BUILD_VECTOR
7941 // node.
7942 SmallVector<ShuffleSourceInfo, 2> Sources;
7943 for (unsigned i = 0; i < NumElts; ++i) {
7944 SDValue V = Op.getOperand(i);
7945 if (V.isUndef())
7946 continue;
7947 else if (V.getOpcode() != ISD::EXTRACT_VECTOR_ELT) {
7948 // A shuffle can only come from building a vector from various
7949 // elements of other vectors.
7950 return SDValue();
7951 } else if (!isa<ConstantSDNode>(V.getOperand(1))) {
7952 // Furthermore, shuffles require a constant mask, whereas extractelts
7953 // accept variable indices.
7954 return SDValue();
7957 // Add this element source to the list if it's not already there.
7958 SDValue SourceVec = V.getOperand(0);
7959 auto Source = llvm::find(Sources, SourceVec);
7960 if (Source == Sources.end())
7961 Source = Sources.insert(Sources.end(), ShuffleSourceInfo(SourceVec));
7963 // Update the minimum and maximum lane number seen.
7964 unsigned EltNo = cast<ConstantSDNode>(V.getOperand(1))->getZExtValue();
7965 Source->MinElt = std::min(Source->MinElt, EltNo);
7966 Source->MaxElt = std::max(Source->MaxElt, EltNo);
7969 // Currently only do something sane when at most two source vectors
7970 // are involved.
7971 if (Sources.size() > 2)
7972 return SDValue();
7974 // Find out the smallest element size among result and two sources, and use
7975 // it as element size to build the shuffle_vector.
7976 EVT SmallestEltTy = VT.getVectorElementType();
7977 for (auto &Source : Sources) {
7978 EVT SrcEltTy = Source.Vec.getValueType().getVectorElementType();
7979 if (SrcEltTy.bitsLT(SmallestEltTy))
7980 SmallestEltTy = SrcEltTy;
7982 unsigned ResMultiplier =
7983 VT.getScalarSizeInBits() / SmallestEltTy.getSizeInBits();
7984 NumElts = VT.getSizeInBits() / SmallestEltTy.getSizeInBits();
7985 EVT ShuffleVT = EVT::getVectorVT(*DAG.getContext(), SmallestEltTy, NumElts);
7987 // If the source vector is too wide or too narrow, we may nevertheless be able
7988 // to construct a compatible shuffle either by concatenating it with UNDEF or
7989 // extracting a suitable range of elements.
7990 for (auto &Src : Sources) {
7991 EVT SrcVT = Src.ShuffleVec.getValueType();
7993 uint64_t SrcVTSize = SrcVT.getFixedSizeInBits();
7994 uint64_t VTSize = VT.getFixedSizeInBits();
7995 if (SrcVTSize == VTSize)
7996 continue;
7998 // This stage of the search produces a source with the same element type as
7999 // the original, but with a total width matching the BUILD_VECTOR output.
8000 EVT EltVT = SrcVT.getVectorElementType();
8001 unsigned NumSrcElts = VTSize / EltVT.getFixedSizeInBits();
8002 EVT DestVT = EVT::getVectorVT(*DAG.getContext(), EltVT, NumSrcElts);
8004 if (SrcVTSize < VTSize) {
8005 if (2 * SrcVTSize != VTSize)
8006 return SDValue();
8007 // We can pad out the smaller vector for free, so if it's part of a
8008 // shuffle...
8009 Src.ShuffleVec =
8010 DAG.getNode(ISD::CONCAT_VECTORS, dl, DestVT, Src.ShuffleVec,
8011 DAG.getUNDEF(Src.ShuffleVec.getValueType()));
8012 continue;
8015 if (SrcVTSize != 2 * VTSize)
8016 return SDValue();
8018 if (Src.MaxElt - Src.MinElt >= NumSrcElts) {
8019 // Span too large for a VEXT to cope
8020 return SDValue();
8023 if (Src.MinElt >= NumSrcElts) {
8024 // The extraction can just take the second half
8025 Src.ShuffleVec =
8026 DAG.getNode(ISD::EXTRACT_SUBVECTOR, dl, DestVT, Src.ShuffleVec,
8027 DAG.getConstant(NumSrcElts, dl, MVT::i32));
8028 Src.WindowBase = -NumSrcElts;
8029 } else if (Src.MaxElt < NumSrcElts) {
8030 // The extraction can just take the first half
8031 Src.ShuffleVec =
8032 DAG.getNode(ISD::EXTRACT_SUBVECTOR, dl, DestVT, Src.ShuffleVec,
8033 DAG.getConstant(0, dl, MVT::i32));
8034 } else {
8035 // An actual VEXT is needed
8036 SDValue VEXTSrc1 =
8037 DAG.getNode(ISD::EXTRACT_SUBVECTOR, dl, DestVT, Src.ShuffleVec,
8038 DAG.getConstant(0, dl, MVT::i32));
8039 SDValue VEXTSrc2 =
8040 DAG.getNode(ISD::EXTRACT_SUBVECTOR, dl, DestVT, Src.ShuffleVec,
8041 DAG.getConstant(NumSrcElts, dl, MVT::i32));
8043 Src.ShuffleVec = DAG.getNode(ARMISD::VEXT, dl, DestVT, VEXTSrc1,
8044 VEXTSrc2,
8045 DAG.getConstant(Src.MinElt, dl, MVT::i32));
8046 Src.WindowBase = -Src.MinElt;
8050 // Another possible incompatibility occurs from the vector element types. We
8051 // can fix this by bitcasting the source vectors to the same type we intend
8052 // for the shuffle.
8053 for (auto &Src : Sources) {
8054 EVT SrcEltTy = Src.ShuffleVec.getValueType().getVectorElementType();
8055 if (SrcEltTy == SmallestEltTy)
8056 continue;
8057 assert(ShuffleVT.getVectorElementType() == SmallestEltTy);
8058 Src.ShuffleVec = DAG.getNode(ARMISD::VECTOR_REG_CAST, dl, ShuffleVT, Src.ShuffleVec);
8059 Src.WindowScale = SrcEltTy.getSizeInBits() / SmallestEltTy.getSizeInBits();
8060 Src.WindowBase *= Src.WindowScale;
8063 // Final sanity check before we try to actually produce a shuffle.
8064 LLVM_DEBUG(for (auto Src
8065 : Sources)
8066 assert(Src.ShuffleVec.getValueType() == ShuffleVT););
8068 // The stars all align, our next step is to produce the mask for the shuffle.
8069 SmallVector<int, 8> Mask(ShuffleVT.getVectorNumElements(), -1);
8070 int BitsPerShuffleLane = ShuffleVT.getScalarSizeInBits();
8071 for (unsigned i = 0; i < VT.getVectorNumElements(); ++i) {
8072 SDValue Entry = Op.getOperand(i);
8073 if (Entry.isUndef())
8074 continue;
8076 auto Src = llvm::find(Sources, Entry.getOperand(0));
8077 int EltNo = cast<ConstantSDNode>(Entry.getOperand(1))->getSExtValue();
8079 // EXTRACT_VECTOR_ELT performs an implicit any_ext; BUILD_VECTOR an implicit
8080 // trunc. So only std::min(SrcBits, DestBits) actually get defined in this
8081 // segment.
8082 EVT OrigEltTy = Entry.getOperand(0).getValueType().getVectorElementType();
8083 int BitsDefined = std::min(OrigEltTy.getScalarSizeInBits(),
8084 VT.getScalarSizeInBits());
8085 int LanesDefined = BitsDefined / BitsPerShuffleLane;
8087 // This source is expected to fill ResMultiplier lanes of the final shuffle,
8088 // starting at the appropriate offset.
8089 int *LaneMask = &Mask[i * ResMultiplier];
8091 int ExtractBase = EltNo * Src->WindowScale + Src->WindowBase;
8092 ExtractBase += NumElts * (Src - Sources.begin());
8093 for (int j = 0; j < LanesDefined; ++j)
8094 LaneMask[j] = ExtractBase + j;
8098 // We can't handle more than two sources. This should have already
8099 // been checked before this point.
8100 assert(Sources.size() <= 2 && "Too many sources!");
8102 SDValue ShuffleOps[] = { DAG.getUNDEF(ShuffleVT), DAG.getUNDEF(ShuffleVT) };
8103 for (unsigned i = 0; i < Sources.size(); ++i)
8104 ShuffleOps[i] = Sources[i].ShuffleVec;
8106 SDValue Shuffle = buildLegalVectorShuffle(ShuffleVT, dl, ShuffleOps[0],
8107 ShuffleOps[1], Mask, DAG);
8108 if (!Shuffle)
8109 return SDValue();
8110 return DAG.getNode(ARMISD::VECTOR_REG_CAST, dl, VT, Shuffle);
8113 enum ShuffleOpCodes {
8114 OP_COPY = 0, // Copy, used for things like <u,u,u,3> to say it is <0,1,2,3>
8115 OP_VREV,
8116 OP_VDUP0,
8117 OP_VDUP1,
8118 OP_VDUP2,
8119 OP_VDUP3,
8120 OP_VEXT1,
8121 OP_VEXT2,
8122 OP_VEXT3,
8123 OP_VUZPL, // VUZP, left result
8124 OP_VUZPR, // VUZP, right result
8125 OP_VZIPL, // VZIP, left result
8126 OP_VZIPR, // VZIP, right result
8127 OP_VTRNL, // VTRN, left result
8128 OP_VTRNR // VTRN, right result
8131 static bool isLegalMVEShuffleOp(unsigned PFEntry) {
8132 unsigned OpNum = (PFEntry >> 26) & 0x0F;
8133 switch (OpNum) {
8134 case OP_COPY:
8135 case OP_VREV:
8136 case OP_VDUP0:
8137 case OP_VDUP1:
8138 case OP_VDUP2:
8139 case OP_VDUP3:
8140 return true;
8142 return false;
8145 /// isShuffleMaskLegal - Targets can use this to indicate that they only
8146 /// support *some* VECTOR_SHUFFLE operations, those with specific masks.
8147 /// By default, if a target supports the VECTOR_SHUFFLE node, all mask values
8148 /// are assumed to be legal.
8149 bool ARMTargetLowering::isShuffleMaskLegal(ArrayRef<int> M, EVT VT) const {
8150 if (VT.getVectorNumElements() == 4 &&
8151 (VT.is128BitVector() || VT.is64BitVector())) {
8152 unsigned PFIndexes[4];
8153 for (unsigned i = 0; i != 4; ++i) {
8154 if (M[i] < 0)
8155 PFIndexes[i] = 8;
8156 else
8157 PFIndexes[i] = M[i];
8160 // Compute the index in the perfect shuffle table.
8161 unsigned PFTableIndex =
8162 PFIndexes[0]*9*9*9+PFIndexes[1]*9*9+PFIndexes[2]*9+PFIndexes[3];
8163 unsigned PFEntry = PerfectShuffleTable[PFTableIndex];
8164 unsigned Cost = (PFEntry >> 30);
8166 if (Cost <= 4 && (Subtarget->hasNEON() || isLegalMVEShuffleOp(PFEntry)))
8167 return true;
8170 bool ReverseVEXT, isV_UNDEF;
8171 unsigned Imm, WhichResult;
8173 unsigned EltSize = VT.getScalarSizeInBits();
8174 if (EltSize >= 32 ||
8175 ShuffleVectorSDNode::isSplatMask(&M[0], VT) ||
8176 ShuffleVectorInst::isIdentityMask(M) ||
8177 isVREVMask(M, VT, 64) ||
8178 isVREVMask(M, VT, 32) ||
8179 isVREVMask(M, VT, 16))
8180 return true;
8181 else if (Subtarget->hasNEON() &&
8182 (isVEXTMask(M, VT, ReverseVEXT, Imm) ||
8183 isVTBLMask(M, VT) ||
8184 isNEONTwoResultShuffleMask(M, VT, WhichResult, isV_UNDEF)))
8185 return true;
8186 else if (Subtarget->hasNEON() && (VT == MVT::v8i16 || VT == MVT::v16i8) &&
8187 isReverseMask(M, VT))
8188 return true;
8189 else if (Subtarget->hasMVEIntegerOps() &&
8190 (isVMOVNMask(M, VT, true, false) ||
8191 isVMOVNMask(M, VT, false, false) || isVMOVNMask(M, VT, true, true)))
8192 return true;
8193 else
8194 return false;
8197 /// GeneratePerfectShuffle - Given an entry in the perfect-shuffle table, emit
8198 /// the specified operations to build the shuffle.
8199 static SDValue GeneratePerfectShuffle(unsigned PFEntry, SDValue LHS,
8200 SDValue RHS, SelectionDAG &DAG,
8201 const SDLoc &dl) {
8202 unsigned OpNum = (PFEntry >> 26) & 0x0F;
8203 unsigned LHSID = (PFEntry >> 13) & ((1 << 13)-1);
8204 unsigned RHSID = (PFEntry >> 0) & ((1 << 13)-1);
8206 if (OpNum == OP_COPY) {
8207 if (LHSID == (1*9+2)*9+3) return LHS;
8208 assert(LHSID == ((4*9+5)*9+6)*9+7 && "Illegal OP_COPY!");
8209 return RHS;
8212 SDValue OpLHS, OpRHS;
8213 OpLHS = GeneratePerfectShuffle(PerfectShuffleTable[LHSID], LHS, RHS, DAG, dl);
8214 OpRHS = GeneratePerfectShuffle(PerfectShuffleTable[RHSID], LHS, RHS, DAG, dl);
8215 EVT VT = OpLHS.getValueType();
8217 switch (OpNum) {
8218 default: llvm_unreachable("Unknown shuffle opcode!");
8219 case OP_VREV:
8220 // VREV divides the vector in half and swaps within the half.
8221 if (VT.getVectorElementType() == MVT::i32 ||
8222 VT.getVectorElementType() == MVT::f32)
8223 return DAG.getNode(ARMISD::VREV64, dl, VT, OpLHS);
8224 // vrev <4 x i16> -> VREV32
8225 if (VT.getVectorElementType() == MVT::i16 ||
8226 VT.getVectorElementType() == MVT::f16)
8227 return DAG.getNode(ARMISD::VREV32, dl, VT, OpLHS);
8228 // vrev <4 x i8> -> VREV16
8229 assert(VT.getVectorElementType() == MVT::i8);
8230 return DAG.getNode(ARMISD::VREV16, dl, VT, OpLHS);
8231 case OP_VDUP0:
8232 case OP_VDUP1:
8233 case OP_VDUP2:
8234 case OP_VDUP3:
8235 return DAG.getNode(ARMISD::VDUPLANE, dl, VT,
8236 OpLHS, DAG.getConstant(OpNum-OP_VDUP0, dl, MVT::i32));
8237 case OP_VEXT1:
8238 case OP_VEXT2:
8239 case OP_VEXT3:
8240 return DAG.getNode(ARMISD::VEXT, dl, VT,
8241 OpLHS, OpRHS,
8242 DAG.getConstant(OpNum - OP_VEXT1 + 1, dl, MVT::i32));
8243 case OP_VUZPL:
8244 case OP_VUZPR:
8245 return DAG.getNode(ARMISD::VUZP, dl, DAG.getVTList(VT, VT),
8246 OpLHS, OpRHS).getValue(OpNum-OP_VUZPL);
8247 case OP_VZIPL:
8248 case OP_VZIPR:
8249 return DAG.getNode(ARMISD::VZIP, dl, DAG.getVTList(VT, VT),
8250 OpLHS, OpRHS).getValue(OpNum-OP_VZIPL);
8251 case OP_VTRNL:
8252 case OP_VTRNR:
8253 return DAG.getNode(ARMISD::VTRN, dl, DAG.getVTList(VT, VT),
8254 OpLHS, OpRHS).getValue(OpNum-OP_VTRNL);
8258 static SDValue LowerVECTOR_SHUFFLEv8i8(SDValue Op,
8259 ArrayRef<int> ShuffleMask,
8260 SelectionDAG &DAG) {
8261 // Check to see if we can use the VTBL instruction.
8262 SDValue V1 = Op.getOperand(0);
8263 SDValue V2 = Op.getOperand(1);
8264 SDLoc DL(Op);
8266 SmallVector<SDValue, 8> VTBLMask;
8267 for (ArrayRef<int>::iterator
8268 I = ShuffleMask.begin(), E = ShuffleMask.end(); I != E; ++I)
8269 VTBLMask.push_back(DAG.getConstant(*I, DL, MVT::i32));
8271 if (V2.getNode()->isUndef())
8272 return DAG.getNode(ARMISD::VTBL1, DL, MVT::v8i8, V1,
8273 DAG.getBuildVector(MVT::v8i8, DL, VTBLMask));
8275 return DAG.getNode(ARMISD::VTBL2, DL, MVT::v8i8, V1, V2,
8276 DAG.getBuildVector(MVT::v8i8, DL, VTBLMask));
8279 static SDValue LowerReverse_VECTOR_SHUFFLEv16i8_v8i16(SDValue Op,
8280 SelectionDAG &DAG) {
8281 SDLoc DL(Op);
8282 SDValue OpLHS = Op.getOperand(0);
8283 EVT VT = OpLHS.getValueType();
8285 assert((VT == MVT::v8i16 || VT == MVT::v16i8) &&
8286 "Expect an v8i16/v16i8 type");
8287 OpLHS = DAG.getNode(ARMISD::VREV64, DL, VT, OpLHS);
8288 // For a v16i8 type: After the VREV, we have got <8, ...15, 8, ..., 0>. Now,
8289 // extract the first 8 bytes into the top double word and the last 8 bytes
8290 // into the bottom double word. The v8i16 case is similar.
8291 unsigned ExtractNum = (VT == MVT::v16i8) ? 8 : 4;
8292 return DAG.getNode(ARMISD::VEXT, DL, VT, OpLHS, OpLHS,
8293 DAG.getConstant(ExtractNum, DL, MVT::i32));
8296 static EVT getVectorTyFromPredicateVector(EVT VT) {
8297 switch (VT.getSimpleVT().SimpleTy) {
8298 case MVT::v4i1:
8299 return MVT::v4i32;
8300 case MVT::v8i1:
8301 return MVT::v8i16;
8302 case MVT::v16i1:
8303 return MVT::v16i8;
8304 default:
8305 llvm_unreachable("Unexpected vector predicate type");
8309 static SDValue PromoteMVEPredVector(SDLoc dl, SDValue Pred, EVT VT,
8310 SelectionDAG &DAG) {
8311 // Converting from boolean predicates to integers involves creating a vector
8312 // of all ones or all zeroes and selecting the lanes based upon the real
8313 // predicate.
8314 SDValue AllOnes =
8315 DAG.getTargetConstant(ARM_AM::createVMOVModImm(0xe, 0xff), dl, MVT::i32);
8316 AllOnes = DAG.getNode(ARMISD::VMOVIMM, dl, MVT::v16i8, AllOnes);
8318 SDValue AllZeroes =
8319 DAG.getTargetConstant(ARM_AM::createVMOVModImm(0xe, 0x0), dl, MVT::i32);
8320 AllZeroes = DAG.getNode(ARMISD::VMOVIMM, dl, MVT::v16i8, AllZeroes);
8322 // Get full vector type from predicate type
8323 EVT NewVT = getVectorTyFromPredicateVector(VT);
8325 SDValue RecastV1;
8326 // If the real predicate is an v8i1 or v4i1 (not v16i1) then we need to recast
8327 // this to a v16i1. This cannot be done with an ordinary bitcast because the
8328 // sizes are not the same. We have to use a MVE specific PREDICATE_CAST node,
8329 // since we know in hardware the sizes are really the same.
8330 if (VT != MVT::v16i1)
8331 RecastV1 = DAG.getNode(ARMISD::PREDICATE_CAST, dl, MVT::v16i1, Pred);
8332 else
8333 RecastV1 = Pred;
8335 // Select either all ones or zeroes depending upon the real predicate bits.
8336 SDValue PredAsVector =
8337 DAG.getNode(ISD::VSELECT, dl, MVT::v16i8, RecastV1, AllOnes, AllZeroes);
8339 // Recast our new predicate-as-integer v16i8 vector into something
8340 // appropriate for the shuffle, i.e. v4i32 for a real v4i1 predicate.
8341 return DAG.getNode(ISD::BITCAST, dl, NewVT, PredAsVector);
8344 static SDValue LowerVECTOR_SHUFFLE_i1(SDValue Op, SelectionDAG &DAG,
8345 const ARMSubtarget *ST) {
8346 EVT VT = Op.getValueType();
8347 ShuffleVectorSDNode *SVN = cast<ShuffleVectorSDNode>(Op.getNode());
8348 ArrayRef<int> ShuffleMask = SVN->getMask();
8350 assert(ST->hasMVEIntegerOps() &&
8351 "No support for vector shuffle of boolean predicates");
8353 SDValue V1 = Op.getOperand(0);
8354 SDLoc dl(Op);
8355 if (isReverseMask(ShuffleMask, VT)) {
8356 SDValue cast = DAG.getNode(ARMISD::PREDICATE_CAST, dl, MVT::i32, V1);
8357 SDValue rbit = DAG.getNode(ISD::BITREVERSE, dl, MVT::i32, cast);
8358 SDValue srl = DAG.getNode(ISD::SRL, dl, MVT::i32, rbit,
8359 DAG.getConstant(16, dl, MVT::i32));
8360 return DAG.getNode(ARMISD::PREDICATE_CAST, dl, VT, srl);
8363 // Until we can come up with optimised cases for every single vector
8364 // shuffle in existence we have chosen the least painful strategy. This is
8365 // to essentially promote the boolean predicate to a 8-bit integer, where
8366 // each predicate represents a byte. Then we fall back on a normal integer
8367 // vector shuffle and convert the result back into a predicate vector. In
8368 // many cases the generated code might be even better than scalar code
8369 // operating on bits. Just imagine trying to shuffle 8 arbitrary 2-bit
8370 // fields in a register into 8 other arbitrary 2-bit fields!
8371 SDValue PredAsVector = PromoteMVEPredVector(dl, V1, VT, DAG);
8372 EVT NewVT = PredAsVector.getValueType();
8374 // Do the shuffle!
8375 SDValue Shuffled = DAG.getVectorShuffle(NewVT, dl, PredAsVector,
8376 DAG.getUNDEF(NewVT), ShuffleMask);
8378 // Now return the result of comparing the shuffled vector with zero,
8379 // which will generate a real predicate, i.e. v4i1, v8i1 or v16i1.
8380 return DAG.getNode(ARMISD::VCMPZ, dl, VT, Shuffled,
8381 DAG.getConstant(ARMCC::NE, dl, MVT::i32));
8384 static SDValue LowerVECTOR_SHUFFLEUsingMovs(SDValue Op,
8385 ArrayRef<int> ShuffleMask,
8386 SelectionDAG &DAG) {
8387 // Attempt to lower the vector shuffle using as many whole register movs as
8388 // possible. This is useful for types smaller than 32bits, which would
8389 // often otherwise become a series for grp movs.
8390 SDLoc dl(Op);
8391 EVT VT = Op.getValueType();
8392 if (VT.getScalarSizeInBits() >= 32)
8393 return SDValue();
8395 assert((VT == MVT::v8i16 || VT == MVT::v8f16 || VT == MVT::v16i8) &&
8396 "Unexpected vector type");
8397 int NumElts = VT.getVectorNumElements();
8398 int QuarterSize = NumElts / 4;
8399 // The four final parts of the vector, as i32's
8400 SDValue Parts[4];
8402 // Look for full lane vmovs like <0,1,2,3> or <u,5,6,7> etc, (but not
8403 // <u,u,u,u>), returning the vmov lane index
8404 auto getMovIdx = [](ArrayRef<int> ShuffleMask, int Start, int Length) {
8405 // Detect which mov lane this would be from the first non-undef element.
8406 int MovIdx = -1;
8407 for (int i = 0; i < Length; i++) {
8408 if (ShuffleMask[Start + i] >= 0) {
8409 if (ShuffleMask[Start + i] % Length != i)
8410 return -1;
8411 MovIdx = ShuffleMask[Start + i] / Length;
8412 break;
8415 // If all items are undef, leave this for other combines
8416 if (MovIdx == -1)
8417 return -1;
8418 // Check the remaining values are the correct part of the same mov
8419 for (int i = 1; i < Length; i++) {
8420 if (ShuffleMask[Start + i] >= 0 &&
8421 (ShuffleMask[Start + i] / Length != MovIdx ||
8422 ShuffleMask[Start + i] % Length != i))
8423 return -1;
8425 return MovIdx;
8428 for (int Part = 0; Part < 4; ++Part) {
8429 // Does this part look like a mov
8430 int Elt = getMovIdx(ShuffleMask, Part * QuarterSize, QuarterSize);
8431 if (Elt != -1) {
8432 SDValue Input = Op->getOperand(0);
8433 if (Elt >= 4) {
8434 Input = Op->getOperand(1);
8435 Elt -= 4;
8437 SDValue BitCast = DAG.getBitcast(MVT::v4f32, Input);
8438 Parts[Part] = DAG.getNode(ISD::EXTRACT_VECTOR_ELT, dl, MVT::f32, BitCast,
8439 DAG.getConstant(Elt, dl, MVT::i32));
8443 // Nothing interesting found, just return
8444 if (!Parts[0] && !Parts[1] && !Parts[2] && !Parts[3])
8445 return SDValue();
8447 // The other parts need to be built with the old shuffle vector, cast to a
8448 // v4i32 and extract_vector_elts
8449 if (!Parts[0] || !Parts[1] || !Parts[2] || !Parts[3]) {
8450 SmallVector<int, 16> NewShuffleMask;
8451 for (int Part = 0; Part < 4; ++Part)
8452 for (int i = 0; i < QuarterSize; i++)
8453 NewShuffleMask.push_back(
8454 Parts[Part] ? -1 : ShuffleMask[Part * QuarterSize + i]);
8455 SDValue NewShuffle = DAG.getVectorShuffle(
8456 VT, dl, Op->getOperand(0), Op->getOperand(1), NewShuffleMask);
8457 SDValue BitCast = DAG.getBitcast(MVT::v4f32, NewShuffle);
8459 for (int Part = 0; Part < 4; ++Part)
8460 if (!Parts[Part])
8461 Parts[Part] = DAG.getNode(ISD::EXTRACT_VECTOR_ELT, dl, MVT::f32,
8462 BitCast, DAG.getConstant(Part, dl, MVT::i32));
8464 // Build a vector out of the various parts and bitcast it back to the original
8465 // type.
8466 SDValue NewVec = DAG.getNode(ARMISD::BUILD_VECTOR, dl, MVT::v4f32, Parts);
8467 return DAG.getBitcast(VT, NewVec);
8470 static SDValue LowerVECTOR_SHUFFLEUsingOneOff(SDValue Op,
8471 ArrayRef<int> ShuffleMask,
8472 SelectionDAG &DAG) {
8473 SDValue V1 = Op.getOperand(0);
8474 SDValue V2 = Op.getOperand(1);
8475 EVT VT = Op.getValueType();
8476 unsigned NumElts = VT.getVectorNumElements();
8478 // An One-Off Identity mask is one that is mostly an identity mask from as
8479 // single source but contains a single element out-of-place, either from a
8480 // different vector or from another position in the same vector. As opposed to
8481 // lowering this via a ARMISD::BUILD_VECTOR we can generate an extract/insert
8482 // pair directly.
8483 auto isOneOffIdentityMask = [](ArrayRef<int> Mask, EVT VT, int BaseOffset,
8484 int &OffElement) {
8485 OffElement = -1;
8486 int NonUndef = 0;
8487 for (int i = 0, NumMaskElts = Mask.size(); i < NumMaskElts; ++i) {
8488 if (Mask[i] == -1)
8489 continue;
8490 NonUndef++;
8491 if (Mask[i] != i + BaseOffset) {
8492 if (OffElement == -1)
8493 OffElement = i;
8494 else
8495 return false;
8498 return NonUndef > 2 && OffElement != -1;
8500 int OffElement;
8501 SDValue VInput;
8502 if (isOneOffIdentityMask(ShuffleMask, VT, 0, OffElement))
8503 VInput = V1;
8504 else if (isOneOffIdentityMask(ShuffleMask, VT, NumElts, OffElement))
8505 VInput = V2;
8506 else
8507 return SDValue();
8509 SDLoc dl(Op);
8510 EVT SVT = VT.getScalarType() == MVT::i8 || VT.getScalarType() == MVT::i16
8511 ? MVT::i32
8512 : VT.getScalarType();
8513 SDValue Elt = DAG.getNode(
8514 ISD::EXTRACT_VECTOR_ELT, dl, SVT,
8515 ShuffleMask[OffElement] < (int)NumElts ? V1 : V2,
8516 DAG.getVectorIdxConstant(ShuffleMask[OffElement] % NumElts, dl));
8517 return DAG.getNode(ISD::INSERT_VECTOR_ELT, dl, VT, VInput, Elt,
8518 DAG.getVectorIdxConstant(OffElement % NumElts, dl));
8521 static SDValue LowerVECTOR_SHUFFLE(SDValue Op, SelectionDAG &DAG,
8522 const ARMSubtarget *ST) {
8523 SDValue V1 = Op.getOperand(0);
8524 SDValue V2 = Op.getOperand(1);
8525 SDLoc dl(Op);
8526 EVT VT = Op.getValueType();
8527 ShuffleVectorSDNode *SVN = cast<ShuffleVectorSDNode>(Op.getNode());
8528 unsigned EltSize = VT.getScalarSizeInBits();
8530 if (ST->hasMVEIntegerOps() && EltSize == 1)
8531 return LowerVECTOR_SHUFFLE_i1(Op, DAG, ST);
8533 // Convert shuffles that are directly supported on NEON to target-specific
8534 // DAG nodes, instead of keeping them as shuffles and matching them again
8535 // during code selection. This is more efficient and avoids the possibility
8536 // of inconsistencies between legalization and selection.
8537 // FIXME: floating-point vectors should be canonicalized to integer vectors
8538 // of the same time so that they get CSEd properly.
8539 ArrayRef<int> ShuffleMask = SVN->getMask();
8541 if (EltSize <= 32) {
8542 if (SVN->isSplat()) {
8543 int Lane = SVN->getSplatIndex();
8544 // If this is undef splat, generate it via "just" vdup, if possible.
8545 if (Lane == -1) Lane = 0;
8547 // Test if V1 is a SCALAR_TO_VECTOR.
8548 if (Lane == 0 && V1.getOpcode() == ISD::SCALAR_TO_VECTOR) {
8549 return DAG.getNode(ARMISD::VDUP, dl, VT, V1.getOperand(0));
8551 // Test if V1 is a BUILD_VECTOR which is equivalent to a SCALAR_TO_VECTOR
8552 // (and probably will turn into a SCALAR_TO_VECTOR once legalization
8553 // reaches it).
8554 if (Lane == 0 && V1.getOpcode() == ISD::BUILD_VECTOR &&
8555 !isa<ConstantSDNode>(V1.getOperand(0))) {
8556 bool IsScalarToVector = true;
8557 for (unsigned i = 1, e = V1.getNumOperands(); i != e; ++i)
8558 if (!V1.getOperand(i).isUndef()) {
8559 IsScalarToVector = false;
8560 break;
8562 if (IsScalarToVector)
8563 return DAG.getNode(ARMISD::VDUP, dl, VT, V1.getOperand(0));
8565 return DAG.getNode(ARMISD::VDUPLANE, dl, VT, V1,
8566 DAG.getConstant(Lane, dl, MVT::i32));
8569 bool ReverseVEXT = false;
8570 unsigned Imm = 0;
8571 if (ST->hasNEON() && isVEXTMask(ShuffleMask, VT, ReverseVEXT, Imm)) {
8572 if (ReverseVEXT)
8573 std::swap(V1, V2);
8574 return DAG.getNode(ARMISD::VEXT, dl, VT, V1, V2,
8575 DAG.getConstant(Imm, dl, MVT::i32));
8578 if (isVREVMask(ShuffleMask, VT, 64))
8579 return DAG.getNode(ARMISD::VREV64, dl, VT, V1);
8580 if (isVREVMask(ShuffleMask, VT, 32))
8581 return DAG.getNode(ARMISD::VREV32, dl, VT, V1);
8582 if (isVREVMask(ShuffleMask, VT, 16))
8583 return DAG.getNode(ARMISD::VREV16, dl, VT, V1);
8585 if (ST->hasNEON() && V2->isUndef() && isSingletonVEXTMask(ShuffleMask, VT, Imm)) {
8586 return DAG.getNode(ARMISD::VEXT, dl, VT, V1, V1,
8587 DAG.getConstant(Imm, dl, MVT::i32));
8590 // Check for Neon shuffles that modify both input vectors in place.
8591 // If both results are used, i.e., if there are two shuffles with the same
8592 // source operands and with masks corresponding to both results of one of
8593 // these operations, DAG memoization will ensure that a single node is
8594 // used for both shuffles.
8595 unsigned WhichResult = 0;
8596 bool isV_UNDEF = false;
8597 if (ST->hasNEON()) {
8598 if (unsigned ShuffleOpc = isNEONTwoResultShuffleMask(
8599 ShuffleMask, VT, WhichResult, isV_UNDEF)) {
8600 if (isV_UNDEF)
8601 V2 = V1;
8602 return DAG.getNode(ShuffleOpc, dl, DAG.getVTList(VT, VT), V1, V2)
8603 .getValue(WhichResult);
8606 if (ST->hasMVEIntegerOps()) {
8607 if (isVMOVNMask(ShuffleMask, VT, false, false))
8608 return DAG.getNode(ARMISD::VMOVN, dl, VT, V2, V1,
8609 DAG.getConstant(0, dl, MVT::i32));
8610 if (isVMOVNMask(ShuffleMask, VT, true, false))
8611 return DAG.getNode(ARMISD::VMOVN, dl, VT, V1, V2,
8612 DAG.getConstant(1, dl, MVT::i32));
8613 if (isVMOVNMask(ShuffleMask, VT, true, true))
8614 return DAG.getNode(ARMISD::VMOVN, dl, VT, V1, V1,
8615 DAG.getConstant(1, dl, MVT::i32));
8618 // Also check for these shuffles through CONCAT_VECTORS: we canonicalize
8619 // shuffles that produce a result larger than their operands with:
8620 // shuffle(concat(v1, undef), concat(v2, undef))
8621 // ->
8622 // shuffle(concat(v1, v2), undef)
8623 // because we can access quad vectors (see PerformVECTOR_SHUFFLECombine).
8625 // This is useful in the general case, but there are special cases where
8626 // native shuffles produce larger results: the two-result ops.
8628 // Look through the concat when lowering them:
8629 // shuffle(concat(v1, v2), undef)
8630 // ->
8631 // concat(VZIP(v1, v2):0, :1)
8633 if (ST->hasNEON() && V1->getOpcode() == ISD::CONCAT_VECTORS && V2->isUndef()) {
8634 SDValue SubV1 = V1->getOperand(0);
8635 SDValue SubV2 = V1->getOperand(1);
8636 EVT SubVT = SubV1.getValueType();
8638 // We expect these to have been canonicalized to -1.
8639 assert(llvm::all_of(ShuffleMask, [&](int i) {
8640 return i < (int)VT.getVectorNumElements();
8641 }) && "Unexpected shuffle index into UNDEF operand!");
8643 if (unsigned ShuffleOpc = isNEONTwoResultShuffleMask(
8644 ShuffleMask, SubVT, WhichResult, isV_UNDEF)) {
8645 if (isV_UNDEF)
8646 SubV2 = SubV1;
8647 assert((WhichResult == 0) &&
8648 "In-place shuffle of concat can only have one result!");
8649 SDValue Res = DAG.getNode(ShuffleOpc, dl, DAG.getVTList(SubVT, SubVT),
8650 SubV1, SubV2);
8651 return DAG.getNode(ISD::CONCAT_VECTORS, dl, VT, Res.getValue(0),
8652 Res.getValue(1));
8657 if (ST->hasMVEIntegerOps() && EltSize <= 32)
8658 if (SDValue V = LowerVECTOR_SHUFFLEUsingOneOff(Op, ShuffleMask, DAG))
8659 return V;
8661 // If the shuffle is not directly supported and it has 4 elements, use
8662 // the PerfectShuffle-generated table to synthesize it from other shuffles.
8663 unsigned NumElts = VT.getVectorNumElements();
8664 if (NumElts == 4) {
8665 unsigned PFIndexes[4];
8666 for (unsigned i = 0; i != 4; ++i) {
8667 if (ShuffleMask[i] < 0)
8668 PFIndexes[i] = 8;
8669 else
8670 PFIndexes[i] = ShuffleMask[i];
8673 // Compute the index in the perfect shuffle table.
8674 unsigned PFTableIndex =
8675 PFIndexes[0]*9*9*9+PFIndexes[1]*9*9+PFIndexes[2]*9+PFIndexes[3];
8676 unsigned PFEntry = PerfectShuffleTable[PFTableIndex];
8677 unsigned Cost = (PFEntry >> 30);
8679 if (Cost <= 4) {
8680 if (ST->hasNEON())
8681 return GeneratePerfectShuffle(PFEntry, V1, V2, DAG, dl);
8682 else if (isLegalMVEShuffleOp(PFEntry)) {
8683 unsigned LHSID = (PFEntry >> 13) & ((1 << 13)-1);
8684 unsigned RHSID = (PFEntry >> 0) & ((1 << 13)-1);
8685 unsigned PFEntryLHS = PerfectShuffleTable[LHSID];
8686 unsigned PFEntryRHS = PerfectShuffleTable[RHSID];
8687 if (isLegalMVEShuffleOp(PFEntryLHS) && isLegalMVEShuffleOp(PFEntryRHS))
8688 return GeneratePerfectShuffle(PFEntry, V1, V2, DAG, dl);
8693 // Implement shuffles with 32- or 64-bit elements as ARMISD::BUILD_VECTORs.
8694 if (EltSize >= 32) {
8695 // Do the expansion with floating-point types, since that is what the VFP
8696 // registers are defined to use, and since i64 is not legal.
8697 EVT EltVT = EVT::getFloatingPointVT(EltSize);
8698 EVT VecVT = EVT::getVectorVT(*DAG.getContext(), EltVT, NumElts);
8699 V1 = DAG.getNode(ISD::BITCAST, dl, VecVT, V1);
8700 V2 = DAG.getNode(ISD::BITCAST, dl, VecVT, V2);
8701 SmallVector<SDValue, 8> Ops;
8702 for (unsigned i = 0; i < NumElts; ++i) {
8703 if (ShuffleMask[i] < 0)
8704 Ops.push_back(DAG.getUNDEF(EltVT));
8705 else
8706 Ops.push_back(DAG.getNode(ISD::EXTRACT_VECTOR_ELT, dl, EltVT,
8707 ShuffleMask[i] < (int)NumElts ? V1 : V2,
8708 DAG.getConstant(ShuffleMask[i] & (NumElts-1),
8709 dl, MVT::i32)));
8711 SDValue Val = DAG.getNode(ARMISD::BUILD_VECTOR, dl, VecVT, Ops);
8712 return DAG.getNode(ISD::BITCAST, dl, VT, Val);
8715 if (ST->hasNEON() && (VT == MVT::v8i16 || VT == MVT::v16i8) && isReverseMask(ShuffleMask, VT))
8716 return LowerReverse_VECTOR_SHUFFLEv16i8_v8i16(Op, DAG);
8718 if (ST->hasNEON() && VT == MVT::v8i8)
8719 if (SDValue NewOp = LowerVECTOR_SHUFFLEv8i8(Op, ShuffleMask, DAG))
8720 return NewOp;
8722 if (ST->hasMVEIntegerOps())
8723 if (SDValue NewOp = LowerVECTOR_SHUFFLEUsingMovs(Op, ShuffleMask, DAG))
8724 return NewOp;
8726 return SDValue();
8729 static SDValue LowerINSERT_VECTOR_ELT_i1(SDValue Op, SelectionDAG &DAG,
8730 const ARMSubtarget *ST) {
8731 EVT VecVT = Op.getOperand(0).getValueType();
8732 SDLoc dl(Op);
8734 assert(ST->hasMVEIntegerOps() &&
8735 "LowerINSERT_VECTOR_ELT_i1 called without MVE!");
8737 SDValue Conv =
8738 DAG.getNode(ARMISD::PREDICATE_CAST, dl, MVT::i32, Op->getOperand(0));
8739 unsigned Lane = cast<ConstantSDNode>(Op.getOperand(2))->getZExtValue();
8740 unsigned LaneWidth =
8741 getVectorTyFromPredicateVector(VecVT).getScalarSizeInBits() / 8;
8742 unsigned Mask = ((1 << LaneWidth) - 1) << Lane * LaneWidth;
8743 SDValue Ext = DAG.getNode(ISD::SIGN_EXTEND_INREG, dl, MVT::i32,
8744 Op.getOperand(1), DAG.getValueType(MVT::i1));
8745 SDValue BFI = DAG.getNode(ARMISD::BFI, dl, MVT::i32, Conv, Ext,
8746 DAG.getConstant(~Mask, dl, MVT::i32));
8747 return DAG.getNode(ARMISD::PREDICATE_CAST, dl, Op.getValueType(), BFI);
8750 SDValue ARMTargetLowering::LowerINSERT_VECTOR_ELT(SDValue Op,
8751 SelectionDAG &DAG) const {
8752 // INSERT_VECTOR_ELT is legal only for immediate indexes.
8753 SDValue Lane = Op.getOperand(2);
8754 if (!isa<ConstantSDNode>(Lane))
8755 return SDValue();
8757 SDValue Elt = Op.getOperand(1);
8758 EVT EltVT = Elt.getValueType();
8760 if (Subtarget->hasMVEIntegerOps() &&
8761 Op.getValueType().getScalarSizeInBits() == 1)
8762 return LowerINSERT_VECTOR_ELT_i1(Op, DAG, Subtarget);
8764 if (getTypeAction(*DAG.getContext(), EltVT) ==
8765 TargetLowering::TypePromoteFloat) {
8766 // INSERT_VECTOR_ELT doesn't want f16 operands promoting to f32,
8767 // but the type system will try to do that if we don't intervene.
8768 // Reinterpret any such vector-element insertion as one with the
8769 // corresponding integer types.
8771 SDLoc dl(Op);
8773 EVT IEltVT = MVT::getIntegerVT(EltVT.getScalarSizeInBits());
8774 assert(getTypeAction(*DAG.getContext(), IEltVT) !=
8775 TargetLowering::TypePromoteFloat);
8777 SDValue VecIn = Op.getOperand(0);
8778 EVT VecVT = VecIn.getValueType();
8779 EVT IVecVT = EVT::getVectorVT(*DAG.getContext(), IEltVT,
8780 VecVT.getVectorNumElements());
8782 SDValue IElt = DAG.getNode(ISD::BITCAST, dl, IEltVT, Elt);
8783 SDValue IVecIn = DAG.getNode(ISD::BITCAST, dl, IVecVT, VecIn);
8784 SDValue IVecOut = DAG.getNode(ISD::INSERT_VECTOR_ELT, dl, IVecVT,
8785 IVecIn, IElt, Lane);
8786 return DAG.getNode(ISD::BITCAST, dl, VecVT, IVecOut);
8789 return Op;
8792 static SDValue LowerEXTRACT_VECTOR_ELT_i1(SDValue Op, SelectionDAG &DAG,
8793 const ARMSubtarget *ST) {
8794 EVT VecVT = Op.getOperand(0).getValueType();
8795 SDLoc dl(Op);
8797 assert(ST->hasMVEIntegerOps() &&
8798 "LowerINSERT_VECTOR_ELT_i1 called without MVE!");
8800 SDValue Conv =
8801 DAG.getNode(ARMISD::PREDICATE_CAST, dl, MVT::i32, Op->getOperand(0));
8802 unsigned Lane = cast<ConstantSDNode>(Op.getOperand(1))->getZExtValue();
8803 unsigned LaneWidth =
8804 getVectorTyFromPredicateVector(VecVT).getScalarSizeInBits() / 8;
8805 SDValue Shift = DAG.getNode(ISD::SRL, dl, MVT::i32, Conv,
8806 DAG.getConstant(Lane * LaneWidth, dl, MVT::i32));
8807 return Shift;
8810 static SDValue LowerEXTRACT_VECTOR_ELT(SDValue Op, SelectionDAG &DAG,
8811 const ARMSubtarget *ST) {
8812 // EXTRACT_VECTOR_ELT is legal only for immediate indexes.
8813 SDValue Lane = Op.getOperand(1);
8814 if (!isa<ConstantSDNode>(Lane))
8815 return SDValue();
8817 SDValue Vec = Op.getOperand(0);
8818 EVT VT = Vec.getValueType();
8820 if (ST->hasMVEIntegerOps() && VT.getScalarSizeInBits() == 1)
8821 return LowerEXTRACT_VECTOR_ELT_i1(Op, DAG, ST);
8823 if (Op.getValueType() == MVT::i32 && Vec.getScalarValueSizeInBits() < 32) {
8824 SDLoc dl(Op);
8825 return DAG.getNode(ARMISD::VGETLANEu, dl, MVT::i32, Vec, Lane);
8828 return Op;
8831 static SDValue LowerCONCAT_VECTORS_i1(SDValue Op, SelectionDAG &DAG,
8832 const ARMSubtarget *ST) {
8833 SDLoc dl(Op);
8834 assert(Op.getValueType().getScalarSizeInBits() == 1 &&
8835 "Unexpected custom CONCAT_VECTORS lowering");
8836 assert(isPowerOf2_32(Op.getNumOperands()) &&
8837 "Unexpected custom CONCAT_VECTORS lowering");
8838 assert(ST->hasMVEIntegerOps() &&
8839 "CONCAT_VECTORS lowering only supported for MVE");
8841 auto ConcatPair = [&](SDValue V1, SDValue V2) {
8842 EVT Op1VT = V1.getValueType();
8843 EVT Op2VT = V2.getValueType();
8844 assert(Op1VT == Op2VT && "Operand types don't match!");
8845 EVT VT = Op1VT.getDoubleNumVectorElementsVT(*DAG.getContext());
8847 SDValue NewV1 = PromoteMVEPredVector(dl, V1, Op1VT, DAG);
8848 SDValue NewV2 = PromoteMVEPredVector(dl, V2, Op2VT, DAG);
8850 // We now have Op1 + Op2 promoted to vectors of integers, where v8i1 gets
8851 // promoted to v8i16, etc.
8852 MVT ElType =
8853 getVectorTyFromPredicateVector(VT).getScalarType().getSimpleVT();
8854 unsigned NumElts = 2 * Op1VT.getVectorNumElements();
8856 // Extract the vector elements from Op1 and Op2 one by one and truncate them
8857 // to be the right size for the destination. For example, if Op1 is v4i1
8858 // then the promoted vector is v4i32. The result of concatentation gives a
8859 // v8i1, which when promoted is v8i16. That means each i32 element from Op1
8860 // needs truncating to i16 and inserting in the result.
8861 EVT ConcatVT = MVT::getVectorVT(ElType, NumElts);
8862 SDValue ConVec = DAG.getNode(ISD::UNDEF, dl, ConcatVT);
8863 auto ExtractInto = [&DAG, &dl](SDValue NewV, SDValue ConVec, unsigned &j) {
8864 EVT NewVT = NewV.getValueType();
8865 EVT ConcatVT = ConVec.getValueType();
8866 for (unsigned i = 0, e = NewVT.getVectorNumElements(); i < e; i++, j++) {
8867 SDValue Elt = DAG.getNode(ISD::EXTRACT_VECTOR_ELT, dl, MVT::i32, NewV,
8868 DAG.getIntPtrConstant(i, dl));
8869 ConVec = DAG.getNode(ISD::INSERT_VECTOR_ELT, dl, ConcatVT, ConVec, Elt,
8870 DAG.getConstant(j, dl, MVT::i32));
8872 return ConVec;
8874 unsigned j = 0;
8875 ConVec = ExtractInto(NewV1, ConVec, j);
8876 ConVec = ExtractInto(NewV2, ConVec, j);
8878 // Now return the result of comparing the subvector with zero,
8879 // which will generate a real predicate, i.e. v4i1, v8i1 or v16i1.
8880 return DAG.getNode(ARMISD::VCMPZ, dl, VT, ConVec,
8881 DAG.getConstant(ARMCC::NE, dl, MVT::i32));
8884 // Concat each pair of subvectors and pack into the lower half of the array.
8885 SmallVector<SDValue> ConcatOps(Op->op_begin(), Op->op_end());
8886 while (ConcatOps.size() > 1) {
8887 for (unsigned I = 0, E = ConcatOps.size(); I != E; I += 2) {
8888 SDValue V1 = ConcatOps[I];
8889 SDValue V2 = ConcatOps[I + 1];
8890 ConcatOps[I / 2] = ConcatPair(V1, V2);
8892 ConcatOps.resize(ConcatOps.size() / 2);
8894 return ConcatOps[0];
8897 static SDValue LowerCONCAT_VECTORS(SDValue Op, SelectionDAG &DAG,
8898 const ARMSubtarget *ST) {
8899 EVT VT = Op->getValueType(0);
8900 if (ST->hasMVEIntegerOps() && VT.getScalarSizeInBits() == 1)
8901 return LowerCONCAT_VECTORS_i1(Op, DAG, ST);
8903 // The only time a CONCAT_VECTORS operation can have legal types is when
8904 // two 64-bit vectors are concatenated to a 128-bit vector.
8905 assert(Op.getValueType().is128BitVector() && Op.getNumOperands() == 2 &&
8906 "unexpected CONCAT_VECTORS");
8907 SDLoc dl(Op);
8908 SDValue Val = DAG.getUNDEF(MVT::v2f64);
8909 SDValue Op0 = Op.getOperand(0);
8910 SDValue Op1 = Op.getOperand(1);
8911 if (!Op0.isUndef())
8912 Val = DAG.getNode(ISD::INSERT_VECTOR_ELT, dl, MVT::v2f64, Val,
8913 DAG.getNode(ISD::BITCAST, dl, MVT::f64, Op0),
8914 DAG.getIntPtrConstant(0, dl));
8915 if (!Op1.isUndef())
8916 Val = DAG.getNode(ISD::INSERT_VECTOR_ELT, dl, MVT::v2f64, Val,
8917 DAG.getNode(ISD::BITCAST, dl, MVT::f64, Op1),
8918 DAG.getIntPtrConstant(1, dl));
8919 return DAG.getNode(ISD::BITCAST, dl, Op.getValueType(), Val);
8922 static SDValue LowerEXTRACT_SUBVECTOR(SDValue Op, SelectionDAG &DAG,
8923 const ARMSubtarget *ST) {
8924 SDValue V1 = Op.getOperand(0);
8925 SDValue V2 = Op.getOperand(1);
8926 SDLoc dl(Op);
8927 EVT VT = Op.getValueType();
8928 EVT Op1VT = V1.getValueType();
8929 unsigned NumElts = VT.getVectorNumElements();
8930 unsigned Index = cast<ConstantSDNode>(V2)->getZExtValue();
8932 assert(VT.getScalarSizeInBits() == 1 &&
8933 "Unexpected custom EXTRACT_SUBVECTOR lowering");
8934 assert(ST->hasMVEIntegerOps() &&
8935 "EXTRACT_SUBVECTOR lowering only supported for MVE");
8937 SDValue NewV1 = PromoteMVEPredVector(dl, V1, Op1VT, DAG);
8939 // We now have Op1 promoted to a vector of integers, where v8i1 gets
8940 // promoted to v8i16, etc.
8942 MVT ElType = getVectorTyFromPredicateVector(VT).getScalarType().getSimpleVT();
8944 EVT SubVT = MVT::getVectorVT(ElType, NumElts);
8945 SDValue SubVec = DAG.getNode(ISD::UNDEF, dl, SubVT);
8946 for (unsigned i = Index, j = 0; i < (Index + NumElts); i++, j++) {
8947 SDValue Elt = DAG.getNode(ISD::EXTRACT_VECTOR_ELT, dl, MVT::i32, NewV1,
8948 DAG.getIntPtrConstant(i, dl));
8949 SubVec = DAG.getNode(ISD::INSERT_VECTOR_ELT, dl, SubVT, SubVec, Elt,
8950 DAG.getConstant(j, dl, MVT::i32));
8953 // Now return the result of comparing the subvector with zero,
8954 // which will generate a real predicate, i.e. v4i1, v8i1 or v16i1.
8955 return DAG.getNode(ARMISD::VCMPZ, dl, VT, SubVec,
8956 DAG.getConstant(ARMCC::NE, dl, MVT::i32));
8959 // Turn a truncate into a predicate (an i1 vector) into icmp(and(x, 1), 0).
8960 static SDValue LowerTruncatei1(SDNode *N, SelectionDAG &DAG,
8961 const ARMSubtarget *ST) {
8962 assert(ST->hasMVEIntegerOps() && "Expected MVE!");
8963 EVT VT = N->getValueType(0);
8964 assert((VT == MVT::v16i1 || VT == MVT::v8i1 || VT == MVT::v4i1) &&
8965 "Expected a vector i1 type!");
8966 SDValue Op = N->getOperand(0);
8967 EVT FromVT = Op.getValueType();
8968 SDLoc DL(N);
8970 SDValue And =
8971 DAG.getNode(ISD::AND, DL, FromVT, Op, DAG.getConstant(1, DL, FromVT));
8972 return DAG.getNode(ISD::SETCC, DL, VT, And, DAG.getConstant(0, DL, FromVT),
8973 DAG.getCondCode(ISD::SETNE));
8976 static SDValue LowerTruncate(SDNode *N, SelectionDAG &DAG,
8977 const ARMSubtarget *Subtarget) {
8978 if (!Subtarget->hasMVEIntegerOps())
8979 return SDValue();
8981 EVT ToVT = N->getValueType(0);
8982 if (ToVT.getScalarType() == MVT::i1)
8983 return LowerTruncatei1(N, DAG, Subtarget);
8985 // MVE does not have a single instruction to perform the truncation of a v4i32
8986 // into the lower half of a v8i16, in the same way that a NEON vmovn would.
8987 // Most of the instructions in MVE follow the 'Beats' system, where moving
8988 // values from different lanes is usually something that the instructions
8989 // avoid.
8991 // Instead it has top/bottom instructions such as VMOVLT/B and VMOVNT/B,
8992 // which take a the top/bottom half of a larger lane and extend it (or do the
8993 // opposite, truncating into the top/bottom lane from a larger lane). Note
8994 // that because of the way we widen lanes, a v4i16 is really a v4i32 using the
8995 // bottom 16bits from each vector lane. This works really well with T/B
8996 // instructions, but that doesn't extend to v8i32->v8i16 where the lanes need
8997 // to move order.
8999 // But truncates and sext/zext are always going to be fairly common from llvm.
9000 // We have several options for how to deal with them:
9001 // - Wherever possible combine them into an instruction that makes them
9002 // "free". This includes loads/stores, which can perform the trunc as part
9003 // of the memory operation. Or certain shuffles that can be turned into
9004 // VMOVN/VMOVL.
9005 // - Lane Interleaving to transform blocks surrounded by ext/trunc. So
9006 // trunc(mul(sext(a), sext(b))) may become
9007 // VMOVNT(VMUL(VMOVLB(a), VMOVLB(b)), VMUL(VMOVLT(a), VMOVLT(b))). (Which in
9008 // this case can use VMULL). This is performed in the
9009 // MVELaneInterleavingPass.
9010 // - Otherwise we have an option. By default we would expand the
9011 // zext/sext/trunc into a series of lane extract/inserts going via GPR
9012 // registers. One for each vector lane in the vector. This can obviously be
9013 // very expensive.
9014 // - The other option is to use the fact that loads/store can extend/truncate
9015 // to turn a trunc into two truncating stack stores and a stack reload. This
9016 // becomes 3 back-to-back memory operations, but at least that is less than
9017 // all the insert/extracts.
9019 // In order to do the last, we convert certain trunc's into MVETRUNC, which
9020 // are either optimized where they can be, or eventually lowered into stack
9021 // stores/loads. This prevents us from splitting a v8i16 trunc into two stores
9022 // two early, where other instructions would be better, and stops us from
9023 // having to reconstruct multiple buildvector shuffles into loads/stores.
9024 if (ToVT != MVT::v8i16 && ToVT != MVT::v16i8)
9025 return SDValue();
9026 EVT FromVT = N->getOperand(0).getValueType();
9027 if (FromVT != MVT::v8i32 && FromVT != MVT::v16i16)
9028 return SDValue();
9030 SDValue Lo, Hi;
9031 std::tie(Lo, Hi) = DAG.SplitVectorOperand(N, 0);
9032 SDLoc DL(N);
9033 return DAG.getNode(ARMISD::MVETRUNC, DL, ToVT, Lo, Hi);
9036 static SDValue LowerVectorExtend(SDNode *N, SelectionDAG &DAG,
9037 const ARMSubtarget *Subtarget) {
9038 if (!Subtarget->hasMVEIntegerOps())
9039 return SDValue();
9041 // See LowerTruncate above for an explanation of MVEEXT/MVETRUNC.
9043 EVT ToVT = N->getValueType(0);
9044 if (ToVT != MVT::v16i32 && ToVT != MVT::v8i32 && ToVT != MVT::v16i16)
9045 return SDValue();
9046 SDValue Op = N->getOperand(0);
9047 EVT FromVT = Op.getValueType();
9048 if (FromVT != MVT::v8i16 && FromVT != MVT::v16i8)
9049 return SDValue();
9051 SDLoc DL(N);
9052 EVT ExtVT = ToVT.getHalfNumVectorElementsVT(*DAG.getContext());
9053 if (ToVT.getScalarType() == MVT::i32 && FromVT.getScalarType() == MVT::i8)
9054 ExtVT = MVT::v8i16;
9056 unsigned Opcode =
9057 N->getOpcode() == ISD::SIGN_EXTEND ? ARMISD::MVESEXT : ARMISD::MVEZEXT;
9058 SDValue Ext = DAG.getNode(Opcode, DL, DAG.getVTList(ExtVT, ExtVT), Op);
9059 SDValue Ext1 = Ext.getValue(1);
9061 if (ToVT.getScalarType() == MVT::i32 && FromVT.getScalarType() == MVT::i8) {
9062 Ext = DAG.getNode(N->getOpcode(), DL, MVT::v8i32, Ext);
9063 Ext1 = DAG.getNode(N->getOpcode(), DL, MVT::v8i32, Ext1);
9066 return DAG.getNode(ISD::CONCAT_VECTORS, DL, ToVT, Ext, Ext1);
9069 /// isExtendedBUILD_VECTOR - Check if N is a constant BUILD_VECTOR where each
9070 /// element has been zero/sign-extended, depending on the isSigned parameter,
9071 /// from an integer type half its size.
9072 static bool isExtendedBUILD_VECTOR(SDNode *N, SelectionDAG &DAG,
9073 bool isSigned) {
9074 // A v2i64 BUILD_VECTOR will have been legalized to a BITCAST from v4i32.
9075 EVT VT = N->getValueType(0);
9076 if (VT == MVT::v2i64 && N->getOpcode() == ISD::BITCAST) {
9077 SDNode *BVN = N->getOperand(0).getNode();
9078 if (BVN->getValueType(0) != MVT::v4i32 ||
9079 BVN->getOpcode() != ISD::BUILD_VECTOR)
9080 return false;
9081 unsigned LoElt = DAG.getDataLayout().isBigEndian() ? 1 : 0;
9082 unsigned HiElt = 1 - LoElt;
9083 ConstantSDNode *Lo0 = dyn_cast<ConstantSDNode>(BVN->getOperand(LoElt));
9084 ConstantSDNode *Hi0 = dyn_cast<ConstantSDNode>(BVN->getOperand(HiElt));
9085 ConstantSDNode *Lo1 = dyn_cast<ConstantSDNode>(BVN->getOperand(LoElt+2));
9086 ConstantSDNode *Hi1 = dyn_cast<ConstantSDNode>(BVN->getOperand(HiElt+2));
9087 if (!Lo0 || !Hi0 || !Lo1 || !Hi1)
9088 return false;
9089 if (isSigned) {
9090 if (Hi0->getSExtValue() == Lo0->getSExtValue() >> 32 &&
9091 Hi1->getSExtValue() == Lo1->getSExtValue() >> 32)
9092 return true;
9093 } else {
9094 if (Hi0->isNullValue() && Hi1->isNullValue())
9095 return true;
9097 return false;
9100 if (N->getOpcode() != ISD::BUILD_VECTOR)
9101 return false;
9103 for (unsigned i = 0, e = N->getNumOperands(); i != e; ++i) {
9104 SDNode *Elt = N->getOperand(i).getNode();
9105 if (ConstantSDNode *C = dyn_cast<ConstantSDNode>(Elt)) {
9106 unsigned EltSize = VT.getScalarSizeInBits();
9107 unsigned HalfSize = EltSize / 2;
9108 if (isSigned) {
9109 if (!isIntN(HalfSize, C->getSExtValue()))
9110 return false;
9111 } else {
9112 if (!isUIntN(HalfSize, C->getZExtValue()))
9113 return false;
9115 continue;
9117 return false;
9120 return true;
9123 /// isSignExtended - Check if a node is a vector value that is sign-extended
9124 /// or a constant BUILD_VECTOR with sign-extended elements.
9125 static bool isSignExtended(SDNode *N, SelectionDAG &DAG) {
9126 if (N->getOpcode() == ISD::SIGN_EXTEND || ISD::isSEXTLoad(N))
9127 return true;
9128 if (isExtendedBUILD_VECTOR(N, DAG, true))
9129 return true;
9130 return false;
9133 /// isZeroExtended - Check if a node is a vector value that is zero-extended (or
9134 /// any-extended) or a constant BUILD_VECTOR with zero-extended elements.
9135 static bool isZeroExtended(SDNode *N, SelectionDAG &DAG) {
9136 if (N->getOpcode() == ISD::ZERO_EXTEND || N->getOpcode() == ISD::ANY_EXTEND ||
9137 ISD::isZEXTLoad(N))
9138 return true;
9139 if (isExtendedBUILD_VECTOR(N, DAG, false))
9140 return true;
9141 return false;
9144 static EVT getExtensionTo64Bits(const EVT &OrigVT) {
9145 if (OrigVT.getSizeInBits() >= 64)
9146 return OrigVT;
9148 assert(OrigVT.isSimple() && "Expecting a simple value type");
9150 MVT::SimpleValueType OrigSimpleTy = OrigVT.getSimpleVT().SimpleTy;
9151 switch (OrigSimpleTy) {
9152 default: llvm_unreachable("Unexpected Vector Type");
9153 case MVT::v2i8:
9154 case MVT::v2i16:
9155 return MVT::v2i32;
9156 case MVT::v4i8:
9157 return MVT::v4i16;
9161 /// AddRequiredExtensionForVMULL - Add a sign/zero extension to extend the total
9162 /// value size to 64 bits. We need a 64-bit D register as an operand to VMULL.
9163 /// We insert the required extension here to get the vector to fill a D register.
9164 static SDValue AddRequiredExtensionForVMULL(SDValue N, SelectionDAG &DAG,
9165 const EVT &OrigTy,
9166 const EVT &ExtTy,
9167 unsigned ExtOpcode) {
9168 // The vector originally had a size of OrigTy. It was then extended to ExtTy.
9169 // We expect the ExtTy to be 128-bits total. If the OrigTy is less than
9170 // 64-bits we need to insert a new extension so that it will be 64-bits.
9171 assert(ExtTy.is128BitVector() && "Unexpected extension size");
9172 if (OrigTy.getSizeInBits() >= 64)
9173 return N;
9175 // Must extend size to at least 64 bits to be used as an operand for VMULL.
9176 EVT NewVT = getExtensionTo64Bits(OrigTy);
9178 return DAG.getNode(ExtOpcode, SDLoc(N), NewVT, N);
9181 /// SkipLoadExtensionForVMULL - return a load of the original vector size that
9182 /// does not do any sign/zero extension. If the original vector is less
9183 /// than 64 bits, an appropriate extension will be added after the load to
9184 /// reach a total size of 64 bits. We have to add the extension separately
9185 /// because ARM does not have a sign/zero extending load for vectors.
9186 static SDValue SkipLoadExtensionForVMULL(LoadSDNode *LD, SelectionDAG& DAG) {
9187 EVT ExtendedTy = getExtensionTo64Bits(LD->getMemoryVT());
9189 // The load already has the right type.
9190 if (ExtendedTy == LD->getMemoryVT())
9191 return DAG.getLoad(LD->getMemoryVT(), SDLoc(LD), LD->getChain(),
9192 LD->getBasePtr(), LD->getPointerInfo(),
9193 LD->getAlignment(), LD->getMemOperand()->getFlags());
9195 // We need to create a zextload/sextload. We cannot just create a load
9196 // followed by a zext/zext node because LowerMUL is also run during normal
9197 // operation legalization where we can't create illegal types.
9198 return DAG.getExtLoad(LD->getExtensionType(), SDLoc(LD), ExtendedTy,
9199 LD->getChain(), LD->getBasePtr(), LD->getPointerInfo(),
9200 LD->getMemoryVT(), LD->getAlignment(),
9201 LD->getMemOperand()->getFlags());
9204 /// SkipExtensionForVMULL - For a node that is a SIGN_EXTEND, ZERO_EXTEND,
9205 /// ANY_EXTEND, extending load, or BUILD_VECTOR with extended elements, return
9206 /// the unextended value. The unextended vector should be 64 bits so that it can
9207 /// be used as an operand to a VMULL instruction. If the original vector size
9208 /// before extension is less than 64 bits we add a an extension to resize
9209 /// the vector to 64 bits.
9210 static SDValue SkipExtensionForVMULL(SDNode *N, SelectionDAG &DAG) {
9211 if (N->getOpcode() == ISD::SIGN_EXTEND ||
9212 N->getOpcode() == ISD::ZERO_EXTEND || N->getOpcode() == ISD::ANY_EXTEND)
9213 return AddRequiredExtensionForVMULL(N->getOperand(0), DAG,
9214 N->getOperand(0)->getValueType(0),
9215 N->getValueType(0),
9216 N->getOpcode());
9218 if (LoadSDNode *LD = dyn_cast<LoadSDNode>(N)) {
9219 assert((ISD::isSEXTLoad(LD) || ISD::isZEXTLoad(LD)) &&
9220 "Expected extending load");
9222 SDValue newLoad = SkipLoadExtensionForVMULL(LD, DAG);
9223 DAG.ReplaceAllUsesOfValueWith(SDValue(LD, 1), newLoad.getValue(1));
9224 unsigned Opcode = ISD::isSEXTLoad(LD) ? ISD::SIGN_EXTEND : ISD::ZERO_EXTEND;
9225 SDValue extLoad =
9226 DAG.getNode(Opcode, SDLoc(newLoad), LD->getValueType(0), newLoad);
9227 DAG.ReplaceAllUsesOfValueWith(SDValue(LD, 0), extLoad);
9229 return newLoad;
9232 // Otherwise, the value must be a BUILD_VECTOR. For v2i64, it will
9233 // have been legalized as a BITCAST from v4i32.
9234 if (N->getOpcode() == ISD::BITCAST) {
9235 SDNode *BVN = N->getOperand(0).getNode();
9236 assert(BVN->getOpcode() == ISD::BUILD_VECTOR &&
9237 BVN->getValueType(0) == MVT::v4i32 && "expected v4i32 BUILD_VECTOR");
9238 unsigned LowElt = DAG.getDataLayout().isBigEndian() ? 1 : 0;
9239 return DAG.getBuildVector(
9240 MVT::v2i32, SDLoc(N),
9241 {BVN->getOperand(LowElt), BVN->getOperand(LowElt + 2)});
9243 // Construct a new BUILD_VECTOR with elements truncated to half the size.
9244 assert(N->getOpcode() == ISD::BUILD_VECTOR && "expected BUILD_VECTOR");
9245 EVT VT = N->getValueType(0);
9246 unsigned EltSize = VT.getScalarSizeInBits() / 2;
9247 unsigned NumElts = VT.getVectorNumElements();
9248 MVT TruncVT = MVT::getIntegerVT(EltSize);
9249 SmallVector<SDValue, 8> Ops;
9250 SDLoc dl(N);
9251 for (unsigned i = 0; i != NumElts; ++i) {
9252 ConstantSDNode *C = cast<ConstantSDNode>(N->getOperand(i));
9253 const APInt &CInt = C->getAPIntValue();
9254 // Element types smaller than 32 bits are not legal, so use i32 elements.
9255 // The values are implicitly truncated so sext vs. zext doesn't matter.
9256 Ops.push_back(DAG.getConstant(CInt.zextOrTrunc(32), dl, MVT::i32));
9258 return DAG.getBuildVector(MVT::getVectorVT(TruncVT, NumElts), dl, Ops);
9261 static bool isAddSubSExt(SDNode *N, SelectionDAG &DAG) {
9262 unsigned Opcode = N->getOpcode();
9263 if (Opcode == ISD::ADD || Opcode == ISD::SUB) {
9264 SDNode *N0 = N->getOperand(0).getNode();
9265 SDNode *N1 = N->getOperand(1).getNode();
9266 return N0->hasOneUse() && N1->hasOneUse() &&
9267 isSignExtended(N0, DAG) && isSignExtended(N1, DAG);
9269 return false;
9272 static bool isAddSubZExt(SDNode *N, SelectionDAG &DAG) {
9273 unsigned Opcode = N->getOpcode();
9274 if (Opcode == ISD::ADD || Opcode == ISD::SUB) {
9275 SDNode *N0 = N->getOperand(0).getNode();
9276 SDNode *N1 = N->getOperand(1).getNode();
9277 return N0->hasOneUse() && N1->hasOneUse() &&
9278 isZeroExtended(N0, DAG) && isZeroExtended(N1, DAG);
9280 return false;
9283 static SDValue LowerMUL(SDValue Op, SelectionDAG &DAG) {
9284 // Multiplications are only custom-lowered for 128-bit vectors so that
9285 // VMULL can be detected. Otherwise v2i64 multiplications are not legal.
9286 EVT VT = Op.getValueType();
9287 assert(VT.is128BitVector() && VT.isInteger() &&
9288 "unexpected type for custom-lowering ISD::MUL");
9289 SDNode *N0 = Op.getOperand(0).getNode();
9290 SDNode *N1 = Op.getOperand(1).getNode();
9291 unsigned NewOpc = 0;
9292 bool isMLA = false;
9293 bool isN0SExt = isSignExtended(N0, DAG);
9294 bool isN1SExt = isSignExtended(N1, DAG);
9295 if (isN0SExt && isN1SExt)
9296 NewOpc = ARMISD::VMULLs;
9297 else {
9298 bool isN0ZExt = isZeroExtended(N0, DAG);
9299 bool isN1ZExt = isZeroExtended(N1, DAG);
9300 if (isN0ZExt && isN1ZExt)
9301 NewOpc = ARMISD::VMULLu;
9302 else if (isN1SExt || isN1ZExt) {
9303 // Look for (s/zext A + s/zext B) * (s/zext C). We want to turn these
9304 // into (s/zext A * s/zext C) + (s/zext B * s/zext C)
9305 if (isN1SExt && isAddSubSExt(N0, DAG)) {
9306 NewOpc = ARMISD::VMULLs;
9307 isMLA = true;
9308 } else if (isN1ZExt && isAddSubZExt(N0, DAG)) {
9309 NewOpc = ARMISD::VMULLu;
9310 isMLA = true;
9311 } else if (isN0ZExt && isAddSubZExt(N1, DAG)) {
9312 std::swap(N0, N1);
9313 NewOpc = ARMISD::VMULLu;
9314 isMLA = true;
9318 if (!NewOpc) {
9319 if (VT == MVT::v2i64)
9320 // Fall through to expand this. It is not legal.
9321 return SDValue();
9322 else
9323 // Other vector multiplications are legal.
9324 return Op;
9328 // Legalize to a VMULL instruction.
9329 SDLoc DL(Op);
9330 SDValue Op0;
9331 SDValue Op1 = SkipExtensionForVMULL(N1, DAG);
9332 if (!isMLA) {
9333 Op0 = SkipExtensionForVMULL(N0, DAG);
9334 assert(Op0.getValueType().is64BitVector() &&
9335 Op1.getValueType().is64BitVector() &&
9336 "unexpected types for extended operands to VMULL");
9337 return DAG.getNode(NewOpc, DL, VT, Op0, Op1);
9340 // Optimizing (zext A + zext B) * C, to (VMULL A, C) + (VMULL B, C) during
9341 // isel lowering to take advantage of no-stall back to back vmul + vmla.
9342 // vmull q0, d4, d6
9343 // vmlal q0, d5, d6
9344 // is faster than
9345 // vaddl q0, d4, d5
9346 // vmovl q1, d6
9347 // vmul q0, q0, q1
9348 SDValue N00 = SkipExtensionForVMULL(N0->getOperand(0).getNode(), DAG);
9349 SDValue N01 = SkipExtensionForVMULL(N0->getOperand(1).getNode(), DAG);
9350 EVT Op1VT = Op1.getValueType();
9351 return DAG.getNode(N0->getOpcode(), DL, VT,
9352 DAG.getNode(NewOpc, DL, VT,
9353 DAG.getNode(ISD::BITCAST, DL, Op1VT, N00), Op1),
9354 DAG.getNode(NewOpc, DL, VT,
9355 DAG.getNode(ISD::BITCAST, DL, Op1VT, N01), Op1));
9358 static SDValue LowerSDIV_v4i8(SDValue X, SDValue Y, const SDLoc &dl,
9359 SelectionDAG &DAG) {
9360 // TODO: Should this propagate fast-math-flags?
9362 // Convert to float
9363 // float4 xf = vcvt_f32_s32(vmovl_s16(a.lo));
9364 // float4 yf = vcvt_f32_s32(vmovl_s16(b.lo));
9365 X = DAG.getNode(ISD::SIGN_EXTEND, dl, MVT::v4i32, X);
9366 Y = DAG.getNode(ISD::SIGN_EXTEND, dl, MVT::v4i32, Y);
9367 X = DAG.getNode(ISD::SINT_TO_FP, dl, MVT::v4f32, X);
9368 Y = DAG.getNode(ISD::SINT_TO_FP, dl, MVT::v4f32, Y);
9369 // Get reciprocal estimate.
9370 // float4 recip = vrecpeq_f32(yf);
9371 Y = DAG.getNode(ISD::INTRINSIC_WO_CHAIN, dl, MVT::v4f32,
9372 DAG.getConstant(Intrinsic::arm_neon_vrecpe, dl, MVT::i32),
9374 // Because char has a smaller range than uchar, we can actually get away
9375 // without any newton steps. This requires that we use a weird bias
9376 // of 0xb000, however (again, this has been exhaustively tested).
9377 // float4 result = as_float4(as_int4(xf*recip) + 0xb000);
9378 X = DAG.getNode(ISD::FMUL, dl, MVT::v4f32, X, Y);
9379 X = DAG.getNode(ISD::BITCAST, dl, MVT::v4i32, X);
9380 Y = DAG.getConstant(0xb000, dl, MVT::v4i32);
9381 X = DAG.getNode(ISD::ADD, dl, MVT::v4i32, X, Y);
9382 X = DAG.getNode(ISD::BITCAST, dl, MVT::v4f32, X);
9383 // Convert back to short.
9384 X = DAG.getNode(ISD::FP_TO_SINT, dl, MVT::v4i32, X);
9385 X = DAG.getNode(ISD::TRUNCATE, dl, MVT::v4i16, X);
9386 return X;
9389 static SDValue LowerSDIV_v4i16(SDValue N0, SDValue N1, const SDLoc &dl,
9390 SelectionDAG &DAG) {
9391 // TODO: Should this propagate fast-math-flags?
9393 SDValue N2;
9394 // Convert to float.
9395 // float4 yf = vcvt_f32_s32(vmovl_s16(y));
9396 // float4 xf = vcvt_f32_s32(vmovl_s16(x));
9397 N0 = DAG.getNode(ISD::SIGN_EXTEND, dl, MVT::v4i32, N0);
9398 N1 = DAG.getNode(ISD::SIGN_EXTEND, dl, MVT::v4i32, N1);
9399 N0 = DAG.getNode(ISD::SINT_TO_FP, dl, MVT::v4f32, N0);
9400 N1 = DAG.getNode(ISD::SINT_TO_FP, dl, MVT::v4f32, N1);
9402 // Use reciprocal estimate and one refinement step.
9403 // float4 recip = vrecpeq_f32(yf);
9404 // recip *= vrecpsq_f32(yf, recip);
9405 N2 = DAG.getNode(ISD::INTRINSIC_WO_CHAIN, dl, MVT::v4f32,
9406 DAG.getConstant(Intrinsic::arm_neon_vrecpe, dl, MVT::i32),
9407 N1);
9408 N1 = DAG.getNode(ISD::INTRINSIC_WO_CHAIN, dl, MVT::v4f32,
9409 DAG.getConstant(Intrinsic::arm_neon_vrecps, dl, MVT::i32),
9410 N1, N2);
9411 N2 = DAG.getNode(ISD::FMUL, dl, MVT::v4f32, N1, N2);
9412 // Because short has a smaller range than ushort, we can actually get away
9413 // with only a single newton step. This requires that we use a weird bias
9414 // of 89, however (again, this has been exhaustively tested).
9415 // float4 result = as_float4(as_int4(xf*recip) + 0x89);
9416 N0 = DAG.getNode(ISD::FMUL, dl, MVT::v4f32, N0, N2);
9417 N0 = DAG.getNode(ISD::BITCAST, dl, MVT::v4i32, N0);
9418 N1 = DAG.getConstant(0x89, dl, MVT::v4i32);
9419 N0 = DAG.getNode(ISD::ADD, dl, MVT::v4i32, N0, N1);
9420 N0 = DAG.getNode(ISD::BITCAST, dl, MVT::v4f32, N0);
9421 // Convert back to integer and return.
9422 // return vmovn_s32(vcvt_s32_f32(result));
9423 N0 = DAG.getNode(ISD::FP_TO_SINT, dl, MVT::v4i32, N0);
9424 N0 = DAG.getNode(ISD::TRUNCATE, dl, MVT::v4i16, N0);
9425 return N0;
9428 static SDValue LowerSDIV(SDValue Op, SelectionDAG &DAG,
9429 const ARMSubtarget *ST) {
9430 EVT VT = Op.getValueType();
9431 assert((VT == MVT::v4i16 || VT == MVT::v8i8) &&
9432 "unexpected type for custom-lowering ISD::SDIV");
9434 SDLoc dl(Op);
9435 SDValue N0 = Op.getOperand(0);
9436 SDValue N1 = Op.getOperand(1);
9437 SDValue N2, N3;
9439 if (VT == MVT::v8i8) {
9440 N0 = DAG.getNode(ISD::SIGN_EXTEND, dl, MVT::v8i16, N0);
9441 N1 = DAG.getNode(ISD::SIGN_EXTEND, dl, MVT::v8i16, N1);
9443 N2 = DAG.getNode(ISD::EXTRACT_SUBVECTOR, dl, MVT::v4i16, N0,
9444 DAG.getIntPtrConstant(4, dl));
9445 N3 = DAG.getNode(ISD::EXTRACT_SUBVECTOR, dl, MVT::v4i16, N1,
9446 DAG.getIntPtrConstant(4, dl));
9447 N0 = DAG.getNode(ISD::EXTRACT_SUBVECTOR, dl, MVT::v4i16, N0,
9448 DAG.getIntPtrConstant(0, dl));
9449 N1 = DAG.getNode(ISD::EXTRACT_SUBVECTOR, dl, MVT::v4i16, N1,
9450 DAG.getIntPtrConstant(0, dl));
9452 N0 = LowerSDIV_v4i8(N0, N1, dl, DAG); // v4i16
9453 N2 = LowerSDIV_v4i8(N2, N3, dl, DAG); // v4i16
9455 N0 = DAG.getNode(ISD::CONCAT_VECTORS, dl, MVT::v8i16, N0, N2);
9456 N0 = LowerCONCAT_VECTORS(N0, DAG, ST);
9458 N0 = DAG.getNode(ISD::TRUNCATE, dl, MVT::v8i8, N0);
9459 return N0;
9461 return LowerSDIV_v4i16(N0, N1, dl, DAG);
9464 static SDValue LowerUDIV(SDValue Op, SelectionDAG &DAG,
9465 const ARMSubtarget *ST) {
9466 // TODO: Should this propagate fast-math-flags?
9467 EVT VT = Op.getValueType();
9468 assert((VT == MVT::v4i16 || VT == MVT::v8i8) &&
9469 "unexpected type for custom-lowering ISD::UDIV");
9471 SDLoc dl(Op);
9472 SDValue N0 = Op.getOperand(0);
9473 SDValue N1 = Op.getOperand(1);
9474 SDValue N2, N3;
9476 if (VT == MVT::v8i8) {
9477 N0 = DAG.getNode(ISD::ZERO_EXTEND, dl, MVT::v8i16, N0);
9478 N1 = DAG.getNode(ISD::ZERO_EXTEND, dl, MVT::v8i16, N1);
9480 N2 = DAG.getNode(ISD::EXTRACT_SUBVECTOR, dl, MVT::v4i16, N0,
9481 DAG.getIntPtrConstant(4, dl));
9482 N3 = DAG.getNode(ISD::EXTRACT_SUBVECTOR, dl, MVT::v4i16, N1,
9483 DAG.getIntPtrConstant(4, dl));
9484 N0 = DAG.getNode(ISD::EXTRACT_SUBVECTOR, dl, MVT::v4i16, N0,
9485 DAG.getIntPtrConstant(0, dl));
9486 N1 = DAG.getNode(ISD::EXTRACT_SUBVECTOR, dl, MVT::v4i16, N1,
9487 DAG.getIntPtrConstant(0, dl));
9489 N0 = LowerSDIV_v4i16(N0, N1, dl, DAG); // v4i16
9490 N2 = LowerSDIV_v4i16(N2, N3, dl, DAG); // v4i16
9492 N0 = DAG.getNode(ISD::CONCAT_VECTORS, dl, MVT::v8i16, N0, N2);
9493 N0 = LowerCONCAT_VECTORS(N0, DAG, ST);
9495 N0 = DAG.getNode(ISD::INTRINSIC_WO_CHAIN, dl, MVT::v8i8,
9496 DAG.getConstant(Intrinsic::arm_neon_vqmovnsu, dl,
9497 MVT::i32),
9498 N0);
9499 return N0;
9502 // v4i16 sdiv ... Convert to float.
9503 // float4 yf = vcvt_f32_s32(vmovl_u16(y));
9504 // float4 xf = vcvt_f32_s32(vmovl_u16(x));
9505 N0 = DAG.getNode(ISD::ZERO_EXTEND, dl, MVT::v4i32, N0);
9506 N1 = DAG.getNode(ISD::ZERO_EXTEND, dl, MVT::v4i32, N1);
9507 N0 = DAG.getNode(ISD::SINT_TO_FP, dl, MVT::v4f32, N0);
9508 SDValue BN1 = DAG.getNode(ISD::SINT_TO_FP, dl, MVT::v4f32, N1);
9510 // Use reciprocal estimate and two refinement steps.
9511 // float4 recip = vrecpeq_f32(yf);
9512 // recip *= vrecpsq_f32(yf, recip);
9513 // recip *= vrecpsq_f32(yf, recip);
9514 N2 = DAG.getNode(ISD::INTRINSIC_WO_CHAIN, dl, MVT::v4f32,
9515 DAG.getConstant(Intrinsic::arm_neon_vrecpe, dl, MVT::i32),
9516 BN1);
9517 N1 = DAG.getNode(ISD::INTRINSIC_WO_CHAIN, dl, MVT::v4f32,
9518 DAG.getConstant(Intrinsic::arm_neon_vrecps, dl, MVT::i32),
9519 BN1, N2);
9520 N2 = DAG.getNode(ISD::FMUL, dl, MVT::v4f32, N1, N2);
9521 N1 = DAG.getNode(ISD::INTRINSIC_WO_CHAIN, dl, MVT::v4f32,
9522 DAG.getConstant(Intrinsic::arm_neon_vrecps, dl, MVT::i32),
9523 BN1, N2);
9524 N2 = DAG.getNode(ISD::FMUL, dl, MVT::v4f32, N1, N2);
9525 // Simply multiplying by the reciprocal estimate can leave us a few ulps
9526 // too low, so we add 2 ulps (exhaustive testing shows that this is enough,
9527 // and that it will never cause us to return an answer too large).
9528 // float4 result = as_float4(as_int4(xf*recip) + 2);
9529 N0 = DAG.getNode(ISD::FMUL, dl, MVT::v4f32, N0, N2);
9530 N0 = DAG.getNode(ISD::BITCAST, dl, MVT::v4i32, N0);
9531 N1 = DAG.getConstant(2, dl, MVT::v4i32);
9532 N0 = DAG.getNode(ISD::ADD, dl, MVT::v4i32, N0, N1);
9533 N0 = DAG.getNode(ISD::BITCAST, dl, MVT::v4f32, N0);
9534 // Convert back to integer and return.
9535 // return vmovn_u32(vcvt_s32_f32(result));
9536 N0 = DAG.getNode(ISD::FP_TO_SINT, dl, MVT::v4i32, N0);
9537 N0 = DAG.getNode(ISD::TRUNCATE, dl, MVT::v4i16, N0);
9538 return N0;
9541 static SDValue LowerADDSUBCARRY(SDValue Op, SelectionDAG &DAG) {
9542 SDNode *N = Op.getNode();
9543 EVT VT = N->getValueType(0);
9544 SDVTList VTs = DAG.getVTList(VT, MVT::i32);
9546 SDValue Carry = Op.getOperand(2);
9548 SDLoc DL(Op);
9550 SDValue Result;
9551 if (Op.getOpcode() == ISD::ADDCARRY) {
9552 // This converts the boolean value carry into the carry flag.
9553 Carry = ConvertBooleanCarryToCarryFlag(Carry, DAG);
9555 // Do the addition proper using the carry flag we wanted.
9556 Result = DAG.getNode(ARMISD::ADDE, DL, VTs, Op.getOperand(0),
9557 Op.getOperand(1), Carry);
9559 // Now convert the carry flag into a boolean value.
9560 Carry = ConvertCarryFlagToBooleanCarry(Result.getValue(1), VT, DAG);
9561 } else {
9562 // ARMISD::SUBE expects a carry not a borrow like ISD::SUBCARRY so we
9563 // have to invert the carry first.
9564 Carry = DAG.getNode(ISD::SUB, DL, MVT::i32,
9565 DAG.getConstant(1, DL, MVT::i32), Carry);
9566 // This converts the boolean value carry into the carry flag.
9567 Carry = ConvertBooleanCarryToCarryFlag(Carry, DAG);
9569 // Do the subtraction proper using the carry flag we wanted.
9570 Result = DAG.getNode(ARMISD::SUBE, DL, VTs, Op.getOperand(0),
9571 Op.getOperand(1), Carry);
9573 // Now convert the carry flag into a boolean value.
9574 Carry = ConvertCarryFlagToBooleanCarry(Result.getValue(1), VT, DAG);
9575 // But the carry returned by ARMISD::SUBE is not a borrow as expected
9576 // by ISD::SUBCARRY, so compute 1 - C.
9577 Carry = DAG.getNode(ISD::SUB, DL, MVT::i32,
9578 DAG.getConstant(1, DL, MVT::i32), Carry);
9581 // Return both values.
9582 return DAG.getNode(ISD::MERGE_VALUES, DL, N->getVTList(), Result, Carry);
9585 SDValue ARMTargetLowering::LowerFSINCOS(SDValue Op, SelectionDAG &DAG) const {
9586 assert(Subtarget->isTargetDarwin());
9588 // For iOS, we want to call an alternative entry point: __sincos_stret,
9589 // return values are passed via sret.
9590 SDLoc dl(Op);
9591 SDValue Arg = Op.getOperand(0);
9592 EVT ArgVT = Arg.getValueType();
9593 Type *ArgTy = ArgVT.getTypeForEVT(*DAG.getContext());
9594 auto PtrVT = getPointerTy(DAG.getDataLayout());
9596 MachineFrameInfo &MFI = DAG.getMachineFunction().getFrameInfo();
9597 const TargetLowering &TLI = DAG.getTargetLoweringInfo();
9599 // Pair of floats / doubles used to pass the result.
9600 Type *RetTy = StructType::get(ArgTy, ArgTy);
9601 auto &DL = DAG.getDataLayout();
9603 ArgListTy Args;
9604 bool ShouldUseSRet = Subtarget->isAPCS_ABI();
9605 SDValue SRet;
9606 if (ShouldUseSRet) {
9607 // Create stack object for sret.
9608 const uint64_t ByteSize = DL.getTypeAllocSize(RetTy);
9609 const Align StackAlign = DL.getPrefTypeAlign(RetTy);
9610 int FrameIdx = MFI.CreateStackObject(ByteSize, StackAlign, false);
9611 SRet = DAG.getFrameIndex(FrameIdx, TLI.getPointerTy(DL));
9613 ArgListEntry Entry;
9614 Entry.Node = SRet;
9615 Entry.Ty = RetTy->getPointerTo();
9616 Entry.IsSExt = false;
9617 Entry.IsZExt = false;
9618 Entry.IsSRet = true;
9619 Args.push_back(Entry);
9620 RetTy = Type::getVoidTy(*DAG.getContext());
9623 ArgListEntry Entry;
9624 Entry.Node = Arg;
9625 Entry.Ty = ArgTy;
9626 Entry.IsSExt = false;
9627 Entry.IsZExt = false;
9628 Args.push_back(Entry);
9630 RTLIB::Libcall LC =
9631 (ArgVT == MVT::f64) ? RTLIB::SINCOS_STRET_F64 : RTLIB::SINCOS_STRET_F32;
9632 const char *LibcallName = getLibcallName(LC);
9633 CallingConv::ID CC = getLibcallCallingConv(LC);
9634 SDValue Callee = DAG.getExternalSymbol(LibcallName, getPointerTy(DL));
9636 TargetLowering::CallLoweringInfo CLI(DAG);
9637 CLI.setDebugLoc(dl)
9638 .setChain(DAG.getEntryNode())
9639 .setCallee(CC, RetTy, Callee, std::move(Args))
9640 .setDiscardResult(ShouldUseSRet);
9641 std::pair<SDValue, SDValue> CallResult = LowerCallTo(CLI);
9643 if (!ShouldUseSRet)
9644 return CallResult.first;
9646 SDValue LoadSin =
9647 DAG.getLoad(ArgVT, dl, CallResult.second, SRet, MachinePointerInfo());
9649 // Address of cos field.
9650 SDValue Add = DAG.getNode(ISD::ADD, dl, PtrVT, SRet,
9651 DAG.getIntPtrConstant(ArgVT.getStoreSize(), dl));
9652 SDValue LoadCos =
9653 DAG.getLoad(ArgVT, dl, LoadSin.getValue(1), Add, MachinePointerInfo());
9655 SDVTList Tys = DAG.getVTList(ArgVT, ArgVT);
9656 return DAG.getNode(ISD::MERGE_VALUES, dl, Tys,
9657 LoadSin.getValue(0), LoadCos.getValue(0));
9660 SDValue ARMTargetLowering::LowerWindowsDIVLibCall(SDValue Op, SelectionDAG &DAG,
9661 bool Signed,
9662 SDValue &Chain) const {
9663 EVT VT = Op.getValueType();
9664 assert((VT == MVT::i32 || VT == MVT::i64) &&
9665 "unexpected type for custom lowering DIV");
9666 SDLoc dl(Op);
9668 const auto &DL = DAG.getDataLayout();
9669 const auto &TLI = DAG.getTargetLoweringInfo();
9671 const char *Name = nullptr;
9672 if (Signed)
9673 Name = (VT == MVT::i32) ? "__rt_sdiv" : "__rt_sdiv64";
9674 else
9675 Name = (VT == MVT::i32) ? "__rt_udiv" : "__rt_udiv64";
9677 SDValue ES = DAG.getExternalSymbol(Name, TLI.getPointerTy(DL));
9679 ARMTargetLowering::ArgListTy Args;
9681 for (auto AI : {1, 0}) {
9682 ArgListEntry Arg;
9683 Arg.Node = Op.getOperand(AI);
9684 Arg.Ty = Arg.Node.getValueType().getTypeForEVT(*DAG.getContext());
9685 Args.push_back(Arg);
9688 CallLoweringInfo CLI(DAG);
9689 CLI.setDebugLoc(dl)
9690 .setChain(Chain)
9691 .setCallee(CallingConv::ARM_AAPCS_VFP, VT.getTypeForEVT(*DAG.getContext()),
9692 ES, std::move(Args));
9694 return LowerCallTo(CLI).first;
9697 // This is a code size optimisation: return the original SDIV node to
9698 // DAGCombiner when we don't want to expand SDIV into a sequence of
9699 // instructions, and an empty node otherwise which will cause the
9700 // SDIV to be expanded in DAGCombine.
9701 SDValue
9702 ARMTargetLowering::BuildSDIVPow2(SDNode *N, const APInt &Divisor,
9703 SelectionDAG &DAG,
9704 SmallVectorImpl<SDNode *> &Created) const {
9705 // TODO: Support SREM
9706 if (N->getOpcode() != ISD::SDIV)
9707 return SDValue();
9709 const auto &ST = static_cast<const ARMSubtarget&>(DAG.getSubtarget());
9710 const bool MinSize = ST.hasMinSize();
9711 const bool HasDivide = ST.isThumb() ? ST.hasDivideInThumbMode()
9712 : ST.hasDivideInARMMode();
9714 // Don't touch vector types; rewriting this may lead to scalarizing
9715 // the int divs.
9716 if (N->getOperand(0).getValueType().isVector())
9717 return SDValue();
9719 // Bail if MinSize is not set, and also for both ARM and Thumb mode we need
9720 // hwdiv support for this to be really profitable.
9721 if (!(MinSize && HasDivide))
9722 return SDValue();
9724 // ARM mode is a bit simpler than Thumb: we can handle large power
9725 // of 2 immediates with 1 mov instruction; no further checks required,
9726 // just return the sdiv node.
9727 if (!ST.isThumb())
9728 return SDValue(N, 0);
9730 // In Thumb mode, immediates larger than 128 need a wide 4-byte MOV,
9731 // and thus lose the code size benefits of a MOVS that requires only 2.
9732 // TargetTransformInfo and 'getIntImmCodeSizeCost' could be helpful here,
9733 // but as it's doing exactly this, it's not worth the trouble to get TTI.
9734 if (Divisor.sgt(128))
9735 return SDValue();
9737 return SDValue(N, 0);
9740 SDValue ARMTargetLowering::LowerDIV_Windows(SDValue Op, SelectionDAG &DAG,
9741 bool Signed) const {
9742 assert(Op.getValueType() == MVT::i32 &&
9743 "unexpected type for custom lowering DIV");
9744 SDLoc dl(Op);
9746 SDValue DBZCHK = DAG.getNode(ARMISD::WIN__DBZCHK, dl, MVT::Other,
9747 DAG.getEntryNode(), Op.getOperand(1));
9749 return LowerWindowsDIVLibCall(Op, DAG, Signed, DBZCHK);
9752 static SDValue WinDBZCheckDenominator(SelectionDAG &DAG, SDNode *N, SDValue InChain) {
9753 SDLoc DL(N);
9754 SDValue Op = N->getOperand(1);
9755 if (N->getValueType(0) == MVT::i32)
9756 return DAG.getNode(ARMISD::WIN__DBZCHK, DL, MVT::Other, InChain, Op);
9757 SDValue Lo = DAG.getNode(ISD::EXTRACT_ELEMENT, DL, MVT::i32, Op,
9758 DAG.getConstant(0, DL, MVT::i32));
9759 SDValue Hi = DAG.getNode(ISD::EXTRACT_ELEMENT, DL, MVT::i32, Op,
9760 DAG.getConstant(1, DL, MVT::i32));
9761 return DAG.getNode(ARMISD::WIN__DBZCHK, DL, MVT::Other, InChain,
9762 DAG.getNode(ISD::OR, DL, MVT::i32, Lo, Hi));
9765 void ARMTargetLowering::ExpandDIV_Windows(
9766 SDValue Op, SelectionDAG &DAG, bool Signed,
9767 SmallVectorImpl<SDValue> &Results) const {
9768 const auto &DL = DAG.getDataLayout();
9769 const auto &TLI = DAG.getTargetLoweringInfo();
9771 assert(Op.getValueType() == MVT::i64 &&
9772 "unexpected type for custom lowering DIV");
9773 SDLoc dl(Op);
9775 SDValue DBZCHK = WinDBZCheckDenominator(DAG, Op.getNode(), DAG.getEntryNode());
9777 SDValue Result = LowerWindowsDIVLibCall(Op, DAG, Signed, DBZCHK);
9779 SDValue Lower = DAG.getNode(ISD::TRUNCATE, dl, MVT::i32, Result);
9780 SDValue Upper = DAG.getNode(ISD::SRL, dl, MVT::i64, Result,
9781 DAG.getConstant(32, dl, TLI.getPointerTy(DL)));
9782 Upper = DAG.getNode(ISD::TRUNCATE, dl, MVT::i32, Upper);
9784 Results.push_back(DAG.getNode(ISD::BUILD_PAIR, dl, MVT::i64, Lower, Upper));
9787 static SDValue LowerPredicateLoad(SDValue Op, SelectionDAG &DAG) {
9788 LoadSDNode *LD = cast<LoadSDNode>(Op.getNode());
9789 EVT MemVT = LD->getMemoryVT();
9790 assert((MemVT == MVT::v4i1 || MemVT == MVT::v8i1 || MemVT == MVT::v16i1) &&
9791 "Expected a predicate type!");
9792 assert(MemVT == Op.getValueType());
9793 assert(LD->getExtensionType() == ISD::NON_EXTLOAD &&
9794 "Expected a non-extending load");
9795 assert(LD->isUnindexed() && "Expected a unindexed load");
9797 // The basic MVE VLDR on a v4i1/v8i1 actually loads the entire 16bit
9798 // predicate, with the "v4i1" bits spread out over the 16 bits loaded. We
9799 // need to make sure that 8/4 bits are actually loaded into the correct
9800 // place, which means loading the value and then shuffling the values into
9801 // the bottom bits of the predicate.
9802 // Equally, VLDR for an v16i1 will actually load 32bits (so will be incorrect
9803 // for BE).
9804 // Speaking of BE, apparently the rest of llvm will assume a reverse order to
9805 // a natural VMSR(load), so needs to be reversed.
9807 SDLoc dl(Op);
9808 SDValue Load = DAG.getExtLoad(
9809 ISD::EXTLOAD, dl, MVT::i32, LD->getChain(), LD->getBasePtr(),
9810 EVT::getIntegerVT(*DAG.getContext(), MemVT.getSizeInBits()),
9811 LD->getMemOperand());
9812 SDValue Val = Load;
9813 if (DAG.getDataLayout().isBigEndian())
9814 Val = DAG.getNode(ISD::SRL, dl, MVT::i32,
9815 DAG.getNode(ISD::BITREVERSE, dl, MVT::i32, Load),
9816 DAG.getConstant(32 - MemVT.getSizeInBits(), dl, MVT::i32));
9817 SDValue Pred = DAG.getNode(ARMISD::PREDICATE_CAST, dl, MVT::v16i1, Val);
9818 if (MemVT != MVT::v16i1)
9819 Pred = DAG.getNode(ISD::EXTRACT_SUBVECTOR, dl, MemVT, Pred,
9820 DAG.getConstant(0, dl, MVT::i32));
9821 return DAG.getMergeValues({Pred, Load.getValue(1)}, dl);
9824 void ARMTargetLowering::LowerLOAD(SDNode *N, SmallVectorImpl<SDValue> &Results,
9825 SelectionDAG &DAG) const {
9826 LoadSDNode *LD = cast<LoadSDNode>(N);
9827 EVT MemVT = LD->getMemoryVT();
9828 assert(LD->isUnindexed() && "Loads should be unindexed at this point.");
9830 if (MemVT == MVT::i64 && Subtarget->hasV5TEOps() &&
9831 !Subtarget->isThumb1Only() && LD->isVolatile()) {
9832 SDLoc dl(N);
9833 SDValue Result = DAG.getMemIntrinsicNode(
9834 ARMISD::LDRD, dl, DAG.getVTList({MVT::i32, MVT::i32, MVT::Other}),
9835 {LD->getChain(), LD->getBasePtr()}, MemVT, LD->getMemOperand());
9836 SDValue Lo = Result.getValue(DAG.getDataLayout().isLittleEndian() ? 0 : 1);
9837 SDValue Hi = Result.getValue(DAG.getDataLayout().isLittleEndian() ? 1 : 0);
9838 SDValue Pair = DAG.getNode(ISD::BUILD_PAIR, dl, MVT::i64, Lo, Hi);
9839 Results.append({Pair, Result.getValue(2)});
9843 static SDValue LowerPredicateStore(SDValue Op, SelectionDAG &DAG) {
9844 StoreSDNode *ST = cast<StoreSDNode>(Op.getNode());
9845 EVT MemVT = ST->getMemoryVT();
9846 assert((MemVT == MVT::v4i1 || MemVT == MVT::v8i1 || MemVT == MVT::v16i1) &&
9847 "Expected a predicate type!");
9848 assert(MemVT == ST->getValue().getValueType());
9849 assert(!ST->isTruncatingStore() && "Expected a non-extending store");
9850 assert(ST->isUnindexed() && "Expected a unindexed store");
9852 // Only store the v4i1 or v8i1 worth of bits, via a buildvector with top bits
9853 // unset and a scalar store.
9854 SDLoc dl(Op);
9855 SDValue Build = ST->getValue();
9856 if (MemVT != MVT::v16i1) {
9857 SmallVector<SDValue, 16> Ops;
9858 for (unsigned I = 0; I < MemVT.getVectorNumElements(); I++) {
9859 unsigned Elt = DAG.getDataLayout().isBigEndian()
9860 ? MemVT.getVectorNumElements() - I - 1
9861 : I;
9862 Ops.push_back(DAG.getNode(ISD::EXTRACT_VECTOR_ELT, dl, MVT::i32, Build,
9863 DAG.getConstant(Elt, dl, MVT::i32)));
9865 for (unsigned I = MemVT.getVectorNumElements(); I < 16; I++)
9866 Ops.push_back(DAG.getUNDEF(MVT::i32));
9867 Build = DAG.getNode(ISD::BUILD_VECTOR, dl, MVT::v16i1, Ops);
9869 SDValue GRP = DAG.getNode(ARMISD::PREDICATE_CAST, dl, MVT::i32, Build);
9870 if (MemVT == MVT::v16i1 && DAG.getDataLayout().isBigEndian())
9871 GRP = DAG.getNode(ISD::SRL, dl, MVT::i32,
9872 DAG.getNode(ISD::BITREVERSE, dl, MVT::i32, GRP),
9873 DAG.getConstant(16, dl, MVT::i32));
9874 return DAG.getTruncStore(
9875 ST->getChain(), dl, GRP, ST->getBasePtr(),
9876 EVT::getIntegerVT(*DAG.getContext(), MemVT.getSizeInBits()),
9877 ST->getMemOperand());
9880 static SDValue LowerSTORE(SDValue Op, SelectionDAG &DAG,
9881 const ARMSubtarget *Subtarget) {
9882 StoreSDNode *ST = cast<StoreSDNode>(Op.getNode());
9883 EVT MemVT = ST->getMemoryVT();
9884 assert(ST->isUnindexed() && "Stores should be unindexed at this point.");
9886 if (MemVT == MVT::i64 && Subtarget->hasV5TEOps() &&
9887 !Subtarget->isThumb1Only() && ST->isVolatile()) {
9888 SDNode *N = Op.getNode();
9889 SDLoc dl(N);
9891 SDValue Lo = DAG.getNode(
9892 ISD::EXTRACT_ELEMENT, dl, MVT::i32, ST->getValue(),
9893 DAG.getTargetConstant(DAG.getDataLayout().isLittleEndian() ? 0 : 1, dl,
9894 MVT::i32));
9895 SDValue Hi = DAG.getNode(
9896 ISD::EXTRACT_ELEMENT, dl, MVT::i32, ST->getValue(),
9897 DAG.getTargetConstant(DAG.getDataLayout().isLittleEndian() ? 1 : 0, dl,
9898 MVT::i32));
9900 return DAG.getMemIntrinsicNode(ARMISD::STRD, dl, DAG.getVTList(MVT::Other),
9901 {ST->getChain(), Lo, Hi, ST->getBasePtr()},
9902 MemVT, ST->getMemOperand());
9903 } else if (Subtarget->hasMVEIntegerOps() &&
9904 ((MemVT == MVT::v4i1 || MemVT == MVT::v8i1 ||
9905 MemVT == MVT::v16i1))) {
9906 return LowerPredicateStore(Op, DAG);
9909 return SDValue();
9912 static bool isZeroVector(SDValue N) {
9913 return (ISD::isBuildVectorAllZeros(N.getNode()) ||
9914 (N->getOpcode() == ARMISD::VMOVIMM &&
9915 isNullConstant(N->getOperand(0))));
9918 static SDValue LowerMLOAD(SDValue Op, SelectionDAG &DAG) {
9919 MaskedLoadSDNode *N = cast<MaskedLoadSDNode>(Op.getNode());
9920 MVT VT = Op.getSimpleValueType();
9921 SDValue Mask = N->getMask();
9922 SDValue PassThru = N->getPassThru();
9923 SDLoc dl(Op);
9925 if (isZeroVector(PassThru))
9926 return Op;
9928 // MVE Masked loads use zero as the passthru value. Here we convert undef to
9929 // zero too, and other values are lowered to a select.
9930 SDValue ZeroVec = DAG.getNode(ARMISD::VMOVIMM, dl, VT,
9931 DAG.getTargetConstant(0, dl, MVT::i32));
9932 SDValue NewLoad = DAG.getMaskedLoad(
9933 VT, dl, N->getChain(), N->getBasePtr(), N->getOffset(), Mask, ZeroVec,
9934 N->getMemoryVT(), N->getMemOperand(), N->getAddressingMode(),
9935 N->getExtensionType(), N->isExpandingLoad());
9936 SDValue Combo = NewLoad;
9937 bool PassThruIsCastZero = (PassThru.getOpcode() == ISD::BITCAST ||
9938 PassThru.getOpcode() == ARMISD::VECTOR_REG_CAST) &&
9939 isZeroVector(PassThru->getOperand(0));
9940 if (!PassThru.isUndef() && !PassThruIsCastZero)
9941 Combo = DAG.getNode(ISD::VSELECT, dl, VT, Mask, NewLoad, PassThru);
9942 return DAG.getMergeValues({Combo, NewLoad.getValue(1)}, dl);
9945 static SDValue LowerVecReduce(SDValue Op, SelectionDAG &DAG,
9946 const ARMSubtarget *ST) {
9947 if (!ST->hasMVEIntegerOps())
9948 return SDValue();
9950 SDLoc dl(Op);
9951 unsigned BaseOpcode = 0;
9952 switch (Op->getOpcode()) {
9953 default: llvm_unreachable("Expected VECREDUCE opcode");
9954 case ISD::VECREDUCE_FADD: BaseOpcode = ISD::FADD; break;
9955 case ISD::VECREDUCE_FMUL: BaseOpcode = ISD::FMUL; break;
9956 case ISD::VECREDUCE_MUL: BaseOpcode = ISD::MUL; break;
9957 case ISD::VECREDUCE_AND: BaseOpcode = ISD::AND; break;
9958 case ISD::VECREDUCE_OR: BaseOpcode = ISD::OR; break;
9959 case ISD::VECREDUCE_XOR: BaseOpcode = ISD::XOR; break;
9960 case ISD::VECREDUCE_FMAX: BaseOpcode = ISD::FMAXNUM; break;
9961 case ISD::VECREDUCE_FMIN: BaseOpcode = ISD::FMINNUM; break;
9964 SDValue Op0 = Op->getOperand(0);
9965 EVT VT = Op0.getValueType();
9966 EVT EltVT = VT.getVectorElementType();
9967 unsigned NumElts = VT.getVectorNumElements();
9968 unsigned NumActiveLanes = NumElts;
9970 assert((NumActiveLanes == 16 || NumActiveLanes == 8 || NumActiveLanes == 4 ||
9971 NumActiveLanes == 2) &&
9972 "Only expected a power 2 vector size");
9974 // Use Mul(X, Rev(X)) until 4 items remain. Going down to 4 vector elements
9975 // allows us to easily extract vector elements from the lanes.
9976 while (NumActiveLanes > 4) {
9977 unsigned RevOpcode = NumActiveLanes == 16 ? ARMISD::VREV16 : ARMISD::VREV32;
9978 SDValue Rev = DAG.getNode(RevOpcode, dl, VT, Op0);
9979 Op0 = DAG.getNode(BaseOpcode, dl, VT, Op0, Rev);
9980 NumActiveLanes /= 2;
9983 SDValue Res;
9984 if (NumActiveLanes == 4) {
9985 // The remaining 4 elements are summed sequentially
9986 SDValue Ext0 = DAG.getNode(ISD::EXTRACT_VECTOR_ELT, dl, EltVT, Op0,
9987 DAG.getConstant(0 * NumElts / 4, dl, MVT::i32));
9988 SDValue Ext1 = DAG.getNode(ISD::EXTRACT_VECTOR_ELT, dl, EltVT, Op0,
9989 DAG.getConstant(1 * NumElts / 4, dl, MVT::i32));
9990 SDValue Ext2 = DAG.getNode(ISD::EXTRACT_VECTOR_ELT, dl, EltVT, Op0,
9991 DAG.getConstant(2 * NumElts / 4, dl, MVT::i32));
9992 SDValue Ext3 = DAG.getNode(ISD::EXTRACT_VECTOR_ELT, dl, EltVT, Op0,
9993 DAG.getConstant(3 * NumElts / 4, dl, MVT::i32));
9994 SDValue Res0 = DAG.getNode(BaseOpcode, dl, EltVT, Ext0, Ext1, Op->getFlags());
9995 SDValue Res1 = DAG.getNode(BaseOpcode, dl, EltVT, Ext2, Ext3, Op->getFlags());
9996 Res = DAG.getNode(BaseOpcode, dl, EltVT, Res0, Res1, Op->getFlags());
9997 } else {
9998 SDValue Ext0 = DAG.getNode(ISD::EXTRACT_VECTOR_ELT, dl, EltVT, Op0,
9999 DAG.getConstant(0, dl, MVT::i32));
10000 SDValue Ext1 = DAG.getNode(ISD::EXTRACT_VECTOR_ELT, dl, EltVT, Op0,
10001 DAG.getConstant(1, dl, MVT::i32));
10002 Res = DAG.getNode(BaseOpcode, dl, EltVT, Ext0, Ext1, Op->getFlags());
10005 // Result type may be wider than element type.
10006 if (EltVT != Op->getValueType(0))
10007 Res = DAG.getNode(ISD::ANY_EXTEND, dl, Op->getValueType(0), Res);
10008 return Res;
10011 static SDValue LowerVecReduceF(SDValue Op, SelectionDAG &DAG,
10012 const ARMSubtarget *ST) {
10013 if (!ST->hasMVEFloatOps())
10014 return SDValue();
10015 return LowerVecReduce(Op, DAG, ST);
10018 static SDValue LowerAtomicLoadStore(SDValue Op, SelectionDAG &DAG) {
10019 if (isStrongerThanMonotonic(cast<AtomicSDNode>(Op)->getSuccessOrdering()))
10020 // Acquire/Release load/store is not legal for targets without a dmb or
10021 // equivalent available.
10022 return SDValue();
10024 // Monotonic load/store is legal for all targets.
10025 return Op;
10028 static void ReplaceREADCYCLECOUNTER(SDNode *N,
10029 SmallVectorImpl<SDValue> &Results,
10030 SelectionDAG &DAG,
10031 const ARMSubtarget *Subtarget) {
10032 SDLoc DL(N);
10033 // Under Power Management extensions, the cycle-count is:
10034 // mrc p15, #0, <Rt>, c9, c13, #0
10035 SDValue Ops[] = { N->getOperand(0), // Chain
10036 DAG.getTargetConstant(Intrinsic::arm_mrc, DL, MVT::i32),
10037 DAG.getTargetConstant(15, DL, MVT::i32),
10038 DAG.getTargetConstant(0, DL, MVT::i32),
10039 DAG.getTargetConstant(9, DL, MVT::i32),
10040 DAG.getTargetConstant(13, DL, MVT::i32),
10041 DAG.getTargetConstant(0, DL, MVT::i32)
10044 SDValue Cycles32 = DAG.getNode(ISD::INTRINSIC_W_CHAIN, DL,
10045 DAG.getVTList(MVT::i32, MVT::Other), Ops);
10046 Results.push_back(DAG.getNode(ISD::BUILD_PAIR, DL, MVT::i64, Cycles32,
10047 DAG.getConstant(0, DL, MVT::i32)));
10048 Results.push_back(Cycles32.getValue(1));
10051 static SDValue createGPRPairNode(SelectionDAG &DAG, SDValue V) {
10052 SDLoc dl(V.getNode());
10053 SDValue VLo = DAG.getAnyExtOrTrunc(V, dl, MVT::i32);
10054 SDValue VHi = DAG.getAnyExtOrTrunc(
10055 DAG.getNode(ISD::SRL, dl, MVT::i64, V, DAG.getConstant(32, dl, MVT::i32)),
10056 dl, MVT::i32);
10057 bool isBigEndian = DAG.getDataLayout().isBigEndian();
10058 if (isBigEndian)
10059 std::swap (VLo, VHi);
10060 SDValue RegClass =
10061 DAG.getTargetConstant(ARM::GPRPairRegClassID, dl, MVT::i32);
10062 SDValue SubReg0 = DAG.getTargetConstant(ARM::gsub_0, dl, MVT::i32);
10063 SDValue SubReg1 = DAG.getTargetConstant(ARM::gsub_1, dl, MVT::i32);
10064 const SDValue Ops[] = { RegClass, VLo, SubReg0, VHi, SubReg1 };
10065 return SDValue(
10066 DAG.getMachineNode(TargetOpcode::REG_SEQUENCE, dl, MVT::Untyped, Ops), 0);
10069 static void ReplaceCMP_SWAP_64Results(SDNode *N,
10070 SmallVectorImpl<SDValue> & Results,
10071 SelectionDAG &DAG) {
10072 assert(N->getValueType(0) == MVT::i64 &&
10073 "AtomicCmpSwap on types less than 64 should be legal");
10074 SDValue Ops[] = {N->getOperand(1),
10075 createGPRPairNode(DAG, N->getOperand(2)),
10076 createGPRPairNode(DAG, N->getOperand(3)),
10077 N->getOperand(0)};
10078 SDNode *CmpSwap = DAG.getMachineNode(
10079 ARM::CMP_SWAP_64, SDLoc(N),
10080 DAG.getVTList(MVT::Untyped, MVT::i32, MVT::Other), Ops);
10082 MachineMemOperand *MemOp = cast<MemSDNode>(N)->getMemOperand();
10083 DAG.setNodeMemRefs(cast<MachineSDNode>(CmpSwap), {MemOp});
10085 bool isBigEndian = DAG.getDataLayout().isBigEndian();
10087 SDValue Lo =
10088 DAG.getTargetExtractSubreg(isBigEndian ? ARM::gsub_1 : ARM::gsub_0,
10089 SDLoc(N), MVT::i32, SDValue(CmpSwap, 0));
10090 SDValue Hi =
10091 DAG.getTargetExtractSubreg(isBigEndian ? ARM::gsub_0 : ARM::gsub_1,
10092 SDLoc(N), MVT::i32, SDValue(CmpSwap, 0));
10093 Results.push_back(DAG.getNode(ISD::BUILD_PAIR, SDLoc(N), MVT::i64, Lo, Hi));
10094 Results.push_back(SDValue(CmpSwap, 2));
10097 SDValue ARMTargetLowering::LowerFSETCC(SDValue Op, SelectionDAG &DAG) const {
10098 SDLoc dl(Op);
10099 EVT VT = Op.getValueType();
10100 SDValue Chain = Op.getOperand(0);
10101 SDValue LHS = Op.getOperand(1);
10102 SDValue RHS = Op.getOperand(2);
10103 ISD::CondCode CC = cast<CondCodeSDNode>(Op.getOperand(3))->get();
10104 bool IsSignaling = Op.getOpcode() == ISD::STRICT_FSETCCS;
10106 // If we don't have instructions of this float type then soften to a libcall
10107 // and use SETCC instead.
10108 if (isUnsupportedFloatingType(LHS.getValueType())) {
10109 DAG.getTargetLoweringInfo().softenSetCCOperands(
10110 DAG, LHS.getValueType(), LHS, RHS, CC, dl, LHS, RHS, Chain, IsSignaling);
10111 if (!RHS.getNode()) {
10112 RHS = DAG.getConstant(0, dl, LHS.getValueType());
10113 CC = ISD::SETNE;
10115 SDValue Result = DAG.getNode(ISD::SETCC, dl, VT, LHS, RHS,
10116 DAG.getCondCode(CC));
10117 return DAG.getMergeValues({Result, Chain}, dl);
10120 ARMCC::CondCodes CondCode, CondCode2;
10121 FPCCToARMCC(CC, CondCode, CondCode2);
10123 // FIXME: Chain is not handled correctly here. Currently the FPSCR is implicit
10124 // in CMPFP and CMPFPE, but instead it should be made explicit by these
10125 // instructions using a chain instead of glue. This would also fix the problem
10126 // here (and also in LowerSELECT_CC) where we generate two comparisons when
10127 // CondCode2 != AL.
10128 SDValue True = DAG.getConstant(1, dl, VT);
10129 SDValue False = DAG.getConstant(0, dl, VT);
10130 SDValue ARMcc = DAG.getConstant(CondCode, dl, MVT::i32);
10131 SDValue CCR = DAG.getRegister(ARM::CPSR, MVT::i32);
10132 SDValue Cmp = getVFPCmp(LHS, RHS, DAG, dl, IsSignaling);
10133 SDValue Result = getCMOV(dl, VT, False, True, ARMcc, CCR, Cmp, DAG);
10134 if (CondCode2 != ARMCC::AL) {
10135 ARMcc = DAG.getConstant(CondCode2, dl, MVT::i32);
10136 Cmp = getVFPCmp(LHS, RHS, DAG, dl, IsSignaling);
10137 Result = getCMOV(dl, VT, Result, True, ARMcc, CCR, Cmp, DAG);
10139 return DAG.getMergeValues({Result, Chain}, dl);
10142 SDValue ARMTargetLowering::LowerOperation(SDValue Op, SelectionDAG &DAG) const {
10143 LLVM_DEBUG(dbgs() << "Lowering node: "; Op.dump());
10144 switch (Op.getOpcode()) {
10145 default: llvm_unreachable("Don't know how to custom lower this!");
10146 case ISD::WRITE_REGISTER: return LowerWRITE_REGISTER(Op, DAG);
10147 case ISD::ConstantPool: return LowerConstantPool(Op, DAG);
10148 case ISD::BlockAddress: return LowerBlockAddress(Op, DAG);
10149 case ISD::GlobalAddress: return LowerGlobalAddress(Op, DAG);
10150 case ISD::GlobalTLSAddress: return LowerGlobalTLSAddress(Op, DAG);
10151 case ISD::SELECT: return LowerSELECT(Op, DAG);
10152 case ISD::SELECT_CC: return LowerSELECT_CC(Op, DAG);
10153 case ISD::BRCOND: return LowerBRCOND(Op, DAG);
10154 case ISD::BR_CC: return LowerBR_CC(Op, DAG);
10155 case ISD::BR_JT: return LowerBR_JT(Op, DAG);
10156 case ISD::VASTART: return LowerVASTART(Op, DAG);
10157 case ISD::ATOMIC_FENCE: return LowerATOMIC_FENCE(Op, DAG, Subtarget);
10158 case ISD::PREFETCH: return LowerPREFETCH(Op, DAG, Subtarget);
10159 case ISD::SINT_TO_FP:
10160 case ISD::UINT_TO_FP: return LowerINT_TO_FP(Op, DAG);
10161 case ISD::STRICT_FP_TO_SINT:
10162 case ISD::STRICT_FP_TO_UINT:
10163 case ISD::FP_TO_SINT:
10164 case ISD::FP_TO_UINT: return LowerFP_TO_INT(Op, DAG);
10165 case ISD::FCOPYSIGN: return LowerFCOPYSIGN(Op, DAG);
10166 case ISD::RETURNADDR: return LowerRETURNADDR(Op, DAG);
10167 case ISD::FRAMEADDR: return LowerFRAMEADDR(Op, DAG);
10168 case ISD::EH_SJLJ_SETJMP: return LowerEH_SJLJ_SETJMP(Op, DAG);
10169 case ISD::EH_SJLJ_LONGJMP: return LowerEH_SJLJ_LONGJMP(Op, DAG);
10170 case ISD::EH_SJLJ_SETUP_DISPATCH: return LowerEH_SJLJ_SETUP_DISPATCH(Op, DAG);
10171 case ISD::INTRINSIC_VOID: return LowerINTRINSIC_VOID(Op, DAG, Subtarget);
10172 case ISD::INTRINSIC_WO_CHAIN: return LowerINTRINSIC_WO_CHAIN(Op, DAG,
10173 Subtarget);
10174 case ISD::BITCAST: return ExpandBITCAST(Op.getNode(), DAG, Subtarget);
10175 case ISD::SHL:
10176 case ISD::SRL:
10177 case ISD::SRA: return LowerShift(Op.getNode(), DAG, Subtarget);
10178 case ISD::SREM: return LowerREM(Op.getNode(), DAG);
10179 case ISD::UREM: return LowerREM(Op.getNode(), DAG);
10180 case ISD::SHL_PARTS: return LowerShiftLeftParts(Op, DAG);
10181 case ISD::SRL_PARTS:
10182 case ISD::SRA_PARTS: return LowerShiftRightParts(Op, DAG);
10183 case ISD::CTTZ:
10184 case ISD::CTTZ_ZERO_UNDEF: return LowerCTTZ(Op.getNode(), DAG, Subtarget);
10185 case ISD::CTPOP: return LowerCTPOP(Op.getNode(), DAG, Subtarget);
10186 case ISD::SETCC: return LowerVSETCC(Op, DAG, Subtarget);
10187 case ISD::SETCCCARRY: return LowerSETCCCARRY(Op, DAG);
10188 case ISD::ConstantFP: return LowerConstantFP(Op, DAG, Subtarget);
10189 case ISD::BUILD_VECTOR: return LowerBUILD_VECTOR(Op, DAG, Subtarget);
10190 case ISD::VECTOR_SHUFFLE: return LowerVECTOR_SHUFFLE(Op, DAG, Subtarget);
10191 case ISD::EXTRACT_SUBVECTOR: return LowerEXTRACT_SUBVECTOR(Op, DAG, Subtarget);
10192 case ISD::INSERT_VECTOR_ELT: return LowerINSERT_VECTOR_ELT(Op, DAG);
10193 case ISD::EXTRACT_VECTOR_ELT: return LowerEXTRACT_VECTOR_ELT(Op, DAG, Subtarget);
10194 case ISD::CONCAT_VECTORS: return LowerCONCAT_VECTORS(Op, DAG, Subtarget);
10195 case ISD::TRUNCATE: return LowerTruncate(Op.getNode(), DAG, Subtarget);
10196 case ISD::SIGN_EXTEND:
10197 case ISD::ZERO_EXTEND: return LowerVectorExtend(Op.getNode(), DAG, Subtarget);
10198 case ISD::FLT_ROUNDS_: return LowerFLT_ROUNDS_(Op, DAG);
10199 case ISD::SET_ROUNDING: return LowerSET_ROUNDING(Op, DAG);
10200 case ISD::MUL: return LowerMUL(Op, DAG);
10201 case ISD::SDIV:
10202 if (Subtarget->isTargetWindows() && !Op.getValueType().isVector())
10203 return LowerDIV_Windows(Op, DAG, /* Signed */ true);
10204 return LowerSDIV(Op, DAG, Subtarget);
10205 case ISD::UDIV:
10206 if (Subtarget->isTargetWindows() && !Op.getValueType().isVector())
10207 return LowerDIV_Windows(Op, DAG, /* Signed */ false);
10208 return LowerUDIV(Op, DAG, Subtarget);
10209 case ISD::ADDCARRY:
10210 case ISD::SUBCARRY: return LowerADDSUBCARRY(Op, DAG);
10211 case ISD::SADDO:
10212 case ISD::SSUBO:
10213 return LowerSignedALUO(Op, DAG);
10214 case ISD::UADDO:
10215 case ISD::USUBO:
10216 return LowerUnsignedALUO(Op, DAG);
10217 case ISD::SADDSAT:
10218 case ISD::SSUBSAT:
10219 case ISD::UADDSAT:
10220 case ISD::USUBSAT:
10221 return LowerADDSUBSAT(Op, DAG, Subtarget);
10222 case ISD::LOAD:
10223 return LowerPredicateLoad(Op, DAG);
10224 case ISD::STORE:
10225 return LowerSTORE(Op, DAG, Subtarget);
10226 case ISD::MLOAD:
10227 return LowerMLOAD(Op, DAG);
10228 case ISD::VECREDUCE_MUL:
10229 case ISD::VECREDUCE_AND:
10230 case ISD::VECREDUCE_OR:
10231 case ISD::VECREDUCE_XOR:
10232 return LowerVecReduce(Op, DAG, Subtarget);
10233 case ISD::VECREDUCE_FADD:
10234 case ISD::VECREDUCE_FMUL:
10235 case ISD::VECREDUCE_FMIN:
10236 case ISD::VECREDUCE_FMAX:
10237 return LowerVecReduceF(Op, DAG, Subtarget);
10238 case ISD::ATOMIC_LOAD:
10239 case ISD::ATOMIC_STORE: return LowerAtomicLoadStore(Op, DAG);
10240 case ISD::FSINCOS: return LowerFSINCOS(Op, DAG);
10241 case ISD::SDIVREM:
10242 case ISD::UDIVREM: return LowerDivRem(Op, DAG);
10243 case ISD::DYNAMIC_STACKALLOC:
10244 if (Subtarget->isTargetWindows())
10245 return LowerDYNAMIC_STACKALLOC(Op, DAG);
10246 llvm_unreachable("Don't know how to custom lower this!");
10247 case ISD::STRICT_FP_ROUND:
10248 case ISD::FP_ROUND: return LowerFP_ROUND(Op, DAG);
10249 case ISD::STRICT_FP_EXTEND:
10250 case ISD::FP_EXTEND: return LowerFP_EXTEND(Op, DAG);
10251 case ISD::STRICT_FSETCC:
10252 case ISD::STRICT_FSETCCS: return LowerFSETCC(Op, DAG);
10253 case ARMISD::WIN__DBZCHK: return SDValue();
10257 static void ReplaceLongIntrinsic(SDNode *N, SmallVectorImpl<SDValue> &Results,
10258 SelectionDAG &DAG) {
10259 unsigned IntNo = cast<ConstantSDNode>(N->getOperand(0))->getZExtValue();
10260 unsigned Opc = 0;
10261 if (IntNo == Intrinsic::arm_smlald)
10262 Opc = ARMISD::SMLALD;
10263 else if (IntNo == Intrinsic::arm_smlaldx)
10264 Opc = ARMISD::SMLALDX;
10265 else if (IntNo == Intrinsic::arm_smlsld)
10266 Opc = ARMISD::SMLSLD;
10267 else if (IntNo == Intrinsic::arm_smlsldx)
10268 Opc = ARMISD::SMLSLDX;
10269 else
10270 return;
10272 SDLoc dl(N);
10273 SDValue Lo = DAG.getNode(ISD::EXTRACT_ELEMENT, dl, MVT::i32,
10274 N->getOperand(3),
10275 DAG.getConstant(0, dl, MVT::i32));
10276 SDValue Hi = DAG.getNode(ISD::EXTRACT_ELEMENT, dl, MVT::i32,
10277 N->getOperand(3),
10278 DAG.getConstant(1, dl, MVT::i32));
10280 SDValue LongMul = DAG.getNode(Opc, dl,
10281 DAG.getVTList(MVT::i32, MVT::i32),
10282 N->getOperand(1), N->getOperand(2),
10283 Lo, Hi);
10284 Results.push_back(DAG.getNode(ISD::BUILD_PAIR, dl, MVT::i64,
10285 LongMul.getValue(0), LongMul.getValue(1)));
10288 /// ReplaceNodeResults - Replace the results of node with an illegal result
10289 /// type with new values built out of custom code.
10290 void ARMTargetLowering::ReplaceNodeResults(SDNode *N,
10291 SmallVectorImpl<SDValue> &Results,
10292 SelectionDAG &DAG) const {
10293 SDValue Res;
10294 switch (N->getOpcode()) {
10295 default:
10296 llvm_unreachable("Don't know how to custom expand this!");
10297 case ISD::READ_REGISTER:
10298 ExpandREAD_REGISTER(N, Results, DAG);
10299 break;
10300 case ISD::BITCAST:
10301 Res = ExpandBITCAST(N, DAG, Subtarget);
10302 break;
10303 case ISD::SRL:
10304 case ISD::SRA:
10305 case ISD::SHL:
10306 Res = Expand64BitShift(N, DAG, Subtarget);
10307 break;
10308 case ISD::SREM:
10309 case ISD::UREM:
10310 Res = LowerREM(N, DAG);
10311 break;
10312 case ISD::SDIVREM:
10313 case ISD::UDIVREM:
10314 Res = LowerDivRem(SDValue(N, 0), DAG);
10315 assert(Res.getNumOperands() == 2 && "DivRem needs two values");
10316 Results.push_back(Res.getValue(0));
10317 Results.push_back(Res.getValue(1));
10318 return;
10319 case ISD::SADDSAT:
10320 case ISD::SSUBSAT:
10321 case ISD::UADDSAT:
10322 case ISD::USUBSAT:
10323 Res = LowerADDSUBSAT(SDValue(N, 0), DAG, Subtarget);
10324 break;
10325 case ISD::READCYCLECOUNTER:
10326 ReplaceREADCYCLECOUNTER(N, Results, DAG, Subtarget);
10327 return;
10328 case ISD::UDIV:
10329 case ISD::SDIV:
10330 assert(Subtarget->isTargetWindows() && "can only expand DIV on Windows");
10331 return ExpandDIV_Windows(SDValue(N, 0), DAG, N->getOpcode() == ISD::SDIV,
10332 Results);
10333 case ISD::ATOMIC_CMP_SWAP:
10334 ReplaceCMP_SWAP_64Results(N, Results, DAG);
10335 return;
10336 case ISD::INTRINSIC_WO_CHAIN:
10337 return ReplaceLongIntrinsic(N, Results, DAG);
10338 case ISD::ABS:
10339 lowerABS(N, Results, DAG);
10340 return ;
10341 case ISD::LOAD:
10342 LowerLOAD(N, Results, DAG);
10343 break;
10344 case ISD::TRUNCATE:
10345 Res = LowerTruncate(N, DAG, Subtarget);
10346 break;
10347 case ISD::SIGN_EXTEND:
10348 case ISD::ZERO_EXTEND:
10349 Res = LowerVectorExtend(N, DAG, Subtarget);
10350 break;
10352 if (Res.getNode())
10353 Results.push_back(Res);
10356 //===----------------------------------------------------------------------===//
10357 // ARM Scheduler Hooks
10358 //===----------------------------------------------------------------------===//
10360 /// SetupEntryBlockForSjLj - Insert code into the entry block that creates and
10361 /// registers the function context.
10362 void ARMTargetLowering::SetupEntryBlockForSjLj(MachineInstr &MI,
10363 MachineBasicBlock *MBB,
10364 MachineBasicBlock *DispatchBB,
10365 int FI) const {
10366 assert(!Subtarget->isROPI() && !Subtarget->isRWPI() &&
10367 "ROPI/RWPI not currently supported with SjLj");
10368 const TargetInstrInfo *TII = Subtarget->getInstrInfo();
10369 DebugLoc dl = MI.getDebugLoc();
10370 MachineFunction *MF = MBB->getParent();
10371 MachineRegisterInfo *MRI = &MF->getRegInfo();
10372 MachineConstantPool *MCP = MF->getConstantPool();
10373 ARMFunctionInfo *AFI = MF->getInfo<ARMFunctionInfo>();
10374 const Function &F = MF->getFunction();
10376 bool isThumb = Subtarget->isThumb();
10377 bool isThumb2 = Subtarget->isThumb2();
10379 unsigned PCLabelId = AFI->createPICLabelUId();
10380 unsigned PCAdj = (isThumb || isThumb2) ? 4 : 8;
10381 ARMConstantPoolValue *CPV =
10382 ARMConstantPoolMBB::Create(F.getContext(), DispatchBB, PCLabelId, PCAdj);
10383 unsigned CPI = MCP->getConstantPoolIndex(CPV, Align(4));
10385 const TargetRegisterClass *TRC = isThumb ? &ARM::tGPRRegClass
10386 : &ARM::GPRRegClass;
10388 // Grab constant pool and fixed stack memory operands.
10389 MachineMemOperand *CPMMO =
10390 MF->getMachineMemOperand(MachinePointerInfo::getConstantPool(*MF),
10391 MachineMemOperand::MOLoad, 4, Align(4));
10393 MachineMemOperand *FIMMOSt =
10394 MF->getMachineMemOperand(MachinePointerInfo::getFixedStack(*MF, FI),
10395 MachineMemOperand::MOStore, 4, Align(4));
10397 // Load the address of the dispatch MBB into the jump buffer.
10398 if (isThumb2) {
10399 // Incoming value: jbuf
10400 // ldr.n r5, LCPI1_1
10401 // orr r5, r5, #1
10402 // add r5, pc
10403 // str r5, [$jbuf, #+4] ; &jbuf[1]
10404 Register NewVReg1 = MRI->createVirtualRegister(TRC);
10405 BuildMI(*MBB, MI, dl, TII->get(ARM::t2LDRpci), NewVReg1)
10406 .addConstantPoolIndex(CPI)
10407 .addMemOperand(CPMMO)
10408 .add(predOps(ARMCC::AL));
10409 // Set the low bit because of thumb mode.
10410 Register NewVReg2 = MRI->createVirtualRegister(TRC);
10411 BuildMI(*MBB, MI, dl, TII->get(ARM::t2ORRri), NewVReg2)
10412 .addReg(NewVReg1, RegState::Kill)
10413 .addImm(0x01)
10414 .add(predOps(ARMCC::AL))
10415 .add(condCodeOp());
10416 Register NewVReg3 = MRI->createVirtualRegister(TRC);
10417 BuildMI(*MBB, MI, dl, TII->get(ARM::tPICADD), NewVReg3)
10418 .addReg(NewVReg2, RegState::Kill)
10419 .addImm(PCLabelId);
10420 BuildMI(*MBB, MI, dl, TII->get(ARM::t2STRi12))
10421 .addReg(NewVReg3, RegState::Kill)
10422 .addFrameIndex(FI)
10423 .addImm(36) // &jbuf[1] :: pc
10424 .addMemOperand(FIMMOSt)
10425 .add(predOps(ARMCC::AL));
10426 } else if (isThumb) {
10427 // Incoming value: jbuf
10428 // ldr.n r1, LCPI1_4
10429 // add r1, pc
10430 // mov r2, #1
10431 // orrs r1, r2
10432 // add r2, $jbuf, #+4 ; &jbuf[1]
10433 // str r1, [r2]
10434 Register NewVReg1 = MRI->createVirtualRegister(TRC);
10435 BuildMI(*MBB, MI, dl, TII->get(ARM::tLDRpci), NewVReg1)
10436 .addConstantPoolIndex(CPI)
10437 .addMemOperand(CPMMO)
10438 .add(predOps(ARMCC::AL));
10439 Register NewVReg2 = MRI->createVirtualRegister(TRC);
10440 BuildMI(*MBB, MI, dl, TII->get(ARM::tPICADD), NewVReg2)
10441 .addReg(NewVReg1, RegState::Kill)
10442 .addImm(PCLabelId);
10443 // Set the low bit because of thumb mode.
10444 Register NewVReg3 = MRI->createVirtualRegister(TRC);
10445 BuildMI(*MBB, MI, dl, TII->get(ARM::tMOVi8), NewVReg3)
10446 .addReg(ARM::CPSR, RegState::Define)
10447 .addImm(1)
10448 .add(predOps(ARMCC::AL));
10449 Register NewVReg4 = MRI->createVirtualRegister(TRC);
10450 BuildMI(*MBB, MI, dl, TII->get(ARM::tORR), NewVReg4)
10451 .addReg(ARM::CPSR, RegState::Define)
10452 .addReg(NewVReg2, RegState::Kill)
10453 .addReg(NewVReg3, RegState::Kill)
10454 .add(predOps(ARMCC::AL));
10455 Register NewVReg5 = MRI->createVirtualRegister(TRC);
10456 BuildMI(*MBB, MI, dl, TII->get(ARM::tADDframe), NewVReg5)
10457 .addFrameIndex(FI)
10458 .addImm(36); // &jbuf[1] :: pc
10459 BuildMI(*MBB, MI, dl, TII->get(ARM::tSTRi))
10460 .addReg(NewVReg4, RegState::Kill)
10461 .addReg(NewVReg5, RegState::Kill)
10462 .addImm(0)
10463 .addMemOperand(FIMMOSt)
10464 .add(predOps(ARMCC::AL));
10465 } else {
10466 // Incoming value: jbuf
10467 // ldr r1, LCPI1_1
10468 // add r1, pc, r1
10469 // str r1, [$jbuf, #+4] ; &jbuf[1]
10470 Register NewVReg1 = MRI->createVirtualRegister(TRC);
10471 BuildMI(*MBB, MI, dl, TII->get(ARM::LDRi12), NewVReg1)
10472 .addConstantPoolIndex(CPI)
10473 .addImm(0)
10474 .addMemOperand(CPMMO)
10475 .add(predOps(ARMCC::AL));
10476 Register NewVReg2 = MRI->createVirtualRegister(TRC);
10477 BuildMI(*MBB, MI, dl, TII->get(ARM::PICADD), NewVReg2)
10478 .addReg(NewVReg1, RegState::Kill)
10479 .addImm(PCLabelId)
10480 .add(predOps(ARMCC::AL));
10481 BuildMI(*MBB, MI, dl, TII->get(ARM::STRi12))
10482 .addReg(NewVReg2, RegState::Kill)
10483 .addFrameIndex(FI)
10484 .addImm(36) // &jbuf[1] :: pc
10485 .addMemOperand(FIMMOSt)
10486 .add(predOps(ARMCC::AL));
10490 void ARMTargetLowering::EmitSjLjDispatchBlock(MachineInstr &MI,
10491 MachineBasicBlock *MBB) const {
10492 const TargetInstrInfo *TII = Subtarget->getInstrInfo();
10493 DebugLoc dl = MI.getDebugLoc();
10494 MachineFunction *MF = MBB->getParent();
10495 MachineRegisterInfo *MRI = &MF->getRegInfo();
10496 MachineFrameInfo &MFI = MF->getFrameInfo();
10497 int FI = MFI.getFunctionContextIndex();
10499 const TargetRegisterClass *TRC = Subtarget->isThumb() ? &ARM::tGPRRegClass
10500 : &ARM::GPRnopcRegClass;
10502 // Get a mapping of the call site numbers to all of the landing pads they're
10503 // associated with.
10504 DenseMap<unsigned, SmallVector<MachineBasicBlock*, 2>> CallSiteNumToLPad;
10505 unsigned MaxCSNum = 0;
10506 for (MachineFunction::iterator BB = MF->begin(), E = MF->end(); BB != E;
10507 ++BB) {
10508 if (!BB->isEHPad()) continue;
10510 // FIXME: We should assert that the EH_LABEL is the first MI in the landing
10511 // pad.
10512 for (MachineBasicBlock::iterator
10513 II = BB->begin(), IE = BB->end(); II != IE; ++II) {
10514 if (!II->isEHLabel()) continue;
10516 MCSymbol *Sym = II->getOperand(0).getMCSymbol();
10517 if (!MF->hasCallSiteLandingPad(Sym)) continue;
10519 SmallVectorImpl<unsigned> &CallSiteIdxs = MF->getCallSiteLandingPad(Sym);
10520 for (SmallVectorImpl<unsigned>::iterator
10521 CSI = CallSiteIdxs.begin(), CSE = CallSiteIdxs.end();
10522 CSI != CSE; ++CSI) {
10523 CallSiteNumToLPad[*CSI].push_back(&*BB);
10524 MaxCSNum = std::max(MaxCSNum, *CSI);
10526 break;
10530 // Get an ordered list of the machine basic blocks for the jump table.
10531 std::vector<MachineBasicBlock*> LPadList;
10532 SmallPtrSet<MachineBasicBlock*, 32> InvokeBBs;
10533 LPadList.reserve(CallSiteNumToLPad.size());
10534 for (unsigned I = 1; I <= MaxCSNum; ++I) {
10535 SmallVectorImpl<MachineBasicBlock*> &MBBList = CallSiteNumToLPad[I];
10536 for (SmallVectorImpl<MachineBasicBlock*>::iterator
10537 II = MBBList.begin(), IE = MBBList.end(); II != IE; ++II) {
10538 LPadList.push_back(*II);
10539 InvokeBBs.insert((*II)->pred_begin(), (*II)->pred_end());
10543 assert(!LPadList.empty() &&
10544 "No landing pad destinations for the dispatch jump table!");
10546 // Create the jump table and associated information.
10547 MachineJumpTableInfo *JTI =
10548 MF->getOrCreateJumpTableInfo(MachineJumpTableInfo::EK_Inline);
10549 unsigned MJTI = JTI->createJumpTableIndex(LPadList);
10551 // Create the MBBs for the dispatch code.
10553 // Shove the dispatch's address into the return slot in the function context.
10554 MachineBasicBlock *DispatchBB = MF->CreateMachineBasicBlock();
10555 DispatchBB->setIsEHPad();
10557 MachineBasicBlock *TrapBB = MF->CreateMachineBasicBlock();
10558 unsigned trap_opcode;
10559 if (Subtarget->isThumb())
10560 trap_opcode = ARM::tTRAP;
10561 else
10562 trap_opcode = Subtarget->useNaClTrap() ? ARM::TRAPNaCl : ARM::TRAP;
10564 BuildMI(TrapBB, dl, TII->get(trap_opcode));
10565 DispatchBB->addSuccessor(TrapBB);
10567 MachineBasicBlock *DispContBB = MF->CreateMachineBasicBlock();
10568 DispatchBB->addSuccessor(DispContBB);
10570 // Insert and MBBs.
10571 MF->insert(MF->end(), DispatchBB);
10572 MF->insert(MF->end(), DispContBB);
10573 MF->insert(MF->end(), TrapBB);
10575 // Insert code into the entry block that creates and registers the function
10576 // context.
10577 SetupEntryBlockForSjLj(MI, MBB, DispatchBB, FI);
10579 MachineMemOperand *FIMMOLd = MF->getMachineMemOperand(
10580 MachinePointerInfo::getFixedStack(*MF, FI),
10581 MachineMemOperand::MOLoad | MachineMemOperand::MOVolatile, 4, Align(4));
10583 MachineInstrBuilder MIB;
10584 MIB = BuildMI(DispatchBB, dl, TII->get(ARM::Int_eh_sjlj_dispatchsetup));
10586 const ARMBaseInstrInfo *AII = static_cast<const ARMBaseInstrInfo*>(TII);
10587 const ARMBaseRegisterInfo &RI = AII->getRegisterInfo();
10589 // Add a register mask with no preserved registers. This results in all
10590 // registers being marked as clobbered. This can't work if the dispatch block
10591 // is in a Thumb1 function and is linked with ARM code which uses the FP
10592 // registers, as there is no way to preserve the FP registers in Thumb1 mode.
10593 MIB.addRegMask(RI.getSjLjDispatchPreservedMask(*MF));
10595 bool IsPositionIndependent = isPositionIndependent();
10596 unsigned NumLPads = LPadList.size();
10597 if (Subtarget->isThumb2()) {
10598 Register NewVReg1 = MRI->createVirtualRegister(TRC);
10599 BuildMI(DispatchBB, dl, TII->get(ARM::t2LDRi12), NewVReg1)
10600 .addFrameIndex(FI)
10601 .addImm(4)
10602 .addMemOperand(FIMMOLd)
10603 .add(predOps(ARMCC::AL));
10605 if (NumLPads < 256) {
10606 BuildMI(DispatchBB, dl, TII->get(ARM::t2CMPri))
10607 .addReg(NewVReg1)
10608 .addImm(LPadList.size())
10609 .add(predOps(ARMCC::AL));
10610 } else {
10611 Register VReg1 = MRI->createVirtualRegister(TRC);
10612 BuildMI(DispatchBB, dl, TII->get(ARM::t2MOVi16), VReg1)
10613 .addImm(NumLPads & 0xFFFF)
10614 .add(predOps(ARMCC::AL));
10616 unsigned VReg2 = VReg1;
10617 if ((NumLPads & 0xFFFF0000) != 0) {
10618 VReg2 = MRI->createVirtualRegister(TRC);
10619 BuildMI(DispatchBB, dl, TII->get(ARM::t2MOVTi16), VReg2)
10620 .addReg(VReg1)
10621 .addImm(NumLPads >> 16)
10622 .add(predOps(ARMCC::AL));
10625 BuildMI(DispatchBB, dl, TII->get(ARM::t2CMPrr))
10626 .addReg(NewVReg1)
10627 .addReg(VReg2)
10628 .add(predOps(ARMCC::AL));
10631 BuildMI(DispatchBB, dl, TII->get(ARM::t2Bcc))
10632 .addMBB(TrapBB)
10633 .addImm(ARMCC::HI)
10634 .addReg(ARM::CPSR);
10636 Register NewVReg3 = MRI->createVirtualRegister(TRC);
10637 BuildMI(DispContBB, dl, TII->get(ARM::t2LEApcrelJT), NewVReg3)
10638 .addJumpTableIndex(MJTI)
10639 .add(predOps(ARMCC::AL));
10641 Register NewVReg4 = MRI->createVirtualRegister(TRC);
10642 BuildMI(DispContBB, dl, TII->get(ARM::t2ADDrs), NewVReg4)
10643 .addReg(NewVReg3, RegState::Kill)
10644 .addReg(NewVReg1)
10645 .addImm(ARM_AM::getSORegOpc(ARM_AM::lsl, 2))
10646 .add(predOps(ARMCC::AL))
10647 .add(condCodeOp());
10649 BuildMI(DispContBB, dl, TII->get(ARM::t2BR_JT))
10650 .addReg(NewVReg4, RegState::Kill)
10651 .addReg(NewVReg1)
10652 .addJumpTableIndex(MJTI);
10653 } else if (Subtarget->isThumb()) {
10654 Register NewVReg1 = MRI->createVirtualRegister(TRC);
10655 BuildMI(DispatchBB, dl, TII->get(ARM::tLDRspi), NewVReg1)
10656 .addFrameIndex(FI)
10657 .addImm(1)
10658 .addMemOperand(FIMMOLd)
10659 .add(predOps(ARMCC::AL));
10661 if (NumLPads < 256) {
10662 BuildMI(DispatchBB, dl, TII->get(ARM::tCMPi8))
10663 .addReg(NewVReg1)
10664 .addImm(NumLPads)
10665 .add(predOps(ARMCC::AL));
10666 } else {
10667 MachineConstantPool *ConstantPool = MF->getConstantPool();
10668 Type *Int32Ty = Type::getInt32Ty(MF->getFunction().getContext());
10669 const Constant *C = ConstantInt::get(Int32Ty, NumLPads);
10671 // MachineConstantPool wants an explicit alignment.
10672 Align Alignment = MF->getDataLayout().getPrefTypeAlign(Int32Ty);
10673 unsigned Idx = ConstantPool->getConstantPoolIndex(C, Alignment);
10675 Register VReg1 = MRI->createVirtualRegister(TRC);
10676 BuildMI(DispatchBB, dl, TII->get(ARM::tLDRpci))
10677 .addReg(VReg1, RegState::Define)
10678 .addConstantPoolIndex(Idx)
10679 .add(predOps(ARMCC::AL));
10680 BuildMI(DispatchBB, dl, TII->get(ARM::tCMPr))
10681 .addReg(NewVReg1)
10682 .addReg(VReg1)
10683 .add(predOps(ARMCC::AL));
10686 BuildMI(DispatchBB, dl, TII->get(ARM::tBcc))
10687 .addMBB(TrapBB)
10688 .addImm(ARMCC::HI)
10689 .addReg(ARM::CPSR);
10691 Register NewVReg2 = MRI->createVirtualRegister(TRC);
10692 BuildMI(DispContBB, dl, TII->get(ARM::tLSLri), NewVReg2)
10693 .addReg(ARM::CPSR, RegState::Define)
10694 .addReg(NewVReg1)
10695 .addImm(2)
10696 .add(predOps(ARMCC::AL));
10698 Register NewVReg3 = MRI->createVirtualRegister(TRC);
10699 BuildMI(DispContBB, dl, TII->get(ARM::tLEApcrelJT), NewVReg3)
10700 .addJumpTableIndex(MJTI)
10701 .add(predOps(ARMCC::AL));
10703 Register NewVReg4 = MRI->createVirtualRegister(TRC);
10704 BuildMI(DispContBB, dl, TII->get(ARM::tADDrr), NewVReg4)
10705 .addReg(ARM::CPSR, RegState::Define)
10706 .addReg(NewVReg2, RegState::Kill)
10707 .addReg(NewVReg3)
10708 .add(predOps(ARMCC::AL));
10710 MachineMemOperand *JTMMOLd =
10711 MF->getMachineMemOperand(MachinePointerInfo::getJumpTable(*MF),
10712 MachineMemOperand::MOLoad, 4, Align(4));
10714 Register NewVReg5 = MRI->createVirtualRegister(TRC);
10715 BuildMI(DispContBB, dl, TII->get(ARM::tLDRi), NewVReg5)
10716 .addReg(NewVReg4, RegState::Kill)
10717 .addImm(0)
10718 .addMemOperand(JTMMOLd)
10719 .add(predOps(ARMCC::AL));
10721 unsigned NewVReg6 = NewVReg5;
10722 if (IsPositionIndependent) {
10723 NewVReg6 = MRI->createVirtualRegister(TRC);
10724 BuildMI(DispContBB, dl, TII->get(ARM::tADDrr), NewVReg6)
10725 .addReg(ARM::CPSR, RegState::Define)
10726 .addReg(NewVReg5, RegState::Kill)
10727 .addReg(NewVReg3)
10728 .add(predOps(ARMCC::AL));
10731 BuildMI(DispContBB, dl, TII->get(ARM::tBR_JTr))
10732 .addReg(NewVReg6, RegState::Kill)
10733 .addJumpTableIndex(MJTI);
10734 } else {
10735 Register NewVReg1 = MRI->createVirtualRegister(TRC);
10736 BuildMI(DispatchBB, dl, TII->get(ARM::LDRi12), NewVReg1)
10737 .addFrameIndex(FI)
10738 .addImm(4)
10739 .addMemOperand(FIMMOLd)
10740 .add(predOps(ARMCC::AL));
10742 if (NumLPads < 256) {
10743 BuildMI(DispatchBB, dl, TII->get(ARM::CMPri))
10744 .addReg(NewVReg1)
10745 .addImm(NumLPads)
10746 .add(predOps(ARMCC::AL));
10747 } else if (Subtarget->hasV6T2Ops() && isUInt<16>(NumLPads)) {
10748 Register VReg1 = MRI->createVirtualRegister(TRC);
10749 BuildMI(DispatchBB, dl, TII->get(ARM::MOVi16), VReg1)
10750 .addImm(NumLPads & 0xFFFF)
10751 .add(predOps(ARMCC::AL));
10753 unsigned VReg2 = VReg1;
10754 if ((NumLPads & 0xFFFF0000) != 0) {
10755 VReg2 = MRI->createVirtualRegister(TRC);
10756 BuildMI(DispatchBB, dl, TII->get(ARM::MOVTi16), VReg2)
10757 .addReg(VReg1)
10758 .addImm(NumLPads >> 16)
10759 .add(predOps(ARMCC::AL));
10762 BuildMI(DispatchBB, dl, TII->get(ARM::CMPrr))
10763 .addReg(NewVReg1)
10764 .addReg(VReg2)
10765 .add(predOps(ARMCC::AL));
10766 } else {
10767 MachineConstantPool *ConstantPool = MF->getConstantPool();
10768 Type *Int32Ty = Type::getInt32Ty(MF->getFunction().getContext());
10769 const Constant *C = ConstantInt::get(Int32Ty, NumLPads);
10771 // MachineConstantPool wants an explicit alignment.
10772 Align Alignment = MF->getDataLayout().getPrefTypeAlign(Int32Ty);
10773 unsigned Idx = ConstantPool->getConstantPoolIndex(C, Alignment);
10775 Register VReg1 = MRI->createVirtualRegister(TRC);
10776 BuildMI(DispatchBB, dl, TII->get(ARM::LDRcp))
10777 .addReg(VReg1, RegState::Define)
10778 .addConstantPoolIndex(Idx)
10779 .addImm(0)
10780 .add(predOps(ARMCC::AL));
10781 BuildMI(DispatchBB, dl, TII->get(ARM::CMPrr))
10782 .addReg(NewVReg1)
10783 .addReg(VReg1, RegState::Kill)
10784 .add(predOps(ARMCC::AL));
10787 BuildMI(DispatchBB, dl, TII->get(ARM::Bcc))
10788 .addMBB(TrapBB)
10789 .addImm(ARMCC::HI)
10790 .addReg(ARM::CPSR);
10792 Register NewVReg3 = MRI->createVirtualRegister(TRC);
10793 BuildMI(DispContBB, dl, TII->get(ARM::MOVsi), NewVReg3)
10794 .addReg(NewVReg1)
10795 .addImm(ARM_AM::getSORegOpc(ARM_AM::lsl, 2))
10796 .add(predOps(ARMCC::AL))
10797 .add(condCodeOp());
10798 Register NewVReg4 = MRI->createVirtualRegister(TRC);
10799 BuildMI(DispContBB, dl, TII->get(ARM::LEApcrelJT), NewVReg4)
10800 .addJumpTableIndex(MJTI)
10801 .add(predOps(ARMCC::AL));
10803 MachineMemOperand *JTMMOLd =
10804 MF->getMachineMemOperand(MachinePointerInfo::getJumpTable(*MF),
10805 MachineMemOperand::MOLoad, 4, Align(4));
10806 Register NewVReg5 = MRI->createVirtualRegister(TRC);
10807 BuildMI(DispContBB, dl, TII->get(ARM::LDRrs), NewVReg5)
10808 .addReg(NewVReg3, RegState::Kill)
10809 .addReg(NewVReg4)
10810 .addImm(0)
10811 .addMemOperand(JTMMOLd)
10812 .add(predOps(ARMCC::AL));
10814 if (IsPositionIndependent) {
10815 BuildMI(DispContBB, dl, TII->get(ARM::BR_JTadd))
10816 .addReg(NewVReg5, RegState::Kill)
10817 .addReg(NewVReg4)
10818 .addJumpTableIndex(MJTI);
10819 } else {
10820 BuildMI(DispContBB, dl, TII->get(ARM::BR_JTr))
10821 .addReg(NewVReg5, RegState::Kill)
10822 .addJumpTableIndex(MJTI);
10826 // Add the jump table entries as successors to the MBB.
10827 SmallPtrSet<MachineBasicBlock*, 8> SeenMBBs;
10828 for (std::vector<MachineBasicBlock*>::iterator
10829 I = LPadList.begin(), E = LPadList.end(); I != E; ++I) {
10830 MachineBasicBlock *CurMBB = *I;
10831 if (SeenMBBs.insert(CurMBB).second)
10832 DispContBB->addSuccessor(CurMBB);
10835 // N.B. the order the invoke BBs are processed in doesn't matter here.
10836 const MCPhysReg *SavedRegs = RI.getCalleeSavedRegs(MF);
10837 SmallVector<MachineBasicBlock*, 64> MBBLPads;
10838 for (MachineBasicBlock *BB : InvokeBBs) {
10840 // Remove the landing pad successor from the invoke block and replace it
10841 // with the new dispatch block.
10842 SmallVector<MachineBasicBlock*, 4> Successors(BB->successors());
10843 while (!Successors.empty()) {
10844 MachineBasicBlock *SMBB = Successors.pop_back_val();
10845 if (SMBB->isEHPad()) {
10846 BB->removeSuccessor(SMBB);
10847 MBBLPads.push_back(SMBB);
10851 BB->addSuccessor(DispatchBB, BranchProbability::getZero());
10852 BB->normalizeSuccProbs();
10854 // Find the invoke call and mark all of the callee-saved registers as
10855 // 'implicit defined' so that they're spilled. This prevents code from
10856 // moving instructions to before the EH block, where they will never be
10857 // executed.
10858 for (MachineBasicBlock::reverse_iterator
10859 II = BB->rbegin(), IE = BB->rend(); II != IE; ++II) {
10860 if (!II->isCall()) continue;
10862 DenseMap<unsigned, bool> DefRegs;
10863 for (MachineInstr::mop_iterator
10864 OI = II->operands_begin(), OE = II->operands_end();
10865 OI != OE; ++OI) {
10866 if (!OI->isReg()) continue;
10867 DefRegs[OI->getReg()] = true;
10870 MachineInstrBuilder MIB(*MF, &*II);
10872 for (unsigned i = 0; SavedRegs[i] != 0; ++i) {
10873 unsigned Reg = SavedRegs[i];
10874 if (Subtarget->isThumb2() &&
10875 !ARM::tGPRRegClass.contains(Reg) &&
10876 !ARM::hGPRRegClass.contains(Reg))
10877 continue;
10878 if (Subtarget->isThumb1Only() && !ARM::tGPRRegClass.contains(Reg))
10879 continue;
10880 if (!Subtarget->isThumb() && !ARM::GPRRegClass.contains(Reg))
10881 continue;
10882 if (!DefRegs[Reg])
10883 MIB.addReg(Reg, RegState::ImplicitDefine | RegState::Dead);
10886 break;
10890 // Mark all former landing pads as non-landing pads. The dispatch is the only
10891 // landing pad now.
10892 for (SmallVectorImpl<MachineBasicBlock*>::iterator
10893 I = MBBLPads.begin(), E = MBBLPads.end(); I != E; ++I)
10894 (*I)->setIsEHPad(false);
10896 // The instruction is gone now.
10897 MI.eraseFromParent();
10900 static
10901 MachineBasicBlock *OtherSucc(MachineBasicBlock *MBB, MachineBasicBlock *Succ) {
10902 for (MachineBasicBlock::succ_iterator I = MBB->succ_begin(),
10903 E = MBB->succ_end(); I != E; ++I)
10904 if (*I != Succ)
10905 return *I;
10906 llvm_unreachable("Expecting a BB with two successors!");
10909 /// Return the load opcode for a given load size. If load size >= 8,
10910 /// neon opcode will be returned.
10911 static unsigned getLdOpcode(unsigned LdSize, bool IsThumb1, bool IsThumb2) {
10912 if (LdSize >= 8)
10913 return LdSize == 16 ? ARM::VLD1q32wb_fixed
10914 : LdSize == 8 ? ARM::VLD1d32wb_fixed : 0;
10915 if (IsThumb1)
10916 return LdSize == 4 ? ARM::tLDRi
10917 : LdSize == 2 ? ARM::tLDRHi
10918 : LdSize == 1 ? ARM::tLDRBi : 0;
10919 if (IsThumb2)
10920 return LdSize == 4 ? ARM::t2LDR_POST
10921 : LdSize == 2 ? ARM::t2LDRH_POST
10922 : LdSize == 1 ? ARM::t2LDRB_POST : 0;
10923 return LdSize == 4 ? ARM::LDR_POST_IMM
10924 : LdSize == 2 ? ARM::LDRH_POST
10925 : LdSize == 1 ? ARM::LDRB_POST_IMM : 0;
10928 /// Return the store opcode for a given store size. If store size >= 8,
10929 /// neon opcode will be returned.
10930 static unsigned getStOpcode(unsigned StSize, bool IsThumb1, bool IsThumb2) {
10931 if (StSize >= 8)
10932 return StSize == 16 ? ARM::VST1q32wb_fixed
10933 : StSize == 8 ? ARM::VST1d32wb_fixed : 0;
10934 if (IsThumb1)
10935 return StSize == 4 ? ARM::tSTRi
10936 : StSize == 2 ? ARM::tSTRHi
10937 : StSize == 1 ? ARM::tSTRBi : 0;
10938 if (IsThumb2)
10939 return StSize == 4 ? ARM::t2STR_POST
10940 : StSize == 2 ? ARM::t2STRH_POST
10941 : StSize == 1 ? ARM::t2STRB_POST : 0;
10942 return StSize == 4 ? ARM::STR_POST_IMM
10943 : StSize == 2 ? ARM::STRH_POST
10944 : StSize == 1 ? ARM::STRB_POST_IMM : 0;
10947 /// Emit a post-increment load operation with given size. The instructions
10948 /// will be added to BB at Pos.
10949 static void emitPostLd(MachineBasicBlock *BB, MachineBasicBlock::iterator Pos,
10950 const TargetInstrInfo *TII, const DebugLoc &dl,
10951 unsigned LdSize, unsigned Data, unsigned AddrIn,
10952 unsigned AddrOut, bool IsThumb1, bool IsThumb2) {
10953 unsigned LdOpc = getLdOpcode(LdSize, IsThumb1, IsThumb2);
10954 assert(LdOpc != 0 && "Should have a load opcode");
10955 if (LdSize >= 8) {
10956 BuildMI(*BB, Pos, dl, TII->get(LdOpc), Data)
10957 .addReg(AddrOut, RegState::Define)
10958 .addReg(AddrIn)
10959 .addImm(0)
10960 .add(predOps(ARMCC::AL));
10961 } else if (IsThumb1) {
10962 // load + update AddrIn
10963 BuildMI(*BB, Pos, dl, TII->get(LdOpc), Data)
10964 .addReg(AddrIn)
10965 .addImm(0)
10966 .add(predOps(ARMCC::AL));
10967 BuildMI(*BB, Pos, dl, TII->get(ARM::tADDi8), AddrOut)
10968 .add(t1CondCodeOp())
10969 .addReg(AddrIn)
10970 .addImm(LdSize)
10971 .add(predOps(ARMCC::AL));
10972 } else if (IsThumb2) {
10973 BuildMI(*BB, Pos, dl, TII->get(LdOpc), Data)
10974 .addReg(AddrOut, RegState::Define)
10975 .addReg(AddrIn)
10976 .addImm(LdSize)
10977 .add(predOps(ARMCC::AL));
10978 } else { // arm
10979 BuildMI(*BB, Pos, dl, TII->get(LdOpc), Data)
10980 .addReg(AddrOut, RegState::Define)
10981 .addReg(AddrIn)
10982 .addReg(0)
10983 .addImm(LdSize)
10984 .add(predOps(ARMCC::AL));
10988 /// Emit a post-increment store operation with given size. The instructions
10989 /// will be added to BB at Pos.
10990 static void emitPostSt(MachineBasicBlock *BB, MachineBasicBlock::iterator Pos,
10991 const TargetInstrInfo *TII, const DebugLoc &dl,
10992 unsigned StSize, unsigned Data, unsigned AddrIn,
10993 unsigned AddrOut, bool IsThumb1, bool IsThumb2) {
10994 unsigned StOpc = getStOpcode(StSize, IsThumb1, IsThumb2);
10995 assert(StOpc != 0 && "Should have a store opcode");
10996 if (StSize >= 8) {
10997 BuildMI(*BB, Pos, dl, TII->get(StOpc), AddrOut)
10998 .addReg(AddrIn)
10999 .addImm(0)
11000 .addReg(Data)
11001 .add(predOps(ARMCC::AL));
11002 } else if (IsThumb1) {
11003 // store + update AddrIn
11004 BuildMI(*BB, Pos, dl, TII->get(StOpc))
11005 .addReg(Data)
11006 .addReg(AddrIn)
11007 .addImm(0)
11008 .add(predOps(ARMCC::AL));
11009 BuildMI(*BB, Pos, dl, TII->get(ARM::tADDi8), AddrOut)
11010 .add(t1CondCodeOp())
11011 .addReg(AddrIn)
11012 .addImm(StSize)
11013 .add(predOps(ARMCC::AL));
11014 } else if (IsThumb2) {
11015 BuildMI(*BB, Pos, dl, TII->get(StOpc), AddrOut)
11016 .addReg(Data)
11017 .addReg(AddrIn)
11018 .addImm(StSize)
11019 .add(predOps(ARMCC::AL));
11020 } else { // arm
11021 BuildMI(*BB, Pos, dl, TII->get(StOpc), AddrOut)
11022 .addReg(Data)
11023 .addReg(AddrIn)
11024 .addReg(0)
11025 .addImm(StSize)
11026 .add(predOps(ARMCC::AL));
11030 MachineBasicBlock *
11031 ARMTargetLowering::EmitStructByval(MachineInstr &MI,
11032 MachineBasicBlock *BB) const {
11033 // This pseudo instruction has 3 operands: dst, src, size
11034 // We expand it to a loop if size > Subtarget->getMaxInlineSizeThreshold().
11035 // Otherwise, we will generate unrolled scalar copies.
11036 const TargetInstrInfo *TII = Subtarget->getInstrInfo();
11037 const BasicBlock *LLVM_BB = BB->getBasicBlock();
11038 MachineFunction::iterator It = ++BB->getIterator();
11040 Register dest = MI.getOperand(0).getReg();
11041 Register src = MI.getOperand(1).getReg();
11042 unsigned SizeVal = MI.getOperand(2).getImm();
11043 unsigned Alignment = MI.getOperand(3).getImm();
11044 DebugLoc dl = MI.getDebugLoc();
11046 MachineFunction *MF = BB->getParent();
11047 MachineRegisterInfo &MRI = MF->getRegInfo();
11048 unsigned UnitSize = 0;
11049 const TargetRegisterClass *TRC = nullptr;
11050 const TargetRegisterClass *VecTRC = nullptr;
11052 bool IsThumb1 = Subtarget->isThumb1Only();
11053 bool IsThumb2 = Subtarget->isThumb2();
11054 bool IsThumb = Subtarget->isThumb();
11056 if (Alignment & 1) {
11057 UnitSize = 1;
11058 } else if (Alignment & 2) {
11059 UnitSize = 2;
11060 } else {
11061 // Check whether we can use NEON instructions.
11062 if (!MF->getFunction().hasFnAttribute(Attribute::NoImplicitFloat) &&
11063 Subtarget->hasNEON()) {
11064 if ((Alignment % 16 == 0) && SizeVal >= 16)
11065 UnitSize = 16;
11066 else if ((Alignment % 8 == 0) && SizeVal >= 8)
11067 UnitSize = 8;
11069 // Can't use NEON instructions.
11070 if (UnitSize == 0)
11071 UnitSize = 4;
11074 // Select the correct opcode and register class for unit size load/store
11075 bool IsNeon = UnitSize >= 8;
11076 TRC = IsThumb ? &ARM::tGPRRegClass : &ARM::GPRRegClass;
11077 if (IsNeon)
11078 VecTRC = UnitSize == 16 ? &ARM::DPairRegClass
11079 : UnitSize == 8 ? &ARM::DPRRegClass
11080 : nullptr;
11082 unsigned BytesLeft = SizeVal % UnitSize;
11083 unsigned LoopSize = SizeVal - BytesLeft;
11085 if (SizeVal <= Subtarget->getMaxInlineSizeThreshold()) {
11086 // Use LDR and STR to copy.
11087 // [scratch, srcOut] = LDR_POST(srcIn, UnitSize)
11088 // [destOut] = STR_POST(scratch, destIn, UnitSize)
11089 unsigned srcIn = src;
11090 unsigned destIn = dest;
11091 for (unsigned i = 0; i < LoopSize; i+=UnitSize) {
11092 Register srcOut = MRI.createVirtualRegister(TRC);
11093 Register destOut = MRI.createVirtualRegister(TRC);
11094 Register scratch = MRI.createVirtualRegister(IsNeon ? VecTRC : TRC);
11095 emitPostLd(BB, MI, TII, dl, UnitSize, scratch, srcIn, srcOut,
11096 IsThumb1, IsThumb2);
11097 emitPostSt(BB, MI, TII, dl, UnitSize, scratch, destIn, destOut,
11098 IsThumb1, IsThumb2);
11099 srcIn = srcOut;
11100 destIn = destOut;
11103 // Handle the leftover bytes with LDRB and STRB.
11104 // [scratch, srcOut] = LDRB_POST(srcIn, 1)
11105 // [destOut] = STRB_POST(scratch, destIn, 1)
11106 for (unsigned i = 0; i < BytesLeft; i++) {
11107 Register srcOut = MRI.createVirtualRegister(TRC);
11108 Register destOut = MRI.createVirtualRegister(TRC);
11109 Register scratch = MRI.createVirtualRegister(TRC);
11110 emitPostLd(BB, MI, TII, dl, 1, scratch, srcIn, srcOut,
11111 IsThumb1, IsThumb2);
11112 emitPostSt(BB, MI, TII, dl, 1, scratch, destIn, destOut,
11113 IsThumb1, IsThumb2);
11114 srcIn = srcOut;
11115 destIn = destOut;
11117 MI.eraseFromParent(); // The instruction is gone now.
11118 return BB;
11121 // Expand the pseudo op to a loop.
11122 // thisMBB:
11123 // ...
11124 // movw varEnd, # --> with thumb2
11125 // movt varEnd, #
11126 // ldrcp varEnd, idx --> without thumb2
11127 // fallthrough --> loopMBB
11128 // loopMBB:
11129 // PHI varPhi, varEnd, varLoop
11130 // PHI srcPhi, src, srcLoop
11131 // PHI destPhi, dst, destLoop
11132 // [scratch, srcLoop] = LDR_POST(srcPhi, UnitSize)
11133 // [destLoop] = STR_POST(scratch, destPhi, UnitSize)
11134 // subs varLoop, varPhi, #UnitSize
11135 // bne loopMBB
11136 // fallthrough --> exitMBB
11137 // exitMBB:
11138 // epilogue to handle left-over bytes
11139 // [scratch, srcOut] = LDRB_POST(srcLoop, 1)
11140 // [destOut] = STRB_POST(scratch, destLoop, 1)
11141 MachineBasicBlock *loopMBB = MF->CreateMachineBasicBlock(LLVM_BB);
11142 MachineBasicBlock *exitMBB = MF->CreateMachineBasicBlock(LLVM_BB);
11143 MF->insert(It, loopMBB);
11144 MF->insert(It, exitMBB);
11146 // Transfer the remainder of BB and its successor edges to exitMBB.
11147 exitMBB->splice(exitMBB->begin(), BB,
11148 std::next(MachineBasicBlock::iterator(MI)), BB->end());
11149 exitMBB->transferSuccessorsAndUpdatePHIs(BB);
11151 // Load an immediate to varEnd.
11152 Register varEnd = MRI.createVirtualRegister(TRC);
11153 if (Subtarget->useMovt()) {
11154 unsigned Vtmp = varEnd;
11155 if ((LoopSize & 0xFFFF0000) != 0)
11156 Vtmp = MRI.createVirtualRegister(TRC);
11157 BuildMI(BB, dl, TII->get(IsThumb ? ARM::t2MOVi16 : ARM::MOVi16), Vtmp)
11158 .addImm(LoopSize & 0xFFFF)
11159 .add(predOps(ARMCC::AL));
11161 if ((LoopSize & 0xFFFF0000) != 0)
11162 BuildMI(BB, dl, TII->get(IsThumb ? ARM::t2MOVTi16 : ARM::MOVTi16), varEnd)
11163 .addReg(Vtmp)
11164 .addImm(LoopSize >> 16)
11165 .add(predOps(ARMCC::AL));
11166 } else {
11167 MachineConstantPool *ConstantPool = MF->getConstantPool();
11168 Type *Int32Ty = Type::getInt32Ty(MF->getFunction().getContext());
11169 const Constant *C = ConstantInt::get(Int32Ty, LoopSize);
11171 // MachineConstantPool wants an explicit alignment.
11172 Align Alignment = MF->getDataLayout().getPrefTypeAlign(Int32Ty);
11173 unsigned Idx = ConstantPool->getConstantPoolIndex(C, Alignment);
11174 MachineMemOperand *CPMMO =
11175 MF->getMachineMemOperand(MachinePointerInfo::getConstantPool(*MF),
11176 MachineMemOperand::MOLoad, 4, Align(4));
11178 if (IsThumb)
11179 BuildMI(*BB, MI, dl, TII->get(ARM::tLDRpci))
11180 .addReg(varEnd, RegState::Define)
11181 .addConstantPoolIndex(Idx)
11182 .add(predOps(ARMCC::AL))
11183 .addMemOperand(CPMMO);
11184 else
11185 BuildMI(*BB, MI, dl, TII->get(ARM::LDRcp))
11186 .addReg(varEnd, RegState::Define)
11187 .addConstantPoolIndex(Idx)
11188 .addImm(0)
11189 .add(predOps(ARMCC::AL))
11190 .addMemOperand(CPMMO);
11192 BB->addSuccessor(loopMBB);
11194 // Generate the loop body:
11195 // varPhi = PHI(varLoop, varEnd)
11196 // srcPhi = PHI(srcLoop, src)
11197 // destPhi = PHI(destLoop, dst)
11198 MachineBasicBlock *entryBB = BB;
11199 BB = loopMBB;
11200 Register varLoop = MRI.createVirtualRegister(TRC);
11201 Register varPhi = MRI.createVirtualRegister(TRC);
11202 Register srcLoop = MRI.createVirtualRegister(TRC);
11203 Register srcPhi = MRI.createVirtualRegister(TRC);
11204 Register destLoop = MRI.createVirtualRegister(TRC);
11205 Register destPhi = MRI.createVirtualRegister(TRC);
11207 BuildMI(*BB, BB->begin(), dl, TII->get(ARM::PHI), varPhi)
11208 .addReg(varLoop).addMBB(loopMBB)
11209 .addReg(varEnd).addMBB(entryBB);
11210 BuildMI(BB, dl, TII->get(ARM::PHI), srcPhi)
11211 .addReg(srcLoop).addMBB(loopMBB)
11212 .addReg(src).addMBB(entryBB);
11213 BuildMI(BB, dl, TII->get(ARM::PHI), destPhi)
11214 .addReg(destLoop).addMBB(loopMBB)
11215 .addReg(dest).addMBB(entryBB);
11217 // [scratch, srcLoop] = LDR_POST(srcPhi, UnitSize)
11218 // [destLoop] = STR_POST(scratch, destPhi, UnitSiz)
11219 Register scratch = MRI.createVirtualRegister(IsNeon ? VecTRC : TRC);
11220 emitPostLd(BB, BB->end(), TII, dl, UnitSize, scratch, srcPhi, srcLoop,
11221 IsThumb1, IsThumb2);
11222 emitPostSt(BB, BB->end(), TII, dl, UnitSize, scratch, destPhi, destLoop,
11223 IsThumb1, IsThumb2);
11225 // Decrement loop variable by UnitSize.
11226 if (IsThumb1) {
11227 BuildMI(*BB, BB->end(), dl, TII->get(ARM::tSUBi8), varLoop)
11228 .add(t1CondCodeOp())
11229 .addReg(varPhi)
11230 .addImm(UnitSize)
11231 .add(predOps(ARMCC::AL));
11232 } else {
11233 MachineInstrBuilder MIB =
11234 BuildMI(*BB, BB->end(), dl,
11235 TII->get(IsThumb2 ? ARM::t2SUBri : ARM::SUBri), varLoop);
11236 MIB.addReg(varPhi)
11237 .addImm(UnitSize)
11238 .add(predOps(ARMCC::AL))
11239 .add(condCodeOp());
11240 MIB->getOperand(5).setReg(ARM::CPSR);
11241 MIB->getOperand(5).setIsDef(true);
11243 BuildMI(*BB, BB->end(), dl,
11244 TII->get(IsThumb1 ? ARM::tBcc : IsThumb2 ? ARM::t2Bcc : ARM::Bcc))
11245 .addMBB(loopMBB).addImm(ARMCC::NE).addReg(ARM::CPSR);
11247 // loopMBB can loop back to loopMBB or fall through to exitMBB.
11248 BB->addSuccessor(loopMBB);
11249 BB->addSuccessor(exitMBB);
11251 // Add epilogue to handle BytesLeft.
11252 BB = exitMBB;
11253 auto StartOfExit = exitMBB->begin();
11255 // [scratch, srcOut] = LDRB_POST(srcLoop, 1)
11256 // [destOut] = STRB_POST(scratch, destLoop, 1)
11257 unsigned srcIn = srcLoop;
11258 unsigned destIn = destLoop;
11259 for (unsigned i = 0; i < BytesLeft; i++) {
11260 Register srcOut = MRI.createVirtualRegister(TRC);
11261 Register destOut = MRI.createVirtualRegister(TRC);
11262 Register scratch = MRI.createVirtualRegister(TRC);
11263 emitPostLd(BB, StartOfExit, TII, dl, 1, scratch, srcIn, srcOut,
11264 IsThumb1, IsThumb2);
11265 emitPostSt(BB, StartOfExit, TII, dl, 1, scratch, destIn, destOut,
11266 IsThumb1, IsThumb2);
11267 srcIn = srcOut;
11268 destIn = destOut;
11271 MI.eraseFromParent(); // The instruction is gone now.
11272 return BB;
11275 MachineBasicBlock *
11276 ARMTargetLowering::EmitLowered__chkstk(MachineInstr &MI,
11277 MachineBasicBlock *MBB) const {
11278 const TargetMachine &TM = getTargetMachine();
11279 const TargetInstrInfo &TII = *Subtarget->getInstrInfo();
11280 DebugLoc DL = MI.getDebugLoc();
11282 assert(Subtarget->isTargetWindows() &&
11283 "__chkstk is only supported on Windows");
11284 assert(Subtarget->isThumb2() && "Windows on ARM requires Thumb-2 mode");
11286 // __chkstk takes the number of words to allocate on the stack in R4, and
11287 // returns the stack adjustment in number of bytes in R4. This will not
11288 // clober any other registers (other than the obvious lr).
11290 // Although, technically, IP should be considered a register which may be
11291 // clobbered, the call itself will not touch it. Windows on ARM is a pure
11292 // thumb-2 environment, so there is no interworking required. As a result, we
11293 // do not expect a veneer to be emitted by the linker, clobbering IP.
11295 // Each module receives its own copy of __chkstk, so no import thunk is
11296 // required, again, ensuring that IP is not clobbered.
11298 // Finally, although some linkers may theoretically provide a trampoline for
11299 // out of range calls (which is quite common due to a 32M range limitation of
11300 // branches for Thumb), we can generate the long-call version via
11301 // -mcmodel=large, alleviating the need for the trampoline which may clobber
11302 // IP.
11304 switch (TM.getCodeModel()) {
11305 case CodeModel::Tiny:
11306 llvm_unreachable("Tiny code model not available on ARM.");
11307 case CodeModel::Small:
11308 case CodeModel::Medium:
11309 case CodeModel::Kernel:
11310 BuildMI(*MBB, MI, DL, TII.get(ARM::tBL))
11311 .add(predOps(ARMCC::AL))
11312 .addExternalSymbol("__chkstk")
11313 .addReg(ARM::R4, RegState::Implicit | RegState::Kill)
11314 .addReg(ARM::R4, RegState::Implicit | RegState::Define)
11315 .addReg(ARM::R12,
11316 RegState::Implicit | RegState::Define | RegState::Dead)
11317 .addReg(ARM::CPSR,
11318 RegState::Implicit | RegState::Define | RegState::Dead);
11319 break;
11320 case CodeModel::Large: {
11321 MachineRegisterInfo &MRI = MBB->getParent()->getRegInfo();
11322 Register Reg = MRI.createVirtualRegister(&ARM::rGPRRegClass);
11324 BuildMI(*MBB, MI, DL, TII.get(ARM::t2MOVi32imm), Reg)
11325 .addExternalSymbol("__chkstk");
11326 BuildMI(*MBB, MI, DL, TII.get(gettBLXrOpcode(*MBB->getParent())))
11327 .add(predOps(ARMCC::AL))
11328 .addReg(Reg, RegState::Kill)
11329 .addReg(ARM::R4, RegState::Implicit | RegState::Kill)
11330 .addReg(ARM::R4, RegState::Implicit | RegState::Define)
11331 .addReg(ARM::R12,
11332 RegState::Implicit | RegState::Define | RegState::Dead)
11333 .addReg(ARM::CPSR,
11334 RegState::Implicit | RegState::Define | RegState::Dead);
11335 break;
11339 BuildMI(*MBB, MI, DL, TII.get(ARM::t2SUBrr), ARM::SP)
11340 .addReg(ARM::SP, RegState::Kill)
11341 .addReg(ARM::R4, RegState::Kill)
11342 .setMIFlags(MachineInstr::FrameSetup)
11343 .add(predOps(ARMCC::AL))
11344 .add(condCodeOp());
11346 MI.eraseFromParent();
11347 return MBB;
11350 MachineBasicBlock *
11351 ARMTargetLowering::EmitLowered__dbzchk(MachineInstr &MI,
11352 MachineBasicBlock *MBB) const {
11353 DebugLoc DL = MI.getDebugLoc();
11354 MachineFunction *MF = MBB->getParent();
11355 const TargetInstrInfo *TII = Subtarget->getInstrInfo();
11357 MachineBasicBlock *ContBB = MF->CreateMachineBasicBlock();
11358 MF->insert(++MBB->getIterator(), ContBB);
11359 ContBB->splice(ContBB->begin(), MBB,
11360 std::next(MachineBasicBlock::iterator(MI)), MBB->end());
11361 ContBB->transferSuccessorsAndUpdatePHIs(MBB);
11362 MBB->addSuccessor(ContBB);
11364 MachineBasicBlock *TrapBB = MF->CreateMachineBasicBlock();
11365 BuildMI(TrapBB, DL, TII->get(ARM::t__brkdiv0));
11366 MF->push_back(TrapBB);
11367 MBB->addSuccessor(TrapBB);
11369 BuildMI(*MBB, MI, DL, TII->get(ARM::tCMPi8))
11370 .addReg(MI.getOperand(0).getReg())
11371 .addImm(0)
11372 .add(predOps(ARMCC::AL));
11373 BuildMI(*MBB, MI, DL, TII->get(ARM::t2Bcc))
11374 .addMBB(TrapBB)
11375 .addImm(ARMCC::EQ)
11376 .addReg(ARM::CPSR);
11378 MI.eraseFromParent();
11379 return ContBB;
11382 // The CPSR operand of SelectItr might be missing a kill marker
11383 // because there were multiple uses of CPSR, and ISel didn't know
11384 // which to mark. Figure out whether SelectItr should have had a
11385 // kill marker, and set it if it should. Returns the correct kill
11386 // marker value.
11387 static bool checkAndUpdateCPSRKill(MachineBasicBlock::iterator SelectItr,
11388 MachineBasicBlock* BB,
11389 const TargetRegisterInfo* TRI) {
11390 // Scan forward through BB for a use/def of CPSR.
11391 MachineBasicBlock::iterator miI(std::next(SelectItr));
11392 for (MachineBasicBlock::iterator miE = BB->end(); miI != miE; ++miI) {
11393 const MachineInstr& mi = *miI;
11394 if (mi.readsRegister(ARM::CPSR))
11395 return false;
11396 if (mi.definesRegister(ARM::CPSR))
11397 break; // Should have kill-flag - update below.
11400 // If we hit the end of the block, check whether CPSR is live into a
11401 // successor.
11402 if (miI == BB->end()) {
11403 for (MachineBasicBlock::succ_iterator sItr = BB->succ_begin(),
11404 sEnd = BB->succ_end();
11405 sItr != sEnd; ++sItr) {
11406 MachineBasicBlock* succ = *sItr;
11407 if (succ->isLiveIn(ARM::CPSR))
11408 return false;
11412 // We found a def, or hit the end of the basic block and CPSR wasn't live
11413 // out. SelectMI should have a kill flag on CPSR.
11414 SelectItr->addRegisterKilled(ARM::CPSR, TRI);
11415 return true;
11418 /// Adds logic in loop entry MBB to calculate loop iteration count and adds
11419 /// t2WhileLoopSetup and t2WhileLoopStart to generate WLS loop
11420 static Register genTPEntry(MachineBasicBlock *TpEntry,
11421 MachineBasicBlock *TpLoopBody,
11422 MachineBasicBlock *TpExit, Register OpSizeReg,
11423 const TargetInstrInfo *TII, DebugLoc Dl,
11424 MachineRegisterInfo &MRI) {
11425 // Calculates loop iteration count = ceil(n/16) = (n + 15) >> 4.
11426 Register AddDestReg = MRI.createVirtualRegister(&ARM::rGPRRegClass);
11427 BuildMI(TpEntry, Dl, TII->get(ARM::t2ADDri), AddDestReg)
11428 .addUse(OpSizeReg)
11429 .addImm(15)
11430 .add(predOps(ARMCC::AL))
11431 .addReg(0);
11433 Register LsrDestReg = MRI.createVirtualRegister(&ARM::rGPRRegClass);
11434 BuildMI(TpEntry, Dl, TII->get(ARM::t2LSRri), LsrDestReg)
11435 .addUse(AddDestReg, RegState::Kill)
11436 .addImm(4)
11437 .add(predOps(ARMCC::AL))
11438 .addReg(0);
11440 Register TotalIterationsReg = MRI.createVirtualRegister(&ARM::GPRlrRegClass);
11441 BuildMI(TpEntry, Dl, TII->get(ARM::t2WhileLoopSetup), TotalIterationsReg)
11442 .addUse(LsrDestReg, RegState::Kill);
11444 BuildMI(TpEntry, Dl, TII->get(ARM::t2WhileLoopStart))
11445 .addUse(TotalIterationsReg)
11446 .addMBB(TpExit);
11448 BuildMI(TpEntry, Dl, TII->get(ARM::t2B))
11449 .addMBB(TpLoopBody)
11450 .add(predOps(ARMCC::AL));
11452 return TotalIterationsReg;
11455 /// Adds logic in the loopBody MBB to generate MVE_VCTP, t2DoLoopDec and
11456 /// t2DoLoopEnd. These are used by later passes to generate tail predicated
11457 /// loops.
11458 static void genTPLoopBody(MachineBasicBlock *TpLoopBody,
11459 MachineBasicBlock *TpEntry, MachineBasicBlock *TpExit,
11460 const TargetInstrInfo *TII, DebugLoc Dl,
11461 MachineRegisterInfo &MRI, Register OpSrcReg,
11462 Register OpDestReg, Register ElementCountReg,
11463 Register TotalIterationsReg, bool IsMemcpy) {
11464 // First insert 4 PHI nodes for: Current pointer to Src (if memcpy), Dest
11465 // array, loop iteration counter, predication counter.
11467 Register SrcPhiReg, CurrSrcReg;
11468 if (IsMemcpy) {
11469 // Current position in the src array
11470 SrcPhiReg = MRI.createVirtualRegister(&ARM::rGPRRegClass);
11471 CurrSrcReg = MRI.createVirtualRegister(&ARM::rGPRRegClass);
11472 BuildMI(TpLoopBody, Dl, TII->get(ARM::PHI), SrcPhiReg)
11473 .addUse(OpSrcReg)
11474 .addMBB(TpEntry)
11475 .addUse(CurrSrcReg)
11476 .addMBB(TpLoopBody);
11479 // Current position in the dest array
11480 Register DestPhiReg = MRI.createVirtualRegister(&ARM::rGPRRegClass);
11481 Register CurrDestReg = MRI.createVirtualRegister(&ARM::rGPRRegClass);
11482 BuildMI(TpLoopBody, Dl, TII->get(ARM::PHI), DestPhiReg)
11483 .addUse(OpDestReg)
11484 .addMBB(TpEntry)
11485 .addUse(CurrDestReg)
11486 .addMBB(TpLoopBody);
11488 // Current loop counter
11489 Register LoopCounterPhiReg = MRI.createVirtualRegister(&ARM::GPRlrRegClass);
11490 Register RemainingLoopIterationsReg =
11491 MRI.createVirtualRegister(&ARM::GPRlrRegClass);
11492 BuildMI(TpLoopBody, Dl, TII->get(ARM::PHI), LoopCounterPhiReg)
11493 .addUse(TotalIterationsReg)
11494 .addMBB(TpEntry)
11495 .addUse(RemainingLoopIterationsReg)
11496 .addMBB(TpLoopBody);
11498 // Predication counter
11499 Register PredCounterPhiReg = MRI.createVirtualRegister(&ARM::rGPRRegClass);
11500 Register RemainingElementsReg = MRI.createVirtualRegister(&ARM::rGPRRegClass);
11501 BuildMI(TpLoopBody, Dl, TII->get(ARM::PHI), PredCounterPhiReg)
11502 .addUse(ElementCountReg)
11503 .addMBB(TpEntry)
11504 .addUse(RemainingElementsReg)
11505 .addMBB(TpLoopBody);
11507 // Pass predication counter to VCTP
11508 Register VccrReg = MRI.createVirtualRegister(&ARM::VCCRRegClass);
11509 BuildMI(TpLoopBody, Dl, TII->get(ARM::MVE_VCTP8), VccrReg)
11510 .addUse(PredCounterPhiReg)
11511 .addImm(ARMVCC::None)
11512 .addReg(0);
11514 BuildMI(TpLoopBody, Dl, TII->get(ARM::t2SUBri), RemainingElementsReg)
11515 .addUse(PredCounterPhiReg)
11516 .addImm(16)
11517 .add(predOps(ARMCC::AL))
11518 .addReg(0);
11520 // VLDRB (only if memcpy) and VSTRB instructions, predicated using VPR
11521 Register SrcValueReg;
11522 if (IsMemcpy) {
11523 SrcValueReg = MRI.createVirtualRegister(&ARM::MQPRRegClass);
11524 BuildMI(TpLoopBody, Dl, TII->get(ARM::MVE_VLDRBU8_post))
11525 .addDef(CurrSrcReg)
11526 .addDef(SrcValueReg)
11527 .addReg(SrcPhiReg)
11528 .addImm(16)
11529 .addImm(ARMVCC::Then)
11530 .addUse(VccrReg);
11531 } else
11532 SrcValueReg = OpSrcReg;
11534 BuildMI(TpLoopBody, Dl, TII->get(ARM::MVE_VSTRBU8_post))
11535 .addDef(CurrDestReg)
11536 .addUse(SrcValueReg)
11537 .addReg(DestPhiReg)
11538 .addImm(16)
11539 .addImm(ARMVCC::Then)
11540 .addUse(VccrReg);
11542 // Add the pseudoInstrs for decrementing the loop counter and marking the
11543 // end:t2DoLoopDec and t2DoLoopEnd
11544 BuildMI(TpLoopBody, Dl, TII->get(ARM::t2LoopDec), RemainingLoopIterationsReg)
11545 .addUse(LoopCounterPhiReg)
11546 .addImm(1);
11548 BuildMI(TpLoopBody, Dl, TII->get(ARM::t2LoopEnd))
11549 .addUse(RemainingLoopIterationsReg)
11550 .addMBB(TpLoopBody);
11552 BuildMI(TpLoopBody, Dl, TII->get(ARM::t2B))
11553 .addMBB(TpExit)
11554 .add(predOps(ARMCC::AL));
11557 MachineBasicBlock *
11558 ARMTargetLowering::EmitInstrWithCustomInserter(MachineInstr &MI,
11559 MachineBasicBlock *BB) const {
11560 const TargetInstrInfo *TII = Subtarget->getInstrInfo();
11561 DebugLoc dl = MI.getDebugLoc();
11562 bool isThumb2 = Subtarget->isThumb2();
11563 switch (MI.getOpcode()) {
11564 default: {
11565 MI.print(errs());
11566 llvm_unreachable("Unexpected instr type to insert");
11569 // Thumb1 post-indexed loads are really just single-register LDMs.
11570 case ARM::tLDR_postidx: {
11571 MachineOperand Def(MI.getOperand(1));
11572 BuildMI(*BB, MI, dl, TII->get(ARM::tLDMIA_UPD))
11573 .add(Def) // Rn_wb
11574 .add(MI.getOperand(2)) // Rn
11575 .add(MI.getOperand(3)) // PredImm
11576 .add(MI.getOperand(4)) // PredReg
11577 .add(MI.getOperand(0)) // Rt
11578 .cloneMemRefs(MI);
11579 MI.eraseFromParent();
11580 return BB;
11583 case ARM::MVE_MEMCPYLOOPINST:
11584 case ARM::MVE_MEMSETLOOPINST: {
11586 // Transformation below expands MVE_MEMCPYLOOPINST/MVE_MEMSETLOOPINST Pseudo
11587 // into a Tail Predicated (TP) Loop. It adds the instructions to calculate
11588 // the iteration count =ceil(size_in_bytes/16)) in the TP entry block and
11589 // adds the relevant instructions in the TP loop Body for generation of a
11590 // WLSTP loop.
11592 // Below is relevant portion of the CFG after the transformation.
11593 // The Machine Basic Blocks are shown along with branch conditions (in
11594 // brackets). Note that TP entry/exit MBBs depict the entry/exit of this
11595 // portion of the CFG and may not necessarily be the entry/exit of the
11596 // function.
11598 // (Relevant) CFG after transformation:
11599 // TP entry MBB
11600 // |
11601 // |-----------------|
11602 // (n <= 0) (n > 0)
11603 // | |
11604 // | TP loop Body MBB<--|
11605 // | | |
11606 // \ |___________|
11607 // \ /
11608 // TP exit MBB
11610 MachineFunction *MF = BB->getParent();
11611 MachineFunctionProperties &Properties = MF->getProperties();
11612 MachineRegisterInfo &MRI = MF->getRegInfo();
11614 Register OpDestReg = MI.getOperand(0).getReg();
11615 Register OpSrcReg = MI.getOperand(1).getReg();
11616 Register OpSizeReg = MI.getOperand(2).getReg();
11618 // Allocate the required MBBs and add to parent function.
11619 MachineBasicBlock *TpEntry = BB;
11620 MachineBasicBlock *TpLoopBody = MF->CreateMachineBasicBlock();
11621 MachineBasicBlock *TpExit;
11623 MF->push_back(TpLoopBody);
11625 // If any instructions are present in the current block after
11626 // MVE_MEMCPYLOOPINST or MVE_MEMSETLOOPINST, split the current block and
11627 // move the instructions into the newly created exit block. If there are no
11628 // instructions add an explicit branch to the FallThrough block and then
11629 // split.
11631 // The split is required for two reasons:
11632 // 1) A terminator(t2WhileLoopStart) will be placed at that site.
11633 // 2) Since a TPLoopBody will be added later, any phis in successive blocks
11634 // need to be updated. splitAt() already handles this.
11635 TpExit = BB->splitAt(MI, false);
11636 if (TpExit == BB) {
11637 assert(BB->canFallThrough() && "Exit Block must be Fallthrough of the "
11638 "block containing memcpy/memset Pseudo");
11639 TpExit = BB->getFallThrough();
11640 BuildMI(BB, dl, TII->get(ARM::t2B))
11641 .addMBB(TpExit)
11642 .add(predOps(ARMCC::AL));
11643 TpExit = BB->splitAt(MI, false);
11646 // Add logic for iteration count
11647 Register TotalIterationsReg =
11648 genTPEntry(TpEntry, TpLoopBody, TpExit, OpSizeReg, TII, dl, MRI);
11650 // Add the vectorized (and predicated) loads/store instructions
11651 bool IsMemcpy = MI.getOpcode() == ARM::MVE_MEMCPYLOOPINST;
11652 genTPLoopBody(TpLoopBody, TpEntry, TpExit, TII, dl, MRI, OpSrcReg,
11653 OpDestReg, OpSizeReg, TotalIterationsReg, IsMemcpy);
11655 // Required to avoid conflict with the MachineVerifier during testing.
11656 Properties.reset(MachineFunctionProperties::Property::NoPHIs);
11658 // Connect the blocks
11659 TpEntry->addSuccessor(TpLoopBody);
11660 TpLoopBody->addSuccessor(TpLoopBody);
11661 TpLoopBody->addSuccessor(TpExit);
11663 // Reorder for a more natural layout
11664 TpLoopBody->moveAfter(TpEntry);
11665 TpExit->moveAfter(TpLoopBody);
11667 // Finally, remove the memcpy Psuedo Instruction
11668 MI.eraseFromParent();
11670 // Return the exit block as it may contain other instructions requiring a
11671 // custom inserter
11672 return TpExit;
11675 // The Thumb2 pre-indexed stores have the same MI operands, they just
11676 // define them differently in the .td files from the isel patterns, so
11677 // they need pseudos.
11678 case ARM::t2STR_preidx:
11679 MI.setDesc(TII->get(ARM::t2STR_PRE));
11680 return BB;
11681 case ARM::t2STRB_preidx:
11682 MI.setDesc(TII->get(ARM::t2STRB_PRE));
11683 return BB;
11684 case ARM::t2STRH_preidx:
11685 MI.setDesc(TII->get(ARM::t2STRH_PRE));
11686 return BB;
11688 case ARM::STRi_preidx:
11689 case ARM::STRBi_preidx: {
11690 unsigned NewOpc = MI.getOpcode() == ARM::STRi_preidx ? ARM::STR_PRE_IMM
11691 : ARM::STRB_PRE_IMM;
11692 // Decode the offset.
11693 unsigned Offset = MI.getOperand(4).getImm();
11694 bool isSub = ARM_AM::getAM2Op(Offset) == ARM_AM::sub;
11695 Offset = ARM_AM::getAM2Offset(Offset);
11696 if (isSub)
11697 Offset = -Offset;
11699 MachineMemOperand *MMO = *MI.memoperands_begin();
11700 BuildMI(*BB, MI, dl, TII->get(NewOpc))
11701 .add(MI.getOperand(0)) // Rn_wb
11702 .add(MI.getOperand(1)) // Rt
11703 .add(MI.getOperand(2)) // Rn
11704 .addImm(Offset) // offset (skip GPR==zero_reg)
11705 .add(MI.getOperand(5)) // pred
11706 .add(MI.getOperand(6))
11707 .addMemOperand(MMO);
11708 MI.eraseFromParent();
11709 return BB;
11711 case ARM::STRr_preidx:
11712 case ARM::STRBr_preidx:
11713 case ARM::STRH_preidx: {
11714 unsigned NewOpc;
11715 switch (MI.getOpcode()) {
11716 default: llvm_unreachable("unexpected opcode!");
11717 case ARM::STRr_preidx: NewOpc = ARM::STR_PRE_REG; break;
11718 case ARM::STRBr_preidx: NewOpc = ARM::STRB_PRE_REG; break;
11719 case ARM::STRH_preidx: NewOpc = ARM::STRH_PRE; break;
11721 MachineInstrBuilder MIB = BuildMI(*BB, MI, dl, TII->get(NewOpc));
11722 for (unsigned i = 0; i < MI.getNumOperands(); ++i)
11723 MIB.add(MI.getOperand(i));
11724 MI.eraseFromParent();
11725 return BB;
11728 case ARM::tMOVCCr_pseudo: {
11729 // To "insert" a SELECT_CC instruction, we actually have to insert the
11730 // diamond control-flow pattern. The incoming instruction knows the
11731 // destination vreg to set, the condition code register to branch on, the
11732 // true/false values to select between, and a branch opcode to use.
11733 const BasicBlock *LLVM_BB = BB->getBasicBlock();
11734 MachineFunction::iterator It = ++BB->getIterator();
11736 // thisMBB:
11737 // ...
11738 // TrueVal = ...
11739 // cmpTY ccX, r1, r2
11740 // bCC copy1MBB
11741 // fallthrough --> copy0MBB
11742 MachineBasicBlock *thisMBB = BB;
11743 MachineFunction *F = BB->getParent();
11744 MachineBasicBlock *copy0MBB = F->CreateMachineBasicBlock(LLVM_BB);
11745 MachineBasicBlock *sinkMBB = F->CreateMachineBasicBlock(LLVM_BB);
11746 F->insert(It, copy0MBB);
11747 F->insert(It, sinkMBB);
11749 // Check whether CPSR is live past the tMOVCCr_pseudo.
11750 const TargetRegisterInfo *TRI = Subtarget->getRegisterInfo();
11751 if (!MI.killsRegister(ARM::CPSR) &&
11752 !checkAndUpdateCPSRKill(MI, thisMBB, TRI)) {
11753 copy0MBB->addLiveIn(ARM::CPSR);
11754 sinkMBB->addLiveIn(ARM::CPSR);
11757 // Transfer the remainder of BB and its successor edges to sinkMBB.
11758 sinkMBB->splice(sinkMBB->begin(), BB,
11759 std::next(MachineBasicBlock::iterator(MI)), BB->end());
11760 sinkMBB->transferSuccessorsAndUpdatePHIs(BB);
11762 BB->addSuccessor(copy0MBB);
11763 BB->addSuccessor(sinkMBB);
11765 BuildMI(BB, dl, TII->get(ARM::tBcc))
11766 .addMBB(sinkMBB)
11767 .addImm(MI.getOperand(3).getImm())
11768 .addReg(MI.getOperand(4).getReg());
11770 // copy0MBB:
11771 // %FalseValue = ...
11772 // # fallthrough to sinkMBB
11773 BB = copy0MBB;
11775 // Update machine-CFG edges
11776 BB->addSuccessor(sinkMBB);
11778 // sinkMBB:
11779 // %Result = phi [ %FalseValue, copy0MBB ], [ %TrueValue, thisMBB ]
11780 // ...
11781 BB = sinkMBB;
11782 BuildMI(*BB, BB->begin(), dl, TII->get(ARM::PHI), MI.getOperand(0).getReg())
11783 .addReg(MI.getOperand(1).getReg())
11784 .addMBB(copy0MBB)
11785 .addReg(MI.getOperand(2).getReg())
11786 .addMBB(thisMBB);
11788 MI.eraseFromParent(); // The pseudo instruction is gone now.
11789 return BB;
11792 case ARM::BCCi64:
11793 case ARM::BCCZi64: {
11794 // If there is an unconditional branch to the other successor, remove it.
11795 BB->erase(std::next(MachineBasicBlock::iterator(MI)), BB->end());
11797 // Compare both parts that make up the double comparison separately for
11798 // equality.
11799 bool RHSisZero = MI.getOpcode() == ARM::BCCZi64;
11801 Register LHS1 = MI.getOperand(1).getReg();
11802 Register LHS2 = MI.getOperand(2).getReg();
11803 if (RHSisZero) {
11804 BuildMI(BB, dl, TII->get(isThumb2 ? ARM::t2CMPri : ARM::CMPri))
11805 .addReg(LHS1)
11806 .addImm(0)
11807 .add(predOps(ARMCC::AL));
11808 BuildMI(BB, dl, TII->get(isThumb2 ? ARM::t2CMPri : ARM::CMPri))
11809 .addReg(LHS2).addImm(0)
11810 .addImm(ARMCC::EQ).addReg(ARM::CPSR);
11811 } else {
11812 Register RHS1 = MI.getOperand(3).getReg();
11813 Register RHS2 = MI.getOperand(4).getReg();
11814 BuildMI(BB, dl, TII->get(isThumb2 ? ARM::t2CMPrr : ARM::CMPrr))
11815 .addReg(LHS1)
11816 .addReg(RHS1)
11817 .add(predOps(ARMCC::AL));
11818 BuildMI(BB, dl, TII->get(isThumb2 ? ARM::t2CMPrr : ARM::CMPrr))
11819 .addReg(LHS2).addReg(RHS2)
11820 .addImm(ARMCC::EQ).addReg(ARM::CPSR);
11823 MachineBasicBlock *destMBB = MI.getOperand(RHSisZero ? 3 : 5).getMBB();
11824 MachineBasicBlock *exitMBB = OtherSucc(BB, destMBB);
11825 if (MI.getOperand(0).getImm() == ARMCC::NE)
11826 std::swap(destMBB, exitMBB);
11828 BuildMI(BB, dl, TII->get(isThumb2 ? ARM::t2Bcc : ARM::Bcc))
11829 .addMBB(destMBB).addImm(ARMCC::EQ).addReg(ARM::CPSR);
11830 if (isThumb2)
11831 BuildMI(BB, dl, TII->get(ARM::t2B))
11832 .addMBB(exitMBB)
11833 .add(predOps(ARMCC::AL));
11834 else
11835 BuildMI(BB, dl, TII->get(ARM::B)) .addMBB(exitMBB);
11837 MI.eraseFromParent(); // The pseudo instruction is gone now.
11838 return BB;
11841 case ARM::Int_eh_sjlj_setjmp:
11842 case ARM::Int_eh_sjlj_setjmp_nofp:
11843 case ARM::tInt_eh_sjlj_setjmp:
11844 case ARM::t2Int_eh_sjlj_setjmp:
11845 case ARM::t2Int_eh_sjlj_setjmp_nofp:
11846 return BB;
11848 case ARM::Int_eh_sjlj_setup_dispatch:
11849 EmitSjLjDispatchBlock(MI, BB);
11850 return BB;
11852 case ARM::ABS:
11853 case ARM::t2ABS: {
11854 // To insert an ABS instruction, we have to insert the
11855 // diamond control-flow pattern. The incoming instruction knows the
11856 // source vreg to test against 0, the destination vreg to set,
11857 // the condition code register to branch on, the
11858 // true/false values to select between, and a branch opcode to use.
11859 // It transforms
11860 // V1 = ABS V0
11861 // into
11862 // V2 = MOVS V0
11863 // BCC (branch to SinkBB if V0 >= 0)
11864 // RSBBB: V3 = RSBri V2, 0 (compute ABS if V2 < 0)
11865 // SinkBB: V1 = PHI(V2, V3)
11866 const BasicBlock *LLVM_BB = BB->getBasicBlock();
11867 MachineFunction::iterator BBI = ++BB->getIterator();
11868 MachineFunction *Fn = BB->getParent();
11869 MachineBasicBlock *RSBBB = Fn->CreateMachineBasicBlock(LLVM_BB);
11870 MachineBasicBlock *SinkBB = Fn->CreateMachineBasicBlock(LLVM_BB);
11871 Fn->insert(BBI, RSBBB);
11872 Fn->insert(BBI, SinkBB);
11874 Register ABSSrcReg = MI.getOperand(1).getReg();
11875 Register ABSDstReg = MI.getOperand(0).getReg();
11876 bool ABSSrcKIll = MI.getOperand(1).isKill();
11877 bool isThumb2 = Subtarget->isThumb2();
11878 MachineRegisterInfo &MRI = Fn->getRegInfo();
11879 // In Thumb mode S must not be specified if source register is the SP or
11880 // PC and if destination register is the SP, so restrict register class
11881 Register NewRsbDstReg = MRI.createVirtualRegister(
11882 isThumb2 ? &ARM::rGPRRegClass : &ARM::GPRRegClass);
11884 // Transfer the remainder of BB and its successor edges to sinkMBB.
11885 SinkBB->splice(SinkBB->begin(), BB,
11886 std::next(MachineBasicBlock::iterator(MI)), BB->end());
11887 SinkBB->transferSuccessorsAndUpdatePHIs(BB);
11889 BB->addSuccessor(RSBBB);
11890 BB->addSuccessor(SinkBB);
11892 // fall through to SinkMBB
11893 RSBBB->addSuccessor(SinkBB);
11895 // insert a cmp at the end of BB
11896 BuildMI(BB, dl, TII->get(isThumb2 ? ARM::t2CMPri : ARM::CMPri))
11897 .addReg(ABSSrcReg)
11898 .addImm(0)
11899 .add(predOps(ARMCC::AL));
11901 // insert a bcc with opposite CC to ARMCC::MI at the end of BB
11902 BuildMI(BB, dl,
11903 TII->get(isThumb2 ? ARM::t2Bcc : ARM::Bcc)).addMBB(SinkBB)
11904 .addImm(ARMCC::getOppositeCondition(ARMCC::MI)).addReg(ARM::CPSR);
11906 // insert rsbri in RSBBB
11907 // Note: BCC and rsbri will be converted into predicated rsbmi
11908 // by if-conversion pass
11909 BuildMI(*RSBBB, RSBBB->begin(), dl,
11910 TII->get(isThumb2 ? ARM::t2RSBri : ARM::RSBri), NewRsbDstReg)
11911 .addReg(ABSSrcReg, ABSSrcKIll ? RegState::Kill : 0)
11912 .addImm(0)
11913 .add(predOps(ARMCC::AL))
11914 .add(condCodeOp());
11916 // insert PHI in SinkBB,
11917 // reuse ABSDstReg to not change uses of ABS instruction
11918 BuildMI(*SinkBB, SinkBB->begin(), dl,
11919 TII->get(ARM::PHI), ABSDstReg)
11920 .addReg(NewRsbDstReg).addMBB(RSBBB)
11921 .addReg(ABSSrcReg).addMBB(BB);
11923 // remove ABS instruction
11924 MI.eraseFromParent();
11926 // return last added BB
11927 return SinkBB;
11929 case ARM::COPY_STRUCT_BYVAL_I32:
11930 ++NumLoopByVals;
11931 return EmitStructByval(MI, BB);
11932 case ARM::WIN__CHKSTK:
11933 return EmitLowered__chkstk(MI, BB);
11934 case ARM::WIN__DBZCHK:
11935 return EmitLowered__dbzchk(MI, BB);
11939 /// Attaches vregs to MEMCPY that it will use as scratch registers
11940 /// when it is expanded into LDM/STM. This is done as a post-isel lowering
11941 /// instead of as a custom inserter because we need the use list from the SDNode.
11942 static void attachMEMCPYScratchRegs(const ARMSubtarget *Subtarget,
11943 MachineInstr &MI, const SDNode *Node) {
11944 bool isThumb1 = Subtarget->isThumb1Only();
11946 DebugLoc DL = MI.getDebugLoc();
11947 MachineFunction *MF = MI.getParent()->getParent();
11948 MachineRegisterInfo &MRI = MF->getRegInfo();
11949 MachineInstrBuilder MIB(*MF, MI);
11951 // If the new dst/src is unused mark it as dead.
11952 if (!Node->hasAnyUseOfValue(0)) {
11953 MI.getOperand(0).setIsDead(true);
11955 if (!Node->hasAnyUseOfValue(1)) {
11956 MI.getOperand(1).setIsDead(true);
11959 // The MEMCPY both defines and kills the scratch registers.
11960 for (unsigned I = 0; I != MI.getOperand(4).getImm(); ++I) {
11961 Register TmpReg = MRI.createVirtualRegister(isThumb1 ? &ARM::tGPRRegClass
11962 : &ARM::GPRRegClass);
11963 MIB.addReg(TmpReg, RegState::Define|RegState::Dead);
11967 void ARMTargetLowering::AdjustInstrPostInstrSelection(MachineInstr &MI,
11968 SDNode *Node) const {
11969 if (MI.getOpcode() == ARM::MEMCPY) {
11970 attachMEMCPYScratchRegs(Subtarget, MI, Node);
11971 return;
11974 const MCInstrDesc *MCID = &MI.getDesc();
11975 // Adjust potentially 's' setting instructions after isel, i.e. ADC, SBC, RSB,
11976 // RSC. Coming out of isel, they have an implicit CPSR def, but the optional
11977 // operand is still set to noreg. If needed, set the optional operand's
11978 // register to CPSR, and remove the redundant implicit def.
11980 // e.g. ADCS (..., implicit-def CPSR) -> ADC (... opt:def CPSR).
11982 // Rename pseudo opcodes.
11983 unsigned NewOpc = convertAddSubFlagsOpcode(MI.getOpcode());
11984 unsigned ccOutIdx;
11985 if (NewOpc) {
11986 const ARMBaseInstrInfo *TII = Subtarget->getInstrInfo();
11987 MCID = &TII->get(NewOpc);
11989 assert(MCID->getNumOperands() ==
11990 MI.getDesc().getNumOperands() + 5 - MI.getDesc().getSize()
11991 && "converted opcode should be the same except for cc_out"
11992 " (and, on Thumb1, pred)");
11994 MI.setDesc(*MCID);
11996 // Add the optional cc_out operand
11997 MI.addOperand(MachineOperand::CreateReg(0, /*isDef=*/true));
11999 // On Thumb1, move all input operands to the end, then add the predicate
12000 if (Subtarget->isThumb1Only()) {
12001 for (unsigned c = MCID->getNumOperands() - 4; c--;) {
12002 MI.addOperand(MI.getOperand(1));
12003 MI.RemoveOperand(1);
12006 // Restore the ties
12007 for (unsigned i = MI.getNumOperands(); i--;) {
12008 const MachineOperand& op = MI.getOperand(i);
12009 if (op.isReg() && op.isUse()) {
12010 int DefIdx = MCID->getOperandConstraint(i, MCOI::TIED_TO);
12011 if (DefIdx != -1)
12012 MI.tieOperands(DefIdx, i);
12016 MI.addOperand(MachineOperand::CreateImm(ARMCC::AL));
12017 MI.addOperand(MachineOperand::CreateReg(0, /*isDef=*/false));
12018 ccOutIdx = 1;
12019 } else
12020 ccOutIdx = MCID->getNumOperands() - 1;
12021 } else
12022 ccOutIdx = MCID->getNumOperands() - 1;
12024 // Any ARM instruction that sets the 's' bit should specify an optional
12025 // "cc_out" operand in the last operand position.
12026 if (!MI.hasOptionalDef() || !MCID->OpInfo[ccOutIdx].isOptionalDef()) {
12027 assert(!NewOpc && "Optional cc_out operand required");
12028 return;
12030 // Look for an implicit def of CPSR added by MachineInstr ctor. Remove it
12031 // since we already have an optional CPSR def.
12032 bool definesCPSR = false;
12033 bool deadCPSR = false;
12034 for (unsigned i = MCID->getNumOperands(), e = MI.getNumOperands(); i != e;
12035 ++i) {
12036 const MachineOperand &MO = MI.getOperand(i);
12037 if (MO.isReg() && MO.isDef() && MO.getReg() == ARM::CPSR) {
12038 definesCPSR = true;
12039 if (MO.isDead())
12040 deadCPSR = true;
12041 MI.RemoveOperand(i);
12042 break;
12045 if (!definesCPSR) {
12046 assert(!NewOpc && "Optional cc_out operand required");
12047 return;
12049 assert(deadCPSR == !Node->hasAnyUseOfValue(1) && "inconsistent dead flag");
12050 if (deadCPSR) {
12051 assert(!MI.getOperand(ccOutIdx).getReg() &&
12052 "expect uninitialized optional cc_out operand");
12053 // Thumb1 instructions must have the S bit even if the CPSR is dead.
12054 if (!Subtarget->isThumb1Only())
12055 return;
12058 // If this instruction was defined with an optional CPSR def and its dag node
12059 // had a live implicit CPSR def, then activate the optional CPSR def.
12060 MachineOperand &MO = MI.getOperand(ccOutIdx);
12061 MO.setReg(ARM::CPSR);
12062 MO.setIsDef(true);
12065 //===----------------------------------------------------------------------===//
12066 // ARM Optimization Hooks
12067 //===----------------------------------------------------------------------===//
12069 // Helper function that checks if N is a null or all ones constant.
12070 static inline bool isZeroOrAllOnes(SDValue N, bool AllOnes) {
12071 return AllOnes ? isAllOnesConstant(N) : isNullConstant(N);
12074 // Return true if N is conditionally 0 or all ones.
12075 // Detects these expressions where cc is an i1 value:
12077 // (select cc 0, y) [AllOnes=0]
12078 // (select cc y, 0) [AllOnes=0]
12079 // (zext cc) [AllOnes=0]
12080 // (sext cc) [AllOnes=0/1]
12081 // (select cc -1, y) [AllOnes=1]
12082 // (select cc y, -1) [AllOnes=1]
12084 // Invert is set when N is the null/all ones constant when CC is false.
12085 // OtherOp is set to the alternative value of N.
12086 static bool isConditionalZeroOrAllOnes(SDNode *N, bool AllOnes,
12087 SDValue &CC, bool &Invert,
12088 SDValue &OtherOp,
12089 SelectionDAG &DAG) {
12090 switch (N->getOpcode()) {
12091 default: return false;
12092 case ISD::SELECT: {
12093 CC = N->getOperand(0);
12094 SDValue N1 = N->getOperand(1);
12095 SDValue N2 = N->getOperand(2);
12096 if (isZeroOrAllOnes(N1, AllOnes)) {
12097 Invert = false;
12098 OtherOp = N2;
12099 return true;
12101 if (isZeroOrAllOnes(N2, AllOnes)) {
12102 Invert = true;
12103 OtherOp = N1;
12104 return true;
12106 return false;
12108 case ISD::ZERO_EXTEND:
12109 // (zext cc) can never be the all ones value.
12110 if (AllOnes)
12111 return false;
12112 LLVM_FALLTHROUGH;
12113 case ISD::SIGN_EXTEND: {
12114 SDLoc dl(N);
12115 EVT VT = N->getValueType(0);
12116 CC = N->getOperand(0);
12117 if (CC.getValueType() != MVT::i1 || CC.getOpcode() != ISD::SETCC)
12118 return false;
12119 Invert = !AllOnes;
12120 if (AllOnes)
12121 // When looking for an AllOnes constant, N is an sext, and the 'other'
12122 // value is 0.
12123 OtherOp = DAG.getConstant(0, dl, VT);
12124 else if (N->getOpcode() == ISD::ZERO_EXTEND)
12125 // When looking for a 0 constant, N can be zext or sext.
12126 OtherOp = DAG.getConstant(1, dl, VT);
12127 else
12128 OtherOp = DAG.getConstant(APInt::getAllOnesValue(VT.getSizeInBits()), dl,
12129 VT);
12130 return true;
12135 // Combine a constant select operand into its use:
12137 // (add (select cc, 0, c), x) -> (select cc, x, (add, x, c))
12138 // (sub x, (select cc, 0, c)) -> (select cc, x, (sub, x, c))
12139 // (and (select cc, -1, c), x) -> (select cc, x, (and, x, c)) [AllOnes=1]
12140 // (or (select cc, 0, c), x) -> (select cc, x, (or, x, c))
12141 // (xor (select cc, 0, c), x) -> (select cc, x, (xor, x, c))
12143 // The transform is rejected if the select doesn't have a constant operand that
12144 // is null, or all ones when AllOnes is set.
12146 // Also recognize sext/zext from i1:
12148 // (add (zext cc), x) -> (select cc (add x, 1), x)
12149 // (add (sext cc), x) -> (select cc (add x, -1), x)
12151 // These transformations eventually create predicated instructions.
12153 // @param N The node to transform.
12154 // @param Slct The N operand that is a select.
12155 // @param OtherOp The other N operand (x above).
12156 // @param DCI Context.
12157 // @param AllOnes Require the select constant to be all ones instead of null.
12158 // @returns The new node, or SDValue() on failure.
12159 static
12160 SDValue combineSelectAndUse(SDNode *N, SDValue Slct, SDValue OtherOp,
12161 TargetLowering::DAGCombinerInfo &DCI,
12162 bool AllOnes = false) {
12163 SelectionDAG &DAG = DCI.DAG;
12164 EVT VT = N->getValueType(0);
12165 SDValue NonConstantVal;
12166 SDValue CCOp;
12167 bool SwapSelectOps;
12168 if (!isConditionalZeroOrAllOnes(Slct.getNode(), AllOnes, CCOp, SwapSelectOps,
12169 NonConstantVal, DAG))
12170 return SDValue();
12172 // Slct is now know to be the desired identity constant when CC is true.
12173 SDValue TrueVal = OtherOp;
12174 SDValue FalseVal = DAG.getNode(N->getOpcode(), SDLoc(N), VT,
12175 OtherOp, NonConstantVal);
12176 // Unless SwapSelectOps says CC should be false.
12177 if (SwapSelectOps)
12178 std::swap(TrueVal, FalseVal);
12180 return DAG.getNode(ISD::SELECT, SDLoc(N), VT,
12181 CCOp, TrueVal, FalseVal);
12184 // Attempt combineSelectAndUse on each operand of a commutative operator N.
12185 static
12186 SDValue combineSelectAndUseCommutative(SDNode *N, bool AllOnes,
12187 TargetLowering::DAGCombinerInfo &DCI) {
12188 SDValue N0 = N->getOperand(0);
12189 SDValue N1 = N->getOperand(1);
12190 if (N0.getNode()->hasOneUse())
12191 if (SDValue Result = combineSelectAndUse(N, N0, N1, DCI, AllOnes))
12192 return Result;
12193 if (N1.getNode()->hasOneUse())
12194 if (SDValue Result = combineSelectAndUse(N, N1, N0, DCI, AllOnes))
12195 return Result;
12196 return SDValue();
12199 static bool IsVUZPShuffleNode(SDNode *N) {
12200 // VUZP shuffle node.
12201 if (N->getOpcode() == ARMISD::VUZP)
12202 return true;
12204 // "VUZP" on i32 is an alias for VTRN.
12205 if (N->getOpcode() == ARMISD::VTRN && N->getValueType(0) == MVT::v2i32)
12206 return true;
12208 return false;
12211 static SDValue AddCombineToVPADD(SDNode *N, SDValue N0, SDValue N1,
12212 TargetLowering::DAGCombinerInfo &DCI,
12213 const ARMSubtarget *Subtarget) {
12214 // Look for ADD(VUZP.0, VUZP.1).
12215 if (!IsVUZPShuffleNode(N0.getNode()) || N0.getNode() != N1.getNode() ||
12216 N0 == N1)
12217 return SDValue();
12219 // Make sure the ADD is a 64-bit add; there is no 128-bit VPADD.
12220 if (!N->getValueType(0).is64BitVector())
12221 return SDValue();
12223 // Generate vpadd.
12224 SelectionDAG &DAG = DCI.DAG;
12225 const TargetLowering &TLI = DAG.getTargetLoweringInfo();
12226 SDLoc dl(N);
12227 SDNode *Unzip = N0.getNode();
12228 EVT VT = N->getValueType(0);
12230 SmallVector<SDValue, 8> Ops;
12231 Ops.push_back(DAG.getConstant(Intrinsic::arm_neon_vpadd, dl,
12232 TLI.getPointerTy(DAG.getDataLayout())));
12233 Ops.push_back(Unzip->getOperand(0));
12234 Ops.push_back(Unzip->getOperand(1));
12236 return DAG.getNode(ISD::INTRINSIC_WO_CHAIN, dl, VT, Ops);
12239 static SDValue AddCombineVUZPToVPADDL(SDNode *N, SDValue N0, SDValue N1,
12240 TargetLowering::DAGCombinerInfo &DCI,
12241 const ARMSubtarget *Subtarget) {
12242 // Check for two extended operands.
12243 if (!(N0.getOpcode() == ISD::SIGN_EXTEND &&
12244 N1.getOpcode() == ISD::SIGN_EXTEND) &&
12245 !(N0.getOpcode() == ISD::ZERO_EXTEND &&
12246 N1.getOpcode() == ISD::ZERO_EXTEND))
12247 return SDValue();
12249 SDValue N00 = N0.getOperand(0);
12250 SDValue N10 = N1.getOperand(0);
12252 // Look for ADD(SEXT(VUZP.0), SEXT(VUZP.1))
12253 if (!IsVUZPShuffleNode(N00.getNode()) || N00.getNode() != N10.getNode() ||
12254 N00 == N10)
12255 return SDValue();
12257 // We only recognize Q register paddl here; this can't be reached until
12258 // after type legalization.
12259 if (!N00.getValueType().is64BitVector() ||
12260 !N0.getValueType().is128BitVector())
12261 return SDValue();
12263 // Generate vpaddl.
12264 SelectionDAG &DAG = DCI.DAG;
12265 const TargetLowering &TLI = DAG.getTargetLoweringInfo();
12266 SDLoc dl(N);
12267 EVT VT = N->getValueType(0);
12269 SmallVector<SDValue, 8> Ops;
12270 // Form vpaddl.sN or vpaddl.uN depending on the kind of extension.
12271 unsigned Opcode;
12272 if (N0.getOpcode() == ISD::SIGN_EXTEND)
12273 Opcode = Intrinsic::arm_neon_vpaddls;
12274 else
12275 Opcode = Intrinsic::arm_neon_vpaddlu;
12276 Ops.push_back(DAG.getConstant(Opcode, dl,
12277 TLI.getPointerTy(DAG.getDataLayout())));
12278 EVT ElemTy = N00.getValueType().getVectorElementType();
12279 unsigned NumElts = VT.getVectorNumElements();
12280 EVT ConcatVT = EVT::getVectorVT(*DAG.getContext(), ElemTy, NumElts * 2);
12281 SDValue Concat = DAG.getNode(ISD::CONCAT_VECTORS, SDLoc(N), ConcatVT,
12282 N00.getOperand(0), N00.getOperand(1));
12283 Ops.push_back(Concat);
12285 return DAG.getNode(ISD::INTRINSIC_WO_CHAIN, dl, VT, Ops);
12288 // FIXME: This function shouldn't be necessary; if we lower BUILD_VECTOR in
12289 // an appropriate manner, we end up with ADD(VUZP(ZEXT(N))), which is
12290 // much easier to match.
12291 static SDValue
12292 AddCombineBUILD_VECTORToVPADDL(SDNode *N, SDValue N0, SDValue N1,
12293 TargetLowering::DAGCombinerInfo &DCI,
12294 const ARMSubtarget *Subtarget) {
12295 // Only perform optimization if after legalize, and if NEON is available. We
12296 // also expected both operands to be BUILD_VECTORs.
12297 if (DCI.isBeforeLegalize() || !Subtarget->hasNEON()
12298 || N0.getOpcode() != ISD::BUILD_VECTOR
12299 || N1.getOpcode() != ISD::BUILD_VECTOR)
12300 return SDValue();
12302 // Check output type since VPADDL operand elements can only be 8, 16, or 32.
12303 EVT VT = N->getValueType(0);
12304 if (!VT.isInteger() || VT.getVectorElementType() == MVT::i64)
12305 return SDValue();
12307 // Check that the vector operands are of the right form.
12308 // N0 and N1 are BUILD_VECTOR nodes with N number of EXTRACT_VECTOR
12309 // operands, where N is the size of the formed vector.
12310 // Each EXTRACT_VECTOR should have the same input vector and odd or even
12311 // index such that we have a pair wise add pattern.
12313 // Grab the vector that all EXTRACT_VECTOR nodes should be referencing.
12314 if (N0->getOperand(0)->getOpcode() != ISD::EXTRACT_VECTOR_ELT)
12315 return SDValue();
12316 SDValue Vec = N0->getOperand(0)->getOperand(0);
12317 SDNode *V = Vec.getNode();
12318 unsigned nextIndex = 0;
12320 // For each operands to the ADD which are BUILD_VECTORs,
12321 // check to see if each of their operands are an EXTRACT_VECTOR with
12322 // the same vector and appropriate index.
12323 for (unsigned i = 0, e = N0->getNumOperands(); i != e; ++i) {
12324 if (N0->getOperand(i)->getOpcode() == ISD::EXTRACT_VECTOR_ELT
12325 && N1->getOperand(i)->getOpcode() == ISD::EXTRACT_VECTOR_ELT) {
12327 SDValue ExtVec0 = N0->getOperand(i);
12328 SDValue ExtVec1 = N1->getOperand(i);
12330 // First operand is the vector, verify its the same.
12331 if (V != ExtVec0->getOperand(0).getNode() ||
12332 V != ExtVec1->getOperand(0).getNode())
12333 return SDValue();
12335 // Second is the constant, verify its correct.
12336 ConstantSDNode *C0 = dyn_cast<ConstantSDNode>(ExtVec0->getOperand(1));
12337 ConstantSDNode *C1 = dyn_cast<ConstantSDNode>(ExtVec1->getOperand(1));
12339 // For the constant, we want to see all the even or all the odd.
12340 if (!C0 || !C1 || C0->getZExtValue() != nextIndex
12341 || C1->getZExtValue() != nextIndex+1)
12342 return SDValue();
12344 // Increment index.
12345 nextIndex+=2;
12346 } else
12347 return SDValue();
12350 // Don't generate vpaddl+vmovn; we'll match it to vpadd later. Also make sure
12351 // we're using the entire input vector, otherwise there's a size/legality
12352 // mismatch somewhere.
12353 if (nextIndex != Vec.getValueType().getVectorNumElements() ||
12354 Vec.getValueType().getVectorElementType() == VT.getVectorElementType())
12355 return SDValue();
12357 // Create VPADDL node.
12358 SelectionDAG &DAG = DCI.DAG;
12359 const TargetLowering &TLI = DAG.getTargetLoweringInfo();
12361 SDLoc dl(N);
12363 // Build operand list.
12364 SmallVector<SDValue, 8> Ops;
12365 Ops.push_back(DAG.getConstant(Intrinsic::arm_neon_vpaddls, dl,
12366 TLI.getPointerTy(DAG.getDataLayout())));
12368 // Input is the vector.
12369 Ops.push_back(Vec);
12371 // Get widened type and narrowed type.
12372 MVT widenType;
12373 unsigned numElem = VT.getVectorNumElements();
12375 EVT inputLaneType = Vec.getValueType().getVectorElementType();
12376 switch (inputLaneType.getSimpleVT().SimpleTy) {
12377 case MVT::i8: widenType = MVT::getVectorVT(MVT::i16, numElem); break;
12378 case MVT::i16: widenType = MVT::getVectorVT(MVT::i32, numElem); break;
12379 case MVT::i32: widenType = MVT::getVectorVT(MVT::i64, numElem); break;
12380 default:
12381 llvm_unreachable("Invalid vector element type for padd optimization.");
12384 SDValue tmp = DAG.getNode(ISD::INTRINSIC_WO_CHAIN, dl, widenType, Ops);
12385 unsigned ExtOp = VT.bitsGT(tmp.getValueType()) ? ISD::ANY_EXTEND : ISD::TRUNCATE;
12386 return DAG.getNode(ExtOp, dl, VT, tmp);
12389 static SDValue findMUL_LOHI(SDValue V) {
12390 if (V->getOpcode() == ISD::UMUL_LOHI ||
12391 V->getOpcode() == ISD::SMUL_LOHI)
12392 return V;
12393 return SDValue();
12396 static SDValue AddCombineTo64BitSMLAL16(SDNode *AddcNode, SDNode *AddeNode,
12397 TargetLowering::DAGCombinerInfo &DCI,
12398 const ARMSubtarget *Subtarget) {
12399 if (!Subtarget->hasBaseDSP())
12400 return SDValue();
12402 // SMLALBB, SMLALBT, SMLALTB, SMLALTT multiply two 16-bit values and
12403 // accumulates the product into a 64-bit value. The 16-bit values will
12404 // be sign extended somehow or SRA'd into 32-bit values
12405 // (addc (adde (mul 16bit, 16bit), lo), hi)
12406 SDValue Mul = AddcNode->getOperand(0);
12407 SDValue Lo = AddcNode->getOperand(1);
12408 if (Mul.getOpcode() != ISD::MUL) {
12409 Lo = AddcNode->getOperand(0);
12410 Mul = AddcNode->getOperand(1);
12411 if (Mul.getOpcode() != ISD::MUL)
12412 return SDValue();
12415 SDValue SRA = AddeNode->getOperand(0);
12416 SDValue Hi = AddeNode->getOperand(1);
12417 if (SRA.getOpcode() != ISD::SRA) {
12418 SRA = AddeNode->getOperand(1);
12419 Hi = AddeNode->getOperand(0);
12420 if (SRA.getOpcode() != ISD::SRA)
12421 return SDValue();
12423 if (auto Const = dyn_cast<ConstantSDNode>(SRA.getOperand(1))) {
12424 if (Const->getZExtValue() != 31)
12425 return SDValue();
12426 } else
12427 return SDValue();
12429 if (SRA.getOperand(0) != Mul)
12430 return SDValue();
12432 SelectionDAG &DAG = DCI.DAG;
12433 SDLoc dl(AddcNode);
12434 unsigned Opcode = 0;
12435 SDValue Op0;
12436 SDValue Op1;
12438 if (isS16(Mul.getOperand(0), DAG) && isS16(Mul.getOperand(1), DAG)) {
12439 Opcode = ARMISD::SMLALBB;
12440 Op0 = Mul.getOperand(0);
12441 Op1 = Mul.getOperand(1);
12442 } else if (isS16(Mul.getOperand(0), DAG) && isSRA16(Mul.getOperand(1))) {
12443 Opcode = ARMISD::SMLALBT;
12444 Op0 = Mul.getOperand(0);
12445 Op1 = Mul.getOperand(1).getOperand(0);
12446 } else if (isSRA16(Mul.getOperand(0)) && isS16(Mul.getOperand(1), DAG)) {
12447 Opcode = ARMISD::SMLALTB;
12448 Op0 = Mul.getOperand(0).getOperand(0);
12449 Op1 = Mul.getOperand(1);
12450 } else if (isSRA16(Mul.getOperand(0)) && isSRA16(Mul.getOperand(1))) {
12451 Opcode = ARMISD::SMLALTT;
12452 Op0 = Mul->getOperand(0).getOperand(0);
12453 Op1 = Mul->getOperand(1).getOperand(0);
12456 if (!Op0 || !Op1)
12457 return SDValue();
12459 SDValue SMLAL = DAG.getNode(Opcode, dl, DAG.getVTList(MVT::i32, MVT::i32),
12460 Op0, Op1, Lo, Hi);
12461 // Replace the ADDs' nodes uses by the MLA node's values.
12462 SDValue HiMLALResult(SMLAL.getNode(), 1);
12463 SDValue LoMLALResult(SMLAL.getNode(), 0);
12465 DAG.ReplaceAllUsesOfValueWith(SDValue(AddcNode, 0), LoMLALResult);
12466 DAG.ReplaceAllUsesOfValueWith(SDValue(AddeNode, 0), HiMLALResult);
12468 // Return original node to notify the driver to stop replacing.
12469 SDValue resNode(AddcNode, 0);
12470 return resNode;
12473 static SDValue AddCombineTo64bitMLAL(SDNode *AddeSubeNode,
12474 TargetLowering::DAGCombinerInfo &DCI,
12475 const ARMSubtarget *Subtarget) {
12476 // Look for multiply add opportunities.
12477 // The pattern is a ISD::UMUL_LOHI followed by two add nodes, where
12478 // each add nodes consumes a value from ISD::UMUL_LOHI and there is
12479 // a glue link from the first add to the second add.
12480 // If we find this pattern, we can replace the U/SMUL_LOHI, ADDC, and ADDE by
12481 // a S/UMLAL instruction.
12482 // UMUL_LOHI
12483 // / :lo \ :hi
12484 // V \ [no multiline comment]
12485 // loAdd -> ADDC |
12486 // \ :carry /
12487 // V V
12488 // ADDE <- hiAdd
12490 // In the special case where only the higher part of a signed result is used
12491 // and the add to the low part of the result of ISD::UMUL_LOHI adds or subtracts
12492 // a constant with the exact value of 0x80000000, we recognize we are dealing
12493 // with a "rounded multiply and add" (or subtract) and transform it into
12494 // either a ARMISD::SMMLAR or ARMISD::SMMLSR respectively.
12496 assert((AddeSubeNode->getOpcode() == ARMISD::ADDE ||
12497 AddeSubeNode->getOpcode() == ARMISD::SUBE) &&
12498 "Expect an ADDE or SUBE");
12500 assert(AddeSubeNode->getNumOperands() == 3 &&
12501 AddeSubeNode->getOperand(2).getValueType() == MVT::i32 &&
12502 "ADDE node has the wrong inputs");
12504 // Check that we are chained to the right ADDC or SUBC node.
12505 SDNode *AddcSubcNode = AddeSubeNode->getOperand(2).getNode();
12506 if ((AddeSubeNode->getOpcode() == ARMISD::ADDE &&
12507 AddcSubcNode->getOpcode() != ARMISD::ADDC) ||
12508 (AddeSubeNode->getOpcode() == ARMISD::SUBE &&
12509 AddcSubcNode->getOpcode() != ARMISD::SUBC))
12510 return SDValue();
12512 SDValue AddcSubcOp0 = AddcSubcNode->getOperand(0);
12513 SDValue AddcSubcOp1 = AddcSubcNode->getOperand(1);
12515 // Check if the two operands are from the same mul_lohi node.
12516 if (AddcSubcOp0.getNode() == AddcSubcOp1.getNode())
12517 return SDValue();
12519 assert(AddcSubcNode->getNumValues() == 2 &&
12520 AddcSubcNode->getValueType(0) == MVT::i32 &&
12521 "Expect ADDC with two result values. First: i32");
12523 // Check that the ADDC adds the low result of the S/UMUL_LOHI. If not, it
12524 // maybe a SMLAL which multiplies two 16-bit values.
12525 if (AddeSubeNode->getOpcode() == ARMISD::ADDE &&
12526 AddcSubcOp0->getOpcode() != ISD::UMUL_LOHI &&
12527 AddcSubcOp0->getOpcode() != ISD::SMUL_LOHI &&
12528 AddcSubcOp1->getOpcode() != ISD::UMUL_LOHI &&
12529 AddcSubcOp1->getOpcode() != ISD::SMUL_LOHI)
12530 return AddCombineTo64BitSMLAL16(AddcSubcNode, AddeSubeNode, DCI, Subtarget);
12532 // Check for the triangle shape.
12533 SDValue AddeSubeOp0 = AddeSubeNode->getOperand(0);
12534 SDValue AddeSubeOp1 = AddeSubeNode->getOperand(1);
12536 // Make sure that the ADDE/SUBE operands are not coming from the same node.
12537 if (AddeSubeOp0.getNode() == AddeSubeOp1.getNode())
12538 return SDValue();
12540 // Find the MUL_LOHI node walking up ADDE/SUBE's operands.
12541 bool IsLeftOperandMUL = false;
12542 SDValue MULOp = findMUL_LOHI(AddeSubeOp0);
12543 if (MULOp == SDValue())
12544 MULOp = findMUL_LOHI(AddeSubeOp1);
12545 else
12546 IsLeftOperandMUL = true;
12547 if (MULOp == SDValue())
12548 return SDValue();
12550 // Figure out the right opcode.
12551 unsigned Opc = MULOp->getOpcode();
12552 unsigned FinalOpc = (Opc == ISD::SMUL_LOHI) ? ARMISD::SMLAL : ARMISD::UMLAL;
12554 // Figure out the high and low input values to the MLAL node.
12555 SDValue *HiAddSub = nullptr;
12556 SDValue *LoMul = nullptr;
12557 SDValue *LowAddSub = nullptr;
12559 // Ensure that ADDE/SUBE is from high result of ISD::xMUL_LOHI.
12560 if ((AddeSubeOp0 != MULOp.getValue(1)) && (AddeSubeOp1 != MULOp.getValue(1)))
12561 return SDValue();
12563 if (IsLeftOperandMUL)
12564 HiAddSub = &AddeSubeOp1;
12565 else
12566 HiAddSub = &AddeSubeOp0;
12568 // Ensure that LoMul and LowAddSub are taken from correct ISD::SMUL_LOHI node
12569 // whose low result is fed to the ADDC/SUBC we are checking.
12571 if (AddcSubcOp0 == MULOp.getValue(0)) {
12572 LoMul = &AddcSubcOp0;
12573 LowAddSub = &AddcSubcOp1;
12575 if (AddcSubcOp1 == MULOp.getValue(0)) {
12576 LoMul = &AddcSubcOp1;
12577 LowAddSub = &AddcSubcOp0;
12580 if (!LoMul)
12581 return SDValue();
12583 // If HiAddSub is the same node as ADDC/SUBC or is a predecessor of ADDC/SUBC
12584 // the replacement below will create a cycle.
12585 if (AddcSubcNode == HiAddSub->getNode() ||
12586 AddcSubcNode->isPredecessorOf(HiAddSub->getNode()))
12587 return SDValue();
12589 // Create the merged node.
12590 SelectionDAG &DAG = DCI.DAG;
12592 // Start building operand list.
12593 SmallVector<SDValue, 8> Ops;
12594 Ops.push_back(LoMul->getOperand(0));
12595 Ops.push_back(LoMul->getOperand(1));
12597 // Check whether we can use SMMLAR, SMMLSR or SMMULR instead. For this to be
12598 // the case, we must be doing signed multiplication and only use the higher
12599 // part of the result of the MLAL, furthermore the LowAddSub must be a constant
12600 // addition or subtraction with the value of 0x800000.
12601 if (Subtarget->hasV6Ops() && Subtarget->hasDSP() && Subtarget->useMulOps() &&
12602 FinalOpc == ARMISD::SMLAL && !AddeSubeNode->hasAnyUseOfValue(1) &&
12603 LowAddSub->getNode()->getOpcode() == ISD::Constant &&
12604 static_cast<ConstantSDNode *>(LowAddSub->getNode())->getZExtValue() ==
12605 0x80000000) {
12606 Ops.push_back(*HiAddSub);
12607 if (AddcSubcNode->getOpcode() == ARMISD::SUBC) {
12608 FinalOpc = ARMISD::SMMLSR;
12609 } else {
12610 FinalOpc = ARMISD::SMMLAR;
12612 SDValue NewNode = DAG.getNode(FinalOpc, SDLoc(AddcSubcNode), MVT::i32, Ops);
12613 DAG.ReplaceAllUsesOfValueWith(SDValue(AddeSubeNode, 0), NewNode);
12615 return SDValue(AddeSubeNode, 0);
12616 } else if (AddcSubcNode->getOpcode() == ARMISD::SUBC)
12617 // SMMLS is generated during instruction selection and the rest of this
12618 // function can not handle the case where AddcSubcNode is a SUBC.
12619 return SDValue();
12621 // Finish building the operand list for {U/S}MLAL
12622 Ops.push_back(*LowAddSub);
12623 Ops.push_back(*HiAddSub);
12625 SDValue MLALNode = DAG.getNode(FinalOpc, SDLoc(AddcSubcNode),
12626 DAG.getVTList(MVT::i32, MVT::i32), Ops);
12628 // Replace the ADDs' nodes uses by the MLA node's values.
12629 SDValue HiMLALResult(MLALNode.getNode(), 1);
12630 DAG.ReplaceAllUsesOfValueWith(SDValue(AddeSubeNode, 0), HiMLALResult);
12632 SDValue LoMLALResult(MLALNode.getNode(), 0);
12633 DAG.ReplaceAllUsesOfValueWith(SDValue(AddcSubcNode, 0), LoMLALResult);
12635 // Return original node to notify the driver to stop replacing.
12636 return SDValue(AddeSubeNode, 0);
12639 static SDValue AddCombineTo64bitUMAAL(SDNode *AddeNode,
12640 TargetLowering::DAGCombinerInfo &DCI,
12641 const ARMSubtarget *Subtarget) {
12642 // UMAAL is similar to UMLAL except that it adds two unsigned values.
12643 // While trying to combine for the other MLAL nodes, first search for the
12644 // chance to use UMAAL. Check if Addc uses a node which has already
12645 // been combined into a UMLAL. The other pattern is UMLAL using Addc/Adde
12646 // as the addend, and it's handled in PerformUMLALCombine.
12648 if (!Subtarget->hasV6Ops() || !Subtarget->hasDSP())
12649 return AddCombineTo64bitMLAL(AddeNode, DCI, Subtarget);
12651 // Check that we have a glued ADDC node.
12652 SDNode* AddcNode = AddeNode->getOperand(2).getNode();
12653 if (AddcNode->getOpcode() != ARMISD::ADDC)
12654 return SDValue();
12656 // Find the converted UMAAL or quit if it doesn't exist.
12657 SDNode *UmlalNode = nullptr;
12658 SDValue AddHi;
12659 if (AddcNode->getOperand(0).getOpcode() == ARMISD::UMLAL) {
12660 UmlalNode = AddcNode->getOperand(0).getNode();
12661 AddHi = AddcNode->getOperand(1);
12662 } else if (AddcNode->getOperand(1).getOpcode() == ARMISD::UMLAL) {
12663 UmlalNode = AddcNode->getOperand(1).getNode();
12664 AddHi = AddcNode->getOperand(0);
12665 } else {
12666 return AddCombineTo64bitMLAL(AddeNode, DCI, Subtarget);
12669 // The ADDC should be glued to an ADDE node, which uses the same UMLAL as
12670 // the ADDC as well as Zero.
12671 if (!isNullConstant(UmlalNode->getOperand(3)))
12672 return SDValue();
12674 if ((isNullConstant(AddeNode->getOperand(0)) &&
12675 AddeNode->getOperand(1).getNode() == UmlalNode) ||
12676 (AddeNode->getOperand(0).getNode() == UmlalNode &&
12677 isNullConstant(AddeNode->getOperand(1)))) {
12678 SelectionDAG &DAG = DCI.DAG;
12679 SDValue Ops[] = { UmlalNode->getOperand(0), UmlalNode->getOperand(1),
12680 UmlalNode->getOperand(2), AddHi };
12681 SDValue UMAAL = DAG.getNode(ARMISD::UMAAL, SDLoc(AddcNode),
12682 DAG.getVTList(MVT::i32, MVT::i32), Ops);
12684 // Replace the ADDs' nodes uses by the UMAAL node's values.
12685 DAG.ReplaceAllUsesOfValueWith(SDValue(AddeNode, 0), SDValue(UMAAL.getNode(), 1));
12686 DAG.ReplaceAllUsesOfValueWith(SDValue(AddcNode, 0), SDValue(UMAAL.getNode(), 0));
12688 // Return original node to notify the driver to stop replacing.
12689 return SDValue(AddeNode, 0);
12691 return SDValue();
12694 static SDValue PerformUMLALCombine(SDNode *N, SelectionDAG &DAG,
12695 const ARMSubtarget *Subtarget) {
12696 if (!Subtarget->hasV6Ops() || !Subtarget->hasDSP())
12697 return SDValue();
12699 // Check that we have a pair of ADDC and ADDE as operands.
12700 // Both addends of the ADDE must be zero.
12701 SDNode* AddcNode = N->getOperand(2).getNode();
12702 SDNode* AddeNode = N->getOperand(3).getNode();
12703 if ((AddcNode->getOpcode() == ARMISD::ADDC) &&
12704 (AddeNode->getOpcode() == ARMISD::ADDE) &&
12705 isNullConstant(AddeNode->getOperand(0)) &&
12706 isNullConstant(AddeNode->getOperand(1)) &&
12707 (AddeNode->getOperand(2).getNode() == AddcNode))
12708 return DAG.getNode(ARMISD::UMAAL, SDLoc(N),
12709 DAG.getVTList(MVT::i32, MVT::i32),
12710 {N->getOperand(0), N->getOperand(1),
12711 AddcNode->getOperand(0), AddcNode->getOperand(1)});
12712 else
12713 return SDValue();
12716 static SDValue PerformAddcSubcCombine(SDNode *N,
12717 TargetLowering::DAGCombinerInfo &DCI,
12718 const ARMSubtarget *Subtarget) {
12719 SelectionDAG &DAG(DCI.DAG);
12721 if (N->getOpcode() == ARMISD::SUBC) {
12722 // (SUBC (ADDE 0, 0, C), 1) -> C
12723 SDValue LHS = N->getOperand(0);
12724 SDValue RHS = N->getOperand(1);
12725 if (LHS->getOpcode() == ARMISD::ADDE &&
12726 isNullConstant(LHS->getOperand(0)) &&
12727 isNullConstant(LHS->getOperand(1)) && isOneConstant(RHS)) {
12728 return DCI.CombineTo(N, SDValue(N, 0), LHS->getOperand(2));
12732 if (Subtarget->isThumb1Only()) {
12733 SDValue RHS = N->getOperand(1);
12734 if (ConstantSDNode *C = dyn_cast<ConstantSDNode>(RHS)) {
12735 int32_t imm = C->getSExtValue();
12736 if (imm < 0 && imm > std::numeric_limits<int>::min()) {
12737 SDLoc DL(N);
12738 RHS = DAG.getConstant(-imm, DL, MVT::i32);
12739 unsigned Opcode = (N->getOpcode() == ARMISD::ADDC) ? ARMISD::SUBC
12740 : ARMISD::ADDC;
12741 return DAG.getNode(Opcode, DL, N->getVTList(), N->getOperand(0), RHS);
12746 return SDValue();
12749 static SDValue PerformAddeSubeCombine(SDNode *N,
12750 TargetLowering::DAGCombinerInfo &DCI,
12751 const ARMSubtarget *Subtarget) {
12752 if (Subtarget->isThumb1Only()) {
12753 SelectionDAG &DAG = DCI.DAG;
12754 SDValue RHS = N->getOperand(1);
12755 if (ConstantSDNode *C = dyn_cast<ConstantSDNode>(RHS)) {
12756 int64_t imm = C->getSExtValue();
12757 if (imm < 0) {
12758 SDLoc DL(N);
12760 // The with-carry-in form matches bitwise not instead of the negation.
12761 // Effectively, the inverse interpretation of the carry flag already
12762 // accounts for part of the negation.
12763 RHS = DAG.getConstant(~imm, DL, MVT::i32);
12765 unsigned Opcode = (N->getOpcode() == ARMISD::ADDE) ? ARMISD::SUBE
12766 : ARMISD::ADDE;
12767 return DAG.getNode(Opcode, DL, N->getVTList(),
12768 N->getOperand(0), RHS, N->getOperand(2));
12771 } else if (N->getOperand(1)->getOpcode() == ISD::SMUL_LOHI) {
12772 return AddCombineTo64bitMLAL(N, DCI, Subtarget);
12774 return SDValue();
12777 static SDValue PerformSELECTCombine(SDNode *N,
12778 TargetLowering::DAGCombinerInfo &DCI,
12779 const ARMSubtarget *Subtarget) {
12780 if (!Subtarget->hasMVEIntegerOps())
12781 return SDValue();
12783 SDLoc dl(N);
12784 SDValue SetCC;
12785 SDValue LHS;
12786 SDValue RHS;
12787 ISD::CondCode CC;
12788 SDValue TrueVal;
12789 SDValue FalseVal;
12791 if (N->getOpcode() == ISD::SELECT &&
12792 N->getOperand(0)->getOpcode() == ISD::SETCC) {
12793 SetCC = N->getOperand(0);
12794 LHS = SetCC->getOperand(0);
12795 RHS = SetCC->getOperand(1);
12796 CC = cast<CondCodeSDNode>(SetCC->getOperand(2))->get();
12797 TrueVal = N->getOperand(1);
12798 FalseVal = N->getOperand(2);
12799 } else if (N->getOpcode() == ISD::SELECT_CC) {
12800 LHS = N->getOperand(0);
12801 RHS = N->getOperand(1);
12802 CC = cast<CondCodeSDNode>(N->getOperand(4))->get();
12803 TrueVal = N->getOperand(2);
12804 FalseVal = N->getOperand(3);
12805 } else {
12806 return SDValue();
12809 unsigned int Opcode = 0;
12810 if ((TrueVal->getOpcode() == ISD::VECREDUCE_UMIN ||
12811 FalseVal->getOpcode() == ISD::VECREDUCE_UMIN) &&
12812 (CC == ISD::SETULT || CC == ISD::SETUGT)) {
12813 Opcode = ARMISD::VMINVu;
12814 if (CC == ISD::SETUGT)
12815 std::swap(TrueVal, FalseVal);
12816 } else if ((TrueVal->getOpcode() == ISD::VECREDUCE_SMIN ||
12817 FalseVal->getOpcode() == ISD::VECREDUCE_SMIN) &&
12818 (CC == ISD::SETLT || CC == ISD::SETGT)) {
12819 Opcode = ARMISD::VMINVs;
12820 if (CC == ISD::SETGT)
12821 std::swap(TrueVal, FalseVal);
12822 } else if ((TrueVal->getOpcode() == ISD::VECREDUCE_UMAX ||
12823 FalseVal->getOpcode() == ISD::VECREDUCE_UMAX) &&
12824 (CC == ISD::SETUGT || CC == ISD::SETULT)) {
12825 Opcode = ARMISD::VMAXVu;
12826 if (CC == ISD::SETULT)
12827 std::swap(TrueVal, FalseVal);
12828 } else if ((TrueVal->getOpcode() == ISD::VECREDUCE_SMAX ||
12829 FalseVal->getOpcode() == ISD::VECREDUCE_SMAX) &&
12830 (CC == ISD::SETGT || CC == ISD::SETLT)) {
12831 Opcode = ARMISD::VMAXVs;
12832 if (CC == ISD::SETLT)
12833 std::swap(TrueVal, FalseVal);
12834 } else
12835 return SDValue();
12837 // Normalise to the right hand side being the vector reduction
12838 switch (TrueVal->getOpcode()) {
12839 case ISD::VECREDUCE_UMIN:
12840 case ISD::VECREDUCE_SMIN:
12841 case ISD::VECREDUCE_UMAX:
12842 case ISD::VECREDUCE_SMAX:
12843 std::swap(LHS, RHS);
12844 std::swap(TrueVal, FalseVal);
12845 break;
12848 EVT VectorType = FalseVal->getOperand(0).getValueType();
12850 if (VectorType != MVT::v16i8 && VectorType != MVT::v8i16 &&
12851 VectorType != MVT::v4i32)
12852 return SDValue();
12854 EVT VectorScalarType = VectorType.getVectorElementType();
12856 // The values being selected must also be the ones being compared
12857 if (TrueVal != LHS || FalseVal != RHS)
12858 return SDValue();
12860 EVT LeftType = LHS->getValueType(0);
12861 EVT RightType = RHS->getValueType(0);
12863 // The types must match the reduced type too
12864 if (LeftType != VectorScalarType || RightType != VectorScalarType)
12865 return SDValue();
12867 // Legalise the scalar to an i32
12868 if (VectorScalarType != MVT::i32)
12869 LHS = DCI.DAG.getNode(ISD::ANY_EXTEND, dl, MVT::i32, LHS);
12871 // Generate the reduction as an i32 for legalisation purposes
12872 auto Reduction =
12873 DCI.DAG.getNode(Opcode, dl, MVT::i32, LHS, RHS->getOperand(0));
12875 // The result isn't actually an i32 so truncate it back to its original type
12876 if (VectorScalarType != MVT::i32)
12877 Reduction = DCI.DAG.getNode(ISD::TRUNCATE, dl, VectorScalarType, Reduction);
12879 return Reduction;
12882 // A special combine for the vqdmulh family of instructions. This is one of the
12883 // potential set of patterns that could patch this instruction. The base pattern
12884 // you would expect to be min(max(ashr(mul(mul(sext(x), 2), sext(y)), 16))).
12885 // This matches the different min(max(ashr(mul(mul(sext(x), sext(y)), 2), 16))),
12886 // which llvm will have optimized to min(ashr(mul(sext(x), sext(y)), 15))) as
12887 // the max is unnecessary.
12888 static SDValue PerformVQDMULHCombine(SDNode *N, SelectionDAG &DAG) {
12889 EVT VT = N->getValueType(0);
12890 SDValue Shft;
12891 ConstantSDNode *Clamp;
12893 if (!VT.isVector())
12894 return SDValue();
12896 if (N->getOpcode() == ISD::SMIN) {
12897 Shft = N->getOperand(0);
12898 Clamp = isConstOrConstSplat(N->getOperand(1));
12899 } else if (N->getOpcode() == ISD::VSELECT) {
12900 // Detect a SMIN, which for an i64 node will be a vselect/setcc, not a smin.
12901 SDValue Cmp = N->getOperand(0);
12902 if (Cmp.getOpcode() != ISD::SETCC ||
12903 cast<CondCodeSDNode>(Cmp.getOperand(2))->get() != ISD::SETLT ||
12904 Cmp.getOperand(0) != N->getOperand(1) ||
12905 Cmp.getOperand(1) != N->getOperand(2))
12906 return SDValue();
12907 Shft = N->getOperand(1);
12908 Clamp = isConstOrConstSplat(N->getOperand(2));
12909 } else
12910 return SDValue();
12912 if (!Clamp)
12913 return SDValue();
12915 MVT ScalarType;
12916 int ShftAmt = 0;
12917 switch (Clamp->getSExtValue()) {
12918 case (1 << 7) - 1:
12919 ScalarType = MVT::i8;
12920 ShftAmt = 7;
12921 break;
12922 case (1 << 15) - 1:
12923 ScalarType = MVT::i16;
12924 ShftAmt = 15;
12925 break;
12926 case (1ULL << 31) - 1:
12927 ScalarType = MVT::i32;
12928 ShftAmt = 31;
12929 break;
12930 default:
12931 return SDValue();
12934 if (Shft.getOpcode() != ISD::SRA)
12935 return SDValue();
12936 ConstantSDNode *N1 = isConstOrConstSplat(Shft.getOperand(1));
12937 if (!N1 || N1->getSExtValue() != ShftAmt)
12938 return SDValue();
12940 SDValue Mul = Shft.getOperand(0);
12941 if (Mul.getOpcode() != ISD::MUL)
12942 return SDValue();
12944 SDValue Ext0 = Mul.getOperand(0);
12945 SDValue Ext1 = Mul.getOperand(1);
12946 if (Ext0.getOpcode() != ISD::SIGN_EXTEND ||
12947 Ext1.getOpcode() != ISD::SIGN_EXTEND)
12948 return SDValue();
12949 EVT VecVT = Ext0.getOperand(0).getValueType();
12950 if (!VecVT.isPow2VectorType() || VecVT.getVectorNumElements() == 1)
12951 return SDValue();
12952 if (Ext1.getOperand(0).getValueType() != VecVT ||
12953 VecVT.getScalarType() != ScalarType ||
12954 VT.getScalarSizeInBits() < ScalarType.getScalarSizeInBits() * 2)
12955 return SDValue();
12957 SDLoc DL(Mul);
12958 unsigned LegalLanes = 128 / (ShftAmt + 1);
12959 EVT LegalVecVT = MVT::getVectorVT(ScalarType, LegalLanes);
12960 // For types smaller than legal vectors extend to be legal and only use needed
12961 // lanes.
12962 if (VecVT.getSizeInBits() < 128) {
12963 EVT ExtVecVT =
12964 MVT::getVectorVT(MVT::getIntegerVT(128 / VecVT.getVectorNumElements()),
12965 VecVT.getVectorNumElements());
12966 SDValue Inp0 =
12967 DAG.getNode(ISD::ANY_EXTEND, DL, ExtVecVT, Ext0.getOperand(0));
12968 SDValue Inp1 =
12969 DAG.getNode(ISD::ANY_EXTEND, DL, ExtVecVT, Ext1.getOperand(0));
12970 Inp0 = DAG.getNode(ARMISD::VECTOR_REG_CAST, DL, LegalVecVT, Inp0);
12971 Inp1 = DAG.getNode(ARMISD::VECTOR_REG_CAST, DL, LegalVecVT, Inp1);
12972 SDValue VQDMULH = DAG.getNode(ARMISD::VQDMULH, DL, LegalVecVT, Inp0, Inp1);
12973 SDValue Trunc = DAG.getNode(ARMISD::VECTOR_REG_CAST, DL, ExtVecVT, VQDMULH);
12974 Trunc = DAG.getNode(ISD::TRUNCATE, DL, VecVT, Trunc);
12975 return DAG.getNode(ISD::SIGN_EXTEND, DL, VT, Trunc);
12978 // For larger types, split into legal sized chunks.
12979 assert(VecVT.getSizeInBits() % 128 == 0 && "Expected a power2 type");
12980 unsigned NumParts = VecVT.getSizeInBits() / 128;
12981 SmallVector<SDValue> Parts;
12982 for (unsigned I = 0; I < NumParts; ++I) {
12983 SDValue Inp0 =
12984 DAG.getNode(ISD::EXTRACT_SUBVECTOR, DL, LegalVecVT, Ext0.getOperand(0),
12985 DAG.getVectorIdxConstant(I * LegalLanes, DL));
12986 SDValue Inp1 =
12987 DAG.getNode(ISD::EXTRACT_SUBVECTOR, DL, LegalVecVT, Ext1.getOperand(0),
12988 DAG.getVectorIdxConstant(I * LegalLanes, DL));
12989 SDValue VQDMULH = DAG.getNode(ARMISD::VQDMULH, DL, LegalVecVT, Inp0, Inp1);
12990 Parts.push_back(VQDMULH);
12992 return DAG.getNode(ISD::SIGN_EXTEND, DL, VT,
12993 DAG.getNode(ISD::CONCAT_VECTORS, DL, VecVT, Parts));
12996 static SDValue PerformVSELECTCombine(SDNode *N,
12997 TargetLowering::DAGCombinerInfo &DCI,
12998 const ARMSubtarget *Subtarget) {
12999 if (!Subtarget->hasMVEIntegerOps())
13000 return SDValue();
13002 if (SDValue V = PerformVQDMULHCombine(N, DCI.DAG))
13003 return V;
13005 // Transforms vselect(not(cond), lhs, rhs) into vselect(cond, rhs, lhs).
13007 // We need to re-implement this optimization here as the implementation in the
13008 // Target-Independent DAGCombiner does not handle the kind of constant we make
13009 // (it calls isConstOrConstSplat with AllowTruncation set to false - and for
13010 // good reason, allowing truncation there would break other targets).
13012 // Currently, this is only done for MVE, as it's the only target that benefits
13013 // from this transformation (e.g. VPNOT+VPSEL becomes a single VPSEL).
13014 if (N->getOperand(0).getOpcode() != ISD::XOR)
13015 return SDValue();
13016 SDValue XOR = N->getOperand(0);
13018 // Check if the XOR's RHS is either a 1, or a BUILD_VECTOR of 1s.
13019 // It is important to check with truncation allowed as the BUILD_VECTORs we
13020 // generate in those situations will truncate their operands.
13021 ConstantSDNode *Const =
13022 isConstOrConstSplat(XOR->getOperand(1), /*AllowUndefs*/ false,
13023 /*AllowTruncation*/ true);
13024 if (!Const || !Const->isOne())
13025 return SDValue();
13027 // Rewrite into vselect(cond, rhs, lhs).
13028 SDValue Cond = XOR->getOperand(0);
13029 SDValue LHS = N->getOperand(1);
13030 SDValue RHS = N->getOperand(2);
13031 EVT Type = N->getValueType(0);
13032 return DCI.DAG.getNode(ISD::VSELECT, SDLoc(N), Type, Cond, RHS, LHS);
13035 static SDValue PerformABSCombine(SDNode *N,
13036 TargetLowering::DAGCombinerInfo &DCI,
13037 const ARMSubtarget *Subtarget) {
13038 SDValue res;
13039 SelectionDAG &DAG = DCI.DAG;
13040 const TargetLowering &TLI = DAG.getTargetLoweringInfo();
13042 if (TLI.isOperationLegal(N->getOpcode(), N->getValueType(0)))
13043 return SDValue();
13045 if (!TLI.expandABS(N, res, DAG))
13046 return SDValue();
13048 return res;
13051 /// PerformADDECombine - Target-specific dag combine transform from
13052 /// ARMISD::ADDC, ARMISD::ADDE, and ISD::MUL_LOHI to MLAL or
13053 /// ARMISD::ADDC, ARMISD::ADDE and ARMISD::UMLAL to ARMISD::UMAAL
13054 static SDValue PerformADDECombine(SDNode *N,
13055 TargetLowering::DAGCombinerInfo &DCI,
13056 const ARMSubtarget *Subtarget) {
13057 // Only ARM and Thumb2 support UMLAL/SMLAL.
13058 if (Subtarget->isThumb1Only())
13059 return PerformAddeSubeCombine(N, DCI, Subtarget);
13061 // Only perform the checks after legalize when the pattern is available.
13062 if (DCI.isBeforeLegalize()) return SDValue();
13064 return AddCombineTo64bitUMAAL(N, DCI, Subtarget);
13067 /// PerformADDCombineWithOperands - Try DAG combinations for an ADD with
13068 /// operands N0 and N1. This is a helper for PerformADDCombine that is
13069 /// called with the default operands, and if that fails, with commuted
13070 /// operands.
13071 static SDValue PerformADDCombineWithOperands(SDNode *N, SDValue N0, SDValue N1,
13072 TargetLowering::DAGCombinerInfo &DCI,
13073 const ARMSubtarget *Subtarget){
13074 // Attempt to create vpadd for this add.
13075 if (SDValue Result = AddCombineToVPADD(N, N0, N1, DCI, Subtarget))
13076 return Result;
13078 // Attempt to create vpaddl for this add.
13079 if (SDValue Result = AddCombineVUZPToVPADDL(N, N0, N1, DCI, Subtarget))
13080 return Result;
13081 if (SDValue Result = AddCombineBUILD_VECTORToVPADDL(N, N0, N1, DCI,
13082 Subtarget))
13083 return Result;
13085 // fold (add (select cc, 0, c), x) -> (select cc, x, (add, x, c))
13086 if (N0.getNode()->hasOneUse())
13087 if (SDValue Result = combineSelectAndUse(N, N0, N1, DCI))
13088 return Result;
13089 return SDValue();
13092 static SDValue TryDistrubutionADDVecReduce(SDNode *N, SelectionDAG &DAG) {
13093 EVT VT = N->getValueType(0);
13094 SDValue N0 = N->getOperand(0);
13095 SDValue N1 = N->getOperand(1);
13096 SDLoc dl(N);
13098 auto IsVecReduce = [](SDValue Op) {
13099 switch (Op.getOpcode()) {
13100 case ISD::VECREDUCE_ADD:
13101 case ARMISD::VADDVs:
13102 case ARMISD::VADDVu:
13103 case ARMISD::VMLAVs:
13104 case ARMISD::VMLAVu:
13105 return true;
13107 return false;
13110 auto DistrubuteAddAddVecReduce = [&](SDValue N0, SDValue N1) {
13111 // Distribute add(X, add(vecreduce(Y), vecreduce(Z))) ->
13112 // add(add(X, vecreduce(Y)), vecreduce(Z))
13113 // to make better use of vaddva style instructions.
13114 if (VT == MVT::i32 && N1.getOpcode() == ISD::ADD && !IsVecReduce(N0) &&
13115 IsVecReduce(N1.getOperand(0)) && IsVecReduce(N1.getOperand(1)) &&
13116 !isa<ConstantSDNode>(N0)) {
13117 SDValue Add0 = DAG.getNode(ISD::ADD, dl, VT, N0, N1.getOperand(0));
13118 return DAG.getNode(ISD::ADD, dl, VT, Add0, N1.getOperand(1));
13120 // And turn add(add(A, reduce(B)), add(C, reduce(D))) ->
13121 // add(add(add(A, C), reduce(B)), reduce(D))
13122 if (VT == MVT::i32 && N0.getOpcode() == ISD::ADD &&
13123 N1.getOpcode() == ISD::ADD) {
13124 unsigned N0RedOp = 0;
13125 if (!IsVecReduce(N0.getOperand(N0RedOp))) {
13126 N0RedOp = 1;
13127 if (!IsVecReduce(N0.getOperand(N0RedOp)))
13128 return SDValue();
13131 unsigned N1RedOp = 0;
13132 if (!IsVecReduce(N1.getOperand(N1RedOp)))
13133 N1RedOp = 1;
13134 if (!IsVecReduce(N1.getOperand(N1RedOp)))
13135 return SDValue();
13137 SDValue Add0 = DAG.getNode(ISD::ADD, dl, VT, N0.getOperand(1 - N0RedOp),
13138 N1.getOperand(1 - N1RedOp));
13139 SDValue Add1 =
13140 DAG.getNode(ISD::ADD, dl, VT, Add0, N0.getOperand(N0RedOp));
13141 return DAG.getNode(ISD::ADD, dl, VT, Add1, N1.getOperand(N1RedOp));
13143 return SDValue();
13145 if (SDValue R = DistrubuteAddAddVecReduce(N0, N1))
13146 return R;
13147 if (SDValue R = DistrubuteAddAddVecReduce(N1, N0))
13148 return R;
13150 // Distribute add(vecreduce(load(Y)), vecreduce(load(Z)))
13151 // Or add(add(X, vecreduce(load(Y))), vecreduce(load(Z)))
13152 // by ascending load offsets. This can help cores prefetch if the order of
13153 // loads is more predictable.
13154 auto DistrubuteVecReduceLoad = [&](SDValue N0, SDValue N1, bool IsForward) {
13155 // Check if two reductions are known to load data where one is before/after
13156 // another. Return negative if N0 loads data before N1, positive if N1 is
13157 // before N0 and 0 otherwise if nothing is known.
13158 auto IsKnownOrderedLoad = [&](SDValue N0, SDValue N1) {
13159 // Look through to the first operand of a MUL, for the VMLA case.
13160 // Currently only looks at the first operand, in the hope they are equal.
13161 if (N0.getOpcode() == ISD::MUL)
13162 N0 = N0.getOperand(0);
13163 if (N1.getOpcode() == ISD::MUL)
13164 N1 = N1.getOperand(0);
13166 // Return true if the two operands are loads to the same object and the
13167 // offset of the first is known to be less than the offset of the second.
13168 LoadSDNode *Load0 = dyn_cast<LoadSDNode>(N0);
13169 LoadSDNode *Load1 = dyn_cast<LoadSDNode>(N1);
13170 if (!Load0 || !Load1 || Load0->getChain() != Load1->getChain() ||
13171 !Load0->isSimple() || !Load1->isSimple() || Load0->isIndexed() ||
13172 Load1->isIndexed())
13173 return 0;
13175 auto BaseLocDecomp0 = BaseIndexOffset::match(Load0, DAG);
13176 auto BaseLocDecomp1 = BaseIndexOffset::match(Load1, DAG);
13178 if (!BaseLocDecomp0.getBase() ||
13179 BaseLocDecomp0.getBase() != BaseLocDecomp1.getBase() ||
13180 !BaseLocDecomp0.hasValidOffset() || !BaseLocDecomp1.hasValidOffset())
13181 return 0;
13182 if (BaseLocDecomp0.getOffset() < BaseLocDecomp1.getOffset())
13183 return -1;
13184 if (BaseLocDecomp0.getOffset() > BaseLocDecomp1.getOffset())
13185 return 1;
13186 return 0;
13189 SDValue X;
13190 if (N0.getOpcode() == ISD::ADD) {
13191 if (IsVecReduce(N0.getOperand(0)) && IsVecReduce(N0.getOperand(1))) {
13192 int IsBefore = IsKnownOrderedLoad(N0.getOperand(0).getOperand(0),
13193 N0.getOperand(1).getOperand(0));
13194 if (IsBefore < 0) {
13195 X = N0.getOperand(0);
13196 N0 = N0.getOperand(1);
13197 } else if (IsBefore > 0) {
13198 X = N0.getOperand(1);
13199 N0 = N0.getOperand(0);
13200 } else
13201 return SDValue();
13202 } else if (IsVecReduce(N0.getOperand(0))) {
13203 X = N0.getOperand(1);
13204 N0 = N0.getOperand(0);
13205 } else if (IsVecReduce(N0.getOperand(1))) {
13206 X = N0.getOperand(0);
13207 N0 = N0.getOperand(1);
13208 } else
13209 return SDValue();
13210 } else if (IsForward && IsVecReduce(N0) && IsVecReduce(N1) &&
13211 IsKnownOrderedLoad(N0.getOperand(0), N1.getOperand(0)) < 0) {
13212 // Note this is backward to how you would expect. We create
13213 // add(reduce(load + 16), reduce(load + 0)) so that the
13214 // add(reduce(load+16), X) is combined into VADDVA(X, load+16)), leaving
13215 // the X as VADDV(load + 0)
13216 return DAG.getNode(ISD::ADD, dl, VT, N1, N0);
13217 } else
13218 return SDValue();
13220 if (!IsVecReduce(N0) || !IsVecReduce(N1))
13221 return SDValue();
13223 if (IsKnownOrderedLoad(N1.getOperand(0), N0.getOperand(0)) >= 0)
13224 return SDValue();
13226 // Switch from add(add(X, N0), N1) to add(add(X, N1), N0)
13227 SDValue Add0 = DAG.getNode(ISD::ADD, dl, VT, X, N1);
13228 return DAG.getNode(ISD::ADD, dl, VT, Add0, N0);
13230 if (SDValue R = DistrubuteVecReduceLoad(N0, N1, true))
13231 return R;
13232 if (SDValue R = DistrubuteVecReduceLoad(N1, N0, false))
13233 return R;
13234 return SDValue();
13237 static SDValue PerformADDVecReduce(SDNode *N, SelectionDAG &DAG,
13238 const ARMSubtarget *Subtarget) {
13239 if (!Subtarget->hasMVEIntegerOps())
13240 return SDValue();
13242 if (SDValue R = TryDistrubutionADDVecReduce(N, DAG))
13243 return R;
13245 EVT VT = N->getValueType(0);
13246 SDValue N0 = N->getOperand(0);
13247 SDValue N1 = N->getOperand(1);
13248 SDLoc dl(N);
13250 if (VT != MVT::i64)
13251 return SDValue();
13253 // We are looking for a i64 add of a VADDLVx. Due to these being i64's, this
13254 // will look like:
13255 // t1: i32,i32 = ARMISD::VADDLVs x
13256 // t2: i64 = build_pair t1, t1:1
13257 // t3: i64 = add t2, y
13258 // Otherwise we try to push the add up above VADDLVAx, to potentially allow
13259 // the add to be simplified seperately.
13260 // We also need to check for sext / zext and commutitive adds.
13261 auto MakeVecReduce = [&](unsigned Opcode, unsigned OpcodeA, SDValue NA,
13262 SDValue NB) {
13263 if (NB->getOpcode() != ISD::BUILD_PAIR)
13264 return SDValue();
13265 SDValue VecRed = NB->getOperand(0);
13266 if ((VecRed->getOpcode() != Opcode && VecRed->getOpcode() != OpcodeA) ||
13267 VecRed.getResNo() != 0 ||
13268 NB->getOperand(1) != SDValue(VecRed.getNode(), 1))
13269 return SDValue();
13271 if (VecRed->getOpcode() == OpcodeA) {
13272 // add(NA, VADDLVA(Inp), Y) -> VADDLVA(add(NA, Inp), Y)
13273 SDValue Inp = DAG.getNode(ISD::BUILD_PAIR, dl, MVT::i64,
13274 VecRed.getOperand(0), VecRed.getOperand(1));
13275 NA = DAG.getNode(ISD::ADD, dl, MVT::i64, Inp, NA);
13278 SmallVector<SDValue, 4> Ops;
13279 Ops.push_back(DAG.getNode(ISD::EXTRACT_ELEMENT, dl, MVT::i32, NA,
13280 DAG.getConstant(0, dl, MVT::i32)));
13281 Ops.push_back(DAG.getNode(ISD::EXTRACT_ELEMENT, dl, MVT::i32, NA,
13282 DAG.getConstant(1, dl, MVT::i32)));
13283 unsigned S = VecRed->getOpcode() == OpcodeA ? 2 : 0;
13284 for (unsigned I = S, E = VecRed.getNumOperands(); I < E; I++)
13285 Ops.push_back(VecRed->getOperand(I));
13286 SDValue Red =
13287 DAG.getNode(OpcodeA, dl, DAG.getVTList({MVT::i32, MVT::i32}), Ops);
13288 return DAG.getNode(ISD::BUILD_PAIR, dl, MVT::i64, Red,
13289 SDValue(Red.getNode(), 1));
13292 if (SDValue M = MakeVecReduce(ARMISD::VADDLVs, ARMISD::VADDLVAs, N0, N1))
13293 return M;
13294 if (SDValue M = MakeVecReduce(ARMISD::VADDLVu, ARMISD::VADDLVAu, N0, N1))
13295 return M;
13296 if (SDValue M = MakeVecReduce(ARMISD::VADDLVs, ARMISD::VADDLVAs, N1, N0))
13297 return M;
13298 if (SDValue M = MakeVecReduce(ARMISD::VADDLVu, ARMISD::VADDLVAu, N1, N0))
13299 return M;
13300 if (SDValue M = MakeVecReduce(ARMISD::VADDLVps, ARMISD::VADDLVAps, N0, N1))
13301 return M;
13302 if (SDValue M = MakeVecReduce(ARMISD::VADDLVpu, ARMISD::VADDLVApu, N0, N1))
13303 return M;
13304 if (SDValue M = MakeVecReduce(ARMISD::VADDLVps, ARMISD::VADDLVAps, N1, N0))
13305 return M;
13306 if (SDValue M = MakeVecReduce(ARMISD::VADDLVpu, ARMISD::VADDLVApu, N1, N0))
13307 return M;
13308 if (SDValue M = MakeVecReduce(ARMISD::VMLALVs, ARMISD::VMLALVAs, N0, N1))
13309 return M;
13310 if (SDValue M = MakeVecReduce(ARMISD::VMLALVu, ARMISD::VMLALVAu, N0, N1))
13311 return M;
13312 if (SDValue M = MakeVecReduce(ARMISD::VMLALVs, ARMISD::VMLALVAs, N1, N0))
13313 return M;
13314 if (SDValue M = MakeVecReduce(ARMISD::VMLALVu, ARMISD::VMLALVAu, N1, N0))
13315 return M;
13316 if (SDValue M = MakeVecReduce(ARMISD::VMLALVps, ARMISD::VMLALVAps, N0, N1))
13317 return M;
13318 if (SDValue M = MakeVecReduce(ARMISD::VMLALVpu, ARMISD::VMLALVApu, N0, N1))
13319 return M;
13320 if (SDValue M = MakeVecReduce(ARMISD::VMLALVps, ARMISD::VMLALVAps, N1, N0))
13321 return M;
13322 if (SDValue M = MakeVecReduce(ARMISD::VMLALVpu, ARMISD::VMLALVApu, N1, N0))
13323 return M;
13324 return SDValue();
13327 bool
13328 ARMTargetLowering::isDesirableToCommuteWithShift(const SDNode *N,
13329 CombineLevel Level) const {
13330 if (Level == BeforeLegalizeTypes)
13331 return true;
13333 if (N->getOpcode() != ISD::SHL)
13334 return true;
13336 if (Subtarget->isThumb1Only()) {
13337 // Avoid making expensive immediates by commuting shifts. (This logic
13338 // only applies to Thumb1 because ARM and Thumb2 immediates can be shifted
13339 // for free.)
13340 if (N->getOpcode() != ISD::SHL)
13341 return true;
13342 SDValue N1 = N->getOperand(0);
13343 if (N1->getOpcode() != ISD::ADD && N1->getOpcode() != ISD::AND &&
13344 N1->getOpcode() != ISD::OR && N1->getOpcode() != ISD::XOR)
13345 return true;
13346 if (auto *Const = dyn_cast<ConstantSDNode>(N1->getOperand(1))) {
13347 if (Const->getAPIntValue().ult(256))
13348 return false;
13349 if (N1->getOpcode() == ISD::ADD && Const->getAPIntValue().slt(0) &&
13350 Const->getAPIntValue().sgt(-256))
13351 return false;
13353 return true;
13356 // Turn off commute-with-shift transform after legalization, so it doesn't
13357 // conflict with PerformSHLSimplify. (We could try to detect when
13358 // PerformSHLSimplify would trigger more precisely, but it isn't
13359 // really necessary.)
13360 return false;
13363 bool ARMTargetLowering::shouldFoldConstantShiftPairToMask(
13364 const SDNode *N, CombineLevel Level) const {
13365 if (!Subtarget->isThumb1Only())
13366 return true;
13368 if (Level == BeforeLegalizeTypes)
13369 return true;
13371 return false;
13374 bool ARMTargetLowering::preferIncOfAddToSubOfNot(EVT VT) const {
13375 if (!Subtarget->hasNEON()) {
13376 if (Subtarget->isThumb1Only())
13377 return VT.getScalarSizeInBits() <= 32;
13378 return true;
13380 return VT.isScalarInteger();
13383 static SDValue PerformSHLSimplify(SDNode *N,
13384 TargetLowering::DAGCombinerInfo &DCI,
13385 const ARMSubtarget *ST) {
13386 // Allow the generic combiner to identify potential bswaps.
13387 if (DCI.isBeforeLegalize())
13388 return SDValue();
13390 // DAG combiner will fold:
13391 // (shl (add x, c1), c2) -> (add (shl x, c2), c1 << c2)
13392 // (shl (or x, c1), c2) -> (or (shl x, c2), c1 << c2
13393 // Other code patterns that can be also be modified have the following form:
13394 // b + ((a << 1) | 510)
13395 // b + ((a << 1) & 510)
13396 // b + ((a << 1) ^ 510)
13397 // b + ((a << 1) + 510)
13399 // Many instructions can perform the shift for free, but it requires both
13400 // the operands to be registers. If c1 << c2 is too large, a mov immediate
13401 // instruction will needed. So, unfold back to the original pattern if:
13402 // - if c1 and c2 are small enough that they don't require mov imms.
13403 // - the user(s) of the node can perform an shl
13405 // No shifted operands for 16-bit instructions.
13406 if (ST->isThumb() && ST->isThumb1Only())
13407 return SDValue();
13409 // Check that all the users could perform the shl themselves.
13410 for (auto U : N->uses()) {
13411 switch(U->getOpcode()) {
13412 default:
13413 return SDValue();
13414 case ISD::SUB:
13415 case ISD::ADD:
13416 case ISD::AND:
13417 case ISD::OR:
13418 case ISD::XOR:
13419 case ISD::SETCC:
13420 case ARMISD::CMP:
13421 // Check that the user isn't already using a constant because there
13422 // aren't any instructions that support an immediate operand and a
13423 // shifted operand.
13424 if (isa<ConstantSDNode>(U->getOperand(0)) ||
13425 isa<ConstantSDNode>(U->getOperand(1)))
13426 return SDValue();
13428 // Check that it's not already using a shift.
13429 if (U->getOperand(0).getOpcode() == ISD::SHL ||
13430 U->getOperand(1).getOpcode() == ISD::SHL)
13431 return SDValue();
13432 break;
13436 if (N->getOpcode() != ISD::ADD && N->getOpcode() != ISD::OR &&
13437 N->getOpcode() != ISD::XOR && N->getOpcode() != ISD::AND)
13438 return SDValue();
13440 if (N->getOperand(0).getOpcode() != ISD::SHL)
13441 return SDValue();
13443 SDValue SHL = N->getOperand(0);
13445 auto *C1ShlC2 = dyn_cast<ConstantSDNode>(N->getOperand(1));
13446 auto *C2 = dyn_cast<ConstantSDNode>(SHL.getOperand(1));
13447 if (!C1ShlC2 || !C2)
13448 return SDValue();
13450 APInt C2Int = C2->getAPIntValue();
13451 APInt C1Int = C1ShlC2->getAPIntValue();
13453 // Check that performing a lshr will not lose any information.
13454 APInt Mask = APInt::getHighBitsSet(C2Int.getBitWidth(),
13455 C2Int.getBitWidth() - C2->getZExtValue());
13456 if ((C1Int & Mask) != C1Int)
13457 return SDValue();
13459 // Shift the first constant.
13460 C1Int.lshrInPlace(C2Int);
13462 // The immediates are encoded as an 8-bit value that can be rotated.
13463 auto LargeImm = [](const APInt &Imm) {
13464 unsigned Zeros = Imm.countLeadingZeros() + Imm.countTrailingZeros();
13465 return Imm.getBitWidth() - Zeros > 8;
13468 if (LargeImm(C1Int) || LargeImm(C2Int))
13469 return SDValue();
13471 SelectionDAG &DAG = DCI.DAG;
13472 SDLoc dl(N);
13473 SDValue X = SHL.getOperand(0);
13474 SDValue BinOp = DAG.getNode(N->getOpcode(), dl, MVT::i32, X,
13475 DAG.getConstant(C1Int, dl, MVT::i32));
13476 // Shift left to compensate for the lshr of C1Int.
13477 SDValue Res = DAG.getNode(ISD::SHL, dl, MVT::i32, BinOp, SHL.getOperand(1));
13479 LLVM_DEBUG(dbgs() << "Simplify shl use:\n"; SHL.getOperand(0).dump();
13480 SHL.dump(); N->dump());
13481 LLVM_DEBUG(dbgs() << "Into:\n"; X.dump(); BinOp.dump(); Res.dump());
13482 return Res;
13486 /// PerformADDCombine - Target-specific dag combine xforms for ISD::ADD.
13488 static SDValue PerformADDCombine(SDNode *N,
13489 TargetLowering::DAGCombinerInfo &DCI,
13490 const ARMSubtarget *Subtarget) {
13491 SDValue N0 = N->getOperand(0);
13492 SDValue N1 = N->getOperand(1);
13494 // Only works one way, because it needs an immediate operand.
13495 if (SDValue Result = PerformSHLSimplify(N, DCI, Subtarget))
13496 return Result;
13498 if (SDValue Result = PerformADDVecReduce(N, DCI.DAG, Subtarget))
13499 return Result;
13501 // First try with the default operand order.
13502 if (SDValue Result = PerformADDCombineWithOperands(N, N0, N1, DCI, Subtarget))
13503 return Result;
13505 // If that didn't work, try again with the operands commuted.
13506 return PerformADDCombineWithOperands(N, N1, N0, DCI, Subtarget);
13509 // Combine (sub 0, (csinc X, Y, CC)) -> (csinv -X, Y, CC)
13510 // providing -X is as cheap as X (currently, just a constant).
13511 static SDValue PerformSubCSINCCombine(SDNode *N, SelectionDAG &DAG) {
13512 if (N->getValueType(0) != MVT::i32 || !isNullConstant(N->getOperand(0)))
13513 return SDValue();
13514 SDValue CSINC = N->getOperand(1);
13515 if (CSINC.getOpcode() != ARMISD::CSINC || !CSINC.hasOneUse())
13516 return SDValue();
13518 ConstantSDNode *X = dyn_cast<ConstantSDNode>(CSINC.getOperand(0));
13519 if (!X)
13520 return SDValue();
13522 return DAG.getNode(ARMISD::CSINV, SDLoc(N), MVT::i32,
13523 DAG.getNode(ISD::SUB, SDLoc(N), MVT::i32, N->getOperand(0),
13524 CSINC.getOperand(0)),
13525 CSINC.getOperand(1), CSINC.getOperand(2),
13526 CSINC.getOperand(3));
13529 /// PerformSUBCombine - Target-specific dag combine xforms for ISD::SUB.
13531 static SDValue PerformSUBCombine(SDNode *N,
13532 TargetLowering::DAGCombinerInfo &DCI,
13533 const ARMSubtarget *Subtarget) {
13534 SDValue N0 = N->getOperand(0);
13535 SDValue N1 = N->getOperand(1);
13537 // fold (sub x, (select cc, 0, c)) -> (select cc, x, (sub, x, c))
13538 if (N1.getNode()->hasOneUse())
13539 if (SDValue Result = combineSelectAndUse(N, N1, N0, DCI))
13540 return Result;
13542 if (SDValue R = PerformSubCSINCCombine(N, DCI.DAG))
13543 return R;
13545 if (!Subtarget->hasMVEIntegerOps() || !N->getValueType(0).isVector())
13546 return SDValue();
13548 // Fold (sub (ARMvmovImm 0), (ARMvdup x)) -> (ARMvdup (sub 0, x))
13549 // so that we can readily pattern match more mve instructions which can use
13550 // a scalar operand.
13551 SDValue VDup = N->getOperand(1);
13552 if (VDup->getOpcode() != ARMISD::VDUP)
13553 return SDValue();
13555 SDValue VMov = N->getOperand(0);
13556 if (VMov->getOpcode() == ISD::BITCAST)
13557 VMov = VMov->getOperand(0);
13559 if (VMov->getOpcode() != ARMISD::VMOVIMM || !isZeroVector(VMov))
13560 return SDValue();
13562 SDLoc dl(N);
13563 SDValue Negate = DCI.DAG.getNode(ISD::SUB, dl, MVT::i32,
13564 DCI.DAG.getConstant(0, dl, MVT::i32),
13565 VDup->getOperand(0));
13566 return DCI.DAG.getNode(ARMISD::VDUP, dl, N->getValueType(0), Negate);
13569 /// PerformVMULCombine
13570 /// Distribute (A + B) * C to (A * C) + (B * C) to take advantage of the
13571 /// special multiplier accumulator forwarding.
13572 /// vmul d3, d0, d2
13573 /// vmla d3, d1, d2
13574 /// is faster than
13575 /// vadd d3, d0, d1
13576 /// vmul d3, d3, d2
13577 // However, for (A + B) * (A + B),
13578 // vadd d2, d0, d1
13579 // vmul d3, d0, d2
13580 // vmla d3, d1, d2
13581 // is slower than
13582 // vadd d2, d0, d1
13583 // vmul d3, d2, d2
13584 static SDValue PerformVMULCombine(SDNode *N,
13585 TargetLowering::DAGCombinerInfo &DCI,
13586 const ARMSubtarget *Subtarget) {
13587 if (!Subtarget->hasVMLxForwarding())
13588 return SDValue();
13590 SelectionDAG &DAG = DCI.DAG;
13591 SDValue N0 = N->getOperand(0);
13592 SDValue N1 = N->getOperand(1);
13593 unsigned Opcode = N0.getOpcode();
13594 if (Opcode != ISD::ADD && Opcode != ISD::SUB &&
13595 Opcode != ISD::FADD && Opcode != ISD::FSUB) {
13596 Opcode = N1.getOpcode();
13597 if (Opcode != ISD::ADD && Opcode != ISD::SUB &&
13598 Opcode != ISD::FADD && Opcode != ISD::FSUB)
13599 return SDValue();
13600 std::swap(N0, N1);
13603 if (N0 == N1)
13604 return SDValue();
13606 EVT VT = N->getValueType(0);
13607 SDLoc DL(N);
13608 SDValue N00 = N0->getOperand(0);
13609 SDValue N01 = N0->getOperand(1);
13610 return DAG.getNode(Opcode, DL, VT,
13611 DAG.getNode(ISD::MUL, DL, VT, N00, N1),
13612 DAG.getNode(ISD::MUL, DL, VT, N01, N1));
13615 static SDValue PerformMVEVMULLCombine(SDNode *N, SelectionDAG &DAG,
13616 const ARMSubtarget *Subtarget) {
13617 EVT VT = N->getValueType(0);
13618 if (VT != MVT::v2i64)
13619 return SDValue();
13621 SDValue N0 = N->getOperand(0);
13622 SDValue N1 = N->getOperand(1);
13624 auto IsSignExt = [&](SDValue Op) {
13625 if (Op->getOpcode() != ISD::SIGN_EXTEND_INREG)
13626 return SDValue();
13627 EVT VT = cast<VTSDNode>(Op->getOperand(1))->getVT();
13628 if (VT.getScalarSizeInBits() == 32)
13629 return Op->getOperand(0);
13630 return SDValue();
13632 auto IsZeroExt = [&](SDValue Op) {
13633 // Zero extends are a little more awkward. At the point we are matching
13634 // this, we are looking for an AND with a (-1, 0, -1, 0) buildvector mask.
13635 // That might be before of after a bitcast depending on how the and is
13636 // placed. Because this has to look through bitcasts, it is currently only
13637 // supported on LE.
13638 if (!Subtarget->isLittle())
13639 return SDValue();
13641 SDValue And = Op;
13642 if (And->getOpcode() == ISD::BITCAST)
13643 And = And->getOperand(0);
13644 if (And->getOpcode() != ISD::AND)
13645 return SDValue();
13646 SDValue Mask = And->getOperand(1);
13647 if (Mask->getOpcode() == ISD::BITCAST)
13648 Mask = Mask->getOperand(0);
13650 if (Mask->getOpcode() != ISD::BUILD_VECTOR ||
13651 Mask.getValueType() != MVT::v4i32)
13652 return SDValue();
13653 if (isAllOnesConstant(Mask->getOperand(0)) &&
13654 isNullConstant(Mask->getOperand(1)) &&
13655 isAllOnesConstant(Mask->getOperand(2)) &&
13656 isNullConstant(Mask->getOperand(3)))
13657 return And->getOperand(0);
13658 return SDValue();
13661 SDLoc dl(N);
13662 if (SDValue Op0 = IsSignExt(N0)) {
13663 if (SDValue Op1 = IsSignExt(N1)) {
13664 SDValue New0a = DAG.getNode(ARMISD::VECTOR_REG_CAST, dl, MVT::v4i32, Op0);
13665 SDValue New1a = DAG.getNode(ARMISD::VECTOR_REG_CAST, dl, MVT::v4i32, Op1);
13666 return DAG.getNode(ARMISD::VMULLs, dl, VT, New0a, New1a);
13669 if (SDValue Op0 = IsZeroExt(N0)) {
13670 if (SDValue Op1 = IsZeroExt(N1)) {
13671 SDValue New0a = DAG.getNode(ARMISD::VECTOR_REG_CAST, dl, MVT::v4i32, Op0);
13672 SDValue New1a = DAG.getNode(ARMISD::VECTOR_REG_CAST, dl, MVT::v4i32, Op1);
13673 return DAG.getNode(ARMISD::VMULLu, dl, VT, New0a, New1a);
13677 return SDValue();
13680 static SDValue PerformMULCombine(SDNode *N,
13681 TargetLowering::DAGCombinerInfo &DCI,
13682 const ARMSubtarget *Subtarget) {
13683 SelectionDAG &DAG = DCI.DAG;
13685 EVT VT = N->getValueType(0);
13686 if (Subtarget->hasMVEIntegerOps() && VT == MVT::v2i64)
13687 return PerformMVEVMULLCombine(N, DAG, Subtarget);
13689 if (Subtarget->isThumb1Only())
13690 return SDValue();
13692 if (DCI.isBeforeLegalize() || DCI.isCalledByLegalizer())
13693 return SDValue();
13695 if (VT.is64BitVector() || VT.is128BitVector())
13696 return PerformVMULCombine(N, DCI, Subtarget);
13697 if (VT != MVT::i32)
13698 return SDValue();
13700 ConstantSDNode *C = dyn_cast<ConstantSDNode>(N->getOperand(1));
13701 if (!C)
13702 return SDValue();
13704 int64_t MulAmt = C->getSExtValue();
13705 unsigned ShiftAmt = countTrailingZeros<uint64_t>(MulAmt);
13707 ShiftAmt = ShiftAmt & (32 - 1);
13708 SDValue V = N->getOperand(0);
13709 SDLoc DL(N);
13711 SDValue Res;
13712 MulAmt >>= ShiftAmt;
13714 if (MulAmt >= 0) {
13715 if (isPowerOf2_32(MulAmt - 1)) {
13716 // (mul x, 2^N + 1) => (add (shl x, N), x)
13717 Res = DAG.getNode(ISD::ADD, DL, VT,
13719 DAG.getNode(ISD::SHL, DL, VT,
13721 DAG.getConstant(Log2_32(MulAmt - 1), DL,
13722 MVT::i32)));
13723 } else if (isPowerOf2_32(MulAmt + 1)) {
13724 // (mul x, 2^N - 1) => (sub (shl x, N), x)
13725 Res = DAG.getNode(ISD::SUB, DL, VT,
13726 DAG.getNode(ISD::SHL, DL, VT,
13728 DAG.getConstant(Log2_32(MulAmt + 1), DL,
13729 MVT::i32)),
13731 } else
13732 return SDValue();
13733 } else {
13734 uint64_t MulAmtAbs = -MulAmt;
13735 if (isPowerOf2_32(MulAmtAbs + 1)) {
13736 // (mul x, -(2^N - 1)) => (sub x, (shl x, N))
13737 Res = DAG.getNode(ISD::SUB, DL, VT,
13739 DAG.getNode(ISD::SHL, DL, VT,
13741 DAG.getConstant(Log2_32(MulAmtAbs + 1), DL,
13742 MVT::i32)));
13743 } else if (isPowerOf2_32(MulAmtAbs - 1)) {
13744 // (mul x, -(2^N + 1)) => - (add (shl x, N), x)
13745 Res = DAG.getNode(ISD::ADD, DL, VT,
13747 DAG.getNode(ISD::SHL, DL, VT,
13749 DAG.getConstant(Log2_32(MulAmtAbs - 1), DL,
13750 MVT::i32)));
13751 Res = DAG.getNode(ISD::SUB, DL, VT,
13752 DAG.getConstant(0, DL, MVT::i32), Res);
13753 } else
13754 return SDValue();
13757 if (ShiftAmt != 0)
13758 Res = DAG.getNode(ISD::SHL, DL, VT,
13759 Res, DAG.getConstant(ShiftAmt, DL, MVT::i32));
13761 // Do not add new nodes to DAG combiner worklist.
13762 DCI.CombineTo(N, Res, false);
13763 return SDValue();
13766 static SDValue CombineANDShift(SDNode *N,
13767 TargetLowering::DAGCombinerInfo &DCI,
13768 const ARMSubtarget *Subtarget) {
13769 // Allow DAGCombine to pattern-match before we touch the canonical form.
13770 if (DCI.isBeforeLegalize() || DCI.isCalledByLegalizer())
13771 return SDValue();
13773 if (N->getValueType(0) != MVT::i32)
13774 return SDValue();
13776 ConstantSDNode *N1C = dyn_cast<ConstantSDNode>(N->getOperand(1));
13777 if (!N1C)
13778 return SDValue();
13780 uint32_t C1 = (uint32_t)N1C->getZExtValue();
13781 // Don't transform uxtb/uxth.
13782 if (C1 == 255 || C1 == 65535)
13783 return SDValue();
13785 SDNode *N0 = N->getOperand(0).getNode();
13786 if (!N0->hasOneUse())
13787 return SDValue();
13789 if (N0->getOpcode() != ISD::SHL && N0->getOpcode() != ISD::SRL)
13790 return SDValue();
13792 bool LeftShift = N0->getOpcode() == ISD::SHL;
13794 ConstantSDNode *N01C = dyn_cast<ConstantSDNode>(N0->getOperand(1));
13795 if (!N01C)
13796 return SDValue();
13798 uint32_t C2 = (uint32_t)N01C->getZExtValue();
13799 if (!C2 || C2 >= 32)
13800 return SDValue();
13802 // Clear irrelevant bits in the mask.
13803 if (LeftShift)
13804 C1 &= (-1U << C2);
13805 else
13806 C1 &= (-1U >> C2);
13808 SelectionDAG &DAG = DCI.DAG;
13809 SDLoc DL(N);
13811 // We have a pattern of the form "(and (shl x, c2) c1)" or
13812 // "(and (srl x, c2) c1)", where c1 is a shifted mask. Try to
13813 // transform to a pair of shifts, to save materializing c1.
13815 // First pattern: right shift, then mask off leading bits.
13816 // FIXME: Use demanded bits?
13817 if (!LeftShift && isMask_32(C1)) {
13818 uint32_t C3 = countLeadingZeros(C1);
13819 if (C2 < C3) {
13820 SDValue SHL = DAG.getNode(ISD::SHL, DL, MVT::i32, N0->getOperand(0),
13821 DAG.getConstant(C3 - C2, DL, MVT::i32));
13822 return DAG.getNode(ISD::SRL, DL, MVT::i32, SHL,
13823 DAG.getConstant(C3, DL, MVT::i32));
13827 // First pattern, reversed: left shift, then mask off trailing bits.
13828 if (LeftShift && isMask_32(~C1)) {
13829 uint32_t C3 = countTrailingZeros(C1);
13830 if (C2 < C3) {
13831 SDValue SHL = DAG.getNode(ISD::SRL, DL, MVT::i32, N0->getOperand(0),
13832 DAG.getConstant(C3 - C2, DL, MVT::i32));
13833 return DAG.getNode(ISD::SHL, DL, MVT::i32, SHL,
13834 DAG.getConstant(C3, DL, MVT::i32));
13838 // Second pattern: left shift, then mask off leading bits.
13839 // FIXME: Use demanded bits?
13840 if (LeftShift && isShiftedMask_32(C1)) {
13841 uint32_t Trailing = countTrailingZeros(C1);
13842 uint32_t C3 = countLeadingZeros(C1);
13843 if (Trailing == C2 && C2 + C3 < 32) {
13844 SDValue SHL = DAG.getNode(ISD::SHL, DL, MVT::i32, N0->getOperand(0),
13845 DAG.getConstant(C2 + C3, DL, MVT::i32));
13846 return DAG.getNode(ISD::SRL, DL, MVT::i32, SHL,
13847 DAG.getConstant(C3, DL, MVT::i32));
13851 // Second pattern, reversed: right shift, then mask off trailing bits.
13852 // FIXME: Handle other patterns of known/demanded bits.
13853 if (!LeftShift && isShiftedMask_32(C1)) {
13854 uint32_t Leading = countLeadingZeros(C1);
13855 uint32_t C3 = countTrailingZeros(C1);
13856 if (Leading == C2 && C2 + C3 < 32) {
13857 SDValue SHL = DAG.getNode(ISD::SRL, DL, MVT::i32, N0->getOperand(0),
13858 DAG.getConstant(C2 + C3, DL, MVT::i32));
13859 return DAG.getNode(ISD::SHL, DL, MVT::i32, SHL,
13860 DAG.getConstant(C3, DL, MVT::i32));
13864 // FIXME: Transform "(and (shl x, c2) c1)" ->
13865 // "(shl (and x, c1>>c2), c2)" if "c1 >> c2" is a cheaper immediate than
13866 // c1.
13867 return SDValue();
13870 static SDValue PerformANDCombine(SDNode *N,
13871 TargetLowering::DAGCombinerInfo &DCI,
13872 const ARMSubtarget *Subtarget) {
13873 // Attempt to use immediate-form VBIC
13874 BuildVectorSDNode *BVN = dyn_cast<BuildVectorSDNode>(N->getOperand(1));
13875 SDLoc dl(N);
13876 EVT VT = N->getValueType(0);
13877 SelectionDAG &DAG = DCI.DAG;
13879 if (!DAG.getTargetLoweringInfo().isTypeLegal(VT) || VT == MVT::v4i1 ||
13880 VT == MVT::v8i1 || VT == MVT::v16i1)
13881 return SDValue();
13883 APInt SplatBits, SplatUndef;
13884 unsigned SplatBitSize;
13885 bool HasAnyUndefs;
13886 if (BVN && (Subtarget->hasNEON() || Subtarget->hasMVEIntegerOps()) &&
13887 BVN->isConstantSplat(SplatBits, SplatUndef, SplatBitSize, HasAnyUndefs)) {
13888 if (SplatBitSize == 8 || SplatBitSize == 16 || SplatBitSize == 32 ||
13889 SplatBitSize == 64) {
13890 EVT VbicVT;
13891 SDValue Val = isVMOVModifiedImm((~SplatBits).getZExtValue(),
13892 SplatUndef.getZExtValue(), SplatBitSize,
13893 DAG, dl, VbicVT, VT, OtherModImm);
13894 if (Val.getNode()) {
13895 SDValue Input =
13896 DAG.getNode(ISD::BITCAST, dl, VbicVT, N->getOperand(0));
13897 SDValue Vbic = DAG.getNode(ARMISD::VBICIMM, dl, VbicVT, Input, Val);
13898 return DAG.getNode(ISD::BITCAST, dl, VT, Vbic);
13903 if (!Subtarget->isThumb1Only()) {
13904 // fold (and (select cc, -1, c), x) -> (select cc, x, (and, x, c))
13905 if (SDValue Result = combineSelectAndUseCommutative(N, true, DCI))
13906 return Result;
13908 if (SDValue Result = PerformSHLSimplify(N, DCI, Subtarget))
13909 return Result;
13912 if (Subtarget->isThumb1Only())
13913 if (SDValue Result = CombineANDShift(N, DCI, Subtarget))
13914 return Result;
13916 return SDValue();
13919 // Try combining OR nodes to SMULWB, SMULWT.
13920 static SDValue PerformORCombineToSMULWBT(SDNode *OR,
13921 TargetLowering::DAGCombinerInfo &DCI,
13922 const ARMSubtarget *Subtarget) {
13923 if (!Subtarget->hasV6Ops() ||
13924 (Subtarget->isThumb() &&
13925 (!Subtarget->hasThumb2() || !Subtarget->hasDSP())))
13926 return SDValue();
13928 SDValue SRL = OR->getOperand(0);
13929 SDValue SHL = OR->getOperand(1);
13931 if (SRL.getOpcode() != ISD::SRL || SHL.getOpcode() != ISD::SHL) {
13932 SRL = OR->getOperand(1);
13933 SHL = OR->getOperand(0);
13935 if (!isSRL16(SRL) || !isSHL16(SHL))
13936 return SDValue();
13938 // The first operands to the shifts need to be the two results from the
13939 // same smul_lohi node.
13940 if ((SRL.getOperand(0).getNode() != SHL.getOperand(0).getNode()) ||
13941 SRL.getOperand(0).getOpcode() != ISD::SMUL_LOHI)
13942 return SDValue();
13944 SDNode *SMULLOHI = SRL.getOperand(0).getNode();
13945 if (SRL.getOperand(0) != SDValue(SMULLOHI, 0) ||
13946 SHL.getOperand(0) != SDValue(SMULLOHI, 1))
13947 return SDValue();
13949 // Now we have:
13950 // (or (srl (smul_lohi ?, ?), 16), (shl (smul_lohi ?, ?), 16)))
13951 // For SMUL[B|T] smul_lohi will take a 32-bit and a 16-bit arguments.
13952 // For SMUWB the 16-bit value will signed extended somehow.
13953 // For SMULWT only the SRA is required.
13954 // Check both sides of SMUL_LOHI
13955 SDValue OpS16 = SMULLOHI->getOperand(0);
13956 SDValue OpS32 = SMULLOHI->getOperand(1);
13958 SelectionDAG &DAG = DCI.DAG;
13959 if (!isS16(OpS16, DAG) && !isSRA16(OpS16)) {
13960 OpS16 = OpS32;
13961 OpS32 = SMULLOHI->getOperand(0);
13964 SDLoc dl(OR);
13965 unsigned Opcode = 0;
13966 if (isS16(OpS16, DAG))
13967 Opcode = ARMISD::SMULWB;
13968 else if (isSRA16(OpS16)) {
13969 Opcode = ARMISD::SMULWT;
13970 OpS16 = OpS16->getOperand(0);
13972 else
13973 return SDValue();
13975 SDValue Res = DAG.getNode(Opcode, dl, MVT::i32, OpS32, OpS16);
13976 DAG.ReplaceAllUsesOfValueWith(SDValue(OR, 0), Res);
13977 return SDValue(OR, 0);
13980 static SDValue PerformORCombineToBFI(SDNode *N,
13981 TargetLowering::DAGCombinerInfo &DCI,
13982 const ARMSubtarget *Subtarget) {
13983 // BFI is only available on V6T2+
13984 if (Subtarget->isThumb1Only() || !Subtarget->hasV6T2Ops())
13985 return SDValue();
13987 EVT VT = N->getValueType(0);
13988 SDValue N0 = N->getOperand(0);
13989 SDValue N1 = N->getOperand(1);
13990 SelectionDAG &DAG = DCI.DAG;
13991 SDLoc DL(N);
13992 // 1) or (and A, mask), val => ARMbfi A, val, mask
13993 // iff (val & mask) == val
13995 // 2) or (and A, mask), (and B, mask2) => ARMbfi A, (lsr B, amt), mask
13996 // 2a) iff isBitFieldInvertedMask(mask) && isBitFieldInvertedMask(~mask2)
13997 // && mask == ~mask2
13998 // 2b) iff isBitFieldInvertedMask(~mask) && isBitFieldInvertedMask(mask2)
13999 // && ~mask == mask2
14000 // (i.e., copy a bitfield value into another bitfield of the same width)
14002 if (VT != MVT::i32)
14003 return SDValue();
14005 SDValue N00 = N0.getOperand(0);
14007 // The value and the mask need to be constants so we can verify this is
14008 // actually a bitfield set. If the mask is 0xffff, we can do better
14009 // via a movt instruction, so don't use BFI in that case.
14010 SDValue MaskOp = N0.getOperand(1);
14011 ConstantSDNode *MaskC = dyn_cast<ConstantSDNode>(MaskOp);
14012 if (!MaskC)
14013 return SDValue();
14014 unsigned Mask = MaskC->getZExtValue();
14015 if (Mask == 0xffff)
14016 return SDValue();
14017 SDValue Res;
14018 // Case (1): or (and A, mask), val => ARMbfi A, val, mask
14019 ConstantSDNode *N1C = dyn_cast<ConstantSDNode>(N1);
14020 if (N1C) {
14021 unsigned Val = N1C->getZExtValue();
14022 if ((Val & ~Mask) != Val)
14023 return SDValue();
14025 if (ARM::isBitFieldInvertedMask(Mask)) {
14026 Val >>= countTrailingZeros(~Mask);
14028 Res = DAG.getNode(ARMISD::BFI, DL, VT, N00,
14029 DAG.getConstant(Val, DL, MVT::i32),
14030 DAG.getConstant(Mask, DL, MVT::i32));
14032 DCI.CombineTo(N, Res, false);
14033 // Return value from the original node to inform the combiner than N is
14034 // now dead.
14035 return SDValue(N, 0);
14037 } else if (N1.getOpcode() == ISD::AND) {
14038 // case (2) or (and A, mask), (and B, mask2) => ARMbfi A, (lsr B, amt), mask
14039 ConstantSDNode *N11C = dyn_cast<ConstantSDNode>(N1.getOperand(1));
14040 if (!N11C)
14041 return SDValue();
14042 unsigned Mask2 = N11C->getZExtValue();
14044 // Mask and ~Mask2 (or reverse) must be equivalent for the BFI pattern
14045 // as is to match.
14046 if (ARM::isBitFieldInvertedMask(Mask) &&
14047 (Mask == ~Mask2)) {
14048 // The pack halfword instruction works better for masks that fit it,
14049 // so use that when it's available.
14050 if (Subtarget->hasDSP() &&
14051 (Mask == 0xffff || Mask == 0xffff0000))
14052 return SDValue();
14053 // 2a
14054 unsigned amt = countTrailingZeros(Mask2);
14055 Res = DAG.getNode(ISD::SRL, DL, VT, N1.getOperand(0),
14056 DAG.getConstant(amt, DL, MVT::i32));
14057 Res = DAG.getNode(ARMISD::BFI, DL, VT, N00, Res,
14058 DAG.getConstant(Mask, DL, MVT::i32));
14059 DCI.CombineTo(N, Res, false);
14060 // Return value from the original node to inform the combiner than N is
14061 // now dead.
14062 return SDValue(N, 0);
14063 } else if (ARM::isBitFieldInvertedMask(~Mask) &&
14064 (~Mask == Mask2)) {
14065 // The pack halfword instruction works better for masks that fit it,
14066 // so use that when it's available.
14067 if (Subtarget->hasDSP() &&
14068 (Mask2 == 0xffff || Mask2 == 0xffff0000))
14069 return SDValue();
14070 // 2b
14071 unsigned lsb = countTrailingZeros(Mask);
14072 Res = DAG.getNode(ISD::SRL, DL, VT, N00,
14073 DAG.getConstant(lsb, DL, MVT::i32));
14074 Res = DAG.getNode(ARMISD::BFI, DL, VT, N1.getOperand(0), Res,
14075 DAG.getConstant(Mask2, DL, MVT::i32));
14076 DCI.CombineTo(N, Res, false);
14077 // Return value from the original node to inform the combiner than N is
14078 // now dead.
14079 return SDValue(N, 0);
14083 if (DAG.MaskedValueIsZero(N1, MaskC->getAPIntValue()) &&
14084 N00.getOpcode() == ISD::SHL && isa<ConstantSDNode>(N00.getOperand(1)) &&
14085 ARM::isBitFieldInvertedMask(~Mask)) {
14086 // Case (3): or (and (shl A, #shamt), mask), B => ARMbfi B, A, ~mask
14087 // where lsb(mask) == #shamt and masked bits of B are known zero.
14088 SDValue ShAmt = N00.getOperand(1);
14089 unsigned ShAmtC = cast<ConstantSDNode>(ShAmt)->getZExtValue();
14090 unsigned LSB = countTrailingZeros(Mask);
14091 if (ShAmtC != LSB)
14092 return SDValue();
14094 Res = DAG.getNode(ARMISD::BFI, DL, VT, N1, N00.getOperand(0),
14095 DAG.getConstant(~Mask, DL, MVT::i32));
14097 DCI.CombineTo(N, Res, false);
14098 // Return value from the original node to inform the combiner than N is
14099 // now dead.
14100 return SDValue(N, 0);
14103 return SDValue();
14106 static bool isValidMVECond(unsigned CC, bool IsFloat) {
14107 switch (CC) {
14108 case ARMCC::EQ:
14109 case ARMCC::NE:
14110 case ARMCC::LE:
14111 case ARMCC::GT:
14112 case ARMCC::GE:
14113 case ARMCC::LT:
14114 return true;
14115 case ARMCC::HS:
14116 case ARMCC::HI:
14117 return !IsFloat;
14118 default:
14119 return false;
14123 static ARMCC::CondCodes getVCMPCondCode(SDValue N) {
14124 if (N->getOpcode() == ARMISD::VCMP)
14125 return (ARMCC::CondCodes)N->getConstantOperandVal(2);
14126 else if (N->getOpcode() == ARMISD::VCMPZ)
14127 return (ARMCC::CondCodes)N->getConstantOperandVal(1);
14128 else
14129 llvm_unreachable("Not a VCMP/VCMPZ!");
14132 static bool CanInvertMVEVCMP(SDValue N) {
14133 ARMCC::CondCodes CC = ARMCC::getOppositeCondition(getVCMPCondCode(N));
14134 return isValidMVECond(CC, N->getOperand(0).getValueType().isFloatingPoint());
14137 static SDValue PerformORCombine_i1(SDNode *N, SelectionDAG &DAG,
14138 const ARMSubtarget *Subtarget) {
14139 // Try to invert "or A, B" -> "and ~A, ~B", as the "and" is easier to chain
14140 // together with predicates
14141 EVT VT = N->getValueType(0);
14142 SDLoc DL(N);
14143 SDValue N0 = N->getOperand(0);
14144 SDValue N1 = N->getOperand(1);
14146 auto IsFreelyInvertable = [&](SDValue V) {
14147 if (V->getOpcode() == ARMISD::VCMP || V->getOpcode() == ARMISD::VCMPZ)
14148 return CanInvertMVEVCMP(V);
14149 return false;
14152 // At least one operand must be freely invertable.
14153 if (!(IsFreelyInvertable(N0) || IsFreelyInvertable(N1)))
14154 return SDValue();
14156 SDValue NewN0 = DAG.getLogicalNOT(DL, N0, VT);
14157 SDValue NewN1 = DAG.getLogicalNOT(DL, N1, VT);
14158 SDValue And = DAG.getNode(ISD::AND, DL, VT, NewN0, NewN1);
14159 return DAG.getLogicalNOT(DL, And, VT);
14162 /// PerformORCombine - Target-specific dag combine xforms for ISD::OR
14163 static SDValue PerformORCombine(SDNode *N,
14164 TargetLowering::DAGCombinerInfo &DCI,
14165 const ARMSubtarget *Subtarget) {
14166 // Attempt to use immediate-form VORR
14167 BuildVectorSDNode *BVN = dyn_cast<BuildVectorSDNode>(N->getOperand(1));
14168 SDLoc dl(N);
14169 EVT VT = N->getValueType(0);
14170 SelectionDAG &DAG = DCI.DAG;
14172 if(!DAG.getTargetLoweringInfo().isTypeLegal(VT))
14173 return SDValue();
14175 if (Subtarget->hasMVEIntegerOps() &&
14176 (VT == MVT::v4i1 || VT == MVT::v8i1 || VT == MVT::v16i1))
14177 return PerformORCombine_i1(N, DAG, Subtarget);
14179 APInt SplatBits, SplatUndef;
14180 unsigned SplatBitSize;
14181 bool HasAnyUndefs;
14182 if (BVN && (Subtarget->hasNEON() || Subtarget->hasMVEIntegerOps()) &&
14183 BVN->isConstantSplat(SplatBits, SplatUndef, SplatBitSize, HasAnyUndefs)) {
14184 if (SplatBitSize == 8 || SplatBitSize == 16 || SplatBitSize == 32 ||
14185 SplatBitSize == 64) {
14186 EVT VorrVT;
14187 SDValue Val =
14188 isVMOVModifiedImm(SplatBits.getZExtValue(), SplatUndef.getZExtValue(),
14189 SplatBitSize, DAG, dl, VorrVT, VT, OtherModImm);
14190 if (Val.getNode()) {
14191 SDValue Input =
14192 DAG.getNode(ISD::BITCAST, dl, VorrVT, N->getOperand(0));
14193 SDValue Vorr = DAG.getNode(ARMISD::VORRIMM, dl, VorrVT, Input, Val);
14194 return DAG.getNode(ISD::BITCAST, dl, VT, Vorr);
14199 if (!Subtarget->isThumb1Only()) {
14200 // fold (or (select cc, 0, c), x) -> (select cc, x, (or, x, c))
14201 if (SDValue Result = combineSelectAndUseCommutative(N, false, DCI))
14202 return Result;
14203 if (SDValue Result = PerformORCombineToSMULWBT(N, DCI, Subtarget))
14204 return Result;
14207 SDValue N0 = N->getOperand(0);
14208 SDValue N1 = N->getOperand(1);
14210 // (or (and B, A), (and C, ~A)) => (VBSL A, B, C) when A is a constant.
14211 if (Subtarget->hasNEON() && N1.getOpcode() == ISD::AND && VT.isVector() &&
14212 DAG.getTargetLoweringInfo().isTypeLegal(VT)) {
14214 // The code below optimizes (or (and X, Y), Z).
14215 // The AND operand needs to have a single user to make these optimizations
14216 // profitable.
14217 if (N0.getOpcode() != ISD::AND || !N0.hasOneUse())
14218 return SDValue();
14220 APInt SplatUndef;
14221 unsigned SplatBitSize;
14222 bool HasAnyUndefs;
14224 APInt SplatBits0, SplatBits1;
14225 BuildVectorSDNode *BVN0 = dyn_cast<BuildVectorSDNode>(N0->getOperand(1));
14226 BuildVectorSDNode *BVN1 = dyn_cast<BuildVectorSDNode>(N1->getOperand(1));
14227 // Ensure that the second operand of both ands are constants
14228 if (BVN0 && BVN0->isConstantSplat(SplatBits0, SplatUndef, SplatBitSize,
14229 HasAnyUndefs) && !HasAnyUndefs) {
14230 if (BVN1 && BVN1->isConstantSplat(SplatBits1, SplatUndef, SplatBitSize,
14231 HasAnyUndefs) && !HasAnyUndefs) {
14232 // Ensure that the bit width of the constants are the same and that
14233 // the splat arguments are logical inverses as per the pattern we
14234 // are trying to simplify.
14235 if (SplatBits0.getBitWidth() == SplatBits1.getBitWidth() &&
14236 SplatBits0 == ~SplatBits1) {
14237 // Canonicalize the vector type to make instruction selection
14238 // simpler.
14239 EVT CanonicalVT = VT.is128BitVector() ? MVT::v4i32 : MVT::v2i32;
14240 SDValue Result = DAG.getNode(ARMISD::VBSP, dl, CanonicalVT,
14241 N0->getOperand(1),
14242 N0->getOperand(0),
14243 N1->getOperand(0));
14244 return DAG.getNode(ISD::BITCAST, dl, VT, Result);
14250 // Try to use the ARM/Thumb2 BFI (bitfield insert) instruction when
14251 // reasonable.
14252 if (N0.getOpcode() == ISD::AND && N0.hasOneUse()) {
14253 if (SDValue Res = PerformORCombineToBFI(N, DCI, Subtarget))
14254 return Res;
14257 if (SDValue Result = PerformSHLSimplify(N, DCI, Subtarget))
14258 return Result;
14260 return SDValue();
14263 static SDValue PerformXORCombine(SDNode *N,
14264 TargetLowering::DAGCombinerInfo &DCI,
14265 const ARMSubtarget *Subtarget) {
14266 EVT VT = N->getValueType(0);
14267 SelectionDAG &DAG = DCI.DAG;
14269 if(!DAG.getTargetLoweringInfo().isTypeLegal(VT))
14270 return SDValue();
14272 if (!Subtarget->isThumb1Only()) {
14273 // fold (xor (select cc, 0, c), x) -> (select cc, x, (xor, x, c))
14274 if (SDValue Result = combineSelectAndUseCommutative(N, false, DCI))
14275 return Result;
14277 if (SDValue Result = PerformSHLSimplify(N, DCI, Subtarget))
14278 return Result;
14281 if (Subtarget->hasMVEIntegerOps()) {
14282 // fold (xor(vcmp/z, 1)) into a vcmp with the opposite condition.
14283 SDValue N0 = N->getOperand(0);
14284 SDValue N1 = N->getOperand(1);
14285 const TargetLowering *TLI = Subtarget->getTargetLowering();
14286 if (TLI->isConstTrueVal(N1.getNode()) &&
14287 (N0->getOpcode() == ARMISD::VCMP || N0->getOpcode() == ARMISD::VCMPZ)) {
14288 if (CanInvertMVEVCMP(N0)) {
14289 SDLoc DL(N0);
14290 ARMCC::CondCodes CC = ARMCC::getOppositeCondition(getVCMPCondCode(N0));
14292 SmallVector<SDValue, 4> Ops;
14293 Ops.push_back(N0->getOperand(0));
14294 if (N0->getOpcode() == ARMISD::VCMP)
14295 Ops.push_back(N0->getOperand(1));
14296 Ops.push_back(DAG.getConstant(CC, DL, MVT::i32));
14297 return DAG.getNode(N0->getOpcode(), DL, N0->getValueType(0), Ops);
14302 return SDValue();
14305 // ParseBFI - given a BFI instruction in N, extract the "from" value (Rn) and return it,
14306 // and fill in FromMask and ToMask with (consecutive) bits in "from" to be extracted and
14307 // their position in "to" (Rd).
14308 static SDValue ParseBFI(SDNode *N, APInt &ToMask, APInt &FromMask) {
14309 assert(N->getOpcode() == ARMISD::BFI);
14311 SDValue From = N->getOperand(1);
14312 ToMask = ~cast<ConstantSDNode>(N->getOperand(2))->getAPIntValue();
14313 FromMask = APInt::getLowBitsSet(ToMask.getBitWidth(), ToMask.countPopulation());
14315 // If the Base came from a SHR #C, we can deduce that it is really testing bit
14316 // #C in the base of the SHR.
14317 if (From->getOpcode() == ISD::SRL &&
14318 isa<ConstantSDNode>(From->getOperand(1))) {
14319 APInt Shift = cast<ConstantSDNode>(From->getOperand(1))->getAPIntValue();
14320 assert(Shift.getLimitedValue() < 32 && "Shift too large!");
14321 FromMask <<= Shift.getLimitedValue(31);
14322 From = From->getOperand(0);
14325 return From;
14328 // If A and B contain one contiguous set of bits, does A | B == A . B?
14330 // Neither A nor B must be zero.
14331 static bool BitsProperlyConcatenate(const APInt &A, const APInt &B) {
14332 unsigned LastActiveBitInA = A.countTrailingZeros();
14333 unsigned FirstActiveBitInB = B.getBitWidth() - B.countLeadingZeros() - 1;
14334 return LastActiveBitInA - 1 == FirstActiveBitInB;
14337 static SDValue FindBFIToCombineWith(SDNode *N) {
14338 // We have a BFI in N. Find a BFI it can combine with, if one exists.
14339 APInt ToMask, FromMask;
14340 SDValue From = ParseBFI(N, ToMask, FromMask);
14341 SDValue To = N->getOperand(0);
14343 SDValue V = To;
14344 if (V.getOpcode() != ARMISD::BFI)
14345 return SDValue();
14347 APInt NewToMask, NewFromMask;
14348 SDValue NewFrom = ParseBFI(V.getNode(), NewToMask, NewFromMask);
14349 if (NewFrom != From)
14350 return SDValue();
14352 // Do the written bits conflict with any we've seen so far?
14353 if ((NewToMask & ToMask).getBoolValue())
14354 // Conflicting bits.
14355 return SDValue();
14357 // Are the new bits contiguous when combined with the old bits?
14358 if (BitsProperlyConcatenate(ToMask, NewToMask) &&
14359 BitsProperlyConcatenate(FromMask, NewFromMask))
14360 return V;
14361 if (BitsProperlyConcatenate(NewToMask, ToMask) &&
14362 BitsProperlyConcatenate(NewFromMask, FromMask))
14363 return V;
14365 return SDValue();
14368 static SDValue PerformBFICombine(SDNode *N, SelectionDAG &DAG) {
14369 SDValue N0 = N->getOperand(0);
14370 SDValue N1 = N->getOperand(1);
14372 if (N1.getOpcode() == ISD::AND) {
14373 // (bfi A, (and B, Mask1), Mask2) -> (bfi A, B, Mask2) iff
14374 // the bits being cleared by the AND are not demanded by the BFI.
14375 ConstantSDNode *N11C = dyn_cast<ConstantSDNode>(N1.getOperand(1));
14376 if (!N11C)
14377 return SDValue();
14378 unsigned InvMask = cast<ConstantSDNode>(N->getOperand(2))->getZExtValue();
14379 unsigned LSB = countTrailingZeros(~InvMask);
14380 unsigned Width = (32 - countLeadingZeros(~InvMask)) - LSB;
14381 assert(Width <
14382 static_cast<unsigned>(std::numeric_limits<unsigned>::digits) &&
14383 "undefined behavior");
14384 unsigned Mask = (1u << Width) - 1;
14385 unsigned Mask2 = N11C->getZExtValue();
14386 if ((Mask & (~Mask2)) == 0)
14387 return DAG.getNode(ARMISD::BFI, SDLoc(N), N->getValueType(0),
14388 N->getOperand(0), N1.getOperand(0), N->getOperand(2));
14389 return SDValue();
14392 // Look for another BFI to combine with.
14393 if (SDValue CombineBFI = FindBFIToCombineWith(N)) {
14394 // We've found a BFI.
14395 APInt ToMask1, FromMask1;
14396 SDValue From1 = ParseBFI(N, ToMask1, FromMask1);
14398 APInt ToMask2, FromMask2;
14399 SDValue From2 = ParseBFI(CombineBFI.getNode(), ToMask2, FromMask2);
14400 assert(From1 == From2);
14401 (void)From2;
14403 // Create a new BFI, combining the two together.
14404 APInt NewFromMask = FromMask1 | FromMask2;
14405 APInt NewToMask = ToMask1 | ToMask2;
14407 EVT VT = N->getValueType(0);
14408 SDLoc dl(N);
14410 if (NewFromMask[0] == 0)
14411 From1 = DAG.getNode(
14412 ISD::SRL, dl, VT, From1,
14413 DAG.getConstant(NewFromMask.countTrailingZeros(), dl, VT));
14414 return DAG.getNode(ARMISD::BFI, dl, VT, CombineBFI.getOperand(0), From1,
14415 DAG.getConstant(~NewToMask, dl, VT));
14418 // Reassociate BFI(BFI (A, B, M1), C, M2) to BFI(BFI (A, C, M2), B, M1) so
14419 // that lower bit insertions are performed first, providing that M1 and M2
14420 // do no overlap. This can allow multiple BFI instructions to be combined
14421 // together by the other folds above.
14422 if (N->getOperand(0).getOpcode() == ARMISD::BFI) {
14423 APInt ToMask1 = ~N->getConstantOperandAPInt(2);
14424 APInt ToMask2 = ~N0.getConstantOperandAPInt(2);
14426 if (!N0.hasOneUse() || (ToMask1 & ToMask2) != 0 ||
14427 ToMask1.countLeadingZeros() < ToMask2.countLeadingZeros())
14428 return SDValue();
14430 EVT VT = N->getValueType(0);
14431 SDLoc dl(N);
14432 SDValue BFI1 = DAG.getNode(ARMISD::BFI, dl, VT, N0.getOperand(0),
14433 N->getOperand(1), N->getOperand(2));
14434 return DAG.getNode(ARMISD::BFI, dl, VT, BFI1, N0.getOperand(1),
14435 N0.getOperand(2));
14438 return SDValue();
14441 /// PerformVMOVRRDCombine - Target-specific dag combine xforms for
14442 /// ARMISD::VMOVRRD.
14443 static SDValue PerformVMOVRRDCombine(SDNode *N,
14444 TargetLowering::DAGCombinerInfo &DCI,
14445 const ARMSubtarget *Subtarget) {
14446 // vmovrrd(vmovdrr x, y) -> x,y
14447 SDValue InDouble = N->getOperand(0);
14448 if (InDouble.getOpcode() == ARMISD::VMOVDRR && Subtarget->hasFP64())
14449 return DCI.CombineTo(N, InDouble.getOperand(0), InDouble.getOperand(1));
14451 // vmovrrd(load f64) -> (load i32), (load i32)
14452 SDNode *InNode = InDouble.getNode();
14453 if (ISD::isNormalLoad(InNode) && InNode->hasOneUse() &&
14454 InNode->getValueType(0) == MVT::f64 &&
14455 InNode->getOperand(1).getOpcode() == ISD::FrameIndex &&
14456 !cast<LoadSDNode>(InNode)->isVolatile()) {
14457 // TODO: Should this be done for non-FrameIndex operands?
14458 LoadSDNode *LD = cast<LoadSDNode>(InNode);
14460 SelectionDAG &DAG = DCI.DAG;
14461 SDLoc DL(LD);
14462 SDValue BasePtr = LD->getBasePtr();
14463 SDValue NewLD1 =
14464 DAG.getLoad(MVT::i32, DL, LD->getChain(), BasePtr, LD->getPointerInfo(),
14465 LD->getAlignment(), LD->getMemOperand()->getFlags());
14467 SDValue OffsetPtr = DAG.getNode(ISD::ADD, DL, MVT::i32, BasePtr,
14468 DAG.getConstant(4, DL, MVT::i32));
14470 SDValue NewLD2 = DAG.getLoad(MVT::i32, DL, LD->getChain(), OffsetPtr,
14471 LD->getPointerInfo().getWithOffset(4),
14472 std::min(4U, LD->getAlignment()),
14473 LD->getMemOperand()->getFlags());
14475 DAG.ReplaceAllUsesOfValueWith(SDValue(LD, 1), NewLD2.getValue(1));
14476 if (DCI.DAG.getDataLayout().isBigEndian())
14477 std::swap (NewLD1, NewLD2);
14478 SDValue Result = DCI.CombineTo(N, NewLD1, NewLD2);
14479 return Result;
14482 // VMOVRRD(extract(..(build_vector(a, b, c, d)))) -> a,b or c,d
14483 // VMOVRRD(extract(insert_vector(insert_vector(.., a, l1), b, l2))) -> a,b
14484 if (InDouble.getOpcode() == ISD::EXTRACT_VECTOR_ELT &&
14485 isa<ConstantSDNode>(InDouble.getOperand(1))) {
14486 SDValue BV = InDouble.getOperand(0);
14487 // Look up through any nop bitcasts and vector_reg_casts. bitcasts may
14488 // change lane order under big endian.
14489 bool BVSwap = BV.getOpcode() == ISD::BITCAST;
14490 while (
14491 (BV.getOpcode() == ISD::BITCAST ||
14492 BV.getOpcode() == ARMISD::VECTOR_REG_CAST) &&
14493 (BV.getValueType() == MVT::v2f64 || BV.getValueType() == MVT::v2i64)) {
14494 BVSwap = BV.getOpcode() == ISD::BITCAST;
14495 BV = BV.getOperand(0);
14497 if (BV.getValueType() != MVT::v4i32)
14498 return SDValue();
14500 // Handle buildvectors, pulling out the correct lane depending on
14501 // endianness.
14502 unsigned Offset = InDouble.getConstantOperandVal(1) == 1 ? 2 : 0;
14503 if (BV.getOpcode() == ISD::BUILD_VECTOR) {
14504 SDValue Op0 = BV.getOperand(Offset);
14505 SDValue Op1 = BV.getOperand(Offset + 1);
14506 if (!Subtarget->isLittle() && BVSwap)
14507 std::swap(Op0, Op1);
14509 return DCI.DAG.getMergeValues({Op0, Op1}, SDLoc(N));
14512 // A chain of insert_vectors, grabbing the correct value of the chain of
14513 // inserts.
14514 SDValue Op0, Op1;
14515 while (BV.getOpcode() == ISD::INSERT_VECTOR_ELT) {
14516 if (isa<ConstantSDNode>(BV.getOperand(2))) {
14517 if (BV.getConstantOperandVal(2) == Offset)
14518 Op0 = BV.getOperand(1);
14519 if (BV.getConstantOperandVal(2) == Offset + 1)
14520 Op1 = BV.getOperand(1);
14522 BV = BV.getOperand(0);
14524 if (!Subtarget->isLittle() && BVSwap)
14525 std::swap(Op0, Op1);
14526 if (Op0 && Op1)
14527 return DCI.DAG.getMergeValues({Op0, Op1}, SDLoc(N));
14530 return SDValue();
14533 /// PerformVMOVDRRCombine - Target-specific dag combine xforms for
14534 /// ARMISD::VMOVDRR. This is also used for BUILD_VECTORs with 2 operands.
14535 static SDValue PerformVMOVDRRCombine(SDNode *N, SelectionDAG &DAG) {
14536 // N=vmovrrd(X); vmovdrr(N:0, N:1) -> bit_convert(X)
14537 SDValue Op0 = N->getOperand(0);
14538 SDValue Op1 = N->getOperand(1);
14539 if (Op0.getOpcode() == ISD::BITCAST)
14540 Op0 = Op0.getOperand(0);
14541 if (Op1.getOpcode() == ISD::BITCAST)
14542 Op1 = Op1.getOperand(0);
14543 if (Op0.getOpcode() == ARMISD::VMOVRRD &&
14544 Op0.getNode() == Op1.getNode() &&
14545 Op0.getResNo() == 0 && Op1.getResNo() == 1)
14546 return DAG.getNode(ISD::BITCAST, SDLoc(N),
14547 N->getValueType(0), Op0.getOperand(0));
14548 return SDValue();
14551 static SDValue PerformVMOVhrCombine(SDNode *N,
14552 TargetLowering::DAGCombinerInfo &DCI) {
14553 SDValue Op0 = N->getOperand(0);
14555 // VMOVhr (VMOVrh (X)) -> X
14556 if (Op0->getOpcode() == ARMISD::VMOVrh)
14557 return Op0->getOperand(0);
14559 // FullFP16: half values are passed in S-registers, and we don't
14560 // need any of the bitcast and moves:
14562 // t2: f32,ch = CopyFromReg t0, Register:f32 %0
14563 // t5: i32 = bitcast t2
14564 // t18: f16 = ARMISD::VMOVhr t5
14565 if (Op0->getOpcode() == ISD::BITCAST) {
14566 SDValue Copy = Op0->getOperand(0);
14567 if (Copy.getValueType() == MVT::f32 &&
14568 Copy->getOpcode() == ISD::CopyFromReg) {
14569 SDValue Ops[] = {Copy->getOperand(0), Copy->getOperand(1)};
14570 SDValue NewCopy =
14571 DCI.DAG.getNode(ISD::CopyFromReg, SDLoc(N), N->getValueType(0), Ops);
14572 return NewCopy;
14576 // fold (VMOVhr (load x)) -> (load (f16*)x)
14577 if (LoadSDNode *LN0 = dyn_cast<LoadSDNode>(Op0)) {
14578 if (LN0->hasOneUse() && LN0->isUnindexed() &&
14579 LN0->getMemoryVT() == MVT::i16) {
14580 SDValue Load =
14581 DCI.DAG.getLoad(N->getValueType(0), SDLoc(N), LN0->getChain(),
14582 LN0->getBasePtr(), LN0->getMemOperand());
14583 DCI.DAG.ReplaceAllUsesOfValueWith(SDValue(N, 0), Load.getValue(0));
14584 DCI.DAG.ReplaceAllUsesOfValueWith(Op0.getValue(1), Load.getValue(1));
14585 return Load;
14589 // Only the bottom 16 bits of the source register are used.
14590 APInt DemandedMask = APInt::getLowBitsSet(32, 16);
14591 const TargetLowering &TLI = DCI.DAG.getTargetLoweringInfo();
14592 if (TLI.SimplifyDemandedBits(Op0, DemandedMask, DCI))
14593 return SDValue(N, 0);
14595 return SDValue();
14598 static SDValue PerformVMOVrhCombine(SDNode *N, SelectionDAG &DAG) {
14599 SDValue N0 = N->getOperand(0);
14600 EVT VT = N->getValueType(0);
14602 // fold (VMOVrh (fpconst x)) -> const x
14603 if (ConstantFPSDNode *C = dyn_cast<ConstantFPSDNode>(N0)) {
14604 APFloat V = C->getValueAPF();
14605 return DAG.getConstant(V.bitcastToAPInt().getZExtValue(), SDLoc(N), VT);
14608 // fold (VMOVrh (load x)) -> (zextload (i16*)x)
14609 if (ISD::isNormalLoad(N0.getNode()) && N0.hasOneUse()) {
14610 LoadSDNode *LN0 = cast<LoadSDNode>(N0);
14612 SDValue Load =
14613 DAG.getExtLoad(ISD::ZEXTLOAD, SDLoc(N), VT, LN0->getChain(),
14614 LN0->getBasePtr(), MVT::i16, LN0->getMemOperand());
14615 DAG.ReplaceAllUsesOfValueWith(SDValue(N, 0), Load.getValue(0));
14616 DAG.ReplaceAllUsesOfValueWith(N0.getValue(1), Load.getValue(1));
14617 return Load;
14620 // Fold VMOVrh(extract(x, n)) -> vgetlaneu(x, n)
14621 if (N0->getOpcode() == ISD::EXTRACT_VECTOR_ELT &&
14622 isa<ConstantSDNode>(N0->getOperand(1)))
14623 return DAG.getNode(ARMISD::VGETLANEu, SDLoc(N), VT, N0->getOperand(0),
14624 N0->getOperand(1));
14626 return SDValue();
14629 /// hasNormalLoadOperand - Check if any of the operands of a BUILD_VECTOR node
14630 /// are normal, non-volatile loads. If so, it is profitable to bitcast an
14631 /// i64 vector to have f64 elements, since the value can then be loaded
14632 /// directly into a VFP register.
14633 static bool hasNormalLoadOperand(SDNode *N) {
14634 unsigned NumElts = N->getValueType(0).getVectorNumElements();
14635 for (unsigned i = 0; i < NumElts; ++i) {
14636 SDNode *Elt = N->getOperand(i).getNode();
14637 if (ISD::isNormalLoad(Elt) && !cast<LoadSDNode>(Elt)->isVolatile())
14638 return true;
14640 return false;
14643 /// PerformBUILD_VECTORCombine - Target-specific dag combine xforms for
14644 /// ISD::BUILD_VECTOR.
14645 static SDValue PerformBUILD_VECTORCombine(SDNode *N,
14646 TargetLowering::DAGCombinerInfo &DCI,
14647 const ARMSubtarget *Subtarget) {
14648 // build_vector(N=ARMISD::VMOVRRD(X), N:1) -> bit_convert(X):
14649 // VMOVRRD is introduced when legalizing i64 types. It forces the i64 value
14650 // into a pair of GPRs, which is fine when the value is used as a scalar,
14651 // but if the i64 value is converted to a vector, we need to undo the VMOVRRD.
14652 SelectionDAG &DAG = DCI.DAG;
14653 if (N->getNumOperands() == 2)
14654 if (SDValue RV = PerformVMOVDRRCombine(N, DAG))
14655 return RV;
14657 // Load i64 elements as f64 values so that type legalization does not split
14658 // them up into i32 values.
14659 EVT VT = N->getValueType(0);
14660 if (VT.getVectorElementType() != MVT::i64 || !hasNormalLoadOperand(N))
14661 return SDValue();
14662 SDLoc dl(N);
14663 SmallVector<SDValue, 8> Ops;
14664 unsigned NumElts = VT.getVectorNumElements();
14665 for (unsigned i = 0; i < NumElts; ++i) {
14666 SDValue V = DAG.getNode(ISD::BITCAST, dl, MVT::f64, N->getOperand(i));
14667 Ops.push_back(V);
14668 // Make the DAGCombiner fold the bitcast.
14669 DCI.AddToWorklist(V.getNode());
14671 EVT FloatVT = EVT::getVectorVT(*DAG.getContext(), MVT::f64, NumElts);
14672 SDValue BV = DAG.getBuildVector(FloatVT, dl, Ops);
14673 return DAG.getNode(ISD::BITCAST, dl, VT, BV);
14676 /// Target-specific dag combine xforms for ARMISD::BUILD_VECTOR.
14677 static SDValue
14678 PerformARMBUILD_VECTORCombine(SDNode *N, TargetLowering::DAGCombinerInfo &DCI) {
14679 // ARMISD::BUILD_VECTOR is introduced when legalizing ISD::BUILD_VECTOR.
14680 // At that time, we may have inserted bitcasts from integer to float.
14681 // If these bitcasts have survived DAGCombine, change the lowering of this
14682 // BUILD_VECTOR in something more vector friendly, i.e., that does not
14683 // force to use floating point types.
14685 // Make sure we can change the type of the vector.
14686 // This is possible iff:
14687 // 1. The vector is only used in a bitcast to a integer type. I.e.,
14688 // 1.1. Vector is used only once.
14689 // 1.2. Use is a bit convert to an integer type.
14690 // 2. The size of its operands are 32-bits (64-bits are not legal).
14691 EVT VT = N->getValueType(0);
14692 EVT EltVT = VT.getVectorElementType();
14694 // Check 1.1. and 2.
14695 if (EltVT.getSizeInBits() != 32 || !N->hasOneUse())
14696 return SDValue();
14698 // By construction, the input type must be float.
14699 assert(EltVT == MVT::f32 && "Unexpected type!");
14701 // Check 1.2.
14702 SDNode *Use = *N->use_begin();
14703 if (Use->getOpcode() != ISD::BITCAST ||
14704 Use->getValueType(0).isFloatingPoint())
14705 return SDValue();
14707 // Check profitability.
14708 // Model is, if more than half of the relevant operands are bitcast from
14709 // i32, turn the build_vector into a sequence of insert_vector_elt.
14710 // Relevant operands are everything that is not statically
14711 // (i.e., at compile time) bitcasted.
14712 unsigned NumOfBitCastedElts = 0;
14713 unsigned NumElts = VT.getVectorNumElements();
14714 unsigned NumOfRelevantElts = NumElts;
14715 for (unsigned Idx = 0; Idx < NumElts; ++Idx) {
14716 SDValue Elt = N->getOperand(Idx);
14717 if (Elt->getOpcode() == ISD::BITCAST) {
14718 // Assume only bit cast to i32 will go away.
14719 if (Elt->getOperand(0).getValueType() == MVT::i32)
14720 ++NumOfBitCastedElts;
14721 } else if (Elt.isUndef() || isa<ConstantSDNode>(Elt))
14722 // Constants are statically casted, thus do not count them as
14723 // relevant operands.
14724 --NumOfRelevantElts;
14727 // Check if more than half of the elements require a non-free bitcast.
14728 if (NumOfBitCastedElts <= NumOfRelevantElts / 2)
14729 return SDValue();
14731 SelectionDAG &DAG = DCI.DAG;
14732 // Create the new vector type.
14733 EVT VecVT = EVT::getVectorVT(*DAG.getContext(), MVT::i32, NumElts);
14734 // Check if the type is legal.
14735 const TargetLowering &TLI = DAG.getTargetLoweringInfo();
14736 if (!TLI.isTypeLegal(VecVT))
14737 return SDValue();
14739 // Combine:
14740 // ARMISD::BUILD_VECTOR E1, E2, ..., EN.
14741 // => BITCAST INSERT_VECTOR_ELT
14742 // (INSERT_VECTOR_ELT (...), (BITCAST EN-1), N-1),
14743 // (BITCAST EN), N.
14744 SDValue Vec = DAG.getUNDEF(VecVT);
14745 SDLoc dl(N);
14746 for (unsigned Idx = 0 ; Idx < NumElts; ++Idx) {
14747 SDValue V = N->getOperand(Idx);
14748 if (V.isUndef())
14749 continue;
14750 if (V.getOpcode() == ISD::BITCAST &&
14751 V->getOperand(0).getValueType() == MVT::i32)
14752 // Fold obvious case.
14753 V = V.getOperand(0);
14754 else {
14755 V = DAG.getNode(ISD::BITCAST, SDLoc(V), MVT::i32, V);
14756 // Make the DAGCombiner fold the bitcasts.
14757 DCI.AddToWorklist(V.getNode());
14759 SDValue LaneIdx = DAG.getConstant(Idx, dl, MVT::i32);
14760 Vec = DAG.getNode(ISD::INSERT_VECTOR_ELT, dl, VecVT, Vec, V, LaneIdx);
14762 Vec = DAG.getNode(ISD::BITCAST, dl, VT, Vec);
14763 // Make the DAGCombiner fold the bitcasts.
14764 DCI.AddToWorklist(Vec.getNode());
14765 return Vec;
14768 static SDValue
14769 PerformPREDICATE_CASTCombine(SDNode *N, TargetLowering::DAGCombinerInfo &DCI) {
14770 EVT VT = N->getValueType(0);
14771 SDValue Op = N->getOperand(0);
14772 SDLoc dl(N);
14774 // PREDICATE_CAST(PREDICATE_CAST(x)) == PREDICATE_CAST(x)
14775 if (Op->getOpcode() == ARMISD::PREDICATE_CAST) {
14776 // If the valuetypes are the same, we can remove the cast entirely.
14777 if (Op->getOperand(0).getValueType() == VT)
14778 return Op->getOperand(0);
14779 return DCI.DAG.getNode(ARMISD::PREDICATE_CAST, dl, VT, Op->getOperand(0));
14782 // Turn pred_cast(xor x, -1) into xor(pred_cast x, -1), in order to produce
14783 // more VPNOT which might get folded as else predicates.
14784 if (Op.getValueType() == MVT::i32 && isBitwiseNot(Op)) {
14785 SDValue X =
14786 DCI.DAG.getNode(ARMISD::PREDICATE_CAST, dl, VT, Op->getOperand(0));
14787 SDValue C = DCI.DAG.getNode(ARMISD::PREDICATE_CAST, dl, VT,
14788 DCI.DAG.getConstant(65535, dl, MVT::i32));
14789 return DCI.DAG.getNode(ISD::XOR, dl, VT, X, C);
14792 // Only the bottom 16 bits of the source register are used.
14793 if (Op.getValueType() == MVT::i32) {
14794 APInt DemandedMask = APInt::getLowBitsSet(32, 16);
14795 const TargetLowering &TLI = DCI.DAG.getTargetLoweringInfo();
14796 if (TLI.SimplifyDemandedBits(Op, DemandedMask, DCI))
14797 return SDValue(N, 0);
14799 return SDValue();
14802 static SDValue PerformVECTOR_REG_CASTCombine(SDNode *N, SelectionDAG &DAG,
14803 const ARMSubtarget *ST) {
14804 EVT VT = N->getValueType(0);
14805 SDValue Op = N->getOperand(0);
14806 SDLoc dl(N);
14808 // Under Little endian, a VECTOR_REG_CAST is equivalent to a BITCAST
14809 if (ST->isLittle())
14810 return DAG.getNode(ISD::BITCAST, dl, VT, Op);
14812 // VECTOR_REG_CAST undef -> undef
14813 if (Op.isUndef())
14814 return DAG.getUNDEF(VT);
14816 // VECTOR_REG_CAST(VECTOR_REG_CAST(x)) == VECTOR_REG_CAST(x)
14817 if (Op->getOpcode() == ARMISD::VECTOR_REG_CAST) {
14818 // If the valuetypes are the same, we can remove the cast entirely.
14819 if (Op->getOperand(0).getValueType() == VT)
14820 return Op->getOperand(0);
14821 return DAG.getNode(ARMISD::VECTOR_REG_CAST, dl, VT, Op->getOperand(0));
14824 return SDValue();
14827 static SDValue PerformVCMPCombine(SDNode *N, SelectionDAG &DAG,
14828 const ARMSubtarget *Subtarget) {
14829 if (!Subtarget->hasMVEIntegerOps())
14830 return SDValue();
14832 EVT VT = N->getValueType(0);
14833 SDValue Op0 = N->getOperand(0);
14834 SDValue Op1 = N->getOperand(1);
14835 ARMCC::CondCodes Cond =
14836 (ARMCC::CondCodes)cast<ConstantSDNode>(N->getOperand(2))->getZExtValue();
14837 SDLoc dl(N);
14839 // vcmp X, 0, cc -> vcmpz X, cc
14840 if (isZeroVector(Op1))
14841 return DAG.getNode(ARMISD::VCMPZ, dl, VT, Op0, N->getOperand(2));
14843 unsigned SwappedCond = getSwappedCondition(Cond);
14844 if (isValidMVECond(SwappedCond, VT.isFloatingPoint())) {
14845 // vcmp 0, X, cc -> vcmpz X, reversed(cc)
14846 if (isZeroVector(Op0))
14847 return DAG.getNode(ARMISD::VCMPZ, dl, VT, Op1,
14848 DAG.getConstant(SwappedCond, dl, MVT::i32));
14849 // vcmp vdup(Y), X, cc -> vcmp X, vdup(Y), reversed(cc)
14850 if (Op0->getOpcode() == ARMISD::VDUP && Op1->getOpcode() != ARMISD::VDUP)
14851 return DAG.getNode(ARMISD::VCMP, dl, VT, Op1, Op0,
14852 DAG.getConstant(SwappedCond, dl, MVT::i32));
14855 return SDValue();
14858 /// PerformInsertEltCombine - Target-specific dag combine xforms for
14859 /// ISD::INSERT_VECTOR_ELT.
14860 static SDValue PerformInsertEltCombine(SDNode *N,
14861 TargetLowering::DAGCombinerInfo &DCI) {
14862 // Bitcast an i64 load inserted into a vector to f64.
14863 // Otherwise, the i64 value will be legalized to a pair of i32 values.
14864 EVT VT = N->getValueType(0);
14865 SDNode *Elt = N->getOperand(1).getNode();
14866 if (VT.getVectorElementType() != MVT::i64 ||
14867 !ISD::isNormalLoad(Elt) || cast<LoadSDNode>(Elt)->isVolatile())
14868 return SDValue();
14870 SelectionDAG &DAG = DCI.DAG;
14871 SDLoc dl(N);
14872 EVT FloatVT = EVT::getVectorVT(*DAG.getContext(), MVT::f64,
14873 VT.getVectorNumElements());
14874 SDValue Vec = DAG.getNode(ISD::BITCAST, dl, FloatVT, N->getOperand(0));
14875 SDValue V = DAG.getNode(ISD::BITCAST, dl, MVT::f64, N->getOperand(1));
14876 // Make the DAGCombiner fold the bitcasts.
14877 DCI.AddToWorklist(Vec.getNode());
14878 DCI.AddToWorklist(V.getNode());
14879 SDValue InsElt = DAG.getNode(ISD::INSERT_VECTOR_ELT, dl, FloatVT,
14880 Vec, V, N->getOperand(2));
14881 return DAG.getNode(ISD::BITCAST, dl, VT, InsElt);
14884 // Convert a pair of extracts from the same base vector to a VMOVRRD. Either
14885 // directly or bitcast to an integer if the original is a float vector.
14886 // extract(x, n); extract(x, n+1) -> VMOVRRD(extract v2f64 x, n/2)
14887 // bitcast(extract(x, n)); bitcast(extract(x, n+1)) -> VMOVRRD(extract x, n/2)
14888 static SDValue
14889 PerformExtractEltToVMOVRRD(SDNode *N, TargetLowering::DAGCombinerInfo &DCI) {
14890 EVT VT = N->getValueType(0);
14891 SDLoc dl(N);
14893 if (!DCI.isAfterLegalizeDAG() || VT != MVT::i32 ||
14894 !DCI.DAG.getTargetLoweringInfo().isTypeLegal(MVT::f64))
14895 return SDValue();
14897 SDValue Ext = SDValue(N, 0);
14898 if (Ext.getOpcode() == ISD::BITCAST &&
14899 Ext.getOperand(0).getValueType() == MVT::f32)
14900 Ext = Ext.getOperand(0);
14901 if (Ext.getOpcode() != ISD::EXTRACT_VECTOR_ELT ||
14902 !isa<ConstantSDNode>(Ext.getOperand(1)) ||
14903 Ext.getConstantOperandVal(1) % 2 != 0)
14904 return SDValue();
14905 if (Ext->use_size() == 1 &&
14906 (Ext->use_begin()->getOpcode() == ISD::SINT_TO_FP ||
14907 Ext->use_begin()->getOpcode() == ISD::UINT_TO_FP))
14908 return SDValue();
14910 SDValue Op0 = Ext.getOperand(0);
14911 EVT VecVT = Op0.getValueType();
14912 unsigned Lane = Ext.getConstantOperandVal(1);
14913 if (VecVT.getVectorNumElements() != 4)
14914 return SDValue();
14916 // Find another extract, of Lane + 1
14917 auto OtherIt = find_if(Op0->uses(), [&](SDNode *V) {
14918 return V->getOpcode() == ISD::EXTRACT_VECTOR_ELT &&
14919 isa<ConstantSDNode>(V->getOperand(1)) &&
14920 V->getConstantOperandVal(1) == Lane + 1;
14922 if (OtherIt == Op0->uses().end())
14923 return SDValue();
14925 // For float extracts, we need to be converting to a i32 for both vector
14926 // lanes.
14927 SDValue OtherExt(*OtherIt, 0);
14928 if (OtherExt.getValueType() != MVT::i32) {
14929 if (OtherExt->use_size() != 1 ||
14930 OtherExt->use_begin()->getOpcode() != ISD::BITCAST ||
14931 OtherExt->use_begin()->getValueType(0) != MVT::i32)
14932 return SDValue();
14933 OtherExt = SDValue(*OtherExt->use_begin(), 0);
14936 // Convert the type to a f64 and extract with a VMOVRRD.
14937 SDValue F64 = DCI.DAG.getNode(
14938 ISD::EXTRACT_VECTOR_ELT, dl, MVT::f64,
14939 DCI.DAG.getNode(ARMISD::VECTOR_REG_CAST, dl, MVT::v2f64, Op0),
14940 DCI.DAG.getConstant(Ext.getConstantOperandVal(1) / 2, dl, MVT::i32));
14941 SDValue VMOVRRD =
14942 DCI.DAG.getNode(ARMISD::VMOVRRD, dl, {MVT::i32, MVT::i32}, F64);
14944 DCI.CombineTo(OtherExt.getNode(), SDValue(VMOVRRD.getNode(), 1));
14945 return VMOVRRD;
14948 static SDValue PerformExtractEltCombine(SDNode *N,
14949 TargetLowering::DAGCombinerInfo &DCI,
14950 const ARMSubtarget *ST) {
14951 SDValue Op0 = N->getOperand(0);
14952 EVT VT = N->getValueType(0);
14953 SDLoc dl(N);
14955 // extract (vdup x) -> x
14956 if (Op0->getOpcode() == ARMISD::VDUP) {
14957 SDValue X = Op0->getOperand(0);
14958 if (VT == MVT::f16 && X.getValueType() == MVT::i32)
14959 return DCI.DAG.getNode(ARMISD::VMOVhr, dl, VT, X);
14960 if (VT == MVT::i32 && X.getValueType() == MVT::f16)
14961 return DCI.DAG.getNode(ARMISD::VMOVrh, dl, VT, X);
14962 if (VT == MVT::f32 && X.getValueType() == MVT::i32)
14963 return DCI.DAG.getNode(ISD::BITCAST, dl, VT, X);
14965 while (X.getValueType() != VT && X->getOpcode() == ISD::BITCAST)
14966 X = X->getOperand(0);
14967 if (X.getValueType() == VT)
14968 return X;
14971 // extract ARM_BUILD_VECTOR -> x
14972 if (Op0->getOpcode() == ARMISD::BUILD_VECTOR &&
14973 isa<ConstantSDNode>(N->getOperand(1)) &&
14974 N->getConstantOperandVal(1) < Op0.getNumOperands()) {
14975 return Op0.getOperand(N->getConstantOperandVal(1));
14978 // extract(bitcast(BUILD_VECTOR(VMOVDRR(a, b), ..))) -> a or b
14979 if (Op0.getValueType() == MVT::v4i32 &&
14980 isa<ConstantSDNode>(N->getOperand(1)) &&
14981 Op0.getOpcode() == ISD::BITCAST &&
14982 Op0.getOperand(0).getOpcode() == ISD::BUILD_VECTOR &&
14983 Op0.getOperand(0).getValueType() == MVT::v2f64) {
14984 SDValue BV = Op0.getOperand(0);
14985 unsigned Offset = N->getConstantOperandVal(1);
14986 SDValue MOV = BV.getOperand(Offset < 2 ? 0 : 1);
14987 if (MOV.getOpcode() == ARMISD::VMOVDRR)
14988 return MOV.getOperand(ST->isLittle() ? Offset % 2 : 1 - Offset % 2);
14991 // extract x, n; extract x, n+1 -> VMOVRRD x
14992 if (SDValue R = PerformExtractEltToVMOVRRD(N, DCI))
14993 return R;
14995 // extract (MVETrunc(x)) -> extract x
14996 if (Op0->getOpcode() == ARMISD::MVETRUNC) {
14997 unsigned Idx = N->getConstantOperandVal(1);
14998 unsigned Vec =
14999 Idx / Op0->getOperand(0).getValueType().getVectorNumElements();
15000 unsigned SubIdx =
15001 Idx % Op0->getOperand(0).getValueType().getVectorNumElements();
15002 return DCI.DAG.getNode(ISD::EXTRACT_VECTOR_ELT, dl, VT, Op0.getOperand(Vec),
15003 DCI.DAG.getConstant(SubIdx, dl, MVT::i32));
15006 return SDValue();
15009 static SDValue PerformSignExtendInregCombine(SDNode *N, SelectionDAG &DAG) {
15010 SDValue Op = N->getOperand(0);
15011 EVT VT = N->getValueType(0);
15013 // sext_inreg(VGETLANEu) -> VGETLANEs
15014 if (Op.getOpcode() == ARMISD::VGETLANEu &&
15015 cast<VTSDNode>(N->getOperand(1))->getVT() ==
15016 Op.getOperand(0).getValueType().getScalarType())
15017 return DAG.getNode(ARMISD::VGETLANEs, SDLoc(N), VT, Op.getOperand(0),
15018 Op.getOperand(1));
15020 return SDValue();
15023 // When lowering complex nodes that we recognize, like VQDMULH and MULH, we
15024 // can end up with shuffle(binop(shuffle, shuffle)), that can be simplified to
15025 // binop as the shuffles cancel out.
15026 static SDValue FlattenVectorShuffle(ShuffleVectorSDNode *N, SelectionDAG &DAG) {
15027 EVT VT = N->getValueType(0);
15028 if (!N->getOperand(1).isUndef() || N->getOperand(0).getValueType() != VT)
15029 return SDValue();
15030 SDValue Op = N->getOperand(0);
15032 // Looking for binary operators that will have been folded from
15033 // truncates/extends.
15034 switch (Op.getOpcode()) {
15035 case ARMISD::VQDMULH:
15036 case ISD::MULHS:
15037 case ISD::MULHU:
15038 case ISD::ABDS:
15039 case ISD::ABDU:
15040 break;
15041 default:
15042 return SDValue();
15045 ShuffleVectorSDNode *Op0 = dyn_cast<ShuffleVectorSDNode>(Op.getOperand(0));
15046 ShuffleVectorSDNode *Op1 = dyn_cast<ShuffleVectorSDNode>(Op.getOperand(1));
15047 if (!Op0 || !Op1 || !Op0->getOperand(1).isUndef() ||
15048 !Op1->getOperand(1).isUndef() || Op0->getMask() != Op1->getMask() ||
15049 Op0->getOperand(0).getValueType() != VT)
15050 return SDValue();
15052 // Check the mask turns into an identity shuffle.
15053 ArrayRef<int> NMask = N->getMask();
15054 ArrayRef<int> OpMask = Op0->getMask();
15055 for (int i = 0, e = NMask.size(); i != e; i++) {
15056 if (NMask[i] > 0 && OpMask[NMask[i]] > 0 && OpMask[NMask[i]] != i)
15057 return SDValue();
15060 return DAG.getNode(Op.getOpcode(), SDLoc(Op), Op.getValueType(),
15061 Op0->getOperand(0), Op1->getOperand(0));
15064 static SDValue
15065 PerformInsertSubvectorCombine(SDNode *N, TargetLowering::DAGCombinerInfo &DCI) {
15066 SDValue Vec = N->getOperand(0);
15067 SDValue SubVec = N->getOperand(1);
15068 uint64_t IdxVal = N->getConstantOperandVal(2);
15069 EVT VecVT = Vec.getValueType();
15070 EVT SubVT = SubVec.getValueType();
15072 // Only do this for legal fixed vector types.
15073 if (!VecVT.isFixedLengthVector() ||
15074 !DCI.DAG.getTargetLoweringInfo().isTypeLegal(VecVT) ||
15075 !DCI.DAG.getTargetLoweringInfo().isTypeLegal(SubVT))
15076 return SDValue();
15078 // Ignore widening patterns.
15079 if (IdxVal == 0 && Vec.isUndef())
15080 return SDValue();
15082 // Subvector must be half the width and an "aligned" insertion.
15083 unsigned NumSubElts = SubVT.getVectorNumElements();
15084 if ((SubVT.getSizeInBits() * 2) != VecVT.getSizeInBits() ||
15085 (IdxVal != 0 && IdxVal != NumSubElts))
15086 return SDValue();
15088 // Fold insert_subvector -> concat_vectors
15089 // insert_subvector(Vec,Sub,lo) -> concat_vectors(Sub,extract(Vec,hi))
15090 // insert_subvector(Vec,Sub,hi) -> concat_vectors(extract(Vec,lo),Sub)
15091 SDLoc DL(N);
15092 SDValue Lo, Hi;
15093 if (IdxVal == 0) {
15094 Lo = SubVec;
15095 Hi = DCI.DAG.getNode(ISD::EXTRACT_SUBVECTOR, DL, SubVT, Vec,
15096 DCI.DAG.getVectorIdxConstant(NumSubElts, DL));
15097 } else {
15098 Lo = DCI.DAG.getNode(ISD::EXTRACT_SUBVECTOR, DL, SubVT, Vec,
15099 DCI.DAG.getVectorIdxConstant(0, DL));
15100 Hi = SubVec;
15102 return DCI.DAG.getNode(ISD::CONCAT_VECTORS, DL, VecVT, Lo, Hi);
15105 // shuffle(MVETrunc(x, y)) -> VMOVN(x, y)
15106 static SDValue PerformShuffleVMOVNCombine(ShuffleVectorSDNode *N,
15107 SelectionDAG &DAG) {
15108 SDValue Trunc = N->getOperand(0);
15109 EVT VT = Trunc.getValueType();
15110 if (Trunc.getOpcode() != ARMISD::MVETRUNC || !N->getOperand(1).isUndef())
15111 return SDValue();
15113 SDLoc DL(Trunc);
15114 if (isVMOVNTruncMask(N->getMask(), VT, 0))
15115 return DAG.getNode(
15116 ARMISD::VMOVN, DL, VT,
15117 DAG.getNode(ARMISD::VECTOR_REG_CAST, DL, VT, Trunc.getOperand(0)),
15118 DAG.getNode(ARMISD::VECTOR_REG_CAST, DL, VT, Trunc.getOperand(1)),
15119 DAG.getConstant(1, DL, MVT::i32));
15120 else if (isVMOVNTruncMask(N->getMask(), VT, 1))
15121 return DAG.getNode(
15122 ARMISD::VMOVN, DL, VT,
15123 DAG.getNode(ARMISD::VECTOR_REG_CAST, DL, VT, Trunc.getOperand(1)),
15124 DAG.getNode(ARMISD::VECTOR_REG_CAST, DL, VT, Trunc.getOperand(0)),
15125 DAG.getConstant(1, DL, MVT::i32));
15126 return SDValue();
15129 /// PerformVECTOR_SHUFFLECombine - Target-specific dag combine xforms for
15130 /// ISD::VECTOR_SHUFFLE.
15131 static SDValue PerformVECTOR_SHUFFLECombine(SDNode *N, SelectionDAG &DAG) {
15132 if (SDValue R = FlattenVectorShuffle(cast<ShuffleVectorSDNode>(N), DAG))
15133 return R;
15134 if (SDValue R = PerformShuffleVMOVNCombine(cast<ShuffleVectorSDNode>(N), DAG))
15135 return R;
15137 // The LLVM shufflevector instruction does not require the shuffle mask
15138 // length to match the operand vector length, but ISD::VECTOR_SHUFFLE does
15139 // have that requirement. When translating to ISD::VECTOR_SHUFFLE, if the
15140 // operands do not match the mask length, they are extended by concatenating
15141 // them with undef vectors. That is probably the right thing for other
15142 // targets, but for NEON it is better to concatenate two double-register
15143 // size vector operands into a single quad-register size vector. Do that
15144 // transformation here:
15145 // shuffle(concat(v1, undef), concat(v2, undef)) ->
15146 // shuffle(concat(v1, v2), undef)
15147 SDValue Op0 = N->getOperand(0);
15148 SDValue Op1 = N->getOperand(1);
15149 if (Op0.getOpcode() != ISD::CONCAT_VECTORS ||
15150 Op1.getOpcode() != ISD::CONCAT_VECTORS ||
15151 Op0.getNumOperands() != 2 ||
15152 Op1.getNumOperands() != 2)
15153 return SDValue();
15154 SDValue Concat0Op1 = Op0.getOperand(1);
15155 SDValue Concat1Op1 = Op1.getOperand(1);
15156 if (!Concat0Op1.isUndef() || !Concat1Op1.isUndef())
15157 return SDValue();
15158 // Skip the transformation if any of the types are illegal.
15159 const TargetLowering &TLI = DAG.getTargetLoweringInfo();
15160 EVT VT = N->getValueType(0);
15161 if (!TLI.isTypeLegal(VT) ||
15162 !TLI.isTypeLegal(Concat0Op1.getValueType()) ||
15163 !TLI.isTypeLegal(Concat1Op1.getValueType()))
15164 return SDValue();
15166 SDValue NewConcat = DAG.getNode(ISD::CONCAT_VECTORS, SDLoc(N), VT,
15167 Op0.getOperand(0), Op1.getOperand(0));
15168 // Translate the shuffle mask.
15169 SmallVector<int, 16> NewMask;
15170 unsigned NumElts = VT.getVectorNumElements();
15171 unsigned HalfElts = NumElts/2;
15172 ShuffleVectorSDNode *SVN = cast<ShuffleVectorSDNode>(N);
15173 for (unsigned n = 0; n < NumElts; ++n) {
15174 int MaskElt = SVN->getMaskElt(n);
15175 int NewElt = -1;
15176 if (MaskElt < (int)HalfElts)
15177 NewElt = MaskElt;
15178 else if (MaskElt >= (int)NumElts && MaskElt < (int)(NumElts + HalfElts))
15179 NewElt = HalfElts + MaskElt - NumElts;
15180 NewMask.push_back(NewElt);
15182 return DAG.getVectorShuffle(VT, SDLoc(N), NewConcat,
15183 DAG.getUNDEF(VT), NewMask);
15186 /// CombineBaseUpdate - Target-specific DAG combine function for VLDDUP,
15187 /// NEON load/store intrinsics, and generic vector load/stores, to merge
15188 /// base address updates.
15189 /// For generic load/stores, the memory type is assumed to be a vector.
15190 /// The caller is assumed to have checked legality.
15191 static SDValue CombineBaseUpdate(SDNode *N,
15192 TargetLowering::DAGCombinerInfo &DCI) {
15193 SelectionDAG &DAG = DCI.DAG;
15194 const bool isIntrinsic = (N->getOpcode() == ISD::INTRINSIC_VOID ||
15195 N->getOpcode() == ISD::INTRINSIC_W_CHAIN);
15196 const bool isStore = N->getOpcode() == ISD::STORE;
15197 const unsigned AddrOpIdx = ((isIntrinsic || isStore) ? 2 : 1);
15198 SDValue Addr = N->getOperand(AddrOpIdx);
15199 MemSDNode *MemN = cast<MemSDNode>(N);
15200 SDLoc dl(N);
15202 // Search for a use of the address operand that is an increment.
15203 for (SDNode::use_iterator UI = Addr.getNode()->use_begin(),
15204 UE = Addr.getNode()->use_end(); UI != UE; ++UI) {
15205 SDNode *User = *UI;
15206 if (User->getOpcode() != ISD::ADD ||
15207 UI.getUse().getResNo() != Addr.getResNo())
15208 continue;
15210 // Check that the add is independent of the load/store. Otherwise, folding
15211 // it would create a cycle. We can avoid searching through Addr as it's a
15212 // predecessor to both.
15213 SmallPtrSet<const SDNode *, 32> Visited;
15214 SmallVector<const SDNode *, 16> Worklist;
15215 Visited.insert(Addr.getNode());
15216 Worklist.push_back(N);
15217 Worklist.push_back(User);
15218 if (SDNode::hasPredecessorHelper(N, Visited, Worklist) ||
15219 SDNode::hasPredecessorHelper(User, Visited, Worklist))
15220 continue;
15222 // Find the new opcode for the updating load/store.
15223 bool isLoadOp = true;
15224 bool isLaneOp = false;
15225 // Workaround for vst1x and vld1x intrinsics which do not have alignment
15226 // as an operand.
15227 bool hasAlignment = true;
15228 unsigned NewOpc = 0;
15229 unsigned NumVecs = 0;
15230 if (isIntrinsic) {
15231 unsigned IntNo = cast<ConstantSDNode>(N->getOperand(1))->getZExtValue();
15232 switch (IntNo) {
15233 default: llvm_unreachable("unexpected intrinsic for Neon base update");
15234 case Intrinsic::arm_neon_vld1: NewOpc = ARMISD::VLD1_UPD;
15235 NumVecs = 1; break;
15236 case Intrinsic::arm_neon_vld2: NewOpc = ARMISD::VLD2_UPD;
15237 NumVecs = 2; break;
15238 case Intrinsic::arm_neon_vld3: NewOpc = ARMISD::VLD3_UPD;
15239 NumVecs = 3; break;
15240 case Intrinsic::arm_neon_vld4: NewOpc = ARMISD::VLD4_UPD;
15241 NumVecs = 4; break;
15242 case Intrinsic::arm_neon_vld1x2: NewOpc = ARMISD::VLD1x2_UPD;
15243 NumVecs = 2; hasAlignment = false; break;
15244 case Intrinsic::arm_neon_vld1x3: NewOpc = ARMISD::VLD1x3_UPD;
15245 NumVecs = 3; hasAlignment = false; break;
15246 case Intrinsic::arm_neon_vld1x4: NewOpc = ARMISD::VLD1x4_UPD;
15247 NumVecs = 4; hasAlignment = false; break;
15248 case Intrinsic::arm_neon_vld2dup: NewOpc = ARMISD::VLD2DUP_UPD;
15249 NumVecs = 2; break;
15250 case Intrinsic::arm_neon_vld3dup: NewOpc = ARMISD::VLD3DUP_UPD;
15251 NumVecs = 3; break;
15252 case Intrinsic::arm_neon_vld4dup: NewOpc = ARMISD::VLD4DUP_UPD;
15253 NumVecs = 4; break;
15254 case Intrinsic::arm_neon_vld2lane: NewOpc = ARMISD::VLD2LN_UPD;
15255 NumVecs = 2; isLaneOp = true; break;
15256 case Intrinsic::arm_neon_vld3lane: NewOpc = ARMISD::VLD3LN_UPD;
15257 NumVecs = 3; isLaneOp = true; break;
15258 case Intrinsic::arm_neon_vld4lane: NewOpc = ARMISD::VLD4LN_UPD;
15259 NumVecs = 4; isLaneOp = true; break;
15260 case Intrinsic::arm_neon_vst1: NewOpc = ARMISD::VST1_UPD;
15261 NumVecs = 1; isLoadOp = false; break;
15262 case Intrinsic::arm_neon_vst2: NewOpc = ARMISD::VST2_UPD;
15263 NumVecs = 2; isLoadOp = false; break;
15264 case Intrinsic::arm_neon_vst3: NewOpc = ARMISD::VST3_UPD;
15265 NumVecs = 3; isLoadOp = false; break;
15266 case Intrinsic::arm_neon_vst4: NewOpc = ARMISD::VST4_UPD;
15267 NumVecs = 4; isLoadOp = false; break;
15268 case Intrinsic::arm_neon_vst2lane: NewOpc = ARMISD::VST2LN_UPD;
15269 NumVecs = 2; isLoadOp = false; isLaneOp = true; break;
15270 case Intrinsic::arm_neon_vst3lane: NewOpc = ARMISD::VST3LN_UPD;
15271 NumVecs = 3; isLoadOp = false; isLaneOp = true; break;
15272 case Intrinsic::arm_neon_vst4lane: NewOpc = ARMISD::VST4LN_UPD;
15273 NumVecs = 4; isLoadOp = false; isLaneOp = true; break;
15274 case Intrinsic::arm_neon_vst1x2: NewOpc = ARMISD::VST1x2_UPD;
15275 NumVecs = 2; isLoadOp = false; hasAlignment = false; break;
15276 case Intrinsic::arm_neon_vst1x3: NewOpc = ARMISD::VST1x3_UPD;
15277 NumVecs = 3; isLoadOp = false; hasAlignment = false; break;
15278 case Intrinsic::arm_neon_vst1x4: NewOpc = ARMISD::VST1x4_UPD;
15279 NumVecs = 4; isLoadOp = false; hasAlignment = false; break;
15281 } else {
15282 isLaneOp = true;
15283 switch (N->getOpcode()) {
15284 default: llvm_unreachable("unexpected opcode for Neon base update");
15285 case ARMISD::VLD1DUP: NewOpc = ARMISD::VLD1DUP_UPD; NumVecs = 1; break;
15286 case ARMISD::VLD2DUP: NewOpc = ARMISD::VLD2DUP_UPD; NumVecs = 2; break;
15287 case ARMISD::VLD3DUP: NewOpc = ARMISD::VLD3DUP_UPD; NumVecs = 3; break;
15288 case ARMISD::VLD4DUP: NewOpc = ARMISD::VLD4DUP_UPD; NumVecs = 4; break;
15289 case ISD::LOAD: NewOpc = ARMISD::VLD1_UPD;
15290 NumVecs = 1; isLaneOp = false; break;
15291 case ISD::STORE: NewOpc = ARMISD::VST1_UPD;
15292 NumVecs = 1; isLaneOp = false; isLoadOp = false; break;
15296 // Find the size of memory referenced by the load/store.
15297 EVT VecTy;
15298 if (isLoadOp) {
15299 VecTy = N->getValueType(0);
15300 } else if (isIntrinsic) {
15301 VecTy = N->getOperand(AddrOpIdx+1).getValueType();
15302 } else {
15303 assert(isStore && "Node has to be a load, a store, or an intrinsic!");
15304 VecTy = N->getOperand(1).getValueType();
15307 bool isVLDDUPOp =
15308 NewOpc == ARMISD::VLD1DUP_UPD || NewOpc == ARMISD::VLD2DUP_UPD ||
15309 NewOpc == ARMISD::VLD3DUP_UPD || NewOpc == ARMISD::VLD4DUP_UPD;
15311 unsigned NumBytes = NumVecs * VecTy.getSizeInBits() / 8;
15312 if (isLaneOp || isVLDDUPOp)
15313 NumBytes /= VecTy.getVectorNumElements();
15315 // If the increment is a constant, it must match the memory ref size.
15316 SDValue Inc = User->getOperand(User->getOperand(0) == Addr ? 1 : 0);
15317 ConstantSDNode *CInc = dyn_cast<ConstantSDNode>(Inc.getNode());
15318 if (NumBytes >= 3 * 16 && (!CInc || CInc->getZExtValue() != NumBytes)) {
15319 // VLD3/4 and VST3/4 for 128-bit vectors are implemented with two
15320 // separate instructions that make it harder to use a non-constant update.
15321 continue;
15324 // OK, we found an ADD we can fold into the base update.
15325 // Now, create a _UPD node, taking care of not breaking alignment.
15327 EVT AlignedVecTy = VecTy;
15328 unsigned Alignment = MemN->getAlignment();
15330 // If this is a less-than-standard-aligned load/store, change the type to
15331 // match the standard alignment.
15332 // The alignment is overlooked when selecting _UPD variants; and it's
15333 // easier to introduce bitcasts here than fix that.
15334 // There are 3 ways to get to this base-update combine:
15335 // - intrinsics: they are assumed to be properly aligned (to the standard
15336 // alignment of the memory type), so we don't need to do anything.
15337 // - ARMISD::VLDx nodes: they are only generated from the aforementioned
15338 // intrinsics, so, likewise, there's nothing to do.
15339 // - generic load/store instructions: the alignment is specified as an
15340 // explicit operand, rather than implicitly as the standard alignment
15341 // of the memory type (like the intrisics). We need to change the
15342 // memory type to match the explicit alignment. That way, we don't
15343 // generate non-standard-aligned ARMISD::VLDx nodes.
15344 if (isa<LSBaseSDNode>(N)) {
15345 if (Alignment == 0)
15346 Alignment = 1;
15347 if (Alignment < VecTy.getScalarSizeInBits() / 8) {
15348 MVT EltTy = MVT::getIntegerVT(Alignment * 8);
15349 assert(NumVecs == 1 && "Unexpected multi-element generic load/store.");
15350 assert(!isLaneOp && "Unexpected generic load/store lane.");
15351 unsigned NumElts = NumBytes / (EltTy.getSizeInBits() / 8);
15352 AlignedVecTy = MVT::getVectorVT(EltTy, NumElts);
15354 // Don't set an explicit alignment on regular load/stores that we want
15355 // to transform to VLD/VST 1_UPD nodes.
15356 // This matches the behavior of regular load/stores, which only get an
15357 // explicit alignment if the MMO alignment is larger than the standard
15358 // alignment of the memory type.
15359 // Intrinsics, however, always get an explicit alignment, set to the
15360 // alignment of the MMO.
15361 Alignment = 1;
15364 // Create the new updating load/store node.
15365 // First, create an SDVTList for the new updating node's results.
15366 EVT Tys[6];
15367 unsigned NumResultVecs = (isLoadOp ? NumVecs : 0);
15368 unsigned n;
15369 for (n = 0; n < NumResultVecs; ++n)
15370 Tys[n] = AlignedVecTy;
15371 Tys[n++] = MVT::i32;
15372 Tys[n] = MVT::Other;
15373 SDVTList SDTys = DAG.getVTList(makeArrayRef(Tys, NumResultVecs+2));
15375 // Then, gather the new node's operands.
15376 SmallVector<SDValue, 8> Ops;
15377 Ops.push_back(N->getOperand(0)); // incoming chain
15378 Ops.push_back(N->getOperand(AddrOpIdx));
15379 Ops.push_back(Inc);
15381 if (StoreSDNode *StN = dyn_cast<StoreSDNode>(N)) {
15382 // Try to match the intrinsic's signature
15383 Ops.push_back(StN->getValue());
15384 } else {
15385 // Loads (and of course intrinsics) match the intrinsics' signature,
15386 // so just add all but the alignment operand.
15387 unsigned LastOperand =
15388 hasAlignment ? N->getNumOperands() - 1 : N->getNumOperands();
15389 for (unsigned i = AddrOpIdx + 1; i < LastOperand; ++i)
15390 Ops.push_back(N->getOperand(i));
15393 // For all node types, the alignment operand is always the last one.
15394 Ops.push_back(DAG.getConstant(Alignment, dl, MVT::i32));
15396 // If this is a non-standard-aligned STORE, the penultimate operand is the
15397 // stored value. Bitcast it to the aligned type.
15398 if (AlignedVecTy != VecTy && N->getOpcode() == ISD::STORE) {
15399 SDValue &StVal = Ops[Ops.size()-2];
15400 StVal = DAG.getNode(ISD::BITCAST, dl, AlignedVecTy, StVal);
15403 EVT LoadVT = isLaneOp ? VecTy.getVectorElementType() : AlignedVecTy;
15404 SDValue UpdN = DAG.getMemIntrinsicNode(NewOpc, dl, SDTys, Ops, LoadVT,
15405 MemN->getMemOperand());
15407 // Update the uses.
15408 SmallVector<SDValue, 5> NewResults;
15409 for (unsigned i = 0; i < NumResultVecs; ++i)
15410 NewResults.push_back(SDValue(UpdN.getNode(), i));
15412 // If this is an non-standard-aligned LOAD, the first result is the loaded
15413 // value. Bitcast it to the expected result type.
15414 if (AlignedVecTy != VecTy && N->getOpcode() == ISD::LOAD) {
15415 SDValue &LdVal = NewResults[0];
15416 LdVal = DAG.getNode(ISD::BITCAST, dl, VecTy, LdVal);
15419 NewResults.push_back(SDValue(UpdN.getNode(), NumResultVecs+1)); // chain
15420 DCI.CombineTo(N, NewResults);
15421 DCI.CombineTo(User, SDValue(UpdN.getNode(), NumResultVecs));
15423 break;
15425 return SDValue();
15428 static SDValue PerformVLDCombine(SDNode *N,
15429 TargetLowering::DAGCombinerInfo &DCI) {
15430 if (DCI.isBeforeLegalize() || DCI.isCalledByLegalizer())
15431 return SDValue();
15433 return CombineBaseUpdate(N, DCI);
15436 static SDValue PerformMVEVLDCombine(SDNode *N,
15437 TargetLowering::DAGCombinerInfo &DCI) {
15438 if (DCI.isBeforeLegalize() || DCI.isCalledByLegalizer())
15439 return SDValue();
15441 SelectionDAG &DAG = DCI.DAG;
15442 SDValue Addr = N->getOperand(2);
15443 MemSDNode *MemN = cast<MemSDNode>(N);
15444 SDLoc dl(N);
15446 // For the stores, where there are multiple intrinsics we only actually want
15447 // to post-inc the last of the them.
15448 unsigned IntNo = cast<ConstantSDNode>(N->getOperand(1))->getZExtValue();
15449 if (IntNo == Intrinsic::arm_mve_vst2q &&
15450 cast<ConstantSDNode>(N->getOperand(5))->getZExtValue() != 1)
15451 return SDValue();
15452 if (IntNo == Intrinsic::arm_mve_vst4q &&
15453 cast<ConstantSDNode>(N->getOperand(7))->getZExtValue() != 3)
15454 return SDValue();
15456 // Search for a use of the address operand that is an increment.
15457 for (SDNode::use_iterator UI = Addr.getNode()->use_begin(),
15458 UE = Addr.getNode()->use_end();
15459 UI != UE; ++UI) {
15460 SDNode *User = *UI;
15461 if (User->getOpcode() != ISD::ADD ||
15462 UI.getUse().getResNo() != Addr.getResNo())
15463 continue;
15465 // Check that the add is independent of the load/store. Otherwise, folding
15466 // it would create a cycle. We can avoid searching through Addr as it's a
15467 // predecessor to both.
15468 SmallPtrSet<const SDNode *, 32> Visited;
15469 SmallVector<const SDNode *, 16> Worklist;
15470 Visited.insert(Addr.getNode());
15471 Worklist.push_back(N);
15472 Worklist.push_back(User);
15473 if (SDNode::hasPredecessorHelper(N, Visited, Worklist) ||
15474 SDNode::hasPredecessorHelper(User, Visited, Worklist))
15475 continue;
15477 // Find the new opcode for the updating load/store.
15478 bool isLoadOp = true;
15479 unsigned NewOpc = 0;
15480 unsigned NumVecs = 0;
15481 switch (IntNo) {
15482 default:
15483 llvm_unreachable("unexpected intrinsic for MVE VLDn combine");
15484 case Intrinsic::arm_mve_vld2q:
15485 NewOpc = ARMISD::VLD2_UPD;
15486 NumVecs = 2;
15487 break;
15488 case Intrinsic::arm_mve_vld4q:
15489 NewOpc = ARMISD::VLD4_UPD;
15490 NumVecs = 4;
15491 break;
15492 case Intrinsic::arm_mve_vst2q:
15493 NewOpc = ARMISD::VST2_UPD;
15494 NumVecs = 2;
15495 isLoadOp = false;
15496 break;
15497 case Intrinsic::arm_mve_vst4q:
15498 NewOpc = ARMISD::VST4_UPD;
15499 NumVecs = 4;
15500 isLoadOp = false;
15501 break;
15504 // Find the size of memory referenced by the load/store.
15505 EVT VecTy;
15506 if (isLoadOp) {
15507 VecTy = N->getValueType(0);
15508 } else {
15509 VecTy = N->getOperand(3).getValueType();
15512 unsigned NumBytes = NumVecs * VecTy.getSizeInBits() / 8;
15514 // If the increment is a constant, it must match the memory ref size.
15515 SDValue Inc = User->getOperand(User->getOperand(0) == Addr ? 1 : 0);
15516 ConstantSDNode *CInc = dyn_cast<ConstantSDNode>(Inc.getNode());
15517 if (!CInc || CInc->getZExtValue() != NumBytes)
15518 continue;
15520 // Create the new updating load/store node.
15521 // First, create an SDVTList for the new updating node's results.
15522 EVT Tys[6];
15523 unsigned NumResultVecs = (isLoadOp ? NumVecs : 0);
15524 unsigned n;
15525 for (n = 0; n < NumResultVecs; ++n)
15526 Tys[n] = VecTy;
15527 Tys[n++] = MVT::i32;
15528 Tys[n] = MVT::Other;
15529 SDVTList SDTys = DAG.getVTList(makeArrayRef(Tys, NumResultVecs + 2));
15531 // Then, gather the new node's operands.
15532 SmallVector<SDValue, 8> Ops;
15533 Ops.push_back(N->getOperand(0)); // incoming chain
15534 Ops.push_back(N->getOperand(2)); // ptr
15535 Ops.push_back(Inc);
15537 for (unsigned i = 3; i < N->getNumOperands(); ++i)
15538 Ops.push_back(N->getOperand(i));
15540 SDValue UpdN = DAG.getMemIntrinsicNode(NewOpc, dl, SDTys, Ops, VecTy,
15541 MemN->getMemOperand());
15543 // Update the uses.
15544 SmallVector<SDValue, 5> NewResults;
15545 for (unsigned i = 0; i < NumResultVecs; ++i)
15546 NewResults.push_back(SDValue(UpdN.getNode(), i));
15548 NewResults.push_back(SDValue(UpdN.getNode(), NumResultVecs + 1)); // chain
15549 DCI.CombineTo(N, NewResults);
15550 DCI.CombineTo(User, SDValue(UpdN.getNode(), NumResultVecs));
15552 break;
15555 return SDValue();
15558 /// CombineVLDDUP - For a VDUPLANE node N, check if its source operand is a
15559 /// vldN-lane (N > 1) intrinsic, and if all the other uses of that intrinsic
15560 /// are also VDUPLANEs. If so, combine them to a vldN-dup operation and
15561 /// return true.
15562 static bool CombineVLDDUP(SDNode *N, TargetLowering::DAGCombinerInfo &DCI) {
15563 SelectionDAG &DAG = DCI.DAG;
15564 EVT VT = N->getValueType(0);
15565 // vldN-dup instructions only support 64-bit vectors for N > 1.
15566 if (!VT.is64BitVector())
15567 return false;
15569 // Check if the VDUPLANE operand is a vldN-dup intrinsic.
15570 SDNode *VLD = N->getOperand(0).getNode();
15571 if (VLD->getOpcode() != ISD::INTRINSIC_W_CHAIN)
15572 return false;
15573 unsigned NumVecs = 0;
15574 unsigned NewOpc = 0;
15575 unsigned IntNo = cast<ConstantSDNode>(VLD->getOperand(1))->getZExtValue();
15576 if (IntNo == Intrinsic::arm_neon_vld2lane) {
15577 NumVecs = 2;
15578 NewOpc = ARMISD::VLD2DUP;
15579 } else if (IntNo == Intrinsic::arm_neon_vld3lane) {
15580 NumVecs = 3;
15581 NewOpc = ARMISD::VLD3DUP;
15582 } else if (IntNo == Intrinsic::arm_neon_vld4lane) {
15583 NumVecs = 4;
15584 NewOpc = ARMISD::VLD4DUP;
15585 } else {
15586 return false;
15589 // First check that all the vldN-lane uses are VDUPLANEs and that the lane
15590 // numbers match the load.
15591 unsigned VLDLaneNo =
15592 cast<ConstantSDNode>(VLD->getOperand(NumVecs+3))->getZExtValue();
15593 for (SDNode::use_iterator UI = VLD->use_begin(), UE = VLD->use_end();
15594 UI != UE; ++UI) {
15595 // Ignore uses of the chain result.
15596 if (UI.getUse().getResNo() == NumVecs)
15597 continue;
15598 SDNode *User = *UI;
15599 if (User->getOpcode() != ARMISD::VDUPLANE ||
15600 VLDLaneNo != cast<ConstantSDNode>(User->getOperand(1))->getZExtValue())
15601 return false;
15604 // Create the vldN-dup node.
15605 EVT Tys[5];
15606 unsigned n;
15607 for (n = 0; n < NumVecs; ++n)
15608 Tys[n] = VT;
15609 Tys[n] = MVT::Other;
15610 SDVTList SDTys = DAG.getVTList(makeArrayRef(Tys, NumVecs+1));
15611 SDValue Ops[] = { VLD->getOperand(0), VLD->getOperand(2) };
15612 MemIntrinsicSDNode *VLDMemInt = cast<MemIntrinsicSDNode>(VLD);
15613 SDValue VLDDup = DAG.getMemIntrinsicNode(NewOpc, SDLoc(VLD), SDTys,
15614 Ops, VLDMemInt->getMemoryVT(),
15615 VLDMemInt->getMemOperand());
15617 // Update the uses.
15618 for (SDNode::use_iterator UI = VLD->use_begin(), UE = VLD->use_end();
15619 UI != UE; ++UI) {
15620 unsigned ResNo = UI.getUse().getResNo();
15621 // Ignore uses of the chain result.
15622 if (ResNo == NumVecs)
15623 continue;
15624 SDNode *User = *UI;
15625 DCI.CombineTo(User, SDValue(VLDDup.getNode(), ResNo));
15628 // Now the vldN-lane intrinsic is dead except for its chain result.
15629 // Update uses of the chain.
15630 std::vector<SDValue> VLDDupResults;
15631 for (unsigned n = 0; n < NumVecs; ++n)
15632 VLDDupResults.push_back(SDValue(VLDDup.getNode(), n));
15633 VLDDupResults.push_back(SDValue(VLDDup.getNode(), NumVecs));
15634 DCI.CombineTo(VLD, VLDDupResults);
15636 return true;
15639 /// PerformVDUPLANECombine - Target-specific dag combine xforms for
15640 /// ARMISD::VDUPLANE.
15641 static SDValue PerformVDUPLANECombine(SDNode *N,
15642 TargetLowering::DAGCombinerInfo &DCI,
15643 const ARMSubtarget *Subtarget) {
15644 SDValue Op = N->getOperand(0);
15645 EVT VT = N->getValueType(0);
15647 // On MVE, we just convert the VDUPLANE to a VDUP with an extract.
15648 if (Subtarget->hasMVEIntegerOps()) {
15649 EVT ExtractVT = VT.getVectorElementType();
15650 // We need to ensure we are creating a legal type.
15651 if (!DCI.DAG.getTargetLoweringInfo().isTypeLegal(ExtractVT))
15652 ExtractVT = MVT::i32;
15653 SDValue Extract = DCI.DAG.getNode(ISD::EXTRACT_VECTOR_ELT, SDLoc(N), ExtractVT,
15654 N->getOperand(0), N->getOperand(1));
15655 return DCI.DAG.getNode(ARMISD::VDUP, SDLoc(N), VT, Extract);
15658 // If the source is a vldN-lane (N > 1) intrinsic, and all the other uses
15659 // of that intrinsic are also VDUPLANEs, combine them to a vldN-dup operation.
15660 if (CombineVLDDUP(N, DCI))
15661 return SDValue(N, 0);
15663 // If the source is already a VMOVIMM or VMVNIMM splat, the VDUPLANE is
15664 // redundant. Ignore bit_converts for now; element sizes are checked below.
15665 while (Op.getOpcode() == ISD::BITCAST)
15666 Op = Op.getOperand(0);
15667 if (Op.getOpcode() != ARMISD::VMOVIMM && Op.getOpcode() != ARMISD::VMVNIMM)
15668 return SDValue();
15670 // Make sure the VMOV element size is not bigger than the VDUPLANE elements.
15671 unsigned EltSize = Op.getScalarValueSizeInBits();
15672 // The canonical VMOV for a zero vector uses a 32-bit element size.
15673 unsigned Imm = cast<ConstantSDNode>(Op.getOperand(0))->getZExtValue();
15674 unsigned EltBits;
15675 if (ARM_AM::decodeVMOVModImm(Imm, EltBits) == 0)
15676 EltSize = 8;
15677 if (EltSize > VT.getScalarSizeInBits())
15678 return SDValue();
15680 return DCI.DAG.getNode(ISD::BITCAST, SDLoc(N), VT, Op);
15683 /// PerformVDUPCombine - Target-specific dag combine xforms for ARMISD::VDUP.
15684 static SDValue PerformVDUPCombine(SDNode *N, SelectionDAG &DAG,
15685 const ARMSubtarget *Subtarget) {
15686 SDValue Op = N->getOperand(0);
15687 SDLoc dl(N);
15689 if (Subtarget->hasMVEIntegerOps()) {
15690 // Convert VDUP f32 -> VDUP BITCAST i32 under MVE, as we know the value will
15691 // need to come from a GPR.
15692 if (Op.getValueType() == MVT::f32)
15693 return DAG.getNode(ARMISD::VDUP, dl, N->getValueType(0),
15694 DAG.getNode(ISD::BITCAST, dl, MVT::i32, Op));
15695 else if (Op.getValueType() == MVT::f16)
15696 return DAG.getNode(ARMISD::VDUP, dl, N->getValueType(0),
15697 DAG.getNode(ARMISD::VMOVrh, dl, MVT::i32, Op));
15700 if (!Subtarget->hasNEON())
15701 return SDValue();
15703 // Match VDUP(LOAD) -> VLD1DUP.
15704 // We match this pattern here rather than waiting for isel because the
15705 // transform is only legal for unindexed loads.
15706 LoadSDNode *LD = dyn_cast<LoadSDNode>(Op.getNode());
15707 if (LD && Op.hasOneUse() && LD->isUnindexed() &&
15708 LD->getMemoryVT() == N->getValueType(0).getVectorElementType()) {
15709 SDValue Ops[] = {LD->getOperand(0), LD->getOperand(1),
15710 DAG.getConstant(LD->getAlignment(), SDLoc(N), MVT::i32)};
15711 SDVTList SDTys = DAG.getVTList(N->getValueType(0), MVT::Other);
15712 SDValue VLDDup =
15713 DAG.getMemIntrinsicNode(ARMISD::VLD1DUP, SDLoc(N), SDTys, Ops,
15714 LD->getMemoryVT(), LD->getMemOperand());
15715 DAG.ReplaceAllUsesOfValueWith(SDValue(LD, 1), VLDDup.getValue(1));
15716 return VLDDup;
15719 return SDValue();
15722 static SDValue PerformLOADCombine(SDNode *N,
15723 TargetLowering::DAGCombinerInfo &DCI) {
15724 EVT VT = N->getValueType(0);
15726 // If this is a legal vector load, try to combine it into a VLD1_UPD.
15727 if (ISD::isNormalLoad(N) && VT.isVector() &&
15728 DCI.DAG.getTargetLoweringInfo().isTypeLegal(VT))
15729 return CombineBaseUpdate(N, DCI);
15731 return SDValue();
15734 // Optimize trunc store (of multiple scalars) to shuffle and store. First,
15735 // pack all of the elements in one place. Next, store to memory in fewer
15736 // chunks.
15737 static SDValue PerformTruncatingStoreCombine(StoreSDNode *St,
15738 SelectionDAG &DAG) {
15739 SDValue StVal = St->getValue();
15740 EVT VT = StVal.getValueType();
15741 if (!St->isTruncatingStore() || !VT.isVector())
15742 return SDValue();
15743 const TargetLowering &TLI = DAG.getTargetLoweringInfo();
15744 EVT StVT = St->getMemoryVT();
15745 unsigned NumElems = VT.getVectorNumElements();
15746 assert(StVT != VT && "Cannot truncate to the same type");
15747 unsigned FromEltSz = VT.getScalarSizeInBits();
15748 unsigned ToEltSz = StVT.getScalarSizeInBits();
15750 // From, To sizes and ElemCount must be pow of two
15751 if (!isPowerOf2_32(NumElems * FromEltSz * ToEltSz))
15752 return SDValue();
15754 // We are going to use the original vector elt for storing.
15755 // Accumulated smaller vector elements must be a multiple of the store size.
15756 if (0 != (NumElems * FromEltSz) % ToEltSz)
15757 return SDValue();
15759 unsigned SizeRatio = FromEltSz / ToEltSz;
15760 assert(SizeRatio * NumElems * ToEltSz == VT.getSizeInBits());
15762 // Create a type on which we perform the shuffle.
15763 EVT WideVecVT = EVT::getVectorVT(*DAG.getContext(), StVT.getScalarType(),
15764 NumElems * SizeRatio);
15765 assert(WideVecVT.getSizeInBits() == VT.getSizeInBits());
15767 SDLoc DL(St);
15768 SDValue WideVec = DAG.getNode(ISD::BITCAST, DL, WideVecVT, StVal);
15769 SmallVector<int, 8> ShuffleVec(NumElems * SizeRatio, -1);
15770 for (unsigned i = 0; i < NumElems; ++i)
15771 ShuffleVec[i] = DAG.getDataLayout().isBigEndian() ? (i + 1) * SizeRatio - 1
15772 : i * SizeRatio;
15774 // Can't shuffle using an illegal type.
15775 if (!TLI.isTypeLegal(WideVecVT))
15776 return SDValue();
15778 SDValue Shuff = DAG.getVectorShuffle(
15779 WideVecVT, DL, WideVec, DAG.getUNDEF(WideVec.getValueType()), ShuffleVec);
15780 // At this point all of the data is stored at the bottom of the
15781 // register. We now need to save it to mem.
15783 // Find the largest store unit
15784 MVT StoreType = MVT::i8;
15785 for (MVT Tp : MVT::integer_valuetypes()) {
15786 if (TLI.isTypeLegal(Tp) && Tp.getSizeInBits() <= NumElems * ToEltSz)
15787 StoreType = Tp;
15789 // Didn't find a legal store type.
15790 if (!TLI.isTypeLegal(StoreType))
15791 return SDValue();
15793 // Bitcast the original vector into a vector of store-size units
15794 EVT StoreVecVT =
15795 EVT::getVectorVT(*DAG.getContext(), StoreType,
15796 VT.getSizeInBits() / EVT(StoreType).getSizeInBits());
15797 assert(StoreVecVT.getSizeInBits() == VT.getSizeInBits());
15798 SDValue ShuffWide = DAG.getNode(ISD::BITCAST, DL, StoreVecVT, Shuff);
15799 SmallVector<SDValue, 8> Chains;
15800 SDValue Increment = DAG.getConstant(StoreType.getSizeInBits() / 8, DL,
15801 TLI.getPointerTy(DAG.getDataLayout()));
15802 SDValue BasePtr = St->getBasePtr();
15804 // Perform one or more big stores into memory.
15805 unsigned E = (ToEltSz * NumElems) / StoreType.getSizeInBits();
15806 for (unsigned I = 0; I < E; I++) {
15807 SDValue SubVec = DAG.getNode(ISD::EXTRACT_VECTOR_ELT, DL, StoreType,
15808 ShuffWide, DAG.getIntPtrConstant(I, DL));
15809 SDValue Ch =
15810 DAG.getStore(St->getChain(), DL, SubVec, BasePtr, St->getPointerInfo(),
15811 St->getAlignment(), St->getMemOperand()->getFlags());
15812 BasePtr =
15813 DAG.getNode(ISD::ADD, DL, BasePtr.getValueType(), BasePtr, Increment);
15814 Chains.push_back(Ch);
15816 return DAG.getNode(ISD::TokenFactor, DL, MVT::Other, Chains);
15819 // Try taking a single vector store from an fpround (which would otherwise turn
15820 // into an expensive buildvector) and splitting it into a series of narrowing
15821 // stores.
15822 static SDValue PerformSplittingToNarrowingStores(StoreSDNode *St,
15823 SelectionDAG &DAG) {
15824 if (!St->isSimple() || St->isTruncatingStore() || !St->isUnindexed())
15825 return SDValue();
15826 SDValue Trunc = St->getValue();
15827 if (Trunc->getOpcode() != ISD::FP_ROUND)
15828 return SDValue();
15829 EVT FromVT = Trunc->getOperand(0).getValueType();
15830 EVT ToVT = Trunc.getValueType();
15831 if (!ToVT.isVector())
15832 return SDValue();
15833 assert(FromVT.getVectorNumElements() == ToVT.getVectorNumElements());
15834 EVT ToEltVT = ToVT.getVectorElementType();
15835 EVT FromEltVT = FromVT.getVectorElementType();
15837 if (FromEltVT != MVT::f32 || ToEltVT != MVT::f16)
15838 return SDValue();
15840 unsigned NumElements = 4;
15841 if (FromVT.getVectorNumElements() % NumElements != 0)
15842 return SDValue();
15844 // Test if the Trunc will be convertable to a VMOVN with a shuffle, and if so
15845 // use the VMOVN over splitting the store. We are looking for patterns of:
15846 // !rev: 0 N 1 N+1 2 N+2 ...
15847 // rev: N 0 N+1 1 N+2 2 ...
15848 // The shuffle may either be a single source (in which case N = NumElts/2) or
15849 // two inputs extended with concat to the same size (in which case N =
15850 // NumElts).
15851 auto isVMOVNShuffle = [&](ShuffleVectorSDNode *SVN, bool Rev) {
15852 ArrayRef<int> M = SVN->getMask();
15853 unsigned NumElts = ToVT.getVectorNumElements();
15854 if (SVN->getOperand(1).isUndef())
15855 NumElts /= 2;
15857 unsigned Off0 = Rev ? NumElts : 0;
15858 unsigned Off1 = Rev ? 0 : NumElts;
15860 for (unsigned I = 0; I < NumElts; I += 2) {
15861 if (M[I] >= 0 && M[I] != (int)(Off0 + I / 2))
15862 return false;
15863 if (M[I + 1] >= 0 && M[I + 1] != (int)(Off1 + I / 2))
15864 return false;
15867 return true;
15870 if (auto *Shuffle = dyn_cast<ShuffleVectorSDNode>(Trunc.getOperand(0)))
15871 if (isVMOVNShuffle(Shuffle, false) || isVMOVNShuffle(Shuffle, true))
15872 return SDValue();
15874 LLVMContext &C = *DAG.getContext();
15875 SDLoc DL(St);
15876 // Details about the old store
15877 SDValue Ch = St->getChain();
15878 SDValue BasePtr = St->getBasePtr();
15879 Align Alignment = St->getOriginalAlign();
15880 MachineMemOperand::Flags MMOFlags = St->getMemOperand()->getFlags();
15881 AAMDNodes AAInfo = St->getAAInfo();
15883 // We split the store into slices of NumElements. fp16 trunc stores are vcvt
15884 // and then stored as truncating integer stores.
15885 EVT NewFromVT = EVT::getVectorVT(C, FromEltVT, NumElements);
15886 EVT NewToVT = EVT::getVectorVT(
15887 C, EVT::getIntegerVT(C, ToEltVT.getSizeInBits()), NumElements);
15889 SmallVector<SDValue, 4> Stores;
15890 for (unsigned i = 0; i < FromVT.getVectorNumElements() / NumElements; i++) {
15891 unsigned NewOffset = i * NumElements * ToEltVT.getSizeInBits() / 8;
15892 SDValue NewPtr =
15893 DAG.getObjectPtrOffset(DL, BasePtr, TypeSize::Fixed(NewOffset));
15895 SDValue Extract =
15896 DAG.getNode(ISD::EXTRACT_SUBVECTOR, DL, NewFromVT, Trunc.getOperand(0),
15897 DAG.getConstant(i * NumElements, DL, MVT::i32));
15899 SDValue FPTrunc =
15900 DAG.getNode(ARMISD::VCVTN, DL, MVT::v8f16, DAG.getUNDEF(MVT::v8f16),
15901 Extract, DAG.getConstant(0, DL, MVT::i32));
15902 Extract = DAG.getNode(ARMISD::VECTOR_REG_CAST, DL, MVT::v4i32, FPTrunc);
15904 SDValue Store = DAG.getTruncStore(
15905 Ch, DL, Extract, NewPtr, St->getPointerInfo().getWithOffset(NewOffset),
15906 NewToVT, Alignment.value(), MMOFlags, AAInfo);
15907 Stores.push_back(Store);
15909 return DAG.getNode(ISD::TokenFactor, DL, MVT::Other, Stores);
15912 // Try taking a single vector store from an MVETRUNC (which would otherwise turn
15913 // into an expensive buildvector) and splitting it into a series of narrowing
15914 // stores.
15915 static SDValue PerformSplittingMVETruncToNarrowingStores(StoreSDNode *St,
15916 SelectionDAG &DAG) {
15917 if (!St->isSimple() || St->isTruncatingStore() || !St->isUnindexed())
15918 return SDValue();
15919 SDValue Trunc = St->getValue();
15920 if (Trunc->getOpcode() != ARMISD::MVETRUNC)
15921 return SDValue();
15922 EVT FromVT = Trunc->getOperand(0).getValueType();
15923 EVT ToVT = Trunc.getValueType();
15925 LLVMContext &C = *DAG.getContext();
15926 SDLoc DL(St);
15927 // Details about the old store
15928 SDValue Ch = St->getChain();
15929 SDValue BasePtr = St->getBasePtr();
15930 Align Alignment = St->getOriginalAlign();
15931 MachineMemOperand::Flags MMOFlags = St->getMemOperand()->getFlags();
15932 AAMDNodes AAInfo = St->getAAInfo();
15934 EVT NewToVT = EVT::getVectorVT(C, ToVT.getVectorElementType(),
15935 FromVT.getVectorNumElements());
15937 SmallVector<SDValue, 4> Stores;
15938 for (unsigned i = 0; i < Trunc.getNumOperands(); i++) {
15939 unsigned NewOffset =
15940 i * FromVT.getVectorNumElements() * ToVT.getScalarSizeInBits() / 8;
15941 SDValue NewPtr =
15942 DAG.getObjectPtrOffset(DL, BasePtr, TypeSize::Fixed(NewOffset));
15944 SDValue Extract = Trunc.getOperand(i);
15945 SDValue Store = DAG.getTruncStore(
15946 Ch, DL, Extract, NewPtr, St->getPointerInfo().getWithOffset(NewOffset),
15947 NewToVT, Alignment.value(), MMOFlags, AAInfo);
15948 Stores.push_back(Store);
15950 return DAG.getNode(ISD::TokenFactor, DL, MVT::Other, Stores);
15953 // Given a floating point store from an extracted vector, with an integer
15954 // VGETLANE that already exists, store the existing VGETLANEu directly. This can
15955 // help reduce fp register pressure, doesn't require the fp extract and allows
15956 // use of more integer post-inc stores not available with vstr.
15957 static SDValue PerformExtractFpToIntStores(StoreSDNode *St, SelectionDAG &DAG) {
15958 if (!St->isSimple() || St->isTruncatingStore() || !St->isUnindexed())
15959 return SDValue();
15960 SDValue Extract = St->getValue();
15961 EVT VT = Extract.getValueType();
15962 // For now only uses f16. This may be useful for f32 too, but that will
15963 // be bitcast(extract), not the VGETLANEu we currently check here.
15964 if (VT != MVT::f16 || Extract->getOpcode() != ISD::EXTRACT_VECTOR_ELT)
15965 return SDValue();
15967 SDNode *GetLane =
15968 DAG.getNodeIfExists(ARMISD::VGETLANEu, DAG.getVTList(MVT::i32),
15969 {Extract.getOperand(0), Extract.getOperand(1)});
15970 if (!GetLane)
15971 return SDValue();
15973 LLVMContext &C = *DAG.getContext();
15974 SDLoc DL(St);
15975 // Create a new integer store to replace the existing floating point version.
15976 SDValue Ch = St->getChain();
15977 SDValue BasePtr = St->getBasePtr();
15978 Align Alignment = St->getOriginalAlign();
15979 MachineMemOperand::Flags MMOFlags = St->getMemOperand()->getFlags();
15980 AAMDNodes AAInfo = St->getAAInfo();
15981 EVT NewToVT = EVT::getIntegerVT(C, VT.getSizeInBits());
15982 SDValue Store = DAG.getTruncStore(Ch, DL, SDValue(GetLane, 0), BasePtr,
15983 St->getPointerInfo(), NewToVT,
15984 Alignment.value(), MMOFlags, AAInfo);
15986 return Store;
15989 /// PerformSTORECombine - Target-specific dag combine xforms for
15990 /// ISD::STORE.
15991 static SDValue PerformSTORECombine(SDNode *N,
15992 TargetLowering::DAGCombinerInfo &DCI,
15993 const ARMSubtarget *Subtarget) {
15994 StoreSDNode *St = cast<StoreSDNode>(N);
15995 if (St->isVolatile())
15996 return SDValue();
15997 SDValue StVal = St->getValue();
15998 EVT VT = StVal.getValueType();
16000 if (Subtarget->hasNEON())
16001 if (SDValue Store = PerformTruncatingStoreCombine(St, DCI.DAG))
16002 return Store;
16004 if (Subtarget->hasMVEIntegerOps()) {
16005 if (SDValue NewToken = PerformSplittingToNarrowingStores(St, DCI.DAG))
16006 return NewToken;
16007 if (SDValue NewChain = PerformExtractFpToIntStores(St, DCI.DAG))
16008 return NewChain;
16009 if (SDValue NewToken =
16010 PerformSplittingMVETruncToNarrowingStores(St, DCI.DAG))
16011 return NewToken;
16014 if (!ISD::isNormalStore(St))
16015 return SDValue();
16017 // Split a store of a VMOVDRR into two integer stores to avoid mixing NEON and
16018 // ARM stores of arguments in the same cache line.
16019 if (StVal.getNode()->getOpcode() == ARMISD::VMOVDRR &&
16020 StVal.getNode()->hasOneUse()) {
16021 SelectionDAG &DAG = DCI.DAG;
16022 bool isBigEndian = DAG.getDataLayout().isBigEndian();
16023 SDLoc DL(St);
16024 SDValue BasePtr = St->getBasePtr();
16025 SDValue NewST1 = DAG.getStore(
16026 St->getChain(), DL, StVal.getNode()->getOperand(isBigEndian ? 1 : 0),
16027 BasePtr, St->getPointerInfo(), St->getOriginalAlign(),
16028 St->getMemOperand()->getFlags());
16030 SDValue OffsetPtr = DAG.getNode(ISD::ADD, DL, MVT::i32, BasePtr,
16031 DAG.getConstant(4, DL, MVT::i32));
16032 return DAG.getStore(NewST1.getValue(0), DL,
16033 StVal.getNode()->getOperand(isBigEndian ? 0 : 1),
16034 OffsetPtr, St->getPointerInfo().getWithOffset(4),
16035 St->getOriginalAlign(),
16036 St->getMemOperand()->getFlags());
16039 if (StVal.getValueType() == MVT::i64 &&
16040 StVal.getNode()->getOpcode() == ISD::EXTRACT_VECTOR_ELT) {
16042 // Bitcast an i64 store extracted from a vector to f64.
16043 // Otherwise, the i64 value will be legalized to a pair of i32 values.
16044 SelectionDAG &DAG = DCI.DAG;
16045 SDLoc dl(StVal);
16046 SDValue IntVec = StVal.getOperand(0);
16047 EVT FloatVT = EVT::getVectorVT(*DAG.getContext(), MVT::f64,
16048 IntVec.getValueType().getVectorNumElements());
16049 SDValue Vec = DAG.getNode(ISD::BITCAST, dl, FloatVT, IntVec);
16050 SDValue ExtElt = DAG.getNode(ISD::EXTRACT_VECTOR_ELT, dl, MVT::f64,
16051 Vec, StVal.getOperand(1));
16052 dl = SDLoc(N);
16053 SDValue V = DAG.getNode(ISD::BITCAST, dl, MVT::i64, ExtElt);
16054 // Make the DAGCombiner fold the bitcasts.
16055 DCI.AddToWorklist(Vec.getNode());
16056 DCI.AddToWorklist(ExtElt.getNode());
16057 DCI.AddToWorklist(V.getNode());
16058 return DAG.getStore(St->getChain(), dl, V, St->getBasePtr(),
16059 St->getPointerInfo(), St->getAlignment(),
16060 St->getMemOperand()->getFlags(), St->getAAInfo());
16063 // If this is a legal vector store, try to combine it into a VST1_UPD.
16064 if (Subtarget->hasNEON() && ISD::isNormalStore(N) && VT.isVector() &&
16065 DCI.DAG.getTargetLoweringInfo().isTypeLegal(VT))
16066 return CombineBaseUpdate(N, DCI);
16068 return SDValue();
16071 /// PerformVCVTCombine - VCVT (floating-point to fixed-point, Advanced SIMD)
16072 /// can replace combinations of VMUL and VCVT (floating-point to integer)
16073 /// when the VMUL has a constant operand that is a power of 2.
16075 /// Example (assume d17 = <float 8.000000e+00, float 8.000000e+00>):
16076 /// vmul.f32 d16, d17, d16
16077 /// vcvt.s32.f32 d16, d16
16078 /// becomes:
16079 /// vcvt.s32.f32 d16, d16, #3
16080 static SDValue PerformVCVTCombine(SDNode *N, SelectionDAG &DAG,
16081 const ARMSubtarget *Subtarget) {
16082 if (!Subtarget->hasNEON())
16083 return SDValue();
16085 SDValue Op = N->getOperand(0);
16086 if (!Op.getValueType().isVector() || !Op.getValueType().isSimple() ||
16087 Op.getOpcode() != ISD::FMUL)
16088 return SDValue();
16090 SDValue ConstVec = Op->getOperand(1);
16091 if (!isa<BuildVectorSDNode>(ConstVec))
16092 return SDValue();
16094 MVT FloatTy = Op.getSimpleValueType().getVectorElementType();
16095 uint32_t FloatBits = FloatTy.getSizeInBits();
16096 MVT IntTy = N->getSimpleValueType(0).getVectorElementType();
16097 uint32_t IntBits = IntTy.getSizeInBits();
16098 unsigned NumLanes = Op.getValueType().getVectorNumElements();
16099 if (FloatBits != 32 || IntBits > 32 || (NumLanes != 4 && NumLanes != 2)) {
16100 // These instructions only exist converting from f32 to i32. We can handle
16101 // smaller integers by generating an extra truncate, but larger ones would
16102 // be lossy. We also can't handle anything other than 2 or 4 lanes, since
16103 // these intructions only support v2i32/v4i32 types.
16104 return SDValue();
16107 BitVector UndefElements;
16108 BuildVectorSDNode *BV = cast<BuildVectorSDNode>(ConstVec);
16109 int32_t C = BV->getConstantFPSplatPow2ToLog2Int(&UndefElements, 33);
16110 if (C == -1 || C == 0 || C > 32)
16111 return SDValue();
16113 SDLoc dl(N);
16114 bool isSigned = N->getOpcode() == ISD::FP_TO_SINT;
16115 unsigned IntrinsicOpcode = isSigned ? Intrinsic::arm_neon_vcvtfp2fxs :
16116 Intrinsic::arm_neon_vcvtfp2fxu;
16117 SDValue FixConv = DAG.getNode(
16118 ISD::INTRINSIC_WO_CHAIN, dl, NumLanes == 2 ? MVT::v2i32 : MVT::v4i32,
16119 DAG.getConstant(IntrinsicOpcode, dl, MVT::i32), Op->getOperand(0),
16120 DAG.getConstant(C, dl, MVT::i32));
16122 if (IntBits < FloatBits)
16123 FixConv = DAG.getNode(ISD::TRUNCATE, dl, N->getValueType(0), FixConv);
16125 return FixConv;
16128 /// PerformVDIVCombine - VCVT (fixed-point to floating-point, Advanced SIMD)
16129 /// can replace combinations of VCVT (integer to floating-point) and VDIV
16130 /// when the VDIV has a constant operand that is a power of 2.
16132 /// Example (assume d17 = <float 8.000000e+00, float 8.000000e+00>):
16133 /// vcvt.f32.s32 d16, d16
16134 /// vdiv.f32 d16, d17, d16
16135 /// becomes:
16136 /// vcvt.f32.s32 d16, d16, #3
16137 static SDValue PerformVDIVCombine(SDNode *N, SelectionDAG &DAG,
16138 const ARMSubtarget *Subtarget) {
16139 if (!Subtarget->hasNEON())
16140 return SDValue();
16142 SDValue Op = N->getOperand(0);
16143 unsigned OpOpcode = Op.getNode()->getOpcode();
16144 if (!N->getValueType(0).isVector() || !N->getValueType(0).isSimple() ||
16145 (OpOpcode != ISD::SINT_TO_FP && OpOpcode != ISD::UINT_TO_FP))
16146 return SDValue();
16148 SDValue ConstVec = N->getOperand(1);
16149 if (!isa<BuildVectorSDNode>(ConstVec))
16150 return SDValue();
16152 MVT FloatTy = N->getSimpleValueType(0).getVectorElementType();
16153 uint32_t FloatBits = FloatTy.getSizeInBits();
16154 MVT IntTy = Op.getOperand(0).getSimpleValueType().getVectorElementType();
16155 uint32_t IntBits = IntTy.getSizeInBits();
16156 unsigned NumLanes = Op.getValueType().getVectorNumElements();
16157 if (FloatBits != 32 || IntBits > 32 || (NumLanes != 4 && NumLanes != 2)) {
16158 // These instructions only exist converting from i32 to f32. We can handle
16159 // smaller integers by generating an extra extend, but larger ones would
16160 // be lossy. We also can't handle anything other than 2 or 4 lanes, since
16161 // these intructions only support v2i32/v4i32 types.
16162 return SDValue();
16165 BitVector UndefElements;
16166 BuildVectorSDNode *BV = cast<BuildVectorSDNode>(ConstVec);
16167 int32_t C = BV->getConstantFPSplatPow2ToLog2Int(&UndefElements, 33);
16168 if (C == -1 || C == 0 || C > 32)
16169 return SDValue();
16171 SDLoc dl(N);
16172 bool isSigned = OpOpcode == ISD::SINT_TO_FP;
16173 SDValue ConvInput = Op.getOperand(0);
16174 if (IntBits < FloatBits)
16175 ConvInput = DAG.getNode(isSigned ? ISD::SIGN_EXTEND : ISD::ZERO_EXTEND,
16176 dl, NumLanes == 2 ? MVT::v2i32 : MVT::v4i32,
16177 ConvInput);
16179 unsigned IntrinsicOpcode = isSigned ? Intrinsic::arm_neon_vcvtfxs2fp :
16180 Intrinsic::arm_neon_vcvtfxu2fp;
16181 return DAG.getNode(ISD::INTRINSIC_WO_CHAIN, dl,
16182 Op.getValueType(),
16183 DAG.getConstant(IntrinsicOpcode, dl, MVT::i32),
16184 ConvInput, DAG.getConstant(C, dl, MVT::i32));
16187 static SDValue PerformVECREDUCE_ADDCombine(SDNode *N, SelectionDAG &DAG,
16188 const ARMSubtarget *ST) {
16189 if (!ST->hasMVEIntegerOps())
16190 return SDValue();
16192 assert(N->getOpcode() == ISD::VECREDUCE_ADD);
16193 EVT ResVT = N->getValueType(0);
16194 SDValue N0 = N->getOperand(0);
16195 SDLoc dl(N);
16197 // Try to turn vecreduce_add(add(x, y)) into vecreduce(x) + vecreduce(y)
16198 if (ResVT == MVT::i32 && N0.getOpcode() == ISD::ADD &&
16199 (N0.getValueType() == MVT::v4i32 || N0.getValueType() == MVT::v8i16 ||
16200 N0.getValueType() == MVT::v16i8)) {
16201 SDValue Red0 = DAG.getNode(ISD::VECREDUCE_ADD, dl, ResVT, N0.getOperand(0));
16202 SDValue Red1 = DAG.getNode(ISD::VECREDUCE_ADD, dl, ResVT, N0.getOperand(1));
16203 return DAG.getNode(ISD::ADD, dl, ResVT, Red0, Red1);
16206 // We are looking for something that will have illegal types if left alone,
16207 // but that we can convert to a single instruction under MVE. For example
16208 // vecreduce_add(sext(A, v8i32)) => VADDV.s16 A
16209 // or
16210 // vecreduce_add(mul(zext(A, v16i32), zext(B, v16i32))) => VMLADAV.u8 A, B
16212 // The legal cases are:
16213 // VADDV u/s 8/16/32
16214 // VMLAV u/s 8/16/32
16215 // VADDLV u/s 32
16216 // VMLALV u/s 16/32
16218 // If the input vector is smaller than legal (v4i8/v4i16 for example) we can
16219 // extend it and use v4i32 instead.
16220 auto ExtTypeMatches = [](SDValue A, ArrayRef<MVT> ExtTypes) {
16221 EVT AVT = A.getValueType();
16222 return any_of(ExtTypes, [&](MVT Ty) {
16223 return AVT.getVectorNumElements() == Ty.getVectorNumElements() &&
16224 AVT.bitsLE(Ty);
16227 auto ExtendIfNeeded = [&](SDValue A, unsigned ExtendCode) {
16228 EVT AVT = A.getValueType();
16229 if (!AVT.is128BitVector())
16230 A = DAG.getNode(ExtendCode, dl,
16231 AVT.changeVectorElementType(MVT::getIntegerVT(
16232 128 / AVT.getVectorMinNumElements())),
16234 return A;
16236 auto IsVADDV = [&](MVT RetTy, unsigned ExtendCode, ArrayRef<MVT> ExtTypes) {
16237 if (ResVT != RetTy || N0->getOpcode() != ExtendCode)
16238 return SDValue();
16239 SDValue A = N0->getOperand(0);
16240 if (ExtTypeMatches(A, ExtTypes))
16241 return ExtendIfNeeded(A, ExtendCode);
16242 return SDValue();
16244 auto IsPredVADDV = [&](MVT RetTy, unsigned ExtendCode,
16245 ArrayRef<MVT> ExtTypes, SDValue &Mask) {
16246 if (ResVT != RetTy || N0->getOpcode() != ISD::VSELECT ||
16247 !ISD::isBuildVectorAllZeros(N0->getOperand(2).getNode()))
16248 return SDValue();
16249 Mask = N0->getOperand(0);
16250 SDValue Ext = N0->getOperand(1);
16251 if (Ext->getOpcode() != ExtendCode)
16252 return SDValue();
16253 SDValue A = Ext->getOperand(0);
16254 if (ExtTypeMatches(A, ExtTypes))
16255 return ExtendIfNeeded(A, ExtendCode);
16256 return SDValue();
16258 auto IsVMLAV = [&](MVT RetTy, unsigned ExtendCode, ArrayRef<MVT> ExtTypes,
16259 SDValue &A, SDValue &B) {
16260 // For a vmla we are trying to match a larger pattern:
16261 // ExtA = sext/zext A
16262 // ExtB = sext/zext B
16263 // Mul = mul ExtA, ExtB
16264 // vecreduce.add Mul
16265 // There might also be en extra extend between the mul and the addreduce, so
16266 // long as the bitwidth is high enough to make them equivalent (for example
16267 // original v8i16 might be mul at v8i32 and the reduce happens at v8i64).
16268 if (ResVT != RetTy)
16269 return false;
16270 SDValue Mul = N0;
16271 if (Mul->getOpcode() == ExtendCode &&
16272 Mul->getOperand(0).getScalarValueSizeInBits() * 2 >=
16273 ResVT.getScalarSizeInBits())
16274 Mul = Mul->getOperand(0);
16275 if (Mul->getOpcode() != ISD::MUL)
16276 return false;
16277 SDValue ExtA = Mul->getOperand(0);
16278 SDValue ExtB = Mul->getOperand(1);
16279 if (ExtA->getOpcode() != ExtendCode || ExtB->getOpcode() != ExtendCode)
16280 return false;
16281 A = ExtA->getOperand(0);
16282 B = ExtB->getOperand(0);
16283 if (ExtTypeMatches(A, ExtTypes) && ExtTypeMatches(B, ExtTypes)) {
16284 A = ExtendIfNeeded(A, ExtendCode);
16285 B = ExtendIfNeeded(B, ExtendCode);
16286 return true;
16288 return false;
16290 auto IsPredVMLAV = [&](MVT RetTy, unsigned ExtendCode, ArrayRef<MVT> ExtTypes,
16291 SDValue &A, SDValue &B, SDValue &Mask) {
16292 // Same as the pattern above with a select for the zero predicated lanes
16293 // ExtA = sext/zext A
16294 // ExtB = sext/zext B
16295 // Mul = mul ExtA, ExtB
16296 // N0 = select Mask, Mul, 0
16297 // vecreduce.add N0
16298 if (ResVT != RetTy || N0->getOpcode() != ISD::VSELECT ||
16299 !ISD::isBuildVectorAllZeros(N0->getOperand(2).getNode()))
16300 return false;
16301 Mask = N0->getOperand(0);
16302 SDValue Mul = N0->getOperand(1);
16303 if (Mul->getOpcode() == ExtendCode &&
16304 Mul->getOperand(0).getScalarValueSizeInBits() * 2 >=
16305 ResVT.getScalarSizeInBits())
16306 Mul = Mul->getOperand(0);
16307 if (Mul->getOpcode() != ISD::MUL)
16308 return false;
16309 SDValue ExtA = Mul->getOperand(0);
16310 SDValue ExtB = Mul->getOperand(1);
16311 if (ExtA->getOpcode() != ExtendCode || ExtB->getOpcode() != ExtendCode)
16312 return false;
16313 A = ExtA->getOperand(0);
16314 B = ExtB->getOperand(0);
16315 if (ExtTypeMatches(A, ExtTypes) && ExtTypeMatches(B, ExtTypes)) {
16316 A = ExtendIfNeeded(A, ExtendCode);
16317 B = ExtendIfNeeded(B, ExtendCode);
16318 return true;
16320 return false;
16322 auto Create64bitNode = [&](unsigned Opcode, ArrayRef<SDValue> Ops) {
16323 // Split illegal MVT::v16i8->i64 vector reductions into two legal v8i16->i64
16324 // reductions. The operands are extended with MVEEXT, but as they are
16325 // reductions the lane orders do not matter. MVEEXT may be combined with
16326 // loads to produce two extending loads, or else they will be expanded to
16327 // VREV/VMOVL.
16328 EVT VT = Ops[0].getValueType();
16329 if (VT == MVT::v16i8) {
16330 assert((Opcode == ARMISD::VMLALVs || Opcode == ARMISD::VMLALVu) &&
16331 "Unexpected illegal long reduction opcode");
16332 bool IsUnsigned = Opcode == ARMISD::VMLALVu;
16334 SDValue Ext0 =
16335 DAG.getNode(IsUnsigned ? ARMISD::MVEZEXT : ARMISD::MVESEXT, dl,
16336 DAG.getVTList(MVT::v8i16, MVT::v8i16), Ops[0]);
16337 SDValue Ext1 =
16338 DAG.getNode(IsUnsigned ? ARMISD::MVEZEXT : ARMISD::MVESEXT, dl,
16339 DAG.getVTList(MVT::v8i16, MVT::v8i16), Ops[1]);
16341 SDValue MLA0 = DAG.getNode(Opcode, dl, DAG.getVTList(MVT::i32, MVT::i32),
16342 Ext0, Ext1);
16343 SDValue MLA1 =
16344 DAG.getNode(IsUnsigned ? ARMISD::VMLALVAu : ARMISD::VMLALVAs, dl,
16345 DAG.getVTList(MVT::i32, MVT::i32), MLA0, MLA0.getValue(1),
16346 Ext0.getValue(1), Ext1.getValue(1));
16347 return DAG.getNode(ISD::BUILD_PAIR, dl, MVT::i64, MLA1, MLA1.getValue(1));
16349 SDValue Node = DAG.getNode(Opcode, dl, {MVT::i32, MVT::i32}, Ops);
16350 return DAG.getNode(ISD::BUILD_PAIR, dl, MVT::i64, Node,
16351 SDValue(Node.getNode(), 1));
16354 SDValue A, B;
16355 SDValue Mask;
16356 if (IsVMLAV(MVT::i32, ISD::SIGN_EXTEND, {MVT::v8i16, MVT::v16i8}, A, B))
16357 return DAG.getNode(ARMISD::VMLAVs, dl, ResVT, A, B);
16358 if (IsVMLAV(MVT::i32, ISD::ZERO_EXTEND, {MVT::v8i16, MVT::v16i8}, A, B))
16359 return DAG.getNode(ARMISD::VMLAVu, dl, ResVT, A, B);
16360 if (IsVMLAV(MVT::i64, ISD::SIGN_EXTEND, {MVT::v16i8, MVT::v8i16, MVT::v4i32},
16361 A, B))
16362 return Create64bitNode(ARMISD::VMLALVs, {A, B});
16363 if (IsVMLAV(MVT::i64, ISD::ZERO_EXTEND, {MVT::v16i8, MVT::v8i16, MVT::v4i32},
16364 A, B))
16365 return Create64bitNode(ARMISD::VMLALVu, {A, B});
16366 if (IsVMLAV(MVT::i16, ISD::SIGN_EXTEND, {MVT::v16i8}, A, B))
16367 return DAG.getNode(ISD::TRUNCATE, dl, ResVT,
16368 DAG.getNode(ARMISD::VMLAVs, dl, MVT::i32, A, B));
16369 if (IsVMLAV(MVT::i16, ISD::ZERO_EXTEND, {MVT::v16i8}, A, B))
16370 return DAG.getNode(ISD::TRUNCATE, dl, ResVT,
16371 DAG.getNode(ARMISD::VMLAVu, dl, MVT::i32, A, B));
16373 if (IsPredVMLAV(MVT::i32, ISD::SIGN_EXTEND, {MVT::v8i16, MVT::v16i8}, A, B,
16374 Mask))
16375 return DAG.getNode(ARMISD::VMLAVps, dl, ResVT, A, B, Mask);
16376 if (IsPredVMLAV(MVT::i32, ISD::ZERO_EXTEND, {MVT::v8i16, MVT::v16i8}, A, B,
16377 Mask))
16378 return DAG.getNode(ARMISD::VMLAVpu, dl, ResVT, A, B, Mask);
16379 if (IsPredVMLAV(MVT::i64, ISD::SIGN_EXTEND, {MVT::v8i16, MVT::v4i32}, A, B,
16380 Mask))
16381 return Create64bitNode(ARMISD::VMLALVps, {A, B, Mask});
16382 if (IsPredVMLAV(MVT::i64, ISD::ZERO_EXTEND, {MVT::v8i16, MVT::v4i32}, A, B,
16383 Mask))
16384 return Create64bitNode(ARMISD::VMLALVpu, {A, B, Mask});
16385 if (IsPredVMLAV(MVT::i16, ISD::SIGN_EXTEND, {MVT::v16i8}, A, B, Mask))
16386 return DAG.getNode(ISD::TRUNCATE, dl, ResVT,
16387 DAG.getNode(ARMISD::VMLAVps, dl, MVT::i32, A, B, Mask));
16388 if (IsPredVMLAV(MVT::i16, ISD::ZERO_EXTEND, {MVT::v16i8}, A, B, Mask))
16389 return DAG.getNode(ISD::TRUNCATE, dl, ResVT,
16390 DAG.getNode(ARMISD::VMLAVpu, dl, MVT::i32, A, B, Mask));
16392 if (SDValue A = IsVADDV(MVT::i32, ISD::SIGN_EXTEND, {MVT::v8i16, MVT::v16i8}))
16393 return DAG.getNode(ARMISD::VADDVs, dl, ResVT, A);
16394 if (SDValue A = IsVADDV(MVT::i32, ISD::ZERO_EXTEND, {MVT::v8i16, MVT::v16i8}))
16395 return DAG.getNode(ARMISD::VADDVu, dl, ResVT, A);
16396 if (SDValue A = IsVADDV(MVT::i64, ISD::SIGN_EXTEND, {MVT::v4i32}))
16397 return Create64bitNode(ARMISD::VADDLVs, {A});
16398 if (SDValue A = IsVADDV(MVT::i64, ISD::ZERO_EXTEND, {MVT::v4i32}))
16399 return Create64bitNode(ARMISD::VADDLVu, {A});
16400 if (SDValue A = IsVADDV(MVT::i16, ISD::SIGN_EXTEND, {MVT::v16i8}))
16401 return DAG.getNode(ISD::TRUNCATE, dl, ResVT,
16402 DAG.getNode(ARMISD::VADDVs, dl, MVT::i32, A));
16403 if (SDValue A = IsVADDV(MVT::i16, ISD::ZERO_EXTEND, {MVT::v16i8}))
16404 return DAG.getNode(ISD::TRUNCATE, dl, ResVT,
16405 DAG.getNode(ARMISD::VADDVu, dl, MVT::i32, A));
16407 if (SDValue A = IsPredVADDV(MVT::i32, ISD::SIGN_EXTEND, {MVT::v8i16, MVT::v16i8}, Mask))
16408 return DAG.getNode(ARMISD::VADDVps, dl, ResVT, A, Mask);
16409 if (SDValue A = IsPredVADDV(MVT::i32, ISD::ZERO_EXTEND, {MVT::v8i16, MVT::v16i8}, Mask))
16410 return DAG.getNode(ARMISD::VADDVpu, dl, ResVT, A, Mask);
16411 if (SDValue A = IsPredVADDV(MVT::i64, ISD::SIGN_EXTEND, {MVT::v4i32}, Mask))
16412 return Create64bitNode(ARMISD::VADDLVps, {A, Mask});
16413 if (SDValue A = IsPredVADDV(MVT::i64, ISD::ZERO_EXTEND, {MVT::v4i32}, Mask))
16414 return Create64bitNode(ARMISD::VADDLVpu, {A, Mask});
16415 if (SDValue A = IsPredVADDV(MVT::i16, ISD::SIGN_EXTEND, {MVT::v16i8}, Mask))
16416 return DAG.getNode(ISD::TRUNCATE, dl, ResVT,
16417 DAG.getNode(ARMISD::VADDVps, dl, MVT::i32, A, Mask));
16418 if (SDValue A = IsPredVADDV(MVT::i16, ISD::ZERO_EXTEND, {MVT::v16i8}, Mask))
16419 return DAG.getNode(ISD::TRUNCATE, dl, ResVT,
16420 DAG.getNode(ARMISD::VADDVpu, dl, MVT::i32, A, Mask));
16422 // Some complications. We can get a case where the two inputs of the mul are
16423 // the same, then the output sext will have been helpfully converted to a
16424 // zext. Turn it back.
16425 SDValue Op = N0;
16426 if (Op->getOpcode() == ISD::VSELECT)
16427 Op = Op->getOperand(1);
16428 if (Op->getOpcode() == ISD::ZERO_EXTEND &&
16429 Op->getOperand(0)->getOpcode() == ISD::MUL) {
16430 SDValue Mul = Op->getOperand(0);
16431 if (Mul->getOperand(0) == Mul->getOperand(1) &&
16432 Mul->getOperand(0)->getOpcode() == ISD::SIGN_EXTEND) {
16433 SDValue Ext = DAG.getNode(ISD::SIGN_EXTEND, dl, N0->getValueType(0), Mul);
16434 if (Op != N0)
16435 Ext = DAG.getNode(ISD::VSELECT, dl, N0->getValueType(0),
16436 N0->getOperand(0), Ext, N0->getOperand(2));
16437 return DAG.getNode(ISD::VECREDUCE_ADD, dl, ResVT, Ext);
16441 return SDValue();
16444 static SDValue PerformVMOVNCombine(SDNode *N,
16445 TargetLowering::DAGCombinerInfo &DCI) {
16446 SDValue Op0 = N->getOperand(0);
16447 SDValue Op1 = N->getOperand(1);
16448 unsigned IsTop = N->getConstantOperandVal(2);
16450 // VMOVNT a undef -> a
16451 // VMOVNB a undef -> a
16452 // VMOVNB undef a -> a
16453 if (Op1->isUndef())
16454 return Op0;
16455 if (Op0->isUndef() && !IsTop)
16456 return Op1;
16458 // VMOVNt(c, VQMOVNb(a, b)) => VQMOVNt(c, b)
16459 // VMOVNb(c, VQMOVNb(a, b)) => VQMOVNb(c, b)
16460 if ((Op1->getOpcode() == ARMISD::VQMOVNs ||
16461 Op1->getOpcode() == ARMISD::VQMOVNu) &&
16462 Op1->getConstantOperandVal(2) == 0)
16463 return DCI.DAG.getNode(Op1->getOpcode(), SDLoc(Op1), N->getValueType(0),
16464 Op0, Op1->getOperand(1), N->getOperand(2));
16466 // Only the bottom lanes from Qm (Op1) and either the top or bottom lanes from
16467 // Qd (Op0) are demanded from a VMOVN, depending on whether we are inserting
16468 // into the top or bottom lanes.
16469 unsigned NumElts = N->getValueType(0).getVectorNumElements();
16470 APInt Op1DemandedElts = APInt::getSplat(NumElts, APInt::getLowBitsSet(2, 1));
16471 APInt Op0DemandedElts =
16472 IsTop ? Op1DemandedElts
16473 : APInt::getSplat(NumElts, APInt::getHighBitsSet(2, 1));
16475 APInt KnownUndef, KnownZero;
16476 const TargetLowering &TLI = DCI.DAG.getTargetLoweringInfo();
16477 if (TLI.SimplifyDemandedVectorElts(Op0, Op0DemandedElts, KnownUndef,
16478 KnownZero, DCI))
16479 return SDValue(N, 0);
16480 if (TLI.SimplifyDemandedVectorElts(Op1, Op1DemandedElts, KnownUndef,
16481 KnownZero, DCI))
16482 return SDValue(N, 0);
16484 return SDValue();
16487 static SDValue PerformVQMOVNCombine(SDNode *N,
16488 TargetLowering::DAGCombinerInfo &DCI) {
16489 SDValue Op0 = N->getOperand(0);
16490 unsigned IsTop = N->getConstantOperandVal(2);
16492 unsigned NumElts = N->getValueType(0).getVectorNumElements();
16493 APInt Op0DemandedElts =
16494 APInt::getSplat(NumElts, IsTop ? APInt::getLowBitsSet(2, 1)
16495 : APInt::getHighBitsSet(2, 1));
16497 APInt KnownUndef, KnownZero;
16498 const TargetLowering &TLI = DCI.DAG.getTargetLoweringInfo();
16499 if (TLI.SimplifyDemandedVectorElts(Op0, Op0DemandedElts, KnownUndef,
16500 KnownZero, DCI))
16501 return SDValue(N, 0);
16502 return SDValue();
16505 static SDValue PerformLongShiftCombine(SDNode *N, SelectionDAG &DAG) {
16506 SDLoc DL(N);
16507 SDValue Op0 = N->getOperand(0);
16508 SDValue Op1 = N->getOperand(1);
16510 // Turn X << -C -> X >> C and viceversa. The negative shifts can come up from
16511 // uses of the intrinsics.
16512 if (auto C = dyn_cast<ConstantSDNode>(N->getOperand(2))) {
16513 int ShiftAmt = C->getSExtValue();
16514 if (ShiftAmt == 0) {
16515 SDValue Merge = DAG.getMergeValues({Op0, Op1}, DL);
16516 DAG.ReplaceAllUsesWith(N, Merge.getNode());
16517 return SDValue();
16520 if (ShiftAmt >= -32 && ShiftAmt < 0) {
16521 unsigned NewOpcode =
16522 N->getOpcode() == ARMISD::LSLL ? ARMISD::LSRL : ARMISD::LSLL;
16523 SDValue NewShift = DAG.getNode(NewOpcode, DL, N->getVTList(), Op0, Op1,
16524 DAG.getConstant(-ShiftAmt, DL, MVT::i32));
16525 DAG.ReplaceAllUsesWith(N, NewShift.getNode());
16526 return NewShift;
16530 return SDValue();
16533 /// PerformIntrinsicCombine - ARM-specific DAG combining for intrinsics.
16534 SDValue ARMTargetLowering::PerformIntrinsicCombine(SDNode *N,
16535 DAGCombinerInfo &DCI) const {
16536 SelectionDAG &DAG = DCI.DAG;
16537 unsigned IntNo = cast<ConstantSDNode>(N->getOperand(0))->getZExtValue();
16538 switch (IntNo) {
16539 default:
16540 // Don't do anything for most intrinsics.
16541 break;
16543 // Vector shifts: check for immediate versions and lower them.
16544 // Note: This is done during DAG combining instead of DAG legalizing because
16545 // the build_vectors for 64-bit vector element shift counts are generally
16546 // not legal, and it is hard to see their values after they get legalized to
16547 // loads from a constant pool.
16548 case Intrinsic::arm_neon_vshifts:
16549 case Intrinsic::arm_neon_vshiftu:
16550 case Intrinsic::arm_neon_vrshifts:
16551 case Intrinsic::arm_neon_vrshiftu:
16552 case Intrinsic::arm_neon_vrshiftn:
16553 case Intrinsic::arm_neon_vqshifts:
16554 case Intrinsic::arm_neon_vqshiftu:
16555 case Intrinsic::arm_neon_vqshiftsu:
16556 case Intrinsic::arm_neon_vqshiftns:
16557 case Intrinsic::arm_neon_vqshiftnu:
16558 case Intrinsic::arm_neon_vqshiftnsu:
16559 case Intrinsic::arm_neon_vqrshiftns:
16560 case Intrinsic::arm_neon_vqrshiftnu:
16561 case Intrinsic::arm_neon_vqrshiftnsu: {
16562 EVT VT = N->getOperand(1).getValueType();
16563 int64_t Cnt;
16564 unsigned VShiftOpc = 0;
16566 switch (IntNo) {
16567 case Intrinsic::arm_neon_vshifts:
16568 case Intrinsic::arm_neon_vshiftu:
16569 if (isVShiftLImm(N->getOperand(2), VT, false, Cnt)) {
16570 VShiftOpc = ARMISD::VSHLIMM;
16571 break;
16573 if (isVShiftRImm(N->getOperand(2), VT, false, true, Cnt)) {
16574 VShiftOpc = (IntNo == Intrinsic::arm_neon_vshifts ? ARMISD::VSHRsIMM
16575 : ARMISD::VSHRuIMM);
16576 break;
16578 return SDValue();
16580 case Intrinsic::arm_neon_vrshifts:
16581 case Intrinsic::arm_neon_vrshiftu:
16582 if (isVShiftRImm(N->getOperand(2), VT, false, true, Cnt))
16583 break;
16584 return SDValue();
16586 case Intrinsic::arm_neon_vqshifts:
16587 case Intrinsic::arm_neon_vqshiftu:
16588 if (isVShiftLImm(N->getOperand(2), VT, false, Cnt))
16589 break;
16590 return SDValue();
16592 case Intrinsic::arm_neon_vqshiftsu:
16593 if (isVShiftLImm(N->getOperand(2), VT, false, Cnt))
16594 break;
16595 llvm_unreachable("invalid shift count for vqshlu intrinsic");
16597 case Intrinsic::arm_neon_vrshiftn:
16598 case Intrinsic::arm_neon_vqshiftns:
16599 case Intrinsic::arm_neon_vqshiftnu:
16600 case Intrinsic::arm_neon_vqshiftnsu:
16601 case Intrinsic::arm_neon_vqrshiftns:
16602 case Intrinsic::arm_neon_vqrshiftnu:
16603 case Intrinsic::arm_neon_vqrshiftnsu:
16604 // Narrowing shifts require an immediate right shift.
16605 if (isVShiftRImm(N->getOperand(2), VT, true, true, Cnt))
16606 break;
16607 llvm_unreachable("invalid shift count for narrowing vector shift "
16608 "intrinsic");
16610 default:
16611 llvm_unreachable("unhandled vector shift");
16614 switch (IntNo) {
16615 case Intrinsic::arm_neon_vshifts:
16616 case Intrinsic::arm_neon_vshiftu:
16617 // Opcode already set above.
16618 break;
16619 case Intrinsic::arm_neon_vrshifts:
16620 VShiftOpc = ARMISD::VRSHRsIMM;
16621 break;
16622 case Intrinsic::arm_neon_vrshiftu:
16623 VShiftOpc = ARMISD::VRSHRuIMM;
16624 break;
16625 case Intrinsic::arm_neon_vrshiftn:
16626 VShiftOpc = ARMISD::VRSHRNIMM;
16627 break;
16628 case Intrinsic::arm_neon_vqshifts:
16629 VShiftOpc = ARMISD::VQSHLsIMM;
16630 break;
16631 case Intrinsic::arm_neon_vqshiftu:
16632 VShiftOpc = ARMISD::VQSHLuIMM;
16633 break;
16634 case Intrinsic::arm_neon_vqshiftsu:
16635 VShiftOpc = ARMISD::VQSHLsuIMM;
16636 break;
16637 case Intrinsic::arm_neon_vqshiftns:
16638 VShiftOpc = ARMISD::VQSHRNsIMM;
16639 break;
16640 case Intrinsic::arm_neon_vqshiftnu:
16641 VShiftOpc = ARMISD::VQSHRNuIMM;
16642 break;
16643 case Intrinsic::arm_neon_vqshiftnsu:
16644 VShiftOpc = ARMISD::VQSHRNsuIMM;
16645 break;
16646 case Intrinsic::arm_neon_vqrshiftns:
16647 VShiftOpc = ARMISD::VQRSHRNsIMM;
16648 break;
16649 case Intrinsic::arm_neon_vqrshiftnu:
16650 VShiftOpc = ARMISD::VQRSHRNuIMM;
16651 break;
16652 case Intrinsic::arm_neon_vqrshiftnsu:
16653 VShiftOpc = ARMISD::VQRSHRNsuIMM;
16654 break;
16657 SDLoc dl(N);
16658 return DAG.getNode(VShiftOpc, dl, N->getValueType(0),
16659 N->getOperand(1), DAG.getConstant(Cnt, dl, MVT::i32));
16662 case Intrinsic::arm_neon_vshiftins: {
16663 EVT VT = N->getOperand(1).getValueType();
16664 int64_t Cnt;
16665 unsigned VShiftOpc = 0;
16667 if (isVShiftLImm(N->getOperand(3), VT, false, Cnt))
16668 VShiftOpc = ARMISD::VSLIIMM;
16669 else if (isVShiftRImm(N->getOperand(3), VT, false, true, Cnt))
16670 VShiftOpc = ARMISD::VSRIIMM;
16671 else {
16672 llvm_unreachable("invalid shift count for vsli/vsri intrinsic");
16675 SDLoc dl(N);
16676 return DAG.getNode(VShiftOpc, dl, N->getValueType(0),
16677 N->getOperand(1), N->getOperand(2),
16678 DAG.getConstant(Cnt, dl, MVT::i32));
16681 case Intrinsic::arm_neon_vqrshifts:
16682 case Intrinsic::arm_neon_vqrshiftu:
16683 // No immediate versions of these to check for.
16684 break;
16686 case Intrinsic::arm_mve_vqdmlah:
16687 case Intrinsic::arm_mve_vqdmlash:
16688 case Intrinsic::arm_mve_vqrdmlah:
16689 case Intrinsic::arm_mve_vqrdmlash:
16690 case Intrinsic::arm_mve_vmla_n_predicated:
16691 case Intrinsic::arm_mve_vmlas_n_predicated:
16692 case Intrinsic::arm_mve_vqdmlah_predicated:
16693 case Intrinsic::arm_mve_vqdmlash_predicated:
16694 case Intrinsic::arm_mve_vqrdmlah_predicated:
16695 case Intrinsic::arm_mve_vqrdmlash_predicated: {
16696 // These intrinsics all take an i32 scalar operand which is narrowed to the
16697 // size of a single lane of the vector type they return. So we don't need
16698 // any bits of that operand above that point, which allows us to eliminate
16699 // uxth/sxth.
16700 unsigned BitWidth = N->getValueType(0).getScalarSizeInBits();
16701 APInt DemandedMask = APInt::getLowBitsSet(32, BitWidth);
16702 if (SimplifyDemandedBits(N->getOperand(3), DemandedMask, DCI))
16703 return SDValue();
16704 break;
16707 case Intrinsic::arm_mve_minv:
16708 case Intrinsic::arm_mve_maxv:
16709 case Intrinsic::arm_mve_minav:
16710 case Intrinsic::arm_mve_maxav:
16711 case Intrinsic::arm_mve_minv_predicated:
16712 case Intrinsic::arm_mve_maxv_predicated:
16713 case Intrinsic::arm_mve_minav_predicated:
16714 case Intrinsic::arm_mve_maxav_predicated: {
16715 // These intrinsics all take an i32 scalar operand which is narrowed to the
16716 // size of a single lane of the vector type they take as the other input.
16717 unsigned BitWidth = N->getOperand(2)->getValueType(0).getScalarSizeInBits();
16718 APInt DemandedMask = APInt::getLowBitsSet(32, BitWidth);
16719 if (SimplifyDemandedBits(N->getOperand(1), DemandedMask, DCI))
16720 return SDValue();
16721 break;
16724 case Intrinsic::arm_mve_addv: {
16725 // Turn this intrinsic straight into the appropriate ARMISD::VADDV node,
16726 // which allow PerformADDVecReduce to turn it into VADDLV when possible.
16727 bool Unsigned = cast<ConstantSDNode>(N->getOperand(2))->getZExtValue();
16728 unsigned Opc = Unsigned ? ARMISD::VADDVu : ARMISD::VADDVs;
16729 return DAG.getNode(Opc, SDLoc(N), N->getVTList(), N->getOperand(1));
16732 case Intrinsic::arm_mve_addlv:
16733 case Intrinsic::arm_mve_addlv_predicated: {
16734 // Same for these, but ARMISD::VADDLV has to be followed by a BUILD_PAIR
16735 // which recombines the two outputs into an i64
16736 bool Unsigned = cast<ConstantSDNode>(N->getOperand(2))->getZExtValue();
16737 unsigned Opc = IntNo == Intrinsic::arm_mve_addlv ?
16738 (Unsigned ? ARMISD::VADDLVu : ARMISD::VADDLVs) :
16739 (Unsigned ? ARMISD::VADDLVpu : ARMISD::VADDLVps);
16741 SmallVector<SDValue, 4> Ops;
16742 for (unsigned i = 1, e = N->getNumOperands(); i < e; i++)
16743 if (i != 2) // skip the unsigned flag
16744 Ops.push_back(N->getOperand(i));
16746 SDLoc dl(N);
16747 SDValue val = DAG.getNode(Opc, dl, {MVT::i32, MVT::i32}, Ops);
16748 return DAG.getNode(ISD::BUILD_PAIR, dl, MVT::i64, val.getValue(0),
16749 val.getValue(1));
16753 return SDValue();
16756 /// PerformShiftCombine - Checks for immediate versions of vector shifts and
16757 /// lowers them. As with the vector shift intrinsics, this is done during DAG
16758 /// combining instead of DAG legalizing because the build_vectors for 64-bit
16759 /// vector element shift counts are generally not legal, and it is hard to see
16760 /// their values after they get legalized to loads from a constant pool.
16761 static SDValue PerformShiftCombine(SDNode *N,
16762 TargetLowering::DAGCombinerInfo &DCI,
16763 const ARMSubtarget *ST) {
16764 SelectionDAG &DAG = DCI.DAG;
16765 EVT VT = N->getValueType(0);
16766 if (N->getOpcode() == ISD::SRL && VT == MVT::i32 && ST->hasV6Ops()) {
16767 // Canonicalize (srl (bswap x), 16) to (rotr (bswap x), 16) if the high
16768 // 16-bits of x is zero. This optimizes rev + lsr 16 to rev16.
16769 SDValue N1 = N->getOperand(1);
16770 if (ConstantSDNode *C = dyn_cast<ConstantSDNode>(N1)) {
16771 SDValue N0 = N->getOperand(0);
16772 if (C->getZExtValue() == 16 && N0.getOpcode() == ISD::BSWAP &&
16773 DAG.MaskedValueIsZero(N0.getOperand(0),
16774 APInt::getHighBitsSet(32, 16)))
16775 return DAG.getNode(ISD::ROTR, SDLoc(N), VT, N0, N1);
16779 if (ST->isThumb1Only() && N->getOpcode() == ISD::SHL && VT == MVT::i32 &&
16780 N->getOperand(0)->getOpcode() == ISD::AND &&
16781 N->getOperand(0)->hasOneUse()) {
16782 if (DCI.isBeforeLegalize() || DCI.isCalledByLegalizer())
16783 return SDValue();
16784 // Look for the pattern (shl (and x, AndMask), ShiftAmt). This doesn't
16785 // usually show up because instcombine prefers to canonicalize it to
16786 // (and (shl x, ShiftAmt) (shl AndMask, ShiftAmt)), but the shift can come
16787 // out of GEP lowering in some cases.
16788 SDValue N0 = N->getOperand(0);
16789 ConstantSDNode *ShiftAmtNode = dyn_cast<ConstantSDNode>(N->getOperand(1));
16790 if (!ShiftAmtNode)
16791 return SDValue();
16792 uint32_t ShiftAmt = static_cast<uint32_t>(ShiftAmtNode->getZExtValue());
16793 ConstantSDNode *AndMaskNode = dyn_cast<ConstantSDNode>(N0->getOperand(1));
16794 if (!AndMaskNode)
16795 return SDValue();
16796 uint32_t AndMask = static_cast<uint32_t>(AndMaskNode->getZExtValue());
16797 // Don't transform uxtb/uxth.
16798 if (AndMask == 255 || AndMask == 65535)
16799 return SDValue();
16800 if (isMask_32(AndMask)) {
16801 uint32_t MaskedBits = countLeadingZeros(AndMask);
16802 if (MaskedBits > ShiftAmt) {
16803 SDLoc DL(N);
16804 SDValue SHL = DAG.getNode(ISD::SHL, DL, MVT::i32, N0->getOperand(0),
16805 DAG.getConstant(MaskedBits, DL, MVT::i32));
16806 return DAG.getNode(
16807 ISD::SRL, DL, MVT::i32, SHL,
16808 DAG.getConstant(MaskedBits - ShiftAmt, DL, MVT::i32));
16813 // Nothing to be done for scalar shifts.
16814 const TargetLowering &TLI = DAG.getTargetLoweringInfo();
16815 if (!VT.isVector() || !TLI.isTypeLegal(VT))
16816 return SDValue();
16817 if (ST->hasMVEIntegerOps() && VT == MVT::v2i64)
16818 return SDValue();
16820 int64_t Cnt;
16822 switch (N->getOpcode()) {
16823 default: llvm_unreachable("unexpected shift opcode");
16825 case ISD::SHL:
16826 if (isVShiftLImm(N->getOperand(1), VT, false, Cnt)) {
16827 SDLoc dl(N);
16828 return DAG.getNode(ARMISD::VSHLIMM, dl, VT, N->getOperand(0),
16829 DAG.getConstant(Cnt, dl, MVT::i32));
16831 break;
16833 case ISD::SRA:
16834 case ISD::SRL:
16835 if (isVShiftRImm(N->getOperand(1), VT, false, false, Cnt)) {
16836 unsigned VShiftOpc =
16837 (N->getOpcode() == ISD::SRA ? ARMISD::VSHRsIMM : ARMISD::VSHRuIMM);
16838 SDLoc dl(N);
16839 return DAG.getNode(VShiftOpc, dl, VT, N->getOperand(0),
16840 DAG.getConstant(Cnt, dl, MVT::i32));
16843 return SDValue();
16846 // Look for a sign/zero/fpextend extend of a larger than legal load. This can be
16847 // split into multiple extending loads, which are simpler to deal with than an
16848 // arbitrary extend. For fp extends we use an integer extending load and a VCVTL
16849 // to convert the type to an f32.
16850 static SDValue PerformSplittingToWideningLoad(SDNode *N, SelectionDAG &DAG) {
16851 SDValue N0 = N->getOperand(0);
16852 if (N0.getOpcode() != ISD::LOAD)
16853 return SDValue();
16854 LoadSDNode *LD = cast<LoadSDNode>(N0.getNode());
16855 if (!LD->isSimple() || !N0.hasOneUse() || LD->isIndexed() ||
16856 LD->getExtensionType() != ISD::NON_EXTLOAD)
16857 return SDValue();
16858 EVT FromVT = LD->getValueType(0);
16859 EVT ToVT = N->getValueType(0);
16860 if (!ToVT.isVector())
16861 return SDValue();
16862 assert(FromVT.getVectorNumElements() == ToVT.getVectorNumElements());
16863 EVT ToEltVT = ToVT.getVectorElementType();
16864 EVT FromEltVT = FromVT.getVectorElementType();
16866 unsigned NumElements = 0;
16867 if (ToEltVT == MVT::i32 && FromEltVT == MVT::i8)
16868 NumElements = 4;
16869 if (ToEltVT == MVT::f32 && FromEltVT == MVT::f16)
16870 NumElements = 4;
16871 if (NumElements == 0 ||
16872 (FromEltVT != MVT::f16 && FromVT.getVectorNumElements() == NumElements) ||
16873 FromVT.getVectorNumElements() % NumElements != 0 ||
16874 !isPowerOf2_32(NumElements))
16875 return SDValue();
16877 LLVMContext &C = *DAG.getContext();
16878 SDLoc DL(LD);
16879 // Details about the old load
16880 SDValue Ch = LD->getChain();
16881 SDValue BasePtr = LD->getBasePtr();
16882 Align Alignment = LD->getOriginalAlign();
16883 MachineMemOperand::Flags MMOFlags = LD->getMemOperand()->getFlags();
16884 AAMDNodes AAInfo = LD->getAAInfo();
16886 ISD::LoadExtType NewExtType =
16887 N->getOpcode() == ISD::SIGN_EXTEND ? ISD::SEXTLOAD : ISD::ZEXTLOAD;
16888 SDValue Offset = DAG.getUNDEF(BasePtr.getValueType());
16889 EVT NewFromVT = EVT::getVectorVT(
16890 C, EVT::getIntegerVT(C, FromEltVT.getScalarSizeInBits()), NumElements);
16891 EVT NewToVT = EVT::getVectorVT(
16892 C, EVT::getIntegerVT(C, ToEltVT.getScalarSizeInBits()), NumElements);
16894 SmallVector<SDValue, 4> Loads;
16895 SmallVector<SDValue, 4> Chains;
16896 for (unsigned i = 0; i < FromVT.getVectorNumElements() / NumElements; i++) {
16897 unsigned NewOffset = (i * NewFromVT.getSizeInBits()) / 8;
16898 SDValue NewPtr =
16899 DAG.getObjectPtrOffset(DL, BasePtr, TypeSize::Fixed(NewOffset));
16901 SDValue NewLoad =
16902 DAG.getLoad(ISD::UNINDEXED, NewExtType, NewToVT, DL, Ch, NewPtr, Offset,
16903 LD->getPointerInfo().getWithOffset(NewOffset), NewFromVT,
16904 Alignment, MMOFlags, AAInfo);
16905 Loads.push_back(NewLoad);
16906 Chains.push_back(SDValue(NewLoad.getNode(), 1));
16909 // Float truncs need to extended with VCVTB's into their floating point types.
16910 if (FromEltVT == MVT::f16) {
16911 SmallVector<SDValue, 4> Extends;
16913 for (unsigned i = 0; i < Loads.size(); i++) {
16914 SDValue LoadBC =
16915 DAG.getNode(ARMISD::VECTOR_REG_CAST, DL, MVT::v8f16, Loads[i]);
16916 SDValue FPExt = DAG.getNode(ARMISD::VCVTL, DL, MVT::v4f32, LoadBC,
16917 DAG.getConstant(0, DL, MVT::i32));
16918 Extends.push_back(FPExt);
16921 Loads = Extends;
16924 SDValue NewChain = DAG.getNode(ISD::TokenFactor, DL, MVT::Other, Chains);
16925 DAG.ReplaceAllUsesOfValueWith(SDValue(LD, 1), NewChain);
16926 return DAG.getNode(ISD::CONCAT_VECTORS, DL, ToVT, Loads);
16929 /// PerformExtendCombine - Target-specific DAG combining for ISD::SIGN_EXTEND,
16930 /// ISD::ZERO_EXTEND, and ISD::ANY_EXTEND.
16931 static SDValue PerformExtendCombine(SDNode *N, SelectionDAG &DAG,
16932 const ARMSubtarget *ST) {
16933 SDValue N0 = N->getOperand(0);
16935 // Check for sign- and zero-extensions of vector extract operations of 8- and
16936 // 16-bit vector elements. NEON and MVE support these directly. They are
16937 // handled during DAG combining because type legalization will promote them
16938 // to 32-bit types and it is messy to recognize the operations after that.
16939 if ((ST->hasNEON() || ST->hasMVEIntegerOps()) &&
16940 N0.getOpcode() == ISD::EXTRACT_VECTOR_ELT) {
16941 SDValue Vec = N0.getOperand(0);
16942 SDValue Lane = N0.getOperand(1);
16943 EVT VT = N->getValueType(0);
16944 EVT EltVT = N0.getValueType();
16945 const TargetLowering &TLI = DAG.getTargetLoweringInfo();
16947 if (VT == MVT::i32 &&
16948 (EltVT == MVT::i8 || EltVT == MVT::i16) &&
16949 TLI.isTypeLegal(Vec.getValueType()) &&
16950 isa<ConstantSDNode>(Lane)) {
16952 unsigned Opc = 0;
16953 switch (N->getOpcode()) {
16954 default: llvm_unreachable("unexpected opcode");
16955 case ISD::SIGN_EXTEND:
16956 Opc = ARMISD::VGETLANEs;
16957 break;
16958 case ISD::ZERO_EXTEND:
16959 case ISD::ANY_EXTEND:
16960 Opc = ARMISD::VGETLANEu;
16961 break;
16963 return DAG.getNode(Opc, SDLoc(N), VT, Vec, Lane);
16967 if (ST->hasMVEIntegerOps())
16968 if (SDValue NewLoad = PerformSplittingToWideningLoad(N, DAG))
16969 return NewLoad;
16971 return SDValue();
16974 static SDValue PerformFPExtendCombine(SDNode *N, SelectionDAG &DAG,
16975 const ARMSubtarget *ST) {
16976 if (ST->hasMVEFloatOps())
16977 if (SDValue NewLoad = PerformSplittingToWideningLoad(N, DAG))
16978 return NewLoad;
16980 return SDValue();
16983 /// PerformMinMaxCombine - Target-specific DAG combining for creating truncating
16984 /// saturates.
16985 static SDValue PerformMinMaxCombine(SDNode *N, SelectionDAG &DAG,
16986 const ARMSubtarget *ST) {
16987 EVT VT = N->getValueType(0);
16988 SDValue N0 = N->getOperand(0);
16989 if (!ST->hasMVEIntegerOps())
16990 return SDValue();
16992 if (SDValue V = PerformVQDMULHCombine(N, DAG))
16993 return V;
16995 if (VT != MVT::v4i32 && VT != MVT::v8i16)
16996 return SDValue();
16998 auto IsSignedSaturate = [&](SDNode *Min, SDNode *Max) {
16999 // Check one is a smin and the other is a smax
17000 if (Min->getOpcode() != ISD::SMIN)
17001 std::swap(Min, Max);
17002 if (Min->getOpcode() != ISD::SMIN || Max->getOpcode() != ISD::SMAX)
17003 return false;
17005 APInt SaturateC;
17006 if (VT == MVT::v4i32)
17007 SaturateC = APInt(32, (1 << 15) - 1, true);
17008 else //if (VT == MVT::v8i16)
17009 SaturateC = APInt(16, (1 << 7) - 1, true);
17011 APInt MinC, MaxC;
17012 if (!ISD::isConstantSplatVector(Min->getOperand(1).getNode(), MinC) ||
17013 MinC != SaturateC)
17014 return false;
17015 if (!ISD::isConstantSplatVector(Max->getOperand(1).getNode(), MaxC) ||
17016 MaxC != ~SaturateC)
17017 return false;
17018 return true;
17021 if (IsSignedSaturate(N, N0.getNode())) {
17022 SDLoc DL(N);
17023 MVT ExtVT, HalfVT;
17024 if (VT == MVT::v4i32) {
17025 HalfVT = MVT::v8i16;
17026 ExtVT = MVT::v4i16;
17027 } else { // if (VT == MVT::v8i16)
17028 HalfVT = MVT::v16i8;
17029 ExtVT = MVT::v8i8;
17032 // Create a VQMOVNB with undef top lanes, then signed extended into the top
17033 // half. That extend will hopefully be removed if only the bottom bits are
17034 // demanded (though a truncating store, for example).
17035 SDValue VQMOVN =
17036 DAG.getNode(ARMISD::VQMOVNs, DL, HalfVT, DAG.getUNDEF(HalfVT),
17037 N0->getOperand(0), DAG.getConstant(0, DL, MVT::i32));
17038 SDValue Bitcast = DAG.getNode(ARMISD::VECTOR_REG_CAST, DL, VT, VQMOVN);
17039 return DAG.getNode(ISD::SIGN_EXTEND_INREG, DL, VT, Bitcast,
17040 DAG.getValueType(ExtVT));
17043 auto IsUnsignedSaturate = [&](SDNode *Min) {
17044 // For unsigned, we just need to check for <= 0xffff
17045 if (Min->getOpcode() != ISD::UMIN)
17046 return false;
17048 APInt SaturateC;
17049 if (VT == MVT::v4i32)
17050 SaturateC = APInt(32, (1 << 16) - 1, true);
17051 else //if (VT == MVT::v8i16)
17052 SaturateC = APInt(16, (1 << 8) - 1, true);
17054 APInt MinC;
17055 if (!ISD::isConstantSplatVector(Min->getOperand(1).getNode(), MinC) ||
17056 MinC != SaturateC)
17057 return false;
17058 return true;
17061 if (IsUnsignedSaturate(N)) {
17062 SDLoc DL(N);
17063 MVT HalfVT;
17064 unsigned ExtConst;
17065 if (VT == MVT::v4i32) {
17066 HalfVT = MVT::v8i16;
17067 ExtConst = 0x0000FFFF;
17068 } else { //if (VT == MVT::v8i16)
17069 HalfVT = MVT::v16i8;
17070 ExtConst = 0x00FF;
17073 // Create a VQMOVNB with undef top lanes, then ZExt into the top half with
17074 // an AND. That extend will hopefully be removed if only the bottom bits are
17075 // demanded (though a truncating store, for example).
17076 SDValue VQMOVN =
17077 DAG.getNode(ARMISD::VQMOVNu, DL, HalfVT, DAG.getUNDEF(HalfVT), N0,
17078 DAG.getConstant(0, DL, MVT::i32));
17079 SDValue Bitcast = DAG.getNode(ARMISD::VECTOR_REG_CAST, DL, VT, VQMOVN);
17080 return DAG.getNode(ISD::AND, DL, VT, Bitcast,
17081 DAG.getConstant(ExtConst, DL, VT));
17084 return SDValue();
17087 static const APInt *isPowerOf2Constant(SDValue V) {
17088 ConstantSDNode *C = dyn_cast<ConstantSDNode>(V);
17089 if (!C)
17090 return nullptr;
17091 const APInt *CV = &C->getAPIntValue();
17092 return CV->isPowerOf2() ? CV : nullptr;
17095 SDValue ARMTargetLowering::PerformCMOVToBFICombine(SDNode *CMOV, SelectionDAG &DAG) const {
17096 // If we have a CMOV, OR and AND combination such as:
17097 // if (x & CN)
17098 // y |= CM;
17100 // And:
17101 // * CN is a single bit;
17102 // * All bits covered by CM are known zero in y
17104 // Then we can convert this into a sequence of BFI instructions. This will
17105 // always be a win if CM is a single bit, will always be no worse than the
17106 // TST&OR sequence if CM is two bits, and for thumb will be no worse if CM is
17107 // three bits (due to the extra IT instruction).
17109 SDValue Op0 = CMOV->getOperand(0);
17110 SDValue Op1 = CMOV->getOperand(1);
17111 auto CCNode = cast<ConstantSDNode>(CMOV->getOperand(2));
17112 auto CC = CCNode->getAPIntValue().getLimitedValue();
17113 SDValue CmpZ = CMOV->getOperand(4);
17115 // The compare must be against zero.
17116 if (!isNullConstant(CmpZ->getOperand(1)))
17117 return SDValue();
17119 assert(CmpZ->getOpcode() == ARMISD::CMPZ);
17120 SDValue And = CmpZ->getOperand(0);
17121 if (And->getOpcode() != ISD::AND)
17122 return SDValue();
17123 const APInt *AndC = isPowerOf2Constant(And->getOperand(1));
17124 if (!AndC)
17125 return SDValue();
17126 SDValue X = And->getOperand(0);
17128 if (CC == ARMCC::EQ) {
17129 // We're performing an "equal to zero" compare. Swap the operands so we
17130 // canonicalize on a "not equal to zero" compare.
17131 std::swap(Op0, Op1);
17132 } else {
17133 assert(CC == ARMCC::NE && "How can a CMPZ node not be EQ or NE?");
17136 if (Op1->getOpcode() != ISD::OR)
17137 return SDValue();
17139 ConstantSDNode *OrC = dyn_cast<ConstantSDNode>(Op1->getOperand(1));
17140 if (!OrC)
17141 return SDValue();
17142 SDValue Y = Op1->getOperand(0);
17144 if (Op0 != Y)
17145 return SDValue();
17147 // Now, is it profitable to continue?
17148 APInt OrCI = OrC->getAPIntValue();
17149 unsigned Heuristic = Subtarget->isThumb() ? 3 : 2;
17150 if (OrCI.countPopulation() > Heuristic)
17151 return SDValue();
17153 // Lastly, can we determine that the bits defined by OrCI
17154 // are zero in Y?
17155 KnownBits Known = DAG.computeKnownBits(Y);
17156 if ((OrCI & Known.Zero) != OrCI)
17157 return SDValue();
17159 // OK, we can do the combine.
17160 SDValue V = Y;
17161 SDLoc dl(X);
17162 EVT VT = X.getValueType();
17163 unsigned BitInX = AndC->logBase2();
17165 if (BitInX != 0) {
17166 // We must shift X first.
17167 X = DAG.getNode(ISD::SRL, dl, VT, X,
17168 DAG.getConstant(BitInX, dl, VT));
17171 for (unsigned BitInY = 0, NumActiveBits = OrCI.getActiveBits();
17172 BitInY < NumActiveBits; ++BitInY) {
17173 if (OrCI[BitInY] == 0)
17174 continue;
17175 APInt Mask(VT.getSizeInBits(), 0);
17176 Mask.setBit(BitInY);
17177 V = DAG.getNode(ARMISD::BFI, dl, VT, V, X,
17178 // Confusingly, the operand is an *inverted* mask.
17179 DAG.getConstant(~Mask, dl, VT));
17182 return V;
17185 // Given N, the value controlling the conditional branch, search for the loop
17186 // intrinsic, returning it, along with how the value is used. We need to handle
17187 // patterns such as the following:
17188 // (brcond (xor (setcc (loop.decrement), 0, ne), 1), exit)
17189 // (brcond (setcc (loop.decrement), 0, eq), exit)
17190 // (brcond (setcc (loop.decrement), 0, ne), header)
17191 static SDValue SearchLoopIntrinsic(SDValue N, ISD::CondCode &CC, int &Imm,
17192 bool &Negate) {
17193 switch (N->getOpcode()) {
17194 default:
17195 break;
17196 case ISD::XOR: {
17197 if (!isa<ConstantSDNode>(N.getOperand(1)))
17198 return SDValue();
17199 if (!cast<ConstantSDNode>(N.getOperand(1))->isOne())
17200 return SDValue();
17201 Negate = !Negate;
17202 return SearchLoopIntrinsic(N.getOperand(0), CC, Imm, Negate);
17204 case ISD::SETCC: {
17205 auto *Const = dyn_cast<ConstantSDNode>(N.getOperand(1));
17206 if (!Const)
17207 return SDValue();
17208 if (Const->isNullValue())
17209 Imm = 0;
17210 else if (Const->isOne())
17211 Imm = 1;
17212 else
17213 return SDValue();
17214 CC = cast<CondCodeSDNode>(N.getOperand(2))->get();
17215 return SearchLoopIntrinsic(N->getOperand(0), CC, Imm, Negate);
17217 case ISD::INTRINSIC_W_CHAIN: {
17218 unsigned IntOp = cast<ConstantSDNode>(N.getOperand(1))->getZExtValue();
17219 if (IntOp != Intrinsic::test_start_loop_iterations &&
17220 IntOp != Intrinsic::loop_decrement_reg)
17221 return SDValue();
17222 return N;
17225 return SDValue();
17228 static SDValue PerformHWLoopCombine(SDNode *N,
17229 TargetLowering::DAGCombinerInfo &DCI,
17230 const ARMSubtarget *ST) {
17232 // The hwloop intrinsics that we're interested are used for control-flow,
17233 // either for entering or exiting the loop:
17234 // - test.start.loop.iterations will test whether its operand is zero. If it
17235 // is zero, the proceeding branch should not enter the loop.
17236 // - loop.decrement.reg also tests whether its operand is zero. If it is
17237 // zero, the proceeding branch should not branch back to the beginning of
17238 // the loop.
17239 // So here, we need to check that how the brcond is using the result of each
17240 // of the intrinsics to ensure that we're branching to the right place at the
17241 // right time.
17243 ISD::CondCode CC;
17244 SDValue Cond;
17245 int Imm = 1;
17246 bool Negate = false;
17247 SDValue Chain = N->getOperand(0);
17248 SDValue Dest;
17250 if (N->getOpcode() == ISD::BRCOND) {
17251 CC = ISD::SETEQ;
17252 Cond = N->getOperand(1);
17253 Dest = N->getOperand(2);
17254 } else {
17255 assert(N->getOpcode() == ISD::BR_CC && "Expected BRCOND or BR_CC!");
17256 CC = cast<CondCodeSDNode>(N->getOperand(1))->get();
17257 Cond = N->getOperand(2);
17258 Dest = N->getOperand(4);
17259 if (auto *Const = dyn_cast<ConstantSDNode>(N->getOperand(3))) {
17260 if (!Const->isOne() && !Const->isNullValue())
17261 return SDValue();
17262 Imm = Const->getZExtValue();
17263 } else
17264 return SDValue();
17267 SDValue Int = SearchLoopIntrinsic(Cond, CC, Imm, Negate);
17268 if (!Int)
17269 return SDValue();
17271 if (Negate)
17272 CC = ISD::getSetCCInverse(CC, /* Integer inverse */ MVT::i32);
17274 auto IsTrueIfZero = [](ISD::CondCode CC, int Imm) {
17275 return (CC == ISD::SETEQ && Imm == 0) ||
17276 (CC == ISD::SETNE && Imm == 1) ||
17277 (CC == ISD::SETLT && Imm == 1) ||
17278 (CC == ISD::SETULT && Imm == 1);
17281 auto IsFalseIfZero = [](ISD::CondCode CC, int Imm) {
17282 return (CC == ISD::SETEQ && Imm == 1) ||
17283 (CC == ISD::SETNE && Imm == 0) ||
17284 (CC == ISD::SETGT && Imm == 0) ||
17285 (CC == ISD::SETUGT && Imm == 0) ||
17286 (CC == ISD::SETGE && Imm == 1) ||
17287 (CC == ISD::SETUGE && Imm == 1);
17290 assert((IsTrueIfZero(CC, Imm) || IsFalseIfZero(CC, Imm)) &&
17291 "unsupported condition");
17293 SDLoc dl(Int);
17294 SelectionDAG &DAG = DCI.DAG;
17295 SDValue Elements = Int.getOperand(2);
17296 unsigned IntOp = cast<ConstantSDNode>(Int->getOperand(1))->getZExtValue();
17297 assert((N->hasOneUse() && N->use_begin()->getOpcode() == ISD::BR)
17298 && "expected single br user");
17299 SDNode *Br = *N->use_begin();
17300 SDValue OtherTarget = Br->getOperand(1);
17302 // Update the unconditional branch to branch to the given Dest.
17303 auto UpdateUncondBr = [](SDNode *Br, SDValue Dest, SelectionDAG &DAG) {
17304 SDValue NewBrOps[] = { Br->getOperand(0), Dest };
17305 SDValue NewBr = DAG.getNode(ISD::BR, SDLoc(Br), MVT::Other, NewBrOps);
17306 DAG.ReplaceAllUsesOfValueWith(SDValue(Br, 0), NewBr);
17309 if (IntOp == Intrinsic::test_start_loop_iterations) {
17310 SDValue Res;
17311 SDValue Setup = DAG.getNode(ARMISD::WLSSETUP, dl, MVT::i32, Elements);
17312 // We expect this 'instruction' to branch when the counter is zero.
17313 if (IsTrueIfZero(CC, Imm)) {
17314 SDValue Ops[] = {Chain, Setup, Dest};
17315 Res = DAG.getNode(ARMISD::WLS, dl, MVT::Other, Ops);
17316 } else {
17317 // The logic is the reverse of what we need for WLS, so find the other
17318 // basic block target: the target of the proceeding br.
17319 UpdateUncondBr(Br, Dest, DAG);
17321 SDValue Ops[] = {Chain, Setup, OtherTarget};
17322 Res = DAG.getNode(ARMISD::WLS, dl, MVT::Other, Ops);
17324 // Update LR count to the new value
17325 DAG.ReplaceAllUsesOfValueWith(Int.getValue(0), Setup);
17326 // Update chain
17327 DAG.ReplaceAllUsesOfValueWith(Int.getValue(2), Int.getOperand(0));
17328 return Res;
17329 } else {
17330 SDValue Size = DAG.getTargetConstant(
17331 cast<ConstantSDNode>(Int.getOperand(3))->getZExtValue(), dl, MVT::i32);
17332 SDValue Args[] = { Int.getOperand(0), Elements, Size, };
17333 SDValue LoopDec = DAG.getNode(ARMISD::LOOP_DEC, dl,
17334 DAG.getVTList(MVT::i32, MVT::Other), Args);
17335 DAG.ReplaceAllUsesWith(Int.getNode(), LoopDec.getNode());
17337 // We expect this instruction to branch when the count is not zero.
17338 SDValue Target = IsFalseIfZero(CC, Imm) ? Dest : OtherTarget;
17340 // Update the unconditional branch to target the loop preheader if we've
17341 // found the condition has been reversed.
17342 if (Target == OtherTarget)
17343 UpdateUncondBr(Br, Dest, DAG);
17345 Chain = DAG.getNode(ISD::TokenFactor, dl, MVT::Other,
17346 SDValue(LoopDec.getNode(), 1), Chain);
17348 SDValue EndArgs[] = { Chain, SDValue(LoopDec.getNode(), 0), Target };
17349 return DAG.getNode(ARMISD::LE, dl, MVT::Other, EndArgs);
17351 return SDValue();
17354 /// PerformBRCONDCombine - Target-specific DAG combining for ARMISD::BRCOND.
17355 SDValue
17356 ARMTargetLowering::PerformBRCONDCombine(SDNode *N, SelectionDAG &DAG) const {
17357 SDValue Cmp = N->getOperand(4);
17358 if (Cmp.getOpcode() != ARMISD::CMPZ)
17359 // Only looking at NE cases.
17360 return SDValue();
17362 EVT VT = N->getValueType(0);
17363 SDLoc dl(N);
17364 SDValue LHS = Cmp.getOperand(0);
17365 SDValue RHS = Cmp.getOperand(1);
17366 SDValue Chain = N->getOperand(0);
17367 SDValue BB = N->getOperand(1);
17368 SDValue ARMcc = N->getOperand(2);
17369 ARMCC::CondCodes CC =
17370 (ARMCC::CondCodes)cast<ConstantSDNode>(ARMcc)->getZExtValue();
17372 // (brcond Chain BB ne CPSR (cmpz (and (cmov 0 1 CC CPSR Cmp) 1) 0))
17373 // -> (brcond Chain BB CC CPSR Cmp)
17374 if (CC == ARMCC::NE && LHS.getOpcode() == ISD::AND && LHS->hasOneUse() &&
17375 LHS->getOperand(0)->getOpcode() == ARMISD::CMOV &&
17376 LHS->getOperand(0)->hasOneUse()) {
17377 auto *LHS00C = dyn_cast<ConstantSDNode>(LHS->getOperand(0)->getOperand(0));
17378 auto *LHS01C = dyn_cast<ConstantSDNode>(LHS->getOperand(0)->getOperand(1));
17379 auto *LHS1C = dyn_cast<ConstantSDNode>(LHS->getOperand(1));
17380 auto *RHSC = dyn_cast<ConstantSDNode>(RHS);
17381 if ((LHS00C && LHS00C->getZExtValue() == 0) &&
17382 (LHS01C && LHS01C->getZExtValue() == 1) &&
17383 (LHS1C && LHS1C->getZExtValue() == 1) &&
17384 (RHSC && RHSC->getZExtValue() == 0)) {
17385 return DAG.getNode(
17386 ARMISD::BRCOND, dl, VT, Chain, BB, LHS->getOperand(0)->getOperand(2),
17387 LHS->getOperand(0)->getOperand(3), LHS->getOperand(0)->getOperand(4));
17391 return SDValue();
17394 /// PerformCMOVCombine - Target-specific DAG combining for ARMISD::CMOV.
17395 SDValue
17396 ARMTargetLowering::PerformCMOVCombine(SDNode *N, SelectionDAG &DAG) const {
17397 SDValue Cmp = N->getOperand(4);
17398 if (Cmp.getOpcode() != ARMISD::CMPZ)
17399 // Only looking at EQ and NE cases.
17400 return SDValue();
17402 EVT VT = N->getValueType(0);
17403 SDLoc dl(N);
17404 SDValue LHS = Cmp.getOperand(0);
17405 SDValue RHS = Cmp.getOperand(1);
17406 SDValue FalseVal = N->getOperand(0);
17407 SDValue TrueVal = N->getOperand(1);
17408 SDValue ARMcc = N->getOperand(2);
17409 ARMCC::CondCodes CC =
17410 (ARMCC::CondCodes)cast<ConstantSDNode>(ARMcc)->getZExtValue();
17412 // BFI is only available on V6T2+.
17413 if (!Subtarget->isThumb1Only() && Subtarget->hasV6T2Ops()) {
17414 SDValue R = PerformCMOVToBFICombine(N, DAG);
17415 if (R)
17416 return R;
17419 // Simplify
17420 // mov r1, r0
17421 // cmp r1, x
17422 // mov r0, y
17423 // moveq r0, x
17424 // to
17425 // cmp r0, x
17426 // movne r0, y
17428 // mov r1, r0
17429 // cmp r1, x
17430 // mov r0, x
17431 // movne r0, y
17432 // to
17433 // cmp r0, x
17434 // movne r0, y
17435 /// FIXME: Turn this into a target neutral optimization?
17436 SDValue Res;
17437 if (CC == ARMCC::NE && FalseVal == RHS && FalseVal != LHS) {
17438 Res = DAG.getNode(ARMISD::CMOV, dl, VT, LHS, TrueVal, ARMcc,
17439 N->getOperand(3), Cmp);
17440 } else if (CC == ARMCC::EQ && TrueVal == RHS) {
17441 SDValue ARMcc;
17442 SDValue NewCmp = getARMCmp(LHS, RHS, ISD::SETNE, ARMcc, DAG, dl);
17443 Res = DAG.getNode(ARMISD::CMOV, dl, VT, LHS, FalseVal, ARMcc,
17444 N->getOperand(3), NewCmp);
17447 // (cmov F T ne CPSR (cmpz (cmov 0 1 CC CPSR Cmp) 0))
17448 // -> (cmov F T CC CPSR Cmp)
17449 if (CC == ARMCC::NE && LHS.getOpcode() == ARMISD::CMOV && LHS->hasOneUse()) {
17450 auto *LHS0C = dyn_cast<ConstantSDNode>(LHS->getOperand(0));
17451 auto *LHS1C = dyn_cast<ConstantSDNode>(LHS->getOperand(1));
17452 auto *RHSC = dyn_cast<ConstantSDNode>(RHS);
17453 if ((LHS0C && LHS0C->getZExtValue() == 0) &&
17454 (LHS1C && LHS1C->getZExtValue() == 1) &&
17455 (RHSC && RHSC->getZExtValue() == 0)) {
17456 return DAG.getNode(ARMISD::CMOV, dl, VT, FalseVal, TrueVal,
17457 LHS->getOperand(2), LHS->getOperand(3),
17458 LHS->getOperand(4));
17462 if (!VT.isInteger())
17463 return SDValue();
17465 // Materialize a boolean comparison for integers so we can avoid branching.
17466 if (isNullConstant(FalseVal)) {
17467 if (CC == ARMCC::EQ && isOneConstant(TrueVal)) {
17468 if (!Subtarget->isThumb1Only() && Subtarget->hasV5TOps()) {
17469 // If x == y then x - y == 0 and ARM's CLZ will return 32, shifting it
17470 // right 5 bits will make that 32 be 1, otherwise it will be 0.
17471 // CMOV 0, 1, ==, (CMPZ x, y) -> SRL (CTLZ (SUB x, y)), 5
17472 SDValue Sub = DAG.getNode(ISD::SUB, dl, VT, LHS, RHS);
17473 Res = DAG.getNode(ISD::SRL, dl, VT, DAG.getNode(ISD::CTLZ, dl, VT, Sub),
17474 DAG.getConstant(5, dl, MVT::i32));
17475 } else {
17476 // CMOV 0, 1, ==, (CMPZ x, y) ->
17477 // (ADDCARRY (SUB x, y), t:0, t:1)
17478 // where t = (SUBCARRY 0, (SUB x, y), 0)
17480 // The SUBCARRY computes 0 - (x - y) and this will give a borrow when
17481 // x != y. In other words, a carry C == 1 when x == y, C == 0
17482 // otherwise.
17483 // The final ADDCARRY computes
17484 // x - y + (0 - (x - y)) + C == C
17485 SDValue Sub = DAG.getNode(ISD::SUB, dl, VT, LHS, RHS);
17486 SDVTList VTs = DAG.getVTList(VT, MVT::i32);
17487 SDValue Neg = DAG.getNode(ISD::USUBO, dl, VTs, FalseVal, Sub);
17488 // ISD::SUBCARRY returns a borrow but we want the carry here
17489 // actually.
17490 SDValue Carry =
17491 DAG.getNode(ISD::SUB, dl, MVT::i32,
17492 DAG.getConstant(1, dl, MVT::i32), Neg.getValue(1));
17493 Res = DAG.getNode(ISD::ADDCARRY, dl, VTs, Sub, Neg, Carry);
17495 } else if (CC == ARMCC::NE && !isNullConstant(RHS) &&
17496 (!Subtarget->isThumb1Only() || isPowerOf2Constant(TrueVal))) {
17497 // This seems pointless but will allow us to combine it further below.
17498 // CMOV 0, z, !=, (CMPZ x, y) -> CMOV (SUBS x, y), z, !=, (SUBS x, y):1
17499 SDValue Sub =
17500 DAG.getNode(ARMISD::SUBS, dl, DAG.getVTList(VT, MVT::i32), LHS, RHS);
17501 SDValue CPSRGlue = DAG.getCopyToReg(DAG.getEntryNode(), dl, ARM::CPSR,
17502 Sub.getValue(1), SDValue());
17503 Res = DAG.getNode(ARMISD::CMOV, dl, VT, Sub, TrueVal, ARMcc,
17504 N->getOperand(3), CPSRGlue.getValue(1));
17505 FalseVal = Sub;
17507 } else if (isNullConstant(TrueVal)) {
17508 if (CC == ARMCC::EQ && !isNullConstant(RHS) &&
17509 (!Subtarget->isThumb1Only() || isPowerOf2Constant(FalseVal))) {
17510 // This seems pointless but will allow us to combine it further below
17511 // Note that we change == for != as this is the dual for the case above.
17512 // CMOV z, 0, ==, (CMPZ x, y) -> CMOV (SUBS x, y), z, !=, (SUBS x, y):1
17513 SDValue Sub =
17514 DAG.getNode(ARMISD::SUBS, dl, DAG.getVTList(VT, MVT::i32), LHS, RHS);
17515 SDValue CPSRGlue = DAG.getCopyToReg(DAG.getEntryNode(), dl, ARM::CPSR,
17516 Sub.getValue(1), SDValue());
17517 Res = DAG.getNode(ARMISD::CMOV, dl, VT, Sub, FalseVal,
17518 DAG.getConstant(ARMCC::NE, dl, MVT::i32),
17519 N->getOperand(3), CPSRGlue.getValue(1));
17520 FalseVal = Sub;
17524 // On Thumb1, the DAG above may be further combined if z is a power of 2
17525 // (z == 2 ^ K).
17526 // CMOV (SUBS x, y), z, !=, (SUBS x, y):1 ->
17527 // t1 = (USUBO (SUB x, y), 1)
17528 // t2 = (SUBCARRY (SUB x, y), t1:0, t1:1)
17529 // Result = if K != 0 then (SHL t2:0, K) else t2:0
17531 // This also handles the special case of comparing against zero; it's
17532 // essentially, the same pattern, except there's no SUBS:
17533 // CMOV x, z, !=, (CMPZ x, 0) ->
17534 // t1 = (USUBO x, 1)
17535 // t2 = (SUBCARRY x, t1:0, t1:1)
17536 // Result = if K != 0 then (SHL t2:0, K) else t2:0
17537 const APInt *TrueConst;
17538 if (Subtarget->isThumb1Only() && CC == ARMCC::NE &&
17539 ((FalseVal.getOpcode() == ARMISD::SUBS &&
17540 FalseVal.getOperand(0) == LHS && FalseVal.getOperand(1) == RHS) ||
17541 (FalseVal == LHS && isNullConstant(RHS))) &&
17542 (TrueConst = isPowerOf2Constant(TrueVal))) {
17543 SDVTList VTs = DAG.getVTList(VT, MVT::i32);
17544 unsigned ShiftAmount = TrueConst->logBase2();
17545 if (ShiftAmount)
17546 TrueVal = DAG.getConstant(1, dl, VT);
17547 SDValue Subc = DAG.getNode(ISD::USUBO, dl, VTs, FalseVal, TrueVal);
17548 Res = DAG.getNode(ISD::SUBCARRY, dl, VTs, FalseVal, Subc, Subc.getValue(1));
17550 if (ShiftAmount)
17551 Res = DAG.getNode(ISD::SHL, dl, VT, Res,
17552 DAG.getConstant(ShiftAmount, dl, MVT::i32));
17555 if (Res.getNode()) {
17556 KnownBits Known = DAG.computeKnownBits(SDValue(N,0));
17557 // Capture demanded bits information that would be otherwise lost.
17558 if (Known.Zero == 0xfffffffe)
17559 Res = DAG.getNode(ISD::AssertZext, dl, MVT::i32, Res,
17560 DAG.getValueType(MVT::i1));
17561 else if (Known.Zero == 0xffffff00)
17562 Res = DAG.getNode(ISD::AssertZext, dl, MVT::i32, Res,
17563 DAG.getValueType(MVT::i8));
17564 else if (Known.Zero == 0xffff0000)
17565 Res = DAG.getNode(ISD::AssertZext, dl, MVT::i32, Res,
17566 DAG.getValueType(MVT::i16));
17569 return Res;
17572 static SDValue PerformBITCASTCombine(SDNode *N,
17573 TargetLowering::DAGCombinerInfo &DCI,
17574 const ARMSubtarget *ST) {
17575 SelectionDAG &DAG = DCI.DAG;
17576 SDValue Src = N->getOperand(0);
17577 EVT DstVT = N->getValueType(0);
17579 // Convert v4f32 bitcast (v4i32 vdup (i32)) -> v4f32 vdup (i32) under MVE.
17580 if (ST->hasMVEIntegerOps() && Src.getOpcode() == ARMISD::VDUP) {
17581 EVT SrcVT = Src.getValueType();
17582 if (SrcVT.getScalarSizeInBits() == DstVT.getScalarSizeInBits())
17583 return DAG.getNode(ARMISD::VDUP, SDLoc(N), DstVT, Src.getOperand(0));
17586 // We may have a bitcast of something that has already had this bitcast
17587 // combine performed on it, so skip past any VECTOR_REG_CASTs.
17588 while (Src.getOpcode() == ARMISD::VECTOR_REG_CAST)
17589 Src = Src.getOperand(0);
17591 // Bitcast from element-wise VMOV or VMVN doesn't need VREV if the VREV that
17592 // would be generated is at least the width of the element type.
17593 EVT SrcVT = Src.getValueType();
17594 if ((Src.getOpcode() == ARMISD::VMOVIMM ||
17595 Src.getOpcode() == ARMISD::VMVNIMM ||
17596 Src.getOpcode() == ARMISD::VMOVFPIMM) &&
17597 SrcVT.getScalarSizeInBits() <= DstVT.getScalarSizeInBits() &&
17598 DAG.getDataLayout().isBigEndian())
17599 return DAG.getNode(ARMISD::VECTOR_REG_CAST, SDLoc(N), DstVT, Src);
17601 // bitcast(extract(x, n)); bitcast(extract(x, n+1)) -> VMOVRRD x
17602 if (SDValue R = PerformExtractEltToVMOVRRD(N, DCI))
17603 return R;
17605 return SDValue();
17608 // Some combines for the MVETrunc truncations legalizer helper. Also lowers the
17609 // node into stack operations after legalizeOps.
17610 SDValue ARMTargetLowering::PerformMVETruncCombine(
17611 SDNode *N, TargetLowering::DAGCombinerInfo &DCI) const {
17612 SelectionDAG &DAG = DCI.DAG;
17613 EVT VT = N->getValueType(0);
17614 SDLoc DL(N);
17616 // MVETrunc(Undef, Undef) -> Undef
17617 if (all_of(N->ops(), [](SDValue Op) { return Op.isUndef(); }))
17618 return DAG.getUNDEF(VT);
17620 // MVETrunc(MVETrunc a b, MVETrunc c, d) -> MVETrunc
17621 if (N->getNumOperands() == 2 &&
17622 N->getOperand(0).getOpcode() == ARMISD::MVETRUNC &&
17623 N->getOperand(1).getOpcode() == ARMISD::MVETRUNC)
17624 return DAG.getNode(ARMISD::MVETRUNC, DL, VT, N->getOperand(0).getOperand(0),
17625 N->getOperand(0).getOperand(1),
17626 N->getOperand(1).getOperand(0),
17627 N->getOperand(1).getOperand(1));
17629 // MVETrunc(shuffle, shuffle) -> VMOVN
17630 if (N->getNumOperands() == 2 &&
17631 N->getOperand(0).getOpcode() == ISD::VECTOR_SHUFFLE &&
17632 N->getOperand(1).getOpcode() == ISD::VECTOR_SHUFFLE) {
17633 auto *S0 = cast<ShuffleVectorSDNode>(N->getOperand(0).getNode());
17634 auto *S1 = cast<ShuffleVectorSDNode>(N->getOperand(1).getNode());
17636 if (S0->getOperand(0) == S1->getOperand(0) &&
17637 S0->getOperand(1) == S1->getOperand(1)) {
17638 // Construct complete shuffle mask
17639 SmallVector<int, 8> Mask(S0->getMask().begin(), S0->getMask().end());
17640 Mask.append(S1->getMask().begin(), S1->getMask().end());
17642 if (isVMOVNTruncMask(Mask, VT, 0))
17643 return DAG.getNode(
17644 ARMISD::VMOVN, DL, VT,
17645 DAG.getNode(ARMISD::VECTOR_REG_CAST, DL, VT, S0->getOperand(0)),
17646 DAG.getNode(ARMISD::VECTOR_REG_CAST, DL, VT, S0->getOperand(1)),
17647 DAG.getConstant(1, DL, MVT::i32));
17648 if (isVMOVNTruncMask(Mask, VT, 1))
17649 return DAG.getNode(
17650 ARMISD::VMOVN, DL, VT,
17651 DAG.getNode(ARMISD::VECTOR_REG_CAST, DL, VT, S0->getOperand(1)),
17652 DAG.getNode(ARMISD::VECTOR_REG_CAST, DL, VT, S0->getOperand(0)),
17653 DAG.getConstant(1, DL, MVT::i32));
17657 // For MVETrunc of a buildvector or shuffle, it can be beneficial to lower the
17658 // truncate to a buildvector to allow the generic optimisations to kick in.
17659 if (all_of(N->ops(), [](SDValue Op) {
17660 return Op.getOpcode() == ISD::BUILD_VECTOR ||
17661 Op.getOpcode() == ISD::VECTOR_SHUFFLE ||
17662 (Op.getOpcode() == ISD::BITCAST &&
17663 Op.getOperand(0).getOpcode() == ISD::BUILD_VECTOR);
17664 })) {
17665 SmallVector<SDValue, 8> Extracts;
17666 for (unsigned Op = 0; Op < N->getNumOperands(); Op++) {
17667 SDValue O = N->getOperand(Op);
17668 for (unsigned i = 0; i < O.getValueType().getVectorNumElements(); i++) {
17669 SDValue Ext = DAG.getNode(ISD::EXTRACT_VECTOR_ELT, DL, MVT::i32, O,
17670 DAG.getConstant(i, DL, MVT::i32));
17671 Extracts.push_back(Ext);
17674 return DAG.getBuildVector(VT, DL, Extracts);
17677 // If we are late in the legalization process and nothing has optimised
17678 // the trunc to anything better, lower it to a stack store and reload,
17679 // performing the truncation whilst keeping the lanes in the correct order:
17680 // VSTRH.32 a, stack; VSTRH.32 b, stack+8; VLDRW.32 stack;
17681 if (!DCI.isAfterLegalizeDAG())
17682 return SDValue();
17684 SDValue StackPtr = DAG.CreateStackTemporary(TypeSize::Fixed(16), Align(4));
17685 int SPFI = cast<FrameIndexSDNode>(StackPtr.getNode())->getIndex();
17686 int NumIns = N->getNumOperands();
17687 assert((NumIns == 2 || NumIns == 4) &&
17688 "Expected 2 or 4 inputs to an MVETrunc");
17689 EVT StoreVT = VT.getHalfNumVectorElementsVT(*DAG.getContext());
17690 if (N->getNumOperands() == 4)
17691 StoreVT = StoreVT.getHalfNumVectorElementsVT(*DAG.getContext());
17693 SmallVector<SDValue> Chains;
17694 for (int I = 0; I < NumIns; I++) {
17695 SDValue Ptr = DAG.getNode(
17696 ISD::ADD, DL, StackPtr.getValueType(), StackPtr,
17697 DAG.getConstant(I * 16 / NumIns, DL, StackPtr.getValueType()));
17698 MachinePointerInfo MPI = MachinePointerInfo::getFixedStack(
17699 DAG.getMachineFunction(), SPFI, I * 16 / NumIns);
17700 SDValue Ch = DAG.getTruncStore(DAG.getEntryNode(), DL, N->getOperand(I),
17701 Ptr, MPI, StoreVT, Align(4));
17702 Chains.push_back(Ch);
17705 SDValue Chain = DAG.getNode(ISD::TokenFactor, DL, MVT::Other, Chains);
17706 MachinePointerInfo MPI =
17707 MachinePointerInfo::getFixedStack(DAG.getMachineFunction(), SPFI, 0);
17708 return DAG.getLoad(VT, DL, Chain, StackPtr, MPI, Align(4));
17711 // Take a MVEEXT(load x) and split that into (extload x, extload x+8)
17712 static SDValue PerformSplittingMVEEXTToWideningLoad(SDNode *N,
17713 SelectionDAG &DAG) {
17714 SDValue N0 = N->getOperand(0);
17715 LoadSDNode *LD = dyn_cast<LoadSDNode>(N0.getNode());
17716 if (!LD || !LD->isSimple() || !N0.hasOneUse() || LD->isIndexed())
17717 return SDValue();
17719 EVT FromVT = LD->getMemoryVT();
17720 EVT ToVT = N->getValueType(0);
17721 if (!ToVT.isVector())
17722 return SDValue();
17723 assert(FromVT.getVectorNumElements() == ToVT.getVectorNumElements() * 2);
17724 EVT ToEltVT = ToVT.getVectorElementType();
17725 EVT FromEltVT = FromVT.getVectorElementType();
17727 unsigned NumElements = 0;
17728 if (ToEltVT == MVT::i32 && (FromEltVT == MVT::i16 || FromEltVT == MVT::i8))
17729 NumElements = 4;
17730 if (ToEltVT == MVT::i16 && FromEltVT == MVT::i8)
17731 NumElements = 8;
17732 assert(NumElements != 0);
17734 ISD::LoadExtType NewExtType =
17735 N->getOpcode() == ARMISD::MVESEXT ? ISD::SEXTLOAD : ISD::ZEXTLOAD;
17736 if (LD->getExtensionType() != ISD::NON_EXTLOAD &&
17737 LD->getExtensionType() != ISD::EXTLOAD &&
17738 LD->getExtensionType() != NewExtType)
17739 return SDValue();
17741 LLVMContext &C = *DAG.getContext();
17742 SDLoc DL(LD);
17743 // Details about the old load
17744 SDValue Ch = LD->getChain();
17745 SDValue BasePtr = LD->getBasePtr();
17746 Align Alignment = LD->getOriginalAlign();
17747 MachineMemOperand::Flags MMOFlags = LD->getMemOperand()->getFlags();
17748 AAMDNodes AAInfo = LD->getAAInfo();
17750 SDValue Offset = DAG.getUNDEF(BasePtr.getValueType());
17751 EVT NewFromVT = EVT::getVectorVT(
17752 C, EVT::getIntegerVT(C, FromEltVT.getScalarSizeInBits()), NumElements);
17753 EVT NewToVT = EVT::getVectorVT(
17754 C, EVT::getIntegerVT(C, ToEltVT.getScalarSizeInBits()), NumElements);
17756 SmallVector<SDValue, 4> Loads;
17757 SmallVector<SDValue, 4> Chains;
17758 for (unsigned i = 0; i < FromVT.getVectorNumElements() / NumElements; i++) {
17759 unsigned NewOffset = (i * NewFromVT.getSizeInBits()) / 8;
17760 SDValue NewPtr =
17761 DAG.getObjectPtrOffset(DL, BasePtr, TypeSize::Fixed(NewOffset));
17763 SDValue NewLoad =
17764 DAG.getLoad(ISD::UNINDEXED, NewExtType, NewToVT, DL, Ch, NewPtr, Offset,
17765 LD->getPointerInfo().getWithOffset(NewOffset), NewFromVT,
17766 Alignment, MMOFlags, AAInfo);
17767 Loads.push_back(NewLoad);
17768 Chains.push_back(SDValue(NewLoad.getNode(), 1));
17771 SDValue NewChain = DAG.getNode(ISD::TokenFactor, DL, MVT::Other, Chains);
17772 DAG.ReplaceAllUsesOfValueWith(SDValue(LD, 1), NewChain);
17773 return DAG.getMergeValues(Loads, DL);
17776 // Perform combines for MVEEXT. If it has not be optimized to anything better
17777 // before lowering, it gets converted to stack store and extloads performing the
17778 // extend whilst still keeping the same lane ordering.
17779 SDValue ARMTargetLowering::PerformMVEExtCombine(
17780 SDNode *N, TargetLowering::DAGCombinerInfo &DCI) const {
17781 SelectionDAG &DAG = DCI.DAG;
17782 EVT VT = N->getValueType(0);
17783 SDLoc DL(N);
17784 assert(N->getNumValues() == 2 && "Expected MVEEXT with 2 elements");
17785 assert((VT == MVT::v4i32 || VT == MVT::v8i16) && "Unexpected MVEEXT type");
17787 EVT ExtVT = N->getOperand(0).getValueType().getHalfNumVectorElementsVT(
17788 *DAG.getContext());
17789 auto Extend = [&](SDValue V) {
17790 SDValue VVT = DAG.getNode(ARMISD::VECTOR_REG_CAST, DL, VT, V);
17791 return N->getOpcode() == ARMISD::MVESEXT
17792 ? DAG.getNode(ISD::SIGN_EXTEND_INREG, DL, VT, VVT,
17793 DAG.getValueType(ExtVT))
17794 : DAG.getZeroExtendInReg(VVT, DL, ExtVT);
17797 // MVEEXT(VDUP) -> SIGN_EXTEND_INREG(VDUP)
17798 if (N->getOperand(0).getOpcode() == ARMISD::VDUP) {
17799 SDValue Ext = Extend(N->getOperand(0));
17800 return DAG.getMergeValues({Ext, Ext}, DL);
17803 // MVEEXT(shuffle) -> SIGN_EXTEND_INREG/ZERO_EXTEND_INREG
17804 if (auto *SVN = dyn_cast<ShuffleVectorSDNode>(N->getOperand(0))) {
17805 ArrayRef<int> Mask = SVN->getMask();
17806 assert(Mask.size() == 2 * VT.getVectorNumElements());
17807 assert(Mask.size() == SVN->getValueType(0).getVectorNumElements());
17808 unsigned Rev = VT == MVT::v4i32 ? ARMISD::VREV32 : ARMISD::VREV16;
17809 SDValue Op0 = SVN->getOperand(0);
17810 SDValue Op1 = SVN->getOperand(1);
17812 auto CheckInregMask = [&](int Start, int Offset) {
17813 for (int Idx = 0, E = VT.getVectorNumElements(); Idx < E; ++Idx)
17814 if (Mask[Start + Idx] >= 0 && Mask[Start + Idx] != Idx * 2 + Offset)
17815 return false;
17816 return true;
17818 SDValue V0 = SDValue(N, 0);
17819 SDValue V1 = SDValue(N, 1);
17820 if (CheckInregMask(0, 0))
17821 V0 = Extend(Op0);
17822 else if (CheckInregMask(0, 1))
17823 V0 = Extend(DAG.getNode(Rev, DL, SVN->getValueType(0), Op0));
17824 else if (CheckInregMask(0, Mask.size()))
17825 V0 = Extend(Op1);
17826 else if (CheckInregMask(0, Mask.size() + 1))
17827 V0 = Extend(DAG.getNode(Rev, DL, SVN->getValueType(0), Op1));
17829 if (CheckInregMask(VT.getVectorNumElements(), Mask.size()))
17830 V1 = Extend(Op1);
17831 else if (CheckInregMask(VT.getVectorNumElements(), Mask.size() + 1))
17832 V1 = Extend(DAG.getNode(Rev, DL, SVN->getValueType(0), Op1));
17833 else if (CheckInregMask(VT.getVectorNumElements(), 0))
17834 V1 = Extend(Op0);
17835 else if (CheckInregMask(VT.getVectorNumElements(), 1))
17836 V1 = Extend(DAG.getNode(Rev, DL, SVN->getValueType(0), Op0));
17838 if (V0.getNode() != N || V1.getNode() != N)
17839 return DAG.getMergeValues({V0, V1}, DL);
17842 // MVEEXT(load) -> extload, extload
17843 if (N->getOperand(0)->getOpcode() == ISD::LOAD)
17844 if (SDValue L = PerformSplittingMVEEXTToWideningLoad(N, DAG))
17845 return L;
17847 if (!DCI.isAfterLegalizeDAG())
17848 return SDValue();
17850 // Lower to a stack store and reload:
17851 // VSTRW.32 a, stack; VLDRH.32 stack; VLDRH.32 stack+8;
17852 SDValue StackPtr = DAG.CreateStackTemporary(TypeSize::Fixed(16), Align(4));
17853 int SPFI = cast<FrameIndexSDNode>(StackPtr.getNode())->getIndex();
17854 int NumOuts = N->getNumValues();
17855 assert((NumOuts == 2 || NumOuts == 4) &&
17856 "Expected 2 or 4 outputs to an MVEEXT");
17857 EVT LoadVT = N->getOperand(0).getValueType().getHalfNumVectorElementsVT(
17858 *DAG.getContext());
17859 if (N->getNumOperands() == 4)
17860 LoadVT = LoadVT.getHalfNumVectorElementsVT(*DAG.getContext());
17862 MachinePointerInfo MPI =
17863 MachinePointerInfo::getFixedStack(DAG.getMachineFunction(), SPFI, 0);
17864 SDValue Chain = DAG.getStore(DAG.getEntryNode(), DL, N->getOperand(0),
17865 StackPtr, MPI, Align(4));
17867 SmallVector<SDValue> Loads;
17868 for (int I = 0; I < NumOuts; I++) {
17869 SDValue Ptr = DAG.getNode(
17870 ISD::ADD, DL, StackPtr.getValueType(), StackPtr,
17871 DAG.getConstant(I * 16 / NumOuts, DL, StackPtr.getValueType()));
17872 MachinePointerInfo MPI = MachinePointerInfo::getFixedStack(
17873 DAG.getMachineFunction(), SPFI, I * 16 / NumOuts);
17874 SDValue Load = DAG.getExtLoad(
17875 N->getOpcode() == ARMISD::MVESEXT ? ISD::SEXTLOAD : ISD::ZEXTLOAD, DL,
17876 VT, Chain, Ptr, MPI, LoadVT, Align(4));
17877 Loads.push_back(Load);
17880 return DAG.getMergeValues(Loads, DL);
17883 SDValue ARMTargetLowering::PerformDAGCombine(SDNode *N,
17884 DAGCombinerInfo &DCI) const {
17885 switch (N->getOpcode()) {
17886 default: break;
17887 case ISD::SELECT_CC:
17888 case ISD::SELECT: return PerformSELECTCombine(N, DCI, Subtarget);
17889 case ISD::VSELECT: return PerformVSELECTCombine(N, DCI, Subtarget);
17890 case ISD::ABS: return PerformABSCombine(N, DCI, Subtarget);
17891 case ARMISD::ADDE: return PerformADDECombine(N, DCI, Subtarget);
17892 case ARMISD::UMLAL: return PerformUMLALCombine(N, DCI.DAG, Subtarget);
17893 case ISD::ADD: return PerformADDCombine(N, DCI, Subtarget);
17894 case ISD::SUB: return PerformSUBCombine(N, DCI, Subtarget);
17895 case ISD::MUL: return PerformMULCombine(N, DCI, Subtarget);
17896 case ISD::OR: return PerformORCombine(N, DCI, Subtarget);
17897 case ISD::XOR: return PerformXORCombine(N, DCI, Subtarget);
17898 case ISD::AND: return PerformANDCombine(N, DCI, Subtarget);
17899 case ISD::BRCOND:
17900 case ISD::BR_CC: return PerformHWLoopCombine(N, DCI, Subtarget);
17901 case ARMISD::ADDC:
17902 case ARMISD::SUBC: return PerformAddcSubcCombine(N, DCI, Subtarget);
17903 case ARMISD::SUBE: return PerformAddeSubeCombine(N, DCI, Subtarget);
17904 case ARMISD::BFI: return PerformBFICombine(N, DCI.DAG);
17905 case ARMISD::VMOVRRD: return PerformVMOVRRDCombine(N, DCI, Subtarget);
17906 case ARMISD::VMOVDRR: return PerformVMOVDRRCombine(N, DCI.DAG);
17907 case ARMISD::VMOVhr: return PerformVMOVhrCombine(N, DCI);
17908 case ARMISD::VMOVrh: return PerformVMOVrhCombine(N, DCI.DAG);
17909 case ISD::STORE: return PerformSTORECombine(N, DCI, Subtarget);
17910 case ISD::BUILD_VECTOR: return PerformBUILD_VECTORCombine(N, DCI, Subtarget);
17911 case ISD::INSERT_VECTOR_ELT: return PerformInsertEltCombine(N, DCI);
17912 case ISD::EXTRACT_VECTOR_ELT:
17913 return PerformExtractEltCombine(N, DCI, Subtarget);
17914 case ISD::SIGN_EXTEND_INREG: return PerformSignExtendInregCombine(N, DCI.DAG);
17915 case ISD::INSERT_SUBVECTOR: return PerformInsertSubvectorCombine(N, DCI);
17916 case ISD::VECTOR_SHUFFLE: return PerformVECTOR_SHUFFLECombine(N, DCI.DAG);
17917 case ARMISD::VDUPLANE: return PerformVDUPLANECombine(N, DCI, Subtarget);
17918 case ARMISD::VDUP: return PerformVDUPCombine(N, DCI.DAG, Subtarget);
17919 case ISD::FP_TO_SINT:
17920 case ISD::FP_TO_UINT:
17921 return PerformVCVTCombine(N, DCI.DAG, Subtarget);
17922 case ISD::FDIV:
17923 return PerformVDIVCombine(N, DCI.DAG, Subtarget);
17924 case ISD::INTRINSIC_WO_CHAIN:
17925 return PerformIntrinsicCombine(N, DCI);
17926 case ISD::SHL:
17927 case ISD::SRA:
17928 case ISD::SRL:
17929 return PerformShiftCombine(N, DCI, Subtarget);
17930 case ISD::SIGN_EXTEND:
17931 case ISD::ZERO_EXTEND:
17932 case ISD::ANY_EXTEND:
17933 return PerformExtendCombine(N, DCI.DAG, Subtarget);
17934 case ISD::FP_EXTEND:
17935 return PerformFPExtendCombine(N, DCI.DAG, Subtarget);
17936 case ISD::SMIN:
17937 case ISD::UMIN:
17938 case ISD::SMAX:
17939 case ISD::UMAX:
17940 return PerformMinMaxCombine(N, DCI.DAG, Subtarget);
17941 case ARMISD::CMOV: return PerformCMOVCombine(N, DCI.DAG);
17942 case ARMISD::BRCOND: return PerformBRCONDCombine(N, DCI.DAG);
17943 case ISD::LOAD: return PerformLOADCombine(N, DCI);
17944 case ARMISD::VLD1DUP:
17945 case ARMISD::VLD2DUP:
17946 case ARMISD::VLD3DUP:
17947 case ARMISD::VLD4DUP:
17948 return PerformVLDCombine(N, DCI);
17949 case ARMISD::BUILD_VECTOR:
17950 return PerformARMBUILD_VECTORCombine(N, DCI);
17951 case ISD::BITCAST:
17952 return PerformBITCASTCombine(N, DCI, Subtarget);
17953 case ARMISD::PREDICATE_CAST:
17954 return PerformPREDICATE_CASTCombine(N, DCI);
17955 case ARMISD::VECTOR_REG_CAST:
17956 return PerformVECTOR_REG_CASTCombine(N, DCI.DAG, Subtarget);
17957 case ARMISD::MVETRUNC:
17958 return PerformMVETruncCombine(N, DCI);
17959 case ARMISD::MVESEXT:
17960 case ARMISD::MVEZEXT:
17961 return PerformMVEExtCombine(N, DCI);
17962 case ARMISD::VCMP:
17963 return PerformVCMPCombine(N, DCI.DAG, Subtarget);
17964 case ISD::VECREDUCE_ADD:
17965 return PerformVECREDUCE_ADDCombine(N, DCI.DAG, Subtarget);
17966 case ARMISD::VMOVN:
17967 return PerformVMOVNCombine(N, DCI);
17968 case ARMISD::VQMOVNs:
17969 case ARMISD::VQMOVNu:
17970 return PerformVQMOVNCombine(N, DCI);
17971 case ARMISD::ASRL:
17972 case ARMISD::LSRL:
17973 case ARMISD::LSLL:
17974 return PerformLongShiftCombine(N, DCI.DAG);
17975 case ARMISD::SMULWB: {
17976 unsigned BitWidth = N->getValueType(0).getSizeInBits();
17977 APInt DemandedMask = APInt::getLowBitsSet(BitWidth, 16);
17978 if (SimplifyDemandedBits(N->getOperand(1), DemandedMask, DCI))
17979 return SDValue();
17980 break;
17982 case ARMISD::SMULWT: {
17983 unsigned BitWidth = N->getValueType(0).getSizeInBits();
17984 APInt DemandedMask = APInt::getHighBitsSet(BitWidth, 16);
17985 if (SimplifyDemandedBits(N->getOperand(1), DemandedMask, DCI))
17986 return SDValue();
17987 break;
17989 case ARMISD::SMLALBB:
17990 case ARMISD::QADD16b:
17991 case ARMISD::QSUB16b:
17992 case ARMISD::UQADD16b:
17993 case ARMISD::UQSUB16b: {
17994 unsigned BitWidth = N->getValueType(0).getSizeInBits();
17995 APInt DemandedMask = APInt::getLowBitsSet(BitWidth, 16);
17996 if ((SimplifyDemandedBits(N->getOperand(0), DemandedMask, DCI)) ||
17997 (SimplifyDemandedBits(N->getOperand(1), DemandedMask, DCI)))
17998 return SDValue();
17999 break;
18001 case ARMISD::SMLALBT: {
18002 unsigned LowWidth = N->getOperand(0).getValueType().getSizeInBits();
18003 APInt LowMask = APInt::getLowBitsSet(LowWidth, 16);
18004 unsigned HighWidth = N->getOperand(1).getValueType().getSizeInBits();
18005 APInt HighMask = APInt::getHighBitsSet(HighWidth, 16);
18006 if ((SimplifyDemandedBits(N->getOperand(0), LowMask, DCI)) ||
18007 (SimplifyDemandedBits(N->getOperand(1), HighMask, DCI)))
18008 return SDValue();
18009 break;
18011 case ARMISD::SMLALTB: {
18012 unsigned HighWidth = N->getOperand(0).getValueType().getSizeInBits();
18013 APInt HighMask = APInt::getHighBitsSet(HighWidth, 16);
18014 unsigned LowWidth = N->getOperand(1).getValueType().getSizeInBits();
18015 APInt LowMask = APInt::getLowBitsSet(LowWidth, 16);
18016 if ((SimplifyDemandedBits(N->getOperand(0), HighMask, DCI)) ||
18017 (SimplifyDemandedBits(N->getOperand(1), LowMask, DCI)))
18018 return SDValue();
18019 break;
18021 case ARMISD::SMLALTT: {
18022 unsigned BitWidth = N->getValueType(0).getSizeInBits();
18023 APInt DemandedMask = APInt::getHighBitsSet(BitWidth, 16);
18024 if ((SimplifyDemandedBits(N->getOperand(0), DemandedMask, DCI)) ||
18025 (SimplifyDemandedBits(N->getOperand(1), DemandedMask, DCI)))
18026 return SDValue();
18027 break;
18029 case ARMISD::QADD8b:
18030 case ARMISD::QSUB8b:
18031 case ARMISD::UQADD8b:
18032 case ARMISD::UQSUB8b: {
18033 unsigned BitWidth = N->getValueType(0).getSizeInBits();
18034 APInt DemandedMask = APInt::getLowBitsSet(BitWidth, 8);
18035 if ((SimplifyDemandedBits(N->getOperand(0), DemandedMask, DCI)) ||
18036 (SimplifyDemandedBits(N->getOperand(1), DemandedMask, DCI)))
18037 return SDValue();
18038 break;
18040 case ISD::INTRINSIC_VOID:
18041 case ISD::INTRINSIC_W_CHAIN:
18042 switch (cast<ConstantSDNode>(N->getOperand(1))->getZExtValue()) {
18043 case Intrinsic::arm_neon_vld1:
18044 case Intrinsic::arm_neon_vld1x2:
18045 case Intrinsic::arm_neon_vld1x3:
18046 case Intrinsic::arm_neon_vld1x4:
18047 case Intrinsic::arm_neon_vld2:
18048 case Intrinsic::arm_neon_vld3:
18049 case Intrinsic::arm_neon_vld4:
18050 case Intrinsic::arm_neon_vld2lane:
18051 case Intrinsic::arm_neon_vld3lane:
18052 case Intrinsic::arm_neon_vld4lane:
18053 case Intrinsic::arm_neon_vld2dup:
18054 case Intrinsic::arm_neon_vld3dup:
18055 case Intrinsic::arm_neon_vld4dup:
18056 case Intrinsic::arm_neon_vst1:
18057 case Intrinsic::arm_neon_vst1x2:
18058 case Intrinsic::arm_neon_vst1x3:
18059 case Intrinsic::arm_neon_vst1x4:
18060 case Intrinsic::arm_neon_vst2:
18061 case Intrinsic::arm_neon_vst3:
18062 case Intrinsic::arm_neon_vst4:
18063 case Intrinsic::arm_neon_vst2lane:
18064 case Intrinsic::arm_neon_vst3lane:
18065 case Intrinsic::arm_neon_vst4lane:
18066 return PerformVLDCombine(N, DCI);
18067 case Intrinsic::arm_mve_vld2q:
18068 case Intrinsic::arm_mve_vld4q:
18069 case Intrinsic::arm_mve_vst2q:
18070 case Intrinsic::arm_mve_vst4q:
18071 return PerformMVEVLDCombine(N, DCI);
18072 default: break;
18074 break;
18076 return SDValue();
18079 bool ARMTargetLowering::isDesirableToTransformToIntegerOp(unsigned Opc,
18080 EVT VT) const {
18081 return (VT == MVT::f32) && (Opc == ISD::LOAD || Opc == ISD::STORE);
18084 bool ARMTargetLowering::allowsMisalignedMemoryAccesses(EVT VT, unsigned,
18085 Align Alignment,
18086 MachineMemOperand::Flags,
18087 bool *Fast) const {
18088 // Depends what it gets converted into if the type is weird.
18089 if (!VT.isSimple())
18090 return false;
18092 // The AllowsUnaligned flag models the SCTLR.A setting in ARM cpus
18093 bool AllowsUnaligned = Subtarget->allowsUnalignedMem();
18094 auto Ty = VT.getSimpleVT().SimpleTy;
18096 if (Ty == MVT::i8 || Ty == MVT::i16 || Ty == MVT::i32) {
18097 // Unaligned access can use (for example) LRDB, LRDH, LDR
18098 if (AllowsUnaligned) {
18099 if (Fast)
18100 *Fast = Subtarget->hasV7Ops();
18101 return true;
18105 if (Ty == MVT::f64 || Ty == MVT::v2f64) {
18106 // For any little-endian targets with neon, we can support unaligned ld/st
18107 // of D and Q (e.g. {D0,D1}) registers by using vld1.i8/vst1.i8.
18108 // A big-endian target may also explicitly support unaligned accesses
18109 if (Subtarget->hasNEON() && (AllowsUnaligned || Subtarget->isLittle())) {
18110 if (Fast)
18111 *Fast = true;
18112 return true;
18116 if (!Subtarget->hasMVEIntegerOps())
18117 return false;
18119 // These are for predicates
18120 if ((Ty == MVT::v16i1 || Ty == MVT::v8i1 || Ty == MVT::v4i1)) {
18121 if (Fast)
18122 *Fast = true;
18123 return true;
18126 // These are for truncated stores/narrowing loads. They are fine so long as
18127 // the alignment is at least the size of the item being loaded
18128 if ((Ty == MVT::v4i8 || Ty == MVT::v8i8 || Ty == MVT::v4i16) &&
18129 Alignment >= VT.getScalarSizeInBits() / 8) {
18130 if (Fast)
18131 *Fast = true;
18132 return true;
18135 // In little-endian MVE, the store instructions VSTRB.U8, VSTRH.U16 and
18136 // VSTRW.U32 all store the vector register in exactly the same format, and
18137 // differ only in the range of their immediate offset field and the required
18138 // alignment. So there is always a store that can be used, regardless of
18139 // actual type.
18141 // For big endian, that is not the case. But can still emit a (VSTRB.U8;
18142 // VREV64.8) pair and get the same effect. This will likely be better than
18143 // aligning the vector through the stack.
18144 if (Ty == MVT::v16i8 || Ty == MVT::v8i16 || Ty == MVT::v8f16 ||
18145 Ty == MVT::v4i32 || Ty == MVT::v4f32 || Ty == MVT::v2i64 ||
18146 Ty == MVT::v2f64) {
18147 if (Fast)
18148 *Fast = true;
18149 return true;
18152 return false;
18156 EVT ARMTargetLowering::getOptimalMemOpType(
18157 const MemOp &Op, const AttributeList &FuncAttributes) const {
18158 // See if we can use NEON instructions for this...
18159 if ((Op.isMemcpy() || Op.isZeroMemset()) && Subtarget->hasNEON() &&
18160 !FuncAttributes.hasFnAttr(Attribute::NoImplicitFloat)) {
18161 bool Fast;
18162 if (Op.size() >= 16 &&
18163 (Op.isAligned(Align(16)) ||
18164 (allowsMisalignedMemoryAccesses(MVT::v2f64, 0, Align(1),
18165 MachineMemOperand::MONone, &Fast) &&
18166 Fast))) {
18167 return MVT::v2f64;
18168 } else if (Op.size() >= 8 &&
18169 (Op.isAligned(Align(8)) ||
18170 (allowsMisalignedMemoryAccesses(
18171 MVT::f64, 0, Align(1), MachineMemOperand::MONone, &Fast) &&
18172 Fast))) {
18173 return MVT::f64;
18177 // Let the target-independent logic figure it out.
18178 return MVT::Other;
18181 // 64-bit integers are split into their high and low parts and held in two
18182 // different registers, so the trunc is free since the low register can just
18183 // be used.
18184 bool ARMTargetLowering::isTruncateFree(Type *SrcTy, Type *DstTy) const {
18185 if (!SrcTy->isIntegerTy() || !DstTy->isIntegerTy())
18186 return false;
18187 unsigned SrcBits = SrcTy->getPrimitiveSizeInBits();
18188 unsigned DestBits = DstTy->getPrimitiveSizeInBits();
18189 return (SrcBits == 64 && DestBits == 32);
18192 bool ARMTargetLowering::isTruncateFree(EVT SrcVT, EVT DstVT) const {
18193 if (SrcVT.isVector() || DstVT.isVector() || !SrcVT.isInteger() ||
18194 !DstVT.isInteger())
18195 return false;
18196 unsigned SrcBits = SrcVT.getSizeInBits();
18197 unsigned DestBits = DstVT.getSizeInBits();
18198 return (SrcBits == 64 && DestBits == 32);
18201 bool ARMTargetLowering::isZExtFree(SDValue Val, EVT VT2) const {
18202 if (Val.getOpcode() != ISD::LOAD)
18203 return false;
18205 EVT VT1 = Val.getValueType();
18206 if (!VT1.isSimple() || !VT1.isInteger() ||
18207 !VT2.isSimple() || !VT2.isInteger())
18208 return false;
18210 switch (VT1.getSimpleVT().SimpleTy) {
18211 default: break;
18212 case MVT::i1:
18213 case MVT::i8:
18214 case MVT::i16:
18215 // 8-bit and 16-bit loads implicitly zero-extend to 32-bits.
18216 return true;
18219 return false;
18222 bool ARMTargetLowering::isFNegFree(EVT VT) const {
18223 if (!VT.isSimple())
18224 return false;
18226 // There are quite a few FP16 instructions (e.g. VNMLA, VNMLS, etc.) that
18227 // negate values directly (fneg is free). So, we don't want to let the DAG
18228 // combiner rewrite fneg into xors and some other instructions. For f16 and
18229 // FullFP16 argument passing, some bitcast nodes may be introduced,
18230 // triggering this DAG combine rewrite, so we are avoiding that with this.
18231 switch (VT.getSimpleVT().SimpleTy) {
18232 default: break;
18233 case MVT::f16:
18234 return Subtarget->hasFullFP16();
18237 return false;
18240 /// Check if Ext1 and Ext2 are extends of the same type, doubling the bitwidth
18241 /// of the vector elements.
18242 static bool areExtractExts(Value *Ext1, Value *Ext2) {
18243 auto areExtDoubled = [](Instruction *Ext) {
18244 return Ext->getType()->getScalarSizeInBits() ==
18245 2 * Ext->getOperand(0)->getType()->getScalarSizeInBits();
18248 if (!match(Ext1, m_ZExtOrSExt(m_Value())) ||
18249 !match(Ext2, m_ZExtOrSExt(m_Value())) ||
18250 !areExtDoubled(cast<Instruction>(Ext1)) ||
18251 !areExtDoubled(cast<Instruction>(Ext2)))
18252 return false;
18254 return true;
18257 /// Check if sinking \p I's operands to I's basic block is profitable, because
18258 /// the operands can be folded into a target instruction, e.g.
18259 /// sext/zext can be folded into vsubl.
18260 bool ARMTargetLowering::shouldSinkOperands(Instruction *I,
18261 SmallVectorImpl<Use *> &Ops) const {
18262 if (!I->getType()->isVectorTy())
18263 return false;
18265 if (Subtarget->hasNEON()) {
18266 switch (I->getOpcode()) {
18267 case Instruction::Sub:
18268 case Instruction::Add: {
18269 if (!areExtractExts(I->getOperand(0), I->getOperand(1)))
18270 return false;
18271 Ops.push_back(&I->getOperandUse(0));
18272 Ops.push_back(&I->getOperandUse(1));
18273 return true;
18275 default:
18276 return false;
18280 if (!Subtarget->hasMVEIntegerOps())
18281 return false;
18283 auto IsFMSMul = [&](Instruction *I) {
18284 if (!I->hasOneUse())
18285 return false;
18286 auto *Sub = cast<Instruction>(*I->users().begin());
18287 return Sub->getOpcode() == Instruction::FSub && Sub->getOperand(1) == I;
18289 auto IsFMS = [&](Instruction *I) {
18290 if (match(I->getOperand(0), m_FNeg(m_Value())) ||
18291 match(I->getOperand(1), m_FNeg(m_Value())))
18292 return true;
18293 return false;
18296 auto IsSinker = [&](Instruction *I, int Operand) {
18297 switch (I->getOpcode()) {
18298 case Instruction::Add:
18299 case Instruction::Mul:
18300 case Instruction::FAdd:
18301 case Instruction::ICmp:
18302 case Instruction::FCmp:
18303 return true;
18304 case Instruction::FMul:
18305 return !IsFMSMul(I);
18306 case Instruction::Sub:
18307 case Instruction::FSub:
18308 case Instruction::Shl:
18309 case Instruction::LShr:
18310 case Instruction::AShr:
18311 return Operand == 1;
18312 case Instruction::Call:
18313 if (auto *II = dyn_cast<IntrinsicInst>(I)) {
18314 switch (II->getIntrinsicID()) {
18315 case Intrinsic::fma:
18316 return !IsFMS(I);
18317 case Intrinsic::arm_mve_add_predicated:
18318 case Intrinsic::arm_mve_mul_predicated:
18319 case Intrinsic::arm_mve_qadd_predicated:
18320 case Intrinsic::arm_mve_hadd_predicated:
18321 case Intrinsic::arm_mve_vqdmull_predicated:
18322 case Intrinsic::arm_mve_qdmulh_predicated:
18323 case Intrinsic::arm_mve_qrdmulh_predicated:
18324 case Intrinsic::arm_mve_fma_predicated:
18325 return true;
18326 case Intrinsic::arm_mve_sub_predicated:
18327 case Intrinsic::arm_mve_qsub_predicated:
18328 case Intrinsic::arm_mve_hsub_predicated:
18329 return Operand == 1;
18330 default:
18331 return false;
18334 return false;
18335 default:
18336 return false;
18340 for (auto OpIdx : enumerate(I->operands())) {
18341 Instruction *Op = dyn_cast<Instruction>(OpIdx.value().get());
18342 // Make sure we are not already sinking this operand
18343 if (!Op || any_of(Ops, [&](Use *U) { return U->get() == Op; }))
18344 continue;
18346 Instruction *Shuffle = Op;
18347 if (Shuffle->getOpcode() == Instruction::BitCast)
18348 Shuffle = dyn_cast<Instruction>(Shuffle->getOperand(0));
18349 // We are looking for a splat that can be sunk.
18350 if (!Shuffle ||
18351 !match(Shuffle, m_Shuffle(
18352 m_InsertElt(m_Undef(), m_Value(), m_ZeroInt()),
18353 m_Undef(), m_ZeroMask())))
18354 continue;
18355 if (!IsSinker(I, OpIdx.index()))
18356 continue;
18358 // All uses of the shuffle should be sunk to avoid duplicating it across gpr
18359 // and vector registers
18360 for (Use &U : Op->uses()) {
18361 Instruction *Insn = cast<Instruction>(U.getUser());
18362 if (!IsSinker(Insn, U.getOperandNo()))
18363 return false;
18366 Ops.push_back(&Shuffle->getOperandUse(0));
18367 if (Shuffle != Op)
18368 Ops.push_back(&Op->getOperandUse(0));
18369 Ops.push_back(&OpIdx.value());
18371 return true;
18374 Type *ARMTargetLowering::shouldConvertSplatType(ShuffleVectorInst *SVI) const {
18375 if (!Subtarget->hasMVEIntegerOps())
18376 return nullptr;
18377 Type *SVIType = SVI->getType();
18378 Type *ScalarType = SVIType->getScalarType();
18380 if (ScalarType->isFloatTy())
18381 return Type::getInt32Ty(SVIType->getContext());
18382 if (ScalarType->isHalfTy())
18383 return Type::getInt16Ty(SVIType->getContext());
18384 return nullptr;
18387 bool ARMTargetLowering::isVectorLoadExtDesirable(SDValue ExtVal) const {
18388 EVT VT = ExtVal.getValueType();
18390 if (!isTypeLegal(VT))
18391 return false;
18393 if (auto *Ld = dyn_cast<MaskedLoadSDNode>(ExtVal.getOperand(0))) {
18394 if (Ld->isExpandingLoad())
18395 return false;
18398 if (Subtarget->hasMVEIntegerOps())
18399 return true;
18401 // Don't create a loadext if we can fold the extension into a wide/long
18402 // instruction.
18403 // If there's more than one user instruction, the loadext is desirable no
18404 // matter what. There can be two uses by the same instruction.
18405 if (ExtVal->use_empty() ||
18406 !ExtVal->use_begin()->isOnlyUserOf(ExtVal.getNode()))
18407 return true;
18409 SDNode *U = *ExtVal->use_begin();
18410 if ((U->getOpcode() == ISD::ADD || U->getOpcode() == ISD::SUB ||
18411 U->getOpcode() == ISD::SHL || U->getOpcode() == ARMISD::VSHLIMM))
18412 return false;
18414 return true;
18417 bool ARMTargetLowering::allowTruncateForTailCall(Type *Ty1, Type *Ty2) const {
18418 if (!Ty1->isIntegerTy() || !Ty2->isIntegerTy())
18419 return false;
18421 if (!isTypeLegal(EVT::getEVT(Ty1)))
18422 return false;
18424 assert(Ty1->getPrimitiveSizeInBits() <= 64 && "i128 is probably not a noop");
18426 // Assuming the caller doesn't have a zeroext or signext return parameter,
18427 // truncation all the way down to i1 is valid.
18428 return true;
18431 InstructionCost ARMTargetLowering::getScalingFactorCost(const DataLayout &DL,
18432 const AddrMode &AM,
18433 Type *Ty,
18434 unsigned AS) const {
18435 if (isLegalAddressingMode(DL, AM, Ty, AS)) {
18436 if (Subtarget->hasFPAO())
18437 return AM.Scale < 0 ? 1 : 0; // positive offsets execute faster
18438 return 0;
18440 return -1;
18443 /// isFMAFasterThanFMulAndFAdd - Return true if an FMA operation is faster
18444 /// than a pair of fmul and fadd instructions. fmuladd intrinsics will be
18445 /// expanded to FMAs when this method returns true, otherwise fmuladd is
18446 /// expanded to fmul + fadd.
18448 /// ARM supports both fused and unfused multiply-add operations; we already
18449 /// lower a pair of fmul and fadd to the latter so it's not clear that there
18450 /// would be a gain or that the gain would be worthwhile enough to risk
18451 /// correctness bugs.
18453 /// For MVE, we set this to true as it helps simplify the need for some
18454 /// patterns (and we don't have the non-fused floating point instruction).
18455 bool ARMTargetLowering::isFMAFasterThanFMulAndFAdd(const MachineFunction &MF,
18456 EVT VT) const {
18457 if (!VT.isSimple())
18458 return false;
18460 switch (VT.getSimpleVT().SimpleTy) {
18461 case MVT::v4f32:
18462 case MVT::v8f16:
18463 return Subtarget->hasMVEFloatOps();
18464 case MVT::f16:
18465 return Subtarget->useFPVFMx16();
18466 case MVT::f32:
18467 return Subtarget->useFPVFMx();
18468 case MVT::f64:
18469 return Subtarget->useFPVFMx64();
18470 default:
18471 break;
18474 return false;
18477 static bool isLegalT1AddressImmediate(int64_t V, EVT VT) {
18478 if (V < 0)
18479 return false;
18481 unsigned Scale = 1;
18482 switch (VT.getSimpleVT().SimpleTy) {
18483 case MVT::i1:
18484 case MVT::i8:
18485 // Scale == 1;
18486 break;
18487 case MVT::i16:
18488 // Scale == 2;
18489 Scale = 2;
18490 break;
18491 default:
18492 // On thumb1 we load most things (i32, i64, floats, etc) with a LDR
18493 // Scale == 4;
18494 Scale = 4;
18495 break;
18498 if ((V & (Scale - 1)) != 0)
18499 return false;
18500 return isUInt<5>(V / Scale);
18503 static bool isLegalT2AddressImmediate(int64_t V, EVT VT,
18504 const ARMSubtarget *Subtarget) {
18505 if (!VT.isInteger() && !VT.isFloatingPoint())
18506 return false;
18507 if (VT.isVector() && Subtarget->hasNEON())
18508 return false;
18509 if (VT.isVector() && VT.isFloatingPoint() && Subtarget->hasMVEIntegerOps() &&
18510 !Subtarget->hasMVEFloatOps())
18511 return false;
18513 bool IsNeg = false;
18514 if (V < 0) {
18515 IsNeg = true;
18516 V = -V;
18519 unsigned NumBytes = std::max((unsigned)VT.getSizeInBits() / 8, 1U);
18521 // MVE: size * imm7
18522 if (VT.isVector() && Subtarget->hasMVEIntegerOps()) {
18523 switch (VT.getSimpleVT().getVectorElementType().SimpleTy) {
18524 case MVT::i32:
18525 case MVT::f32:
18526 return isShiftedUInt<7,2>(V);
18527 case MVT::i16:
18528 case MVT::f16:
18529 return isShiftedUInt<7,1>(V);
18530 case MVT::i8:
18531 return isUInt<7>(V);
18532 default:
18533 return false;
18537 // half VLDR: 2 * imm8
18538 if (VT.isFloatingPoint() && NumBytes == 2 && Subtarget->hasFPRegs16())
18539 return isShiftedUInt<8, 1>(V);
18540 // VLDR and LDRD: 4 * imm8
18541 if ((VT.isFloatingPoint() && Subtarget->hasVFP2Base()) || NumBytes == 8)
18542 return isShiftedUInt<8, 2>(V);
18544 if (NumBytes == 1 || NumBytes == 2 || NumBytes == 4) {
18545 // + imm12 or - imm8
18546 if (IsNeg)
18547 return isUInt<8>(V);
18548 return isUInt<12>(V);
18551 return false;
18554 /// isLegalAddressImmediate - Return true if the integer value can be used
18555 /// as the offset of the target addressing mode for load / store of the
18556 /// given type.
18557 static bool isLegalAddressImmediate(int64_t V, EVT VT,
18558 const ARMSubtarget *Subtarget) {
18559 if (V == 0)
18560 return true;
18562 if (!VT.isSimple())
18563 return false;
18565 if (Subtarget->isThumb1Only())
18566 return isLegalT1AddressImmediate(V, VT);
18567 else if (Subtarget->isThumb2())
18568 return isLegalT2AddressImmediate(V, VT, Subtarget);
18570 // ARM mode.
18571 if (V < 0)
18572 V = - V;
18573 switch (VT.getSimpleVT().SimpleTy) {
18574 default: return false;
18575 case MVT::i1:
18576 case MVT::i8:
18577 case MVT::i32:
18578 // +- imm12
18579 return isUInt<12>(V);
18580 case MVT::i16:
18581 // +- imm8
18582 return isUInt<8>(V);
18583 case MVT::f32:
18584 case MVT::f64:
18585 if (!Subtarget->hasVFP2Base()) // FIXME: NEON?
18586 return false;
18587 return isShiftedUInt<8, 2>(V);
18591 bool ARMTargetLowering::isLegalT2ScaledAddressingMode(const AddrMode &AM,
18592 EVT VT) const {
18593 int Scale = AM.Scale;
18594 if (Scale < 0)
18595 return false;
18597 switch (VT.getSimpleVT().SimpleTy) {
18598 default: return false;
18599 case MVT::i1:
18600 case MVT::i8:
18601 case MVT::i16:
18602 case MVT::i32:
18603 if (Scale == 1)
18604 return true;
18605 // r + r << imm
18606 Scale = Scale & ~1;
18607 return Scale == 2 || Scale == 4 || Scale == 8;
18608 case MVT::i64:
18609 // FIXME: What are we trying to model here? ldrd doesn't have an r + r
18610 // version in Thumb mode.
18611 // r + r
18612 if (Scale == 1)
18613 return true;
18614 // r * 2 (this can be lowered to r + r).
18615 if (!AM.HasBaseReg && Scale == 2)
18616 return true;
18617 return false;
18618 case MVT::isVoid:
18619 // Note, we allow "void" uses (basically, uses that aren't loads or
18620 // stores), because arm allows folding a scale into many arithmetic
18621 // operations. This should be made more precise and revisited later.
18623 // Allow r << imm, but the imm has to be a multiple of two.
18624 if (Scale & 1) return false;
18625 return isPowerOf2_32(Scale);
18629 bool ARMTargetLowering::isLegalT1ScaledAddressingMode(const AddrMode &AM,
18630 EVT VT) const {
18631 const int Scale = AM.Scale;
18633 // Negative scales are not supported in Thumb1.
18634 if (Scale < 0)
18635 return false;
18637 // Thumb1 addressing modes do not support register scaling excepting the
18638 // following cases:
18639 // 1. Scale == 1 means no scaling.
18640 // 2. Scale == 2 this can be lowered to r + r if there is no base register.
18641 return (Scale == 1) || (!AM.HasBaseReg && Scale == 2);
18644 /// isLegalAddressingMode - Return true if the addressing mode represented
18645 /// by AM is legal for this target, for a load/store of the specified type.
18646 bool ARMTargetLowering::isLegalAddressingMode(const DataLayout &DL,
18647 const AddrMode &AM, Type *Ty,
18648 unsigned AS, Instruction *I) const {
18649 EVT VT = getValueType(DL, Ty, true);
18650 if (!isLegalAddressImmediate(AM.BaseOffs, VT, Subtarget))
18651 return false;
18653 // Can never fold addr of global into load/store.
18654 if (AM.BaseGV)
18655 return false;
18657 switch (AM.Scale) {
18658 case 0: // no scale reg, must be "r+i" or "r", or "i".
18659 break;
18660 default:
18661 // ARM doesn't support any R+R*scale+imm addr modes.
18662 if (AM.BaseOffs)
18663 return false;
18665 if (!VT.isSimple())
18666 return false;
18668 if (Subtarget->isThumb1Only())
18669 return isLegalT1ScaledAddressingMode(AM, VT);
18671 if (Subtarget->isThumb2())
18672 return isLegalT2ScaledAddressingMode(AM, VT);
18674 int Scale = AM.Scale;
18675 switch (VT.getSimpleVT().SimpleTy) {
18676 default: return false;
18677 case MVT::i1:
18678 case MVT::i8:
18679 case MVT::i32:
18680 if (Scale < 0) Scale = -Scale;
18681 if (Scale == 1)
18682 return true;
18683 // r + r << imm
18684 return isPowerOf2_32(Scale & ~1);
18685 case MVT::i16:
18686 case MVT::i64:
18687 // r +/- r
18688 if (Scale == 1 || (AM.HasBaseReg && Scale == -1))
18689 return true;
18690 // r * 2 (this can be lowered to r + r).
18691 if (!AM.HasBaseReg && Scale == 2)
18692 return true;
18693 return false;
18695 case MVT::isVoid:
18696 // Note, we allow "void" uses (basically, uses that aren't loads or
18697 // stores), because arm allows folding a scale into many arithmetic
18698 // operations. This should be made more precise and revisited later.
18700 // Allow r << imm, but the imm has to be a multiple of two.
18701 if (Scale & 1) return false;
18702 return isPowerOf2_32(Scale);
18705 return true;
18708 /// isLegalICmpImmediate - Return true if the specified immediate is legal
18709 /// icmp immediate, that is the target has icmp instructions which can compare
18710 /// a register against the immediate without having to materialize the
18711 /// immediate into a register.
18712 bool ARMTargetLowering::isLegalICmpImmediate(int64_t Imm) const {
18713 // Thumb2 and ARM modes can use cmn for negative immediates.
18714 if (!Subtarget->isThumb())
18715 return ARM_AM::getSOImmVal((uint32_t)Imm) != -1 ||
18716 ARM_AM::getSOImmVal(-(uint32_t)Imm) != -1;
18717 if (Subtarget->isThumb2())
18718 return ARM_AM::getT2SOImmVal((uint32_t)Imm) != -1 ||
18719 ARM_AM::getT2SOImmVal(-(uint32_t)Imm) != -1;
18720 // Thumb1 doesn't have cmn, and only 8-bit immediates.
18721 return Imm >= 0 && Imm <= 255;
18724 /// isLegalAddImmediate - Return true if the specified immediate is a legal add
18725 /// *or sub* immediate, that is the target has add or sub instructions which can
18726 /// add a register with the immediate without having to materialize the
18727 /// immediate into a register.
18728 bool ARMTargetLowering::isLegalAddImmediate(int64_t Imm) const {
18729 // Same encoding for add/sub, just flip the sign.
18730 int64_t AbsImm = std::abs(Imm);
18731 if (!Subtarget->isThumb())
18732 return ARM_AM::getSOImmVal(AbsImm) != -1;
18733 if (Subtarget->isThumb2())
18734 return ARM_AM::getT2SOImmVal(AbsImm) != -1;
18735 // Thumb1 only has 8-bit unsigned immediate.
18736 return AbsImm >= 0 && AbsImm <= 255;
18739 static bool getARMIndexedAddressParts(SDNode *Ptr, EVT VT,
18740 bool isSEXTLoad, SDValue &Base,
18741 SDValue &Offset, bool &isInc,
18742 SelectionDAG &DAG) {
18743 if (Ptr->getOpcode() != ISD::ADD && Ptr->getOpcode() != ISD::SUB)
18744 return false;
18746 if (VT == MVT::i16 || ((VT == MVT::i8 || VT == MVT::i1) && isSEXTLoad)) {
18747 // AddressingMode 3
18748 Base = Ptr->getOperand(0);
18749 if (ConstantSDNode *RHS = dyn_cast<ConstantSDNode>(Ptr->getOperand(1))) {
18750 int RHSC = (int)RHS->getZExtValue();
18751 if (RHSC < 0 && RHSC > -256) {
18752 assert(Ptr->getOpcode() == ISD::ADD);
18753 isInc = false;
18754 Offset = DAG.getConstant(-RHSC, SDLoc(Ptr), RHS->getValueType(0));
18755 return true;
18758 isInc = (Ptr->getOpcode() == ISD::ADD);
18759 Offset = Ptr->getOperand(1);
18760 return true;
18761 } else if (VT == MVT::i32 || VT == MVT::i8 || VT == MVT::i1) {
18762 // AddressingMode 2
18763 if (ConstantSDNode *RHS = dyn_cast<ConstantSDNode>(Ptr->getOperand(1))) {
18764 int RHSC = (int)RHS->getZExtValue();
18765 if (RHSC < 0 && RHSC > -0x1000) {
18766 assert(Ptr->getOpcode() == ISD::ADD);
18767 isInc = false;
18768 Offset = DAG.getConstant(-RHSC, SDLoc(Ptr), RHS->getValueType(0));
18769 Base = Ptr->getOperand(0);
18770 return true;
18774 if (Ptr->getOpcode() == ISD::ADD) {
18775 isInc = true;
18776 ARM_AM::ShiftOpc ShOpcVal=
18777 ARM_AM::getShiftOpcForNode(Ptr->getOperand(0).getOpcode());
18778 if (ShOpcVal != ARM_AM::no_shift) {
18779 Base = Ptr->getOperand(1);
18780 Offset = Ptr->getOperand(0);
18781 } else {
18782 Base = Ptr->getOperand(0);
18783 Offset = Ptr->getOperand(1);
18785 return true;
18788 isInc = (Ptr->getOpcode() == ISD::ADD);
18789 Base = Ptr->getOperand(0);
18790 Offset = Ptr->getOperand(1);
18791 return true;
18794 // FIXME: Use VLDM / VSTM to emulate indexed FP load / store.
18795 return false;
18798 static bool getT2IndexedAddressParts(SDNode *Ptr, EVT VT,
18799 bool isSEXTLoad, SDValue &Base,
18800 SDValue &Offset, bool &isInc,
18801 SelectionDAG &DAG) {
18802 if (Ptr->getOpcode() != ISD::ADD && Ptr->getOpcode() != ISD::SUB)
18803 return false;
18805 Base = Ptr->getOperand(0);
18806 if (ConstantSDNode *RHS = dyn_cast<ConstantSDNode>(Ptr->getOperand(1))) {
18807 int RHSC = (int)RHS->getZExtValue();
18808 if (RHSC < 0 && RHSC > -0x100) { // 8 bits.
18809 assert(Ptr->getOpcode() == ISD::ADD);
18810 isInc = false;
18811 Offset = DAG.getConstant(-RHSC, SDLoc(Ptr), RHS->getValueType(0));
18812 return true;
18813 } else if (RHSC > 0 && RHSC < 0x100) { // 8 bit, no zero.
18814 isInc = Ptr->getOpcode() == ISD::ADD;
18815 Offset = DAG.getConstant(RHSC, SDLoc(Ptr), RHS->getValueType(0));
18816 return true;
18820 return false;
18823 static bool getMVEIndexedAddressParts(SDNode *Ptr, EVT VT, Align Alignment,
18824 bool isSEXTLoad, bool IsMasked, bool isLE,
18825 SDValue &Base, SDValue &Offset,
18826 bool &isInc, SelectionDAG &DAG) {
18827 if (Ptr->getOpcode() != ISD::ADD && Ptr->getOpcode() != ISD::SUB)
18828 return false;
18829 if (!isa<ConstantSDNode>(Ptr->getOperand(1)))
18830 return false;
18832 // We allow LE non-masked loads to change the type (for example use a vldrb.8
18833 // as opposed to a vldrw.32). This can allow extra addressing modes or
18834 // alignments for what is otherwise an equivalent instruction.
18835 bool CanChangeType = isLE && !IsMasked;
18837 ConstantSDNode *RHS = cast<ConstantSDNode>(Ptr->getOperand(1));
18838 int RHSC = (int)RHS->getZExtValue();
18840 auto IsInRange = [&](int RHSC, int Limit, int Scale) {
18841 if (RHSC < 0 && RHSC > -Limit * Scale && RHSC % Scale == 0) {
18842 assert(Ptr->getOpcode() == ISD::ADD);
18843 isInc = false;
18844 Offset = DAG.getConstant(-RHSC, SDLoc(Ptr), RHS->getValueType(0));
18845 return true;
18846 } else if (RHSC > 0 && RHSC < Limit * Scale && RHSC % Scale == 0) {
18847 isInc = Ptr->getOpcode() == ISD::ADD;
18848 Offset = DAG.getConstant(RHSC, SDLoc(Ptr), RHS->getValueType(0));
18849 return true;
18851 return false;
18854 // Try to find a matching instruction based on s/zext, Alignment, Offset and
18855 // (in BE/masked) type.
18856 Base = Ptr->getOperand(0);
18857 if (VT == MVT::v4i16) {
18858 if (Alignment >= 2 && IsInRange(RHSC, 0x80, 2))
18859 return true;
18860 } else if (VT == MVT::v4i8 || VT == MVT::v8i8) {
18861 if (IsInRange(RHSC, 0x80, 1))
18862 return true;
18863 } else if (Alignment >= 4 &&
18864 (CanChangeType || VT == MVT::v4i32 || VT == MVT::v4f32) &&
18865 IsInRange(RHSC, 0x80, 4))
18866 return true;
18867 else if (Alignment >= 2 &&
18868 (CanChangeType || VT == MVT::v8i16 || VT == MVT::v8f16) &&
18869 IsInRange(RHSC, 0x80, 2))
18870 return true;
18871 else if ((CanChangeType || VT == MVT::v16i8) && IsInRange(RHSC, 0x80, 1))
18872 return true;
18873 return false;
18876 /// getPreIndexedAddressParts - returns true by value, base pointer and
18877 /// offset pointer and addressing mode by reference if the node's address
18878 /// can be legally represented as pre-indexed load / store address.
18879 bool
18880 ARMTargetLowering::getPreIndexedAddressParts(SDNode *N, SDValue &Base,
18881 SDValue &Offset,
18882 ISD::MemIndexedMode &AM,
18883 SelectionDAG &DAG) const {
18884 if (Subtarget->isThumb1Only())
18885 return false;
18887 EVT VT;
18888 SDValue Ptr;
18889 Align Alignment;
18890 bool isSEXTLoad = false;
18891 bool IsMasked = false;
18892 if (LoadSDNode *LD = dyn_cast<LoadSDNode>(N)) {
18893 Ptr = LD->getBasePtr();
18894 VT = LD->getMemoryVT();
18895 Alignment = LD->getAlign();
18896 isSEXTLoad = LD->getExtensionType() == ISD::SEXTLOAD;
18897 } else if (StoreSDNode *ST = dyn_cast<StoreSDNode>(N)) {
18898 Ptr = ST->getBasePtr();
18899 VT = ST->getMemoryVT();
18900 Alignment = ST->getAlign();
18901 } else if (MaskedLoadSDNode *LD = dyn_cast<MaskedLoadSDNode>(N)) {
18902 Ptr = LD->getBasePtr();
18903 VT = LD->getMemoryVT();
18904 Alignment = LD->getAlign();
18905 isSEXTLoad = LD->getExtensionType() == ISD::SEXTLOAD;
18906 IsMasked = true;
18907 } else if (MaskedStoreSDNode *ST = dyn_cast<MaskedStoreSDNode>(N)) {
18908 Ptr = ST->getBasePtr();
18909 VT = ST->getMemoryVT();
18910 Alignment = ST->getAlign();
18911 IsMasked = true;
18912 } else
18913 return false;
18915 bool isInc;
18916 bool isLegal = false;
18917 if (VT.isVector())
18918 isLegal = Subtarget->hasMVEIntegerOps() &&
18919 getMVEIndexedAddressParts(
18920 Ptr.getNode(), VT, Alignment, isSEXTLoad, IsMasked,
18921 Subtarget->isLittle(), Base, Offset, isInc, DAG);
18922 else {
18923 if (Subtarget->isThumb2())
18924 isLegal = getT2IndexedAddressParts(Ptr.getNode(), VT, isSEXTLoad, Base,
18925 Offset, isInc, DAG);
18926 else
18927 isLegal = getARMIndexedAddressParts(Ptr.getNode(), VT, isSEXTLoad, Base,
18928 Offset, isInc, DAG);
18930 if (!isLegal)
18931 return false;
18933 AM = isInc ? ISD::PRE_INC : ISD::PRE_DEC;
18934 return true;
18937 /// getPostIndexedAddressParts - returns true by value, base pointer and
18938 /// offset pointer and addressing mode by reference if this node can be
18939 /// combined with a load / store to form a post-indexed load / store.
18940 bool ARMTargetLowering::getPostIndexedAddressParts(SDNode *N, SDNode *Op,
18941 SDValue &Base,
18942 SDValue &Offset,
18943 ISD::MemIndexedMode &AM,
18944 SelectionDAG &DAG) const {
18945 EVT VT;
18946 SDValue Ptr;
18947 Align Alignment;
18948 bool isSEXTLoad = false, isNonExt;
18949 bool IsMasked = false;
18950 if (LoadSDNode *LD = dyn_cast<LoadSDNode>(N)) {
18951 VT = LD->getMemoryVT();
18952 Ptr = LD->getBasePtr();
18953 Alignment = LD->getAlign();
18954 isSEXTLoad = LD->getExtensionType() == ISD::SEXTLOAD;
18955 isNonExt = LD->getExtensionType() == ISD::NON_EXTLOAD;
18956 } else if (StoreSDNode *ST = dyn_cast<StoreSDNode>(N)) {
18957 VT = ST->getMemoryVT();
18958 Ptr = ST->getBasePtr();
18959 Alignment = ST->getAlign();
18960 isNonExt = !ST->isTruncatingStore();
18961 } else if (MaskedLoadSDNode *LD = dyn_cast<MaskedLoadSDNode>(N)) {
18962 VT = LD->getMemoryVT();
18963 Ptr = LD->getBasePtr();
18964 Alignment = LD->getAlign();
18965 isSEXTLoad = LD->getExtensionType() == ISD::SEXTLOAD;
18966 isNonExt = LD->getExtensionType() == ISD::NON_EXTLOAD;
18967 IsMasked = true;
18968 } else if (MaskedStoreSDNode *ST = dyn_cast<MaskedStoreSDNode>(N)) {
18969 VT = ST->getMemoryVT();
18970 Ptr = ST->getBasePtr();
18971 Alignment = ST->getAlign();
18972 isNonExt = !ST->isTruncatingStore();
18973 IsMasked = true;
18974 } else
18975 return false;
18977 if (Subtarget->isThumb1Only()) {
18978 // Thumb-1 can do a limited post-inc load or store as an updating LDM. It
18979 // must be non-extending/truncating, i32, with an offset of 4.
18980 assert(Op->getValueType(0) == MVT::i32 && "Non-i32 post-inc op?!");
18981 if (Op->getOpcode() != ISD::ADD || !isNonExt)
18982 return false;
18983 auto *RHS = dyn_cast<ConstantSDNode>(Op->getOperand(1));
18984 if (!RHS || RHS->getZExtValue() != 4)
18985 return false;
18986 if (Alignment < Align(4))
18987 return false;
18989 Offset = Op->getOperand(1);
18990 Base = Op->getOperand(0);
18991 AM = ISD::POST_INC;
18992 return true;
18995 bool isInc;
18996 bool isLegal = false;
18997 if (VT.isVector())
18998 isLegal = Subtarget->hasMVEIntegerOps() &&
18999 getMVEIndexedAddressParts(Op, VT, Alignment, isSEXTLoad, IsMasked,
19000 Subtarget->isLittle(), Base, Offset,
19001 isInc, DAG);
19002 else {
19003 if (Subtarget->isThumb2())
19004 isLegal = getT2IndexedAddressParts(Op, VT, isSEXTLoad, Base, Offset,
19005 isInc, DAG);
19006 else
19007 isLegal = getARMIndexedAddressParts(Op, VT, isSEXTLoad, Base, Offset,
19008 isInc, DAG);
19010 if (!isLegal)
19011 return false;
19013 if (Ptr != Base) {
19014 // Swap base ptr and offset to catch more post-index load / store when
19015 // it's legal. In Thumb2 mode, offset must be an immediate.
19016 if (Ptr == Offset && Op->getOpcode() == ISD::ADD &&
19017 !Subtarget->isThumb2())
19018 std::swap(Base, Offset);
19020 // Post-indexed load / store update the base pointer.
19021 if (Ptr != Base)
19022 return false;
19025 AM = isInc ? ISD::POST_INC : ISD::POST_DEC;
19026 return true;
19029 void ARMTargetLowering::computeKnownBitsForTargetNode(const SDValue Op,
19030 KnownBits &Known,
19031 const APInt &DemandedElts,
19032 const SelectionDAG &DAG,
19033 unsigned Depth) const {
19034 unsigned BitWidth = Known.getBitWidth();
19035 Known.resetAll();
19036 switch (Op.getOpcode()) {
19037 default: break;
19038 case ARMISD::ADDC:
19039 case ARMISD::ADDE:
19040 case ARMISD::SUBC:
19041 case ARMISD::SUBE:
19042 // Special cases when we convert a carry to a boolean.
19043 if (Op.getResNo() == 0) {
19044 SDValue LHS = Op.getOperand(0);
19045 SDValue RHS = Op.getOperand(1);
19046 // (ADDE 0, 0, C) will give us a single bit.
19047 if (Op->getOpcode() == ARMISD::ADDE && isNullConstant(LHS) &&
19048 isNullConstant(RHS)) {
19049 Known.Zero |= APInt::getHighBitsSet(BitWidth, BitWidth - 1);
19050 return;
19053 break;
19054 case ARMISD::CMOV: {
19055 // Bits are known zero/one if known on the LHS and RHS.
19056 Known = DAG.computeKnownBits(Op.getOperand(0), Depth+1);
19057 if (Known.isUnknown())
19058 return;
19060 KnownBits KnownRHS = DAG.computeKnownBits(Op.getOperand(1), Depth+1);
19061 Known = KnownBits::commonBits(Known, KnownRHS);
19062 return;
19064 case ISD::INTRINSIC_W_CHAIN: {
19065 ConstantSDNode *CN = cast<ConstantSDNode>(Op->getOperand(1));
19066 Intrinsic::ID IntID = static_cast<Intrinsic::ID>(CN->getZExtValue());
19067 switch (IntID) {
19068 default: return;
19069 case Intrinsic::arm_ldaex:
19070 case Intrinsic::arm_ldrex: {
19071 EVT VT = cast<MemIntrinsicSDNode>(Op)->getMemoryVT();
19072 unsigned MemBits = VT.getScalarSizeInBits();
19073 Known.Zero |= APInt::getHighBitsSet(BitWidth, BitWidth - MemBits);
19074 return;
19078 case ARMISD::BFI: {
19079 // Conservatively, we can recurse down the first operand
19080 // and just mask out all affected bits.
19081 Known = DAG.computeKnownBits(Op.getOperand(0), Depth + 1);
19083 // The operand to BFI is already a mask suitable for removing the bits it
19084 // sets.
19085 ConstantSDNode *CI = cast<ConstantSDNode>(Op.getOperand(2));
19086 const APInt &Mask = CI->getAPIntValue();
19087 Known.Zero &= Mask;
19088 Known.One &= Mask;
19089 return;
19091 case ARMISD::VGETLANEs:
19092 case ARMISD::VGETLANEu: {
19093 const SDValue &SrcSV = Op.getOperand(0);
19094 EVT VecVT = SrcSV.getValueType();
19095 assert(VecVT.isVector() && "VGETLANE expected a vector type");
19096 const unsigned NumSrcElts = VecVT.getVectorNumElements();
19097 ConstantSDNode *Pos = cast<ConstantSDNode>(Op.getOperand(1).getNode());
19098 assert(Pos->getAPIntValue().ult(NumSrcElts) &&
19099 "VGETLANE index out of bounds");
19100 unsigned Idx = Pos->getZExtValue();
19101 APInt DemandedElt = APInt::getOneBitSet(NumSrcElts, Idx);
19102 Known = DAG.computeKnownBits(SrcSV, DemandedElt, Depth + 1);
19104 EVT VT = Op.getValueType();
19105 const unsigned DstSz = VT.getScalarSizeInBits();
19106 const unsigned SrcSz = VecVT.getVectorElementType().getSizeInBits();
19107 (void)SrcSz;
19108 assert(SrcSz == Known.getBitWidth());
19109 assert(DstSz > SrcSz);
19110 if (Op.getOpcode() == ARMISD::VGETLANEs)
19111 Known = Known.sext(DstSz);
19112 else {
19113 Known = Known.zext(DstSz);
19115 assert(DstSz == Known.getBitWidth());
19116 break;
19118 case ARMISD::VMOVrh: {
19119 KnownBits KnownOp = DAG.computeKnownBits(Op->getOperand(0), Depth + 1);
19120 assert(KnownOp.getBitWidth() == 16);
19121 Known = KnownOp.zext(32);
19122 break;
19124 case ARMISD::CSINC:
19125 case ARMISD::CSINV:
19126 case ARMISD::CSNEG: {
19127 KnownBits KnownOp0 = DAG.computeKnownBits(Op->getOperand(0), Depth + 1);
19128 KnownBits KnownOp1 = DAG.computeKnownBits(Op->getOperand(1), Depth + 1);
19130 // The result is either:
19131 // CSINC: KnownOp0 or KnownOp1 + 1
19132 // CSINV: KnownOp0 or ~KnownOp1
19133 // CSNEG: KnownOp0 or KnownOp1 * -1
19134 if (Op.getOpcode() == ARMISD::CSINC)
19135 KnownOp1 = KnownBits::computeForAddSub(
19136 true, false, KnownOp1, KnownBits::makeConstant(APInt(32, 1)));
19137 else if (Op.getOpcode() == ARMISD::CSINV)
19138 std::swap(KnownOp1.Zero, KnownOp1.One);
19139 else if (Op.getOpcode() == ARMISD::CSNEG)
19140 KnownOp1 = KnownBits::mul(
19141 KnownOp1, KnownBits::makeConstant(APInt(32, -1)));
19143 Known = KnownBits::commonBits(KnownOp0, KnownOp1);
19144 break;
19149 bool ARMTargetLowering::targetShrinkDemandedConstant(
19150 SDValue Op, const APInt &DemandedBits, const APInt &DemandedElts,
19151 TargetLoweringOpt &TLO) const {
19152 // Delay optimization, so we don't have to deal with illegal types, or block
19153 // optimizations.
19154 if (!TLO.LegalOps)
19155 return false;
19157 // Only optimize AND for now.
19158 if (Op.getOpcode() != ISD::AND)
19159 return false;
19161 EVT VT = Op.getValueType();
19163 // Ignore vectors.
19164 if (VT.isVector())
19165 return false;
19167 assert(VT == MVT::i32 && "Unexpected integer type");
19169 // Make sure the RHS really is a constant.
19170 ConstantSDNode *C = dyn_cast<ConstantSDNode>(Op.getOperand(1));
19171 if (!C)
19172 return false;
19174 unsigned Mask = C->getZExtValue();
19176 unsigned Demanded = DemandedBits.getZExtValue();
19177 unsigned ShrunkMask = Mask & Demanded;
19178 unsigned ExpandedMask = Mask | ~Demanded;
19180 // If the mask is all zeros, let the target-independent code replace the
19181 // result with zero.
19182 if (ShrunkMask == 0)
19183 return false;
19185 // If the mask is all ones, erase the AND. (Currently, the target-independent
19186 // code won't do this, so we have to do it explicitly to avoid an infinite
19187 // loop in obscure cases.)
19188 if (ExpandedMask == ~0U)
19189 return TLO.CombineTo(Op, Op.getOperand(0));
19191 auto IsLegalMask = [ShrunkMask, ExpandedMask](unsigned Mask) -> bool {
19192 return (ShrunkMask & Mask) == ShrunkMask && (~ExpandedMask & Mask) == 0;
19194 auto UseMask = [Mask, Op, VT, &TLO](unsigned NewMask) -> bool {
19195 if (NewMask == Mask)
19196 return true;
19197 SDLoc DL(Op);
19198 SDValue NewC = TLO.DAG.getConstant(NewMask, DL, VT);
19199 SDValue NewOp = TLO.DAG.getNode(ISD::AND, DL, VT, Op.getOperand(0), NewC);
19200 return TLO.CombineTo(Op, NewOp);
19203 // Prefer uxtb mask.
19204 if (IsLegalMask(0xFF))
19205 return UseMask(0xFF);
19207 // Prefer uxth mask.
19208 if (IsLegalMask(0xFFFF))
19209 return UseMask(0xFFFF);
19211 // [1, 255] is Thumb1 movs+ands, legal immediate for ARM/Thumb2.
19212 // FIXME: Prefer a contiguous sequence of bits for other optimizations.
19213 if (ShrunkMask < 256)
19214 return UseMask(ShrunkMask);
19216 // [-256, -2] is Thumb1 movs+bics, legal immediate for ARM/Thumb2.
19217 // FIXME: Prefer a contiguous sequence of bits for other optimizations.
19218 if ((int)ExpandedMask <= -2 && (int)ExpandedMask >= -256)
19219 return UseMask(ExpandedMask);
19221 // Potential improvements:
19223 // We could try to recognize lsls+lsrs or lsrs+lsls pairs here.
19224 // We could try to prefer Thumb1 immediates which can be lowered to a
19225 // two-instruction sequence.
19226 // We could try to recognize more legal ARM/Thumb2 immediates here.
19228 return false;
19231 bool ARMTargetLowering::SimplifyDemandedBitsForTargetNode(
19232 SDValue Op, const APInt &OriginalDemandedBits,
19233 const APInt &OriginalDemandedElts, KnownBits &Known, TargetLoweringOpt &TLO,
19234 unsigned Depth) const {
19235 unsigned Opc = Op.getOpcode();
19237 switch (Opc) {
19238 case ARMISD::ASRL:
19239 case ARMISD::LSRL: {
19240 // If this is result 0 and the other result is unused, see if the demand
19241 // bits allow us to shrink this long shift into a standard small shift in
19242 // the opposite direction.
19243 if (Op.getResNo() == 0 && !Op->hasAnyUseOfValue(1) &&
19244 isa<ConstantSDNode>(Op->getOperand(2))) {
19245 unsigned ShAmt = Op->getConstantOperandVal(2);
19246 if (ShAmt < 32 && OriginalDemandedBits.isSubsetOf(
19247 APInt::getAllOnesValue(32) << (32 - ShAmt)))
19248 return TLO.CombineTo(
19249 Op, TLO.DAG.getNode(
19250 ISD::SHL, SDLoc(Op), MVT::i32, Op.getOperand(1),
19251 TLO.DAG.getConstant(32 - ShAmt, SDLoc(Op), MVT::i32)));
19253 break;
19257 return TargetLowering::SimplifyDemandedBitsForTargetNode(
19258 Op, OriginalDemandedBits, OriginalDemandedElts, Known, TLO, Depth);
19261 //===----------------------------------------------------------------------===//
19262 // ARM Inline Assembly Support
19263 //===----------------------------------------------------------------------===//
19265 bool ARMTargetLowering::ExpandInlineAsm(CallInst *CI) const {
19266 // Looking for "rev" which is V6+.
19267 if (!Subtarget->hasV6Ops())
19268 return false;
19270 InlineAsm *IA = cast<InlineAsm>(CI->getCalledOperand());
19271 std::string AsmStr = IA->getAsmString();
19272 SmallVector<StringRef, 4> AsmPieces;
19273 SplitString(AsmStr, AsmPieces, ";\n");
19275 switch (AsmPieces.size()) {
19276 default: return false;
19277 case 1:
19278 AsmStr = std::string(AsmPieces[0]);
19279 AsmPieces.clear();
19280 SplitString(AsmStr, AsmPieces, " \t,");
19282 // rev $0, $1
19283 if (AsmPieces.size() == 3 &&
19284 AsmPieces[0] == "rev" && AsmPieces[1] == "$0" && AsmPieces[2] == "$1" &&
19285 IA->getConstraintString().compare(0, 4, "=l,l") == 0) {
19286 IntegerType *Ty = dyn_cast<IntegerType>(CI->getType());
19287 if (Ty && Ty->getBitWidth() == 32)
19288 return IntrinsicLowering::LowerToByteSwap(CI);
19290 break;
19293 return false;
19296 const char *ARMTargetLowering::LowerXConstraint(EVT ConstraintVT) const {
19297 // At this point, we have to lower this constraint to something else, so we
19298 // lower it to an "r" or "w". However, by doing this we will force the result
19299 // to be in register, while the X constraint is much more permissive.
19301 // Although we are correct (we are free to emit anything, without
19302 // constraints), we might break use cases that would expect us to be more
19303 // efficient and emit something else.
19304 if (!Subtarget->hasVFP2Base())
19305 return "r";
19306 if (ConstraintVT.isFloatingPoint())
19307 return "w";
19308 if (ConstraintVT.isVector() && Subtarget->hasNEON() &&
19309 (ConstraintVT.getSizeInBits() == 64 ||
19310 ConstraintVT.getSizeInBits() == 128))
19311 return "w";
19313 return "r";
19316 /// getConstraintType - Given a constraint letter, return the type of
19317 /// constraint it is for this target.
19318 ARMTargetLowering::ConstraintType
19319 ARMTargetLowering::getConstraintType(StringRef Constraint) const {
19320 unsigned S = Constraint.size();
19321 if (S == 1) {
19322 switch (Constraint[0]) {
19323 default: break;
19324 case 'l': return C_RegisterClass;
19325 case 'w': return C_RegisterClass;
19326 case 'h': return C_RegisterClass;
19327 case 'x': return C_RegisterClass;
19328 case 't': return C_RegisterClass;
19329 case 'j': return C_Immediate; // Constant for movw.
19330 // An address with a single base register. Due to the way we
19331 // currently handle addresses it is the same as an 'r' memory constraint.
19332 case 'Q': return C_Memory;
19334 } else if (S == 2) {
19335 switch (Constraint[0]) {
19336 default: break;
19337 case 'T': return C_RegisterClass;
19338 // All 'U+' constraints are addresses.
19339 case 'U': return C_Memory;
19342 return TargetLowering::getConstraintType(Constraint);
19345 /// Examine constraint type and operand type and determine a weight value.
19346 /// This object must already have been set up with the operand type
19347 /// and the current alternative constraint selected.
19348 TargetLowering::ConstraintWeight
19349 ARMTargetLowering::getSingleConstraintMatchWeight(
19350 AsmOperandInfo &info, const char *constraint) const {
19351 ConstraintWeight weight = CW_Invalid;
19352 Value *CallOperandVal = info.CallOperandVal;
19353 // If we don't have a value, we can't do a match,
19354 // but allow it at the lowest weight.
19355 if (!CallOperandVal)
19356 return CW_Default;
19357 Type *type = CallOperandVal->getType();
19358 // Look at the constraint type.
19359 switch (*constraint) {
19360 default:
19361 weight = TargetLowering::getSingleConstraintMatchWeight(info, constraint);
19362 break;
19363 case 'l':
19364 if (type->isIntegerTy()) {
19365 if (Subtarget->isThumb())
19366 weight = CW_SpecificReg;
19367 else
19368 weight = CW_Register;
19370 break;
19371 case 'w':
19372 if (type->isFloatingPointTy())
19373 weight = CW_Register;
19374 break;
19376 return weight;
19379 using RCPair = std::pair<unsigned, const TargetRegisterClass *>;
19381 RCPair ARMTargetLowering::getRegForInlineAsmConstraint(
19382 const TargetRegisterInfo *TRI, StringRef Constraint, MVT VT) const {
19383 switch (Constraint.size()) {
19384 case 1:
19385 // GCC ARM Constraint Letters
19386 switch (Constraint[0]) {
19387 case 'l': // Low regs or general regs.
19388 if (Subtarget->isThumb())
19389 return RCPair(0U, &ARM::tGPRRegClass);
19390 return RCPair(0U, &ARM::GPRRegClass);
19391 case 'h': // High regs or no regs.
19392 if (Subtarget->isThumb())
19393 return RCPair(0U, &ARM::hGPRRegClass);
19394 break;
19395 case 'r':
19396 if (Subtarget->isThumb1Only())
19397 return RCPair(0U, &ARM::tGPRRegClass);
19398 return RCPair(0U, &ARM::GPRRegClass);
19399 case 'w':
19400 if (VT == MVT::Other)
19401 break;
19402 if (VT == MVT::f32)
19403 return RCPair(0U, &ARM::SPRRegClass);
19404 if (VT.getSizeInBits() == 64)
19405 return RCPair(0U, &ARM::DPRRegClass);
19406 if (VT.getSizeInBits() == 128)
19407 return RCPair(0U, &ARM::QPRRegClass);
19408 break;
19409 case 'x':
19410 if (VT == MVT::Other)
19411 break;
19412 if (VT == MVT::f32)
19413 return RCPair(0U, &ARM::SPR_8RegClass);
19414 if (VT.getSizeInBits() == 64)
19415 return RCPair(0U, &ARM::DPR_8RegClass);
19416 if (VT.getSizeInBits() == 128)
19417 return RCPair(0U, &ARM::QPR_8RegClass);
19418 break;
19419 case 't':
19420 if (VT == MVT::Other)
19421 break;
19422 if (VT == MVT::f32 || VT == MVT::i32)
19423 return RCPair(0U, &ARM::SPRRegClass);
19424 if (VT.getSizeInBits() == 64)
19425 return RCPair(0U, &ARM::DPR_VFP2RegClass);
19426 if (VT.getSizeInBits() == 128)
19427 return RCPair(0U, &ARM::QPR_VFP2RegClass);
19428 break;
19430 break;
19432 case 2:
19433 if (Constraint[0] == 'T') {
19434 switch (Constraint[1]) {
19435 default:
19436 break;
19437 case 'e':
19438 return RCPair(0U, &ARM::tGPREvenRegClass);
19439 case 'o':
19440 return RCPair(0U, &ARM::tGPROddRegClass);
19443 break;
19445 default:
19446 break;
19449 if (StringRef("{cc}").equals_insensitive(Constraint))
19450 return std::make_pair(unsigned(ARM::CPSR), &ARM::CCRRegClass);
19452 return TargetLowering::getRegForInlineAsmConstraint(TRI, Constraint, VT);
19455 /// LowerAsmOperandForConstraint - Lower the specified operand into the Ops
19456 /// vector. If it is invalid, don't add anything to Ops.
19457 void ARMTargetLowering::LowerAsmOperandForConstraint(SDValue Op,
19458 std::string &Constraint,
19459 std::vector<SDValue>&Ops,
19460 SelectionDAG &DAG) const {
19461 SDValue Result;
19463 // Currently only support length 1 constraints.
19464 if (Constraint.length() != 1) return;
19466 char ConstraintLetter = Constraint[0];
19467 switch (ConstraintLetter) {
19468 default: break;
19469 case 'j':
19470 case 'I': case 'J': case 'K': case 'L':
19471 case 'M': case 'N': case 'O':
19472 ConstantSDNode *C = dyn_cast<ConstantSDNode>(Op);
19473 if (!C)
19474 return;
19476 int64_t CVal64 = C->getSExtValue();
19477 int CVal = (int) CVal64;
19478 // None of these constraints allow values larger than 32 bits. Check
19479 // that the value fits in an int.
19480 if (CVal != CVal64)
19481 return;
19483 switch (ConstraintLetter) {
19484 case 'j':
19485 // Constant suitable for movw, must be between 0 and
19486 // 65535.
19487 if (Subtarget->hasV6T2Ops() || (Subtarget->hasV8MBaselineOps()))
19488 if (CVal >= 0 && CVal <= 65535)
19489 break;
19490 return;
19491 case 'I':
19492 if (Subtarget->isThumb1Only()) {
19493 // This must be a constant between 0 and 255, for ADD
19494 // immediates.
19495 if (CVal >= 0 && CVal <= 255)
19496 break;
19497 } else if (Subtarget->isThumb2()) {
19498 // A constant that can be used as an immediate value in a
19499 // data-processing instruction.
19500 if (ARM_AM::getT2SOImmVal(CVal) != -1)
19501 break;
19502 } else {
19503 // A constant that can be used as an immediate value in a
19504 // data-processing instruction.
19505 if (ARM_AM::getSOImmVal(CVal) != -1)
19506 break;
19508 return;
19510 case 'J':
19511 if (Subtarget->isThumb1Only()) {
19512 // This must be a constant between -255 and -1, for negated ADD
19513 // immediates. This can be used in GCC with an "n" modifier that
19514 // prints the negated value, for use with SUB instructions. It is
19515 // not useful otherwise but is implemented for compatibility.
19516 if (CVal >= -255 && CVal <= -1)
19517 break;
19518 } else {
19519 // This must be a constant between -4095 and 4095. It is not clear
19520 // what this constraint is intended for. Implemented for
19521 // compatibility with GCC.
19522 if (CVal >= -4095 && CVal <= 4095)
19523 break;
19525 return;
19527 case 'K':
19528 if (Subtarget->isThumb1Only()) {
19529 // A 32-bit value where only one byte has a nonzero value. Exclude
19530 // zero to match GCC. This constraint is used by GCC internally for
19531 // constants that can be loaded with a move/shift combination.
19532 // It is not useful otherwise but is implemented for compatibility.
19533 if (CVal != 0 && ARM_AM::isThumbImmShiftedVal(CVal))
19534 break;
19535 } else if (Subtarget->isThumb2()) {
19536 // A constant whose bitwise inverse can be used as an immediate
19537 // value in a data-processing instruction. This can be used in GCC
19538 // with a "B" modifier that prints the inverted value, for use with
19539 // BIC and MVN instructions. It is not useful otherwise but is
19540 // implemented for compatibility.
19541 if (ARM_AM::getT2SOImmVal(~CVal) != -1)
19542 break;
19543 } else {
19544 // A constant whose bitwise inverse can be used as an immediate
19545 // value in a data-processing instruction. This can be used in GCC
19546 // with a "B" modifier that prints the inverted value, for use with
19547 // BIC and MVN instructions. It is not useful otherwise but is
19548 // implemented for compatibility.
19549 if (ARM_AM::getSOImmVal(~CVal) != -1)
19550 break;
19552 return;
19554 case 'L':
19555 if (Subtarget->isThumb1Only()) {
19556 // This must be a constant between -7 and 7,
19557 // for 3-operand ADD/SUB immediate instructions.
19558 if (CVal >= -7 && CVal < 7)
19559 break;
19560 } else if (Subtarget->isThumb2()) {
19561 // A constant whose negation can be used as an immediate value in a
19562 // data-processing instruction. This can be used in GCC with an "n"
19563 // modifier that prints the negated value, for use with SUB
19564 // instructions. It is not useful otherwise but is implemented for
19565 // compatibility.
19566 if (ARM_AM::getT2SOImmVal(-CVal) != -1)
19567 break;
19568 } else {
19569 // A constant whose negation can be used as an immediate value in a
19570 // data-processing instruction. This can be used in GCC with an "n"
19571 // modifier that prints the negated value, for use with SUB
19572 // instructions. It is not useful otherwise but is implemented for
19573 // compatibility.
19574 if (ARM_AM::getSOImmVal(-CVal) != -1)
19575 break;
19577 return;
19579 case 'M':
19580 if (Subtarget->isThumb1Only()) {
19581 // This must be a multiple of 4 between 0 and 1020, for
19582 // ADD sp + immediate.
19583 if ((CVal >= 0 && CVal <= 1020) && ((CVal & 3) == 0))
19584 break;
19585 } else {
19586 // A power of two or a constant between 0 and 32. This is used in
19587 // GCC for the shift amount on shifted register operands, but it is
19588 // useful in general for any shift amounts.
19589 if ((CVal >= 0 && CVal <= 32) || ((CVal & (CVal - 1)) == 0))
19590 break;
19592 return;
19594 case 'N':
19595 if (Subtarget->isThumb1Only()) {
19596 // This must be a constant between 0 and 31, for shift amounts.
19597 if (CVal >= 0 && CVal <= 31)
19598 break;
19600 return;
19602 case 'O':
19603 if (Subtarget->isThumb1Only()) {
19604 // This must be a multiple of 4 between -508 and 508, for
19605 // ADD/SUB sp = sp + immediate.
19606 if ((CVal >= -508 && CVal <= 508) && ((CVal & 3) == 0))
19607 break;
19609 return;
19611 Result = DAG.getTargetConstant(CVal, SDLoc(Op), Op.getValueType());
19612 break;
19615 if (Result.getNode()) {
19616 Ops.push_back(Result);
19617 return;
19619 return TargetLowering::LowerAsmOperandForConstraint(Op, Constraint, Ops, DAG);
19622 static RTLIB::Libcall getDivRemLibcall(
19623 const SDNode *N, MVT::SimpleValueType SVT) {
19624 assert((N->getOpcode() == ISD::SDIVREM || N->getOpcode() == ISD::UDIVREM ||
19625 N->getOpcode() == ISD::SREM || N->getOpcode() == ISD::UREM) &&
19626 "Unhandled Opcode in getDivRemLibcall");
19627 bool isSigned = N->getOpcode() == ISD::SDIVREM ||
19628 N->getOpcode() == ISD::SREM;
19629 RTLIB::Libcall LC;
19630 switch (SVT) {
19631 default: llvm_unreachable("Unexpected request for libcall!");
19632 case MVT::i8: LC = isSigned ? RTLIB::SDIVREM_I8 : RTLIB::UDIVREM_I8; break;
19633 case MVT::i16: LC = isSigned ? RTLIB::SDIVREM_I16 : RTLIB::UDIVREM_I16; break;
19634 case MVT::i32: LC = isSigned ? RTLIB::SDIVREM_I32 : RTLIB::UDIVREM_I32; break;
19635 case MVT::i64: LC = isSigned ? RTLIB::SDIVREM_I64 : RTLIB::UDIVREM_I64; break;
19637 return LC;
19640 static TargetLowering::ArgListTy getDivRemArgList(
19641 const SDNode *N, LLVMContext *Context, const ARMSubtarget *Subtarget) {
19642 assert((N->getOpcode() == ISD::SDIVREM || N->getOpcode() == ISD::UDIVREM ||
19643 N->getOpcode() == ISD::SREM || N->getOpcode() == ISD::UREM) &&
19644 "Unhandled Opcode in getDivRemArgList");
19645 bool isSigned = N->getOpcode() == ISD::SDIVREM ||
19646 N->getOpcode() == ISD::SREM;
19647 TargetLowering::ArgListTy Args;
19648 TargetLowering::ArgListEntry Entry;
19649 for (unsigned i = 0, e = N->getNumOperands(); i != e; ++i) {
19650 EVT ArgVT = N->getOperand(i).getValueType();
19651 Type *ArgTy = ArgVT.getTypeForEVT(*Context);
19652 Entry.Node = N->getOperand(i);
19653 Entry.Ty = ArgTy;
19654 Entry.IsSExt = isSigned;
19655 Entry.IsZExt = !isSigned;
19656 Args.push_back(Entry);
19658 if (Subtarget->isTargetWindows() && Args.size() >= 2)
19659 std::swap(Args[0], Args[1]);
19660 return Args;
19663 SDValue ARMTargetLowering::LowerDivRem(SDValue Op, SelectionDAG &DAG) const {
19664 assert((Subtarget->isTargetAEABI() || Subtarget->isTargetAndroid() ||
19665 Subtarget->isTargetGNUAEABI() || Subtarget->isTargetMuslAEABI() ||
19666 Subtarget->isTargetWindows()) &&
19667 "Register-based DivRem lowering only");
19668 unsigned Opcode = Op->getOpcode();
19669 assert((Opcode == ISD::SDIVREM || Opcode == ISD::UDIVREM) &&
19670 "Invalid opcode for Div/Rem lowering");
19671 bool isSigned = (Opcode == ISD::SDIVREM);
19672 EVT VT = Op->getValueType(0);
19673 Type *Ty = VT.getTypeForEVT(*DAG.getContext());
19674 SDLoc dl(Op);
19676 // If the target has hardware divide, use divide + multiply + subtract:
19677 // div = a / b
19678 // rem = a - b * div
19679 // return {div, rem}
19680 // This should be lowered into UDIV/SDIV + MLS later on.
19681 bool hasDivide = Subtarget->isThumb() ? Subtarget->hasDivideInThumbMode()
19682 : Subtarget->hasDivideInARMMode();
19683 if (hasDivide && Op->getValueType(0).isSimple() &&
19684 Op->getSimpleValueType(0) == MVT::i32) {
19685 unsigned DivOpcode = isSigned ? ISD::SDIV : ISD::UDIV;
19686 const SDValue Dividend = Op->getOperand(0);
19687 const SDValue Divisor = Op->getOperand(1);
19688 SDValue Div = DAG.getNode(DivOpcode, dl, VT, Dividend, Divisor);
19689 SDValue Mul = DAG.getNode(ISD::MUL, dl, VT, Div, Divisor);
19690 SDValue Rem = DAG.getNode(ISD::SUB, dl, VT, Dividend, Mul);
19692 SDValue Values[2] = {Div, Rem};
19693 return DAG.getNode(ISD::MERGE_VALUES, dl, DAG.getVTList(VT, VT), Values);
19696 RTLIB::Libcall LC = getDivRemLibcall(Op.getNode(),
19697 VT.getSimpleVT().SimpleTy);
19698 SDValue InChain = DAG.getEntryNode();
19700 TargetLowering::ArgListTy Args = getDivRemArgList(Op.getNode(),
19701 DAG.getContext(),
19702 Subtarget);
19704 SDValue Callee = DAG.getExternalSymbol(getLibcallName(LC),
19705 getPointerTy(DAG.getDataLayout()));
19707 Type *RetTy = StructType::get(Ty, Ty);
19709 if (Subtarget->isTargetWindows())
19710 InChain = WinDBZCheckDenominator(DAG, Op.getNode(), InChain);
19712 TargetLowering::CallLoweringInfo CLI(DAG);
19713 CLI.setDebugLoc(dl).setChain(InChain)
19714 .setCallee(getLibcallCallingConv(LC), RetTy, Callee, std::move(Args))
19715 .setInRegister().setSExtResult(isSigned).setZExtResult(!isSigned);
19717 std::pair<SDValue, SDValue> CallInfo = LowerCallTo(CLI);
19718 return CallInfo.first;
19721 // Lowers REM using divmod helpers
19722 // see RTABI section 4.2/4.3
19723 SDValue ARMTargetLowering::LowerREM(SDNode *N, SelectionDAG &DAG) const {
19724 // Build return types (div and rem)
19725 std::vector<Type*> RetTyParams;
19726 Type *RetTyElement;
19728 switch (N->getValueType(0).getSimpleVT().SimpleTy) {
19729 default: llvm_unreachable("Unexpected request for libcall!");
19730 case MVT::i8: RetTyElement = Type::getInt8Ty(*DAG.getContext()); break;
19731 case MVT::i16: RetTyElement = Type::getInt16Ty(*DAG.getContext()); break;
19732 case MVT::i32: RetTyElement = Type::getInt32Ty(*DAG.getContext()); break;
19733 case MVT::i64: RetTyElement = Type::getInt64Ty(*DAG.getContext()); break;
19736 RetTyParams.push_back(RetTyElement);
19737 RetTyParams.push_back(RetTyElement);
19738 ArrayRef<Type*> ret = ArrayRef<Type*>(RetTyParams);
19739 Type *RetTy = StructType::get(*DAG.getContext(), ret);
19741 RTLIB::Libcall LC = getDivRemLibcall(N, N->getValueType(0).getSimpleVT().
19742 SimpleTy);
19743 SDValue InChain = DAG.getEntryNode();
19744 TargetLowering::ArgListTy Args = getDivRemArgList(N, DAG.getContext(),
19745 Subtarget);
19746 bool isSigned = N->getOpcode() == ISD::SREM;
19747 SDValue Callee = DAG.getExternalSymbol(getLibcallName(LC),
19748 getPointerTy(DAG.getDataLayout()));
19750 if (Subtarget->isTargetWindows())
19751 InChain = WinDBZCheckDenominator(DAG, N, InChain);
19753 // Lower call
19754 CallLoweringInfo CLI(DAG);
19755 CLI.setChain(InChain)
19756 .setCallee(CallingConv::ARM_AAPCS, RetTy, Callee, std::move(Args))
19757 .setSExtResult(isSigned).setZExtResult(!isSigned).setDebugLoc(SDLoc(N));
19758 std::pair<SDValue, SDValue> CallResult = LowerCallTo(CLI);
19760 // Return second (rem) result operand (first contains div)
19761 SDNode *ResNode = CallResult.first.getNode();
19762 assert(ResNode->getNumOperands() == 2 && "divmod should return two operands");
19763 return ResNode->getOperand(1);
19766 SDValue
19767 ARMTargetLowering::LowerDYNAMIC_STACKALLOC(SDValue Op, SelectionDAG &DAG) const {
19768 assert(Subtarget->isTargetWindows() && "unsupported target platform");
19769 SDLoc DL(Op);
19771 // Get the inputs.
19772 SDValue Chain = Op.getOperand(0);
19773 SDValue Size = Op.getOperand(1);
19775 if (DAG.getMachineFunction().getFunction().hasFnAttribute(
19776 "no-stack-arg-probe")) {
19777 MaybeAlign Align =
19778 cast<ConstantSDNode>(Op.getOperand(2))->getMaybeAlignValue();
19779 SDValue SP = DAG.getCopyFromReg(Chain, DL, ARM::SP, MVT::i32);
19780 Chain = SP.getValue(1);
19781 SP = DAG.getNode(ISD::SUB, DL, MVT::i32, SP, Size);
19782 if (Align)
19783 SP =
19784 DAG.getNode(ISD::AND, DL, MVT::i32, SP.getValue(0),
19785 DAG.getConstant(-(uint64_t)Align->value(), DL, MVT::i32));
19786 Chain = DAG.getCopyToReg(Chain, DL, ARM::SP, SP);
19787 SDValue Ops[2] = { SP, Chain };
19788 return DAG.getMergeValues(Ops, DL);
19791 SDValue Words = DAG.getNode(ISD::SRL, DL, MVT::i32, Size,
19792 DAG.getConstant(2, DL, MVT::i32));
19794 SDValue Flag;
19795 Chain = DAG.getCopyToReg(Chain, DL, ARM::R4, Words, Flag);
19796 Flag = Chain.getValue(1);
19798 SDVTList NodeTys = DAG.getVTList(MVT::Other, MVT::Glue);
19799 Chain = DAG.getNode(ARMISD::WIN__CHKSTK, DL, NodeTys, Chain, Flag);
19801 SDValue NewSP = DAG.getCopyFromReg(Chain, DL, ARM::SP, MVT::i32);
19802 Chain = NewSP.getValue(1);
19804 SDValue Ops[2] = { NewSP, Chain };
19805 return DAG.getMergeValues(Ops, DL);
19808 SDValue ARMTargetLowering::LowerFP_EXTEND(SDValue Op, SelectionDAG &DAG) const {
19809 bool IsStrict = Op->isStrictFPOpcode();
19810 SDValue SrcVal = Op.getOperand(IsStrict ? 1 : 0);
19811 const unsigned DstSz = Op.getValueType().getSizeInBits();
19812 const unsigned SrcSz = SrcVal.getValueType().getSizeInBits();
19813 assert(DstSz > SrcSz && DstSz <= 64 && SrcSz >= 16 &&
19814 "Unexpected type for custom-lowering FP_EXTEND");
19816 assert((!Subtarget->hasFP64() || !Subtarget->hasFPARMv8Base()) &&
19817 "With both FP DP and 16, any FP conversion is legal!");
19819 assert(!(DstSz == 32 && Subtarget->hasFP16()) &&
19820 "With FP16, 16 to 32 conversion is legal!");
19822 // Converting from 32 -> 64 is valid if we have FP64.
19823 if (SrcSz == 32 && DstSz == 64 && Subtarget->hasFP64()) {
19824 // FIXME: Remove this when we have strict fp instruction selection patterns
19825 if (IsStrict) {
19826 SDLoc Loc(Op);
19827 SDValue Result = DAG.getNode(ISD::FP_EXTEND,
19828 Loc, Op.getValueType(), SrcVal);
19829 return DAG.getMergeValues({Result, Op.getOperand(0)}, Loc);
19831 return Op;
19834 // Either we are converting from 16 -> 64, without FP16 and/or
19835 // FP.double-precision or without Armv8-fp. So we must do it in two
19836 // steps.
19837 // Or we are converting from 32 -> 64 without fp.double-precision or 16 -> 32
19838 // without FP16. So we must do a function call.
19839 SDLoc Loc(Op);
19840 RTLIB::Libcall LC;
19841 MakeLibCallOptions CallOptions;
19842 SDValue Chain = IsStrict ? Op.getOperand(0) : SDValue();
19843 for (unsigned Sz = SrcSz; Sz <= 32 && Sz < DstSz; Sz *= 2) {
19844 bool Supported = (Sz == 16 ? Subtarget->hasFP16() : Subtarget->hasFP64());
19845 MVT SrcVT = (Sz == 16 ? MVT::f16 : MVT::f32);
19846 MVT DstVT = (Sz == 16 ? MVT::f32 : MVT::f64);
19847 if (Supported) {
19848 if (IsStrict) {
19849 SrcVal = DAG.getNode(ISD::STRICT_FP_EXTEND, Loc,
19850 {DstVT, MVT::Other}, {Chain, SrcVal});
19851 Chain = SrcVal.getValue(1);
19852 } else {
19853 SrcVal = DAG.getNode(ISD::FP_EXTEND, Loc, DstVT, SrcVal);
19855 } else {
19856 LC = RTLIB::getFPEXT(SrcVT, DstVT);
19857 assert(LC != RTLIB::UNKNOWN_LIBCALL &&
19858 "Unexpected type for custom-lowering FP_EXTEND");
19859 std::tie(SrcVal, Chain) = makeLibCall(DAG, LC, DstVT, SrcVal, CallOptions,
19860 Loc, Chain);
19864 return IsStrict ? DAG.getMergeValues({SrcVal, Chain}, Loc) : SrcVal;
19867 SDValue ARMTargetLowering::LowerFP_ROUND(SDValue Op, SelectionDAG &DAG) const {
19868 bool IsStrict = Op->isStrictFPOpcode();
19870 SDValue SrcVal = Op.getOperand(IsStrict ? 1 : 0);
19871 EVT SrcVT = SrcVal.getValueType();
19872 EVT DstVT = Op.getValueType();
19873 const unsigned DstSz = Op.getValueType().getSizeInBits();
19874 const unsigned SrcSz = SrcVT.getSizeInBits();
19875 (void)DstSz;
19876 assert(DstSz < SrcSz && SrcSz <= 64 && DstSz >= 16 &&
19877 "Unexpected type for custom-lowering FP_ROUND");
19879 assert((!Subtarget->hasFP64() || !Subtarget->hasFPARMv8Base()) &&
19880 "With both FP DP and 16, any FP conversion is legal!");
19882 SDLoc Loc(Op);
19884 // Instruction from 32 -> 16 if hasFP16 is valid
19885 if (SrcSz == 32 && Subtarget->hasFP16())
19886 return Op;
19888 // Lib call from 32 -> 16 / 64 -> [32, 16]
19889 RTLIB::Libcall LC = RTLIB::getFPROUND(SrcVT, DstVT);
19890 assert(LC != RTLIB::UNKNOWN_LIBCALL &&
19891 "Unexpected type for custom-lowering FP_ROUND");
19892 MakeLibCallOptions CallOptions;
19893 SDValue Chain = IsStrict ? Op.getOperand(0) : SDValue();
19894 SDValue Result;
19895 std::tie(Result, Chain) = makeLibCall(DAG, LC, DstVT, SrcVal, CallOptions,
19896 Loc, Chain);
19897 return IsStrict ? DAG.getMergeValues({Result, Chain}, Loc) : Result;
19900 void ARMTargetLowering::lowerABS(SDNode *N, SmallVectorImpl<SDValue> &Results,
19901 SelectionDAG &DAG) const {
19902 assert(N->getValueType(0) == MVT::i64 && "Unexpected type (!= i64) on ABS.");
19903 MVT HalfT = MVT::i32;
19904 SDLoc dl(N);
19905 SDValue Hi, Lo, Tmp;
19907 if (!isOperationLegalOrCustom(ISD::ADDCARRY, HalfT) ||
19908 !isOperationLegalOrCustom(ISD::UADDO, HalfT))
19909 return ;
19911 unsigned OpTypeBits = HalfT.getScalarSizeInBits();
19912 SDVTList VTList = DAG.getVTList(HalfT, MVT::i1);
19914 Lo = DAG.getNode(ISD::EXTRACT_ELEMENT, dl, HalfT, N->getOperand(0),
19915 DAG.getConstant(0, dl, HalfT));
19916 Hi = DAG.getNode(ISD::EXTRACT_ELEMENT, dl, HalfT, N->getOperand(0),
19917 DAG.getConstant(1, dl, HalfT));
19919 Tmp = DAG.getNode(ISD::SRA, dl, HalfT, Hi,
19920 DAG.getConstant(OpTypeBits - 1, dl,
19921 getShiftAmountTy(HalfT, DAG.getDataLayout())));
19922 Lo = DAG.getNode(ISD::UADDO, dl, VTList, Tmp, Lo);
19923 Hi = DAG.getNode(ISD::ADDCARRY, dl, VTList, Tmp, Hi,
19924 SDValue(Lo.getNode(), 1));
19925 Hi = DAG.getNode(ISD::XOR, dl, HalfT, Tmp, Hi);
19926 Lo = DAG.getNode(ISD::XOR, dl, HalfT, Tmp, Lo);
19928 Results.push_back(Lo);
19929 Results.push_back(Hi);
19932 bool
19933 ARMTargetLowering::isOffsetFoldingLegal(const GlobalAddressSDNode *GA) const {
19934 // The ARM target isn't yet aware of offsets.
19935 return false;
19938 bool ARM::isBitFieldInvertedMask(unsigned v) {
19939 if (v == 0xffffffff)
19940 return false;
19942 // there can be 1's on either or both "outsides", all the "inside"
19943 // bits must be 0's
19944 return isShiftedMask_32(~v);
19947 /// isFPImmLegal - Returns true if the target can instruction select the
19948 /// specified FP immediate natively. If false, the legalizer will
19949 /// materialize the FP immediate as a load from a constant pool.
19950 bool ARMTargetLowering::isFPImmLegal(const APFloat &Imm, EVT VT,
19951 bool ForCodeSize) const {
19952 if (!Subtarget->hasVFP3Base())
19953 return false;
19954 if (VT == MVT::f16 && Subtarget->hasFullFP16())
19955 return ARM_AM::getFP16Imm(Imm) != -1;
19956 if (VT == MVT::f32 && Subtarget->hasFullFP16() &&
19957 ARM_AM::getFP32FP16Imm(Imm) != -1)
19958 return true;
19959 if (VT == MVT::f32)
19960 return ARM_AM::getFP32Imm(Imm) != -1;
19961 if (VT == MVT::f64 && Subtarget->hasFP64())
19962 return ARM_AM::getFP64Imm(Imm) != -1;
19963 return false;
19966 /// getTgtMemIntrinsic - Represent NEON load and store intrinsics as
19967 /// MemIntrinsicNodes. The associated MachineMemOperands record the alignment
19968 /// specified in the intrinsic calls.
19969 bool ARMTargetLowering::getTgtMemIntrinsic(IntrinsicInfo &Info,
19970 const CallInst &I,
19971 MachineFunction &MF,
19972 unsigned Intrinsic) const {
19973 switch (Intrinsic) {
19974 case Intrinsic::arm_neon_vld1:
19975 case Intrinsic::arm_neon_vld2:
19976 case Intrinsic::arm_neon_vld3:
19977 case Intrinsic::arm_neon_vld4:
19978 case Intrinsic::arm_neon_vld2lane:
19979 case Intrinsic::arm_neon_vld3lane:
19980 case Intrinsic::arm_neon_vld4lane:
19981 case Intrinsic::arm_neon_vld2dup:
19982 case Intrinsic::arm_neon_vld3dup:
19983 case Intrinsic::arm_neon_vld4dup: {
19984 Info.opc = ISD::INTRINSIC_W_CHAIN;
19985 // Conservatively set memVT to the entire set of vectors loaded.
19986 auto &DL = I.getCalledFunction()->getParent()->getDataLayout();
19987 uint64_t NumElts = DL.getTypeSizeInBits(I.getType()) / 64;
19988 Info.memVT = EVT::getVectorVT(I.getType()->getContext(), MVT::i64, NumElts);
19989 Info.ptrVal = I.getArgOperand(0);
19990 Info.offset = 0;
19991 Value *AlignArg = I.getArgOperand(I.getNumArgOperands() - 1);
19992 Info.align = cast<ConstantInt>(AlignArg)->getMaybeAlignValue();
19993 // volatile loads with NEON intrinsics not supported
19994 Info.flags = MachineMemOperand::MOLoad;
19995 return true;
19997 case Intrinsic::arm_neon_vld1x2:
19998 case Intrinsic::arm_neon_vld1x3:
19999 case Intrinsic::arm_neon_vld1x4: {
20000 Info.opc = ISD::INTRINSIC_W_CHAIN;
20001 // Conservatively set memVT to the entire set of vectors loaded.
20002 auto &DL = I.getCalledFunction()->getParent()->getDataLayout();
20003 uint64_t NumElts = DL.getTypeSizeInBits(I.getType()) / 64;
20004 Info.memVT = EVT::getVectorVT(I.getType()->getContext(), MVT::i64, NumElts);
20005 Info.ptrVal = I.getArgOperand(I.getNumArgOperands() - 1);
20006 Info.offset = 0;
20007 Info.align.reset();
20008 // volatile loads with NEON intrinsics not supported
20009 Info.flags = MachineMemOperand::MOLoad;
20010 return true;
20012 case Intrinsic::arm_neon_vst1:
20013 case Intrinsic::arm_neon_vst2:
20014 case Intrinsic::arm_neon_vst3:
20015 case Intrinsic::arm_neon_vst4:
20016 case Intrinsic::arm_neon_vst2lane:
20017 case Intrinsic::arm_neon_vst3lane:
20018 case Intrinsic::arm_neon_vst4lane: {
20019 Info.opc = ISD::INTRINSIC_VOID;
20020 // Conservatively set memVT to the entire set of vectors stored.
20021 auto &DL = I.getCalledFunction()->getParent()->getDataLayout();
20022 unsigned NumElts = 0;
20023 for (unsigned ArgI = 1, ArgE = I.getNumArgOperands(); ArgI < ArgE; ++ArgI) {
20024 Type *ArgTy = I.getArgOperand(ArgI)->getType();
20025 if (!ArgTy->isVectorTy())
20026 break;
20027 NumElts += DL.getTypeSizeInBits(ArgTy) / 64;
20029 Info.memVT = EVT::getVectorVT(I.getType()->getContext(), MVT::i64, NumElts);
20030 Info.ptrVal = I.getArgOperand(0);
20031 Info.offset = 0;
20032 Value *AlignArg = I.getArgOperand(I.getNumArgOperands() - 1);
20033 Info.align = cast<ConstantInt>(AlignArg)->getMaybeAlignValue();
20034 // volatile stores with NEON intrinsics not supported
20035 Info.flags = MachineMemOperand::MOStore;
20036 return true;
20038 case Intrinsic::arm_neon_vst1x2:
20039 case Intrinsic::arm_neon_vst1x3:
20040 case Intrinsic::arm_neon_vst1x4: {
20041 Info.opc = ISD::INTRINSIC_VOID;
20042 // Conservatively set memVT to the entire set of vectors stored.
20043 auto &DL = I.getCalledFunction()->getParent()->getDataLayout();
20044 unsigned NumElts = 0;
20045 for (unsigned ArgI = 1, ArgE = I.getNumArgOperands(); ArgI < ArgE; ++ArgI) {
20046 Type *ArgTy = I.getArgOperand(ArgI)->getType();
20047 if (!ArgTy->isVectorTy())
20048 break;
20049 NumElts += DL.getTypeSizeInBits(ArgTy) / 64;
20051 Info.memVT = EVT::getVectorVT(I.getType()->getContext(), MVT::i64, NumElts);
20052 Info.ptrVal = I.getArgOperand(0);
20053 Info.offset = 0;
20054 Info.align.reset();
20055 // volatile stores with NEON intrinsics not supported
20056 Info.flags = MachineMemOperand::MOStore;
20057 return true;
20059 case Intrinsic::arm_mve_vld2q:
20060 case Intrinsic::arm_mve_vld4q: {
20061 Info.opc = ISD::INTRINSIC_W_CHAIN;
20062 // Conservatively set memVT to the entire set of vectors loaded.
20063 Type *VecTy = cast<StructType>(I.getType())->getElementType(1);
20064 unsigned Factor = Intrinsic == Intrinsic::arm_mve_vld2q ? 2 : 4;
20065 Info.memVT = EVT::getVectorVT(VecTy->getContext(), MVT::i64, Factor * 2);
20066 Info.ptrVal = I.getArgOperand(0);
20067 Info.offset = 0;
20068 Info.align = Align(VecTy->getScalarSizeInBits() / 8);
20069 // volatile loads with MVE intrinsics not supported
20070 Info.flags = MachineMemOperand::MOLoad;
20071 return true;
20073 case Intrinsic::arm_mve_vst2q:
20074 case Intrinsic::arm_mve_vst4q: {
20075 Info.opc = ISD::INTRINSIC_VOID;
20076 // Conservatively set memVT to the entire set of vectors stored.
20077 Type *VecTy = I.getArgOperand(1)->getType();
20078 unsigned Factor = Intrinsic == Intrinsic::arm_mve_vst2q ? 2 : 4;
20079 Info.memVT = EVT::getVectorVT(VecTy->getContext(), MVT::i64, Factor * 2);
20080 Info.ptrVal = I.getArgOperand(0);
20081 Info.offset = 0;
20082 Info.align = Align(VecTy->getScalarSizeInBits() / 8);
20083 // volatile stores with MVE intrinsics not supported
20084 Info.flags = MachineMemOperand::MOStore;
20085 return true;
20087 case Intrinsic::arm_mve_vldr_gather_base:
20088 case Intrinsic::arm_mve_vldr_gather_base_predicated: {
20089 Info.opc = ISD::INTRINSIC_W_CHAIN;
20090 Info.ptrVal = nullptr;
20091 Info.memVT = MVT::getVT(I.getType());
20092 Info.align = Align(1);
20093 Info.flags |= MachineMemOperand::MOLoad;
20094 return true;
20096 case Intrinsic::arm_mve_vldr_gather_base_wb:
20097 case Intrinsic::arm_mve_vldr_gather_base_wb_predicated: {
20098 Info.opc = ISD::INTRINSIC_W_CHAIN;
20099 Info.ptrVal = nullptr;
20100 Info.memVT = MVT::getVT(I.getType()->getContainedType(0));
20101 Info.align = Align(1);
20102 Info.flags |= MachineMemOperand::MOLoad;
20103 return true;
20105 case Intrinsic::arm_mve_vldr_gather_offset:
20106 case Intrinsic::arm_mve_vldr_gather_offset_predicated: {
20107 Info.opc = ISD::INTRINSIC_W_CHAIN;
20108 Info.ptrVal = nullptr;
20109 MVT DataVT = MVT::getVT(I.getType());
20110 unsigned MemSize = cast<ConstantInt>(I.getArgOperand(2))->getZExtValue();
20111 Info.memVT = MVT::getVectorVT(MVT::getIntegerVT(MemSize),
20112 DataVT.getVectorNumElements());
20113 Info.align = Align(1);
20114 Info.flags |= MachineMemOperand::MOLoad;
20115 return true;
20117 case Intrinsic::arm_mve_vstr_scatter_base:
20118 case Intrinsic::arm_mve_vstr_scatter_base_predicated: {
20119 Info.opc = ISD::INTRINSIC_VOID;
20120 Info.ptrVal = nullptr;
20121 Info.memVT = MVT::getVT(I.getArgOperand(2)->getType());
20122 Info.align = Align(1);
20123 Info.flags |= MachineMemOperand::MOStore;
20124 return true;
20126 case Intrinsic::arm_mve_vstr_scatter_base_wb:
20127 case Intrinsic::arm_mve_vstr_scatter_base_wb_predicated: {
20128 Info.opc = ISD::INTRINSIC_W_CHAIN;
20129 Info.ptrVal = nullptr;
20130 Info.memVT = MVT::getVT(I.getArgOperand(2)->getType());
20131 Info.align = Align(1);
20132 Info.flags |= MachineMemOperand::MOStore;
20133 return true;
20135 case Intrinsic::arm_mve_vstr_scatter_offset:
20136 case Intrinsic::arm_mve_vstr_scatter_offset_predicated: {
20137 Info.opc = ISD::INTRINSIC_VOID;
20138 Info.ptrVal = nullptr;
20139 MVT DataVT = MVT::getVT(I.getArgOperand(2)->getType());
20140 unsigned MemSize = cast<ConstantInt>(I.getArgOperand(3))->getZExtValue();
20141 Info.memVT = MVT::getVectorVT(MVT::getIntegerVT(MemSize),
20142 DataVT.getVectorNumElements());
20143 Info.align = Align(1);
20144 Info.flags |= MachineMemOperand::MOStore;
20145 return true;
20147 case Intrinsic::arm_ldaex:
20148 case Intrinsic::arm_ldrex: {
20149 auto &DL = I.getCalledFunction()->getParent()->getDataLayout();
20150 PointerType *PtrTy = cast<PointerType>(I.getArgOperand(0)->getType());
20151 Info.opc = ISD::INTRINSIC_W_CHAIN;
20152 Info.memVT = MVT::getVT(PtrTy->getElementType());
20153 Info.ptrVal = I.getArgOperand(0);
20154 Info.offset = 0;
20155 Info.align = DL.getABITypeAlign(PtrTy->getElementType());
20156 Info.flags = MachineMemOperand::MOLoad | MachineMemOperand::MOVolatile;
20157 return true;
20159 case Intrinsic::arm_stlex:
20160 case Intrinsic::arm_strex: {
20161 auto &DL = I.getCalledFunction()->getParent()->getDataLayout();
20162 PointerType *PtrTy = cast<PointerType>(I.getArgOperand(1)->getType());
20163 Info.opc = ISD::INTRINSIC_W_CHAIN;
20164 Info.memVT = MVT::getVT(PtrTy->getElementType());
20165 Info.ptrVal = I.getArgOperand(1);
20166 Info.offset = 0;
20167 Info.align = DL.getABITypeAlign(PtrTy->getElementType());
20168 Info.flags = MachineMemOperand::MOStore | MachineMemOperand::MOVolatile;
20169 return true;
20171 case Intrinsic::arm_stlexd:
20172 case Intrinsic::arm_strexd:
20173 Info.opc = ISD::INTRINSIC_W_CHAIN;
20174 Info.memVT = MVT::i64;
20175 Info.ptrVal = I.getArgOperand(2);
20176 Info.offset = 0;
20177 Info.align = Align(8);
20178 Info.flags = MachineMemOperand::MOStore | MachineMemOperand::MOVolatile;
20179 return true;
20181 case Intrinsic::arm_ldaexd:
20182 case Intrinsic::arm_ldrexd:
20183 Info.opc = ISD::INTRINSIC_W_CHAIN;
20184 Info.memVT = MVT::i64;
20185 Info.ptrVal = I.getArgOperand(0);
20186 Info.offset = 0;
20187 Info.align = Align(8);
20188 Info.flags = MachineMemOperand::MOLoad | MachineMemOperand::MOVolatile;
20189 return true;
20191 default:
20192 break;
20195 return false;
20198 /// Returns true if it is beneficial to convert a load of a constant
20199 /// to just the constant itself.
20200 bool ARMTargetLowering::shouldConvertConstantLoadToIntImm(const APInt &Imm,
20201 Type *Ty) const {
20202 assert(Ty->isIntegerTy());
20204 unsigned Bits = Ty->getPrimitiveSizeInBits();
20205 if (Bits == 0 || Bits > 32)
20206 return false;
20207 return true;
20210 bool ARMTargetLowering::isExtractSubvectorCheap(EVT ResVT, EVT SrcVT,
20211 unsigned Index) const {
20212 if (!isOperationLegalOrCustom(ISD::EXTRACT_SUBVECTOR, ResVT))
20213 return false;
20215 return (Index == 0 || Index == ResVT.getVectorNumElements());
20218 Instruction *ARMTargetLowering::makeDMB(IRBuilderBase &Builder,
20219 ARM_MB::MemBOpt Domain) const {
20220 Module *M = Builder.GetInsertBlock()->getParent()->getParent();
20222 // First, if the target has no DMB, see what fallback we can use.
20223 if (!Subtarget->hasDataBarrier()) {
20224 // Some ARMv6 cpus can support data barriers with an mcr instruction.
20225 // Thumb1 and pre-v6 ARM mode use a libcall instead and should never get
20226 // here.
20227 if (Subtarget->hasV6Ops() && !Subtarget->isThumb()) {
20228 Function *MCR = Intrinsic::getDeclaration(M, Intrinsic::arm_mcr);
20229 Value* args[6] = {Builder.getInt32(15), Builder.getInt32(0),
20230 Builder.getInt32(0), Builder.getInt32(7),
20231 Builder.getInt32(10), Builder.getInt32(5)};
20232 return Builder.CreateCall(MCR, args);
20233 } else {
20234 // Instead of using barriers, atomic accesses on these subtargets use
20235 // libcalls.
20236 llvm_unreachable("makeDMB on a target so old that it has no barriers");
20238 } else {
20239 Function *DMB = Intrinsic::getDeclaration(M, Intrinsic::arm_dmb);
20240 // Only a full system barrier exists in the M-class architectures.
20241 Domain = Subtarget->isMClass() ? ARM_MB::SY : Domain;
20242 Constant *CDomain = Builder.getInt32(Domain);
20243 return Builder.CreateCall(DMB, CDomain);
20247 // Based on http://www.cl.cam.ac.uk/~pes20/cpp/cpp0xmappings.html
20248 Instruction *ARMTargetLowering::emitLeadingFence(IRBuilderBase &Builder,
20249 Instruction *Inst,
20250 AtomicOrdering Ord) const {
20251 switch (Ord) {
20252 case AtomicOrdering::NotAtomic:
20253 case AtomicOrdering::Unordered:
20254 llvm_unreachable("Invalid fence: unordered/non-atomic");
20255 case AtomicOrdering::Monotonic:
20256 case AtomicOrdering::Acquire:
20257 return nullptr; // Nothing to do
20258 case AtomicOrdering::SequentiallyConsistent:
20259 if (!Inst->hasAtomicStore())
20260 return nullptr; // Nothing to do
20261 LLVM_FALLTHROUGH;
20262 case AtomicOrdering::Release:
20263 case AtomicOrdering::AcquireRelease:
20264 if (Subtarget->preferISHSTBarriers())
20265 return makeDMB(Builder, ARM_MB::ISHST);
20266 // FIXME: add a comment with a link to documentation justifying this.
20267 else
20268 return makeDMB(Builder, ARM_MB::ISH);
20270 llvm_unreachable("Unknown fence ordering in emitLeadingFence");
20273 Instruction *ARMTargetLowering::emitTrailingFence(IRBuilderBase &Builder,
20274 Instruction *Inst,
20275 AtomicOrdering Ord) const {
20276 switch (Ord) {
20277 case AtomicOrdering::NotAtomic:
20278 case AtomicOrdering::Unordered:
20279 llvm_unreachable("Invalid fence: unordered/not-atomic");
20280 case AtomicOrdering::Monotonic:
20281 case AtomicOrdering::Release:
20282 return nullptr; // Nothing to do
20283 case AtomicOrdering::Acquire:
20284 case AtomicOrdering::AcquireRelease:
20285 case AtomicOrdering::SequentiallyConsistent:
20286 return makeDMB(Builder, ARM_MB::ISH);
20288 llvm_unreachable("Unknown fence ordering in emitTrailingFence");
20291 // Loads and stores less than 64-bits are already atomic; ones above that
20292 // are doomed anyway, so defer to the default libcall and blame the OS when
20293 // things go wrong. Cortex M doesn't have ldrexd/strexd though, so don't emit
20294 // anything for those.
20295 bool ARMTargetLowering::shouldExpandAtomicStoreInIR(StoreInst *SI) const {
20296 unsigned Size = SI->getValueOperand()->getType()->getPrimitiveSizeInBits();
20297 return (Size == 64) && !Subtarget->isMClass();
20300 // Loads and stores less than 64-bits are already atomic; ones above that
20301 // are doomed anyway, so defer to the default libcall and blame the OS when
20302 // things go wrong. Cortex M doesn't have ldrexd/strexd though, so don't emit
20303 // anything for those.
20304 // FIXME: ldrd and strd are atomic if the CPU has LPAE (e.g. A15 has that
20305 // guarantee, see DDI0406C ARM architecture reference manual,
20306 // sections A8.8.72-74 LDRD)
20307 TargetLowering::AtomicExpansionKind
20308 ARMTargetLowering::shouldExpandAtomicLoadInIR(LoadInst *LI) const {
20309 unsigned Size = LI->getType()->getPrimitiveSizeInBits();
20310 return ((Size == 64) && !Subtarget->isMClass()) ? AtomicExpansionKind::LLOnly
20311 : AtomicExpansionKind::None;
20314 // For the real atomic operations, we have ldrex/strex up to 32 bits,
20315 // and up to 64 bits on the non-M profiles
20316 TargetLowering::AtomicExpansionKind
20317 ARMTargetLowering::shouldExpandAtomicRMWInIR(AtomicRMWInst *AI) const {
20318 if (AI->isFloatingPointOperation())
20319 return AtomicExpansionKind::CmpXChg;
20321 // At -O0, fast-regalloc cannot cope with the live vregs necessary to
20322 // implement atomicrmw without spilling. If the target address is also on the
20323 // stack and close enough to the spill slot, this can lead to a situation
20324 // where the monitor always gets cleared and the atomic operation can never
20325 // succeed. So at -O0 lower this operation to a CAS loop.
20326 if (getTargetMachine().getOptLevel() == CodeGenOpt::None)
20327 return AtomicExpansionKind::CmpXChg;
20329 unsigned Size = AI->getType()->getPrimitiveSizeInBits();
20330 bool hasAtomicRMW = !Subtarget->isThumb() || Subtarget->hasV8MBaselineOps();
20331 return (Size <= (Subtarget->isMClass() ? 32U : 64U) && hasAtomicRMW)
20332 ? AtomicExpansionKind::LLSC
20333 : AtomicExpansionKind::None;
20336 // Similar to shouldExpandAtomicRMWInIR, ldrex/strex can be used up to 32
20337 // bits, and up to 64 bits on the non-M profiles.
20338 TargetLowering::AtomicExpansionKind
20339 ARMTargetLowering::shouldExpandAtomicCmpXchgInIR(AtomicCmpXchgInst *AI) const {
20340 // At -O0, fast-regalloc cannot cope with the live vregs necessary to
20341 // implement cmpxchg without spilling. If the address being exchanged is also
20342 // on the stack and close enough to the spill slot, this can lead to a
20343 // situation where the monitor always gets cleared and the atomic operation
20344 // can never succeed. So at -O0 we need a late-expanded pseudo-inst instead.
20345 unsigned Size = AI->getOperand(1)->getType()->getPrimitiveSizeInBits();
20346 bool HasAtomicCmpXchg =
20347 !Subtarget->isThumb() || Subtarget->hasV8MBaselineOps();
20348 if (getTargetMachine().getOptLevel() != 0 && HasAtomicCmpXchg &&
20349 Size <= (Subtarget->isMClass() ? 32U : 64U))
20350 return AtomicExpansionKind::LLSC;
20351 return AtomicExpansionKind::None;
20354 bool ARMTargetLowering::shouldInsertFencesForAtomic(
20355 const Instruction *I) const {
20356 return InsertFencesForAtomic;
20359 // This has so far only been implemented for MachO.
20360 bool ARMTargetLowering::useLoadStackGuardNode() const {
20361 return Subtarget->isTargetMachO();
20364 void ARMTargetLowering::insertSSPDeclarations(Module &M) const {
20365 if (!Subtarget->getTargetTriple().isWindowsMSVCEnvironment())
20366 return TargetLowering::insertSSPDeclarations(M);
20368 // MSVC CRT has a global variable holding security cookie.
20369 M.getOrInsertGlobal("__security_cookie",
20370 Type::getInt8PtrTy(M.getContext()));
20372 // MSVC CRT has a function to validate security cookie.
20373 FunctionCallee SecurityCheckCookie = M.getOrInsertFunction(
20374 "__security_check_cookie", Type::getVoidTy(M.getContext()),
20375 Type::getInt8PtrTy(M.getContext()));
20376 if (Function *F = dyn_cast<Function>(SecurityCheckCookie.getCallee()))
20377 F->addParamAttr(0, Attribute::AttrKind::InReg);
20380 Value *ARMTargetLowering::getSDagStackGuard(const Module &M) const {
20381 // MSVC CRT has a global variable holding security cookie.
20382 if (Subtarget->getTargetTriple().isWindowsMSVCEnvironment())
20383 return M.getGlobalVariable("__security_cookie");
20384 return TargetLowering::getSDagStackGuard(M);
20387 Function *ARMTargetLowering::getSSPStackGuardCheck(const Module &M) const {
20388 // MSVC CRT has a function to validate security cookie.
20389 if (Subtarget->getTargetTriple().isWindowsMSVCEnvironment())
20390 return M.getFunction("__security_check_cookie");
20391 return TargetLowering::getSSPStackGuardCheck(M);
20394 bool ARMTargetLowering::canCombineStoreAndExtract(Type *VectorTy, Value *Idx,
20395 unsigned &Cost) const {
20396 // If we do not have NEON, vector types are not natively supported.
20397 if (!Subtarget->hasNEON())
20398 return false;
20400 // Floating point values and vector values map to the same register file.
20401 // Therefore, although we could do a store extract of a vector type, this is
20402 // better to leave at float as we have more freedom in the addressing mode for
20403 // those.
20404 if (VectorTy->isFPOrFPVectorTy())
20405 return false;
20407 // If the index is unknown at compile time, this is very expensive to lower
20408 // and it is not possible to combine the store with the extract.
20409 if (!isa<ConstantInt>(Idx))
20410 return false;
20412 assert(VectorTy->isVectorTy() && "VectorTy is not a vector type");
20413 unsigned BitWidth = VectorTy->getPrimitiveSizeInBits().getFixedSize();
20414 // We can do a store + vector extract on any vector that fits perfectly in a D
20415 // or Q register.
20416 if (BitWidth == 64 || BitWidth == 128) {
20417 Cost = 0;
20418 return true;
20420 return false;
20423 bool ARMTargetLowering::isCheapToSpeculateCttz() const {
20424 return Subtarget->hasV6T2Ops();
20427 bool ARMTargetLowering::isCheapToSpeculateCtlz() const {
20428 return Subtarget->hasV6T2Ops();
20431 bool ARMTargetLowering::shouldExpandShift(SelectionDAG &DAG, SDNode *N) const {
20432 return !Subtarget->hasMinSize() || Subtarget->isTargetWindows();
20435 Value *ARMTargetLowering::emitLoadLinked(IRBuilderBase &Builder, Type *ValueTy,
20436 Value *Addr,
20437 AtomicOrdering Ord) const {
20438 Module *M = Builder.GetInsertBlock()->getParent()->getParent();
20439 bool IsAcquire = isAcquireOrStronger(Ord);
20441 // Since i64 isn't legal and intrinsics don't get type-lowered, the ldrexd
20442 // intrinsic must return {i32, i32} and we have to recombine them into a
20443 // single i64 here.
20444 if (ValueTy->getPrimitiveSizeInBits() == 64) {
20445 Intrinsic::ID Int =
20446 IsAcquire ? Intrinsic::arm_ldaexd : Intrinsic::arm_ldrexd;
20447 Function *Ldrex = Intrinsic::getDeclaration(M, Int);
20449 Addr = Builder.CreateBitCast(Addr, Type::getInt8PtrTy(M->getContext()));
20450 Value *LoHi = Builder.CreateCall(Ldrex, Addr, "lohi");
20452 Value *Lo = Builder.CreateExtractValue(LoHi, 0, "lo");
20453 Value *Hi = Builder.CreateExtractValue(LoHi, 1, "hi");
20454 if (!Subtarget->isLittle())
20455 std::swap (Lo, Hi);
20456 Lo = Builder.CreateZExt(Lo, ValueTy, "lo64");
20457 Hi = Builder.CreateZExt(Hi, ValueTy, "hi64");
20458 return Builder.CreateOr(
20459 Lo, Builder.CreateShl(Hi, ConstantInt::get(ValueTy, 32)), "val64");
20462 Type *Tys[] = { Addr->getType() };
20463 Intrinsic::ID Int = IsAcquire ? Intrinsic::arm_ldaex : Intrinsic::arm_ldrex;
20464 Function *Ldrex = Intrinsic::getDeclaration(M, Int, Tys);
20466 return Builder.CreateTruncOrBitCast(Builder.CreateCall(Ldrex, Addr), ValueTy);
20469 void ARMTargetLowering::emitAtomicCmpXchgNoStoreLLBalance(
20470 IRBuilderBase &Builder) const {
20471 if (!Subtarget->hasV7Ops())
20472 return;
20473 Module *M = Builder.GetInsertBlock()->getParent()->getParent();
20474 Builder.CreateCall(Intrinsic::getDeclaration(M, Intrinsic::arm_clrex));
20477 Value *ARMTargetLowering::emitStoreConditional(IRBuilderBase &Builder,
20478 Value *Val, Value *Addr,
20479 AtomicOrdering Ord) const {
20480 Module *M = Builder.GetInsertBlock()->getParent()->getParent();
20481 bool IsRelease = isReleaseOrStronger(Ord);
20483 // Since the intrinsics must have legal type, the i64 intrinsics take two
20484 // parameters: "i32, i32". We must marshal Val into the appropriate form
20485 // before the call.
20486 if (Val->getType()->getPrimitiveSizeInBits() == 64) {
20487 Intrinsic::ID Int =
20488 IsRelease ? Intrinsic::arm_stlexd : Intrinsic::arm_strexd;
20489 Function *Strex = Intrinsic::getDeclaration(M, Int);
20490 Type *Int32Ty = Type::getInt32Ty(M->getContext());
20492 Value *Lo = Builder.CreateTrunc(Val, Int32Ty, "lo");
20493 Value *Hi = Builder.CreateTrunc(Builder.CreateLShr(Val, 32), Int32Ty, "hi");
20494 if (!Subtarget->isLittle())
20495 std::swap(Lo, Hi);
20496 Addr = Builder.CreateBitCast(Addr, Type::getInt8PtrTy(M->getContext()));
20497 return Builder.CreateCall(Strex, {Lo, Hi, Addr});
20500 Intrinsic::ID Int = IsRelease ? Intrinsic::arm_stlex : Intrinsic::arm_strex;
20501 Type *Tys[] = { Addr->getType() };
20502 Function *Strex = Intrinsic::getDeclaration(M, Int, Tys);
20504 return Builder.CreateCall(
20505 Strex, {Builder.CreateZExtOrBitCast(
20506 Val, Strex->getFunctionType()->getParamType(0)),
20507 Addr});
20511 bool ARMTargetLowering::alignLoopsWithOptSize() const {
20512 return Subtarget->isMClass();
20515 /// A helper function for determining the number of interleaved accesses we
20516 /// will generate when lowering accesses of the given type.
20517 unsigned
20518 ARMTargetLowering::getNumInterleavedAccesses(VectorType *VecTy,
20519 const DataLayout &DL) const {
20520 return (DL.getTypeSizeInBits(VecTy) + 127) / 128;
20523 bool ARMTargetLowering::isLegalInterleavedAccessType(
20524 unsigned Factor, FixedVectorType *VecTy, Align Alignment,
20525 const DataLayout &DL) const {
20527 unsigned VecSize = DL.getTypeSizeInBits(VecTy);
20528 unsigned ElSize = DL.getTypeSizeInBits(VecTy->getElementType());
20530 if (!Subtarget->hasNEON() && !Subtarget->hasMVEIntegerOps())
20531 return false;
20533 // Ensure the vector doesn't have f16 elements. Even though we could do an
20534 // i16 vldN, we can't hold the f16 vectors and will end up converting via
20535 // f32.
20536 if (Subtarget->hasNEON() && VecTy->getElementType()->isHalfTy())
20537 return false;
20538 if (Subtarget->hasMVEIntegerOps() && Factor == 3)
20539 return false;
20541 // Ensure the number of vector elements is greater than 1.
20542 if (VecTy->getNumElements() < 2)
20543 return false;
20545 // Ensure the element type is legal.
20546 if (ElSize != 8 && ElSize != 16 && ElSize != 32)
20547 return false;
20548 // And the alignment if high enough under MVE.
20549 if (Subtarget->hasMVEIntegerOps() && Alignment < ElSize / 8)
20550 return false;
20552 // Ensure the total vector size is 64 or a multiple of 128. Types larger than
20553 // 128 will be split into multiple interleaved accesses.
20554 if (Subtarget->hasNEON() && VecSize == 64)
20555 return true;
20556 return VecSize % 128 == 0;
20559 unsigned ARMTargetLowering::getMaxSupportedInterleaveFactor() const {
20560 if (Subtarget->hasNEON())
20561 return 4;
20562 if (Subtarget->hasMVEIntegerOps())
20563 return MVEMaxSupportedInterleaveFactor;
20564 return TargetLoweringBase::getMaxSupportedInterleaveFactor();
20567 /// Lower an interleaved load into a vldN intrinsic.
20569 /// E.g. Lower an interleaved load (Factor = 2):
20570 /// %wide.vec = load <8 x i32>, <8 x i32>* %ptr, align 4
20571 /// %v0 = shuffle %wide.vec, undef, <0, 2, 4, 6> ; Extract even elements
20572 /// %v1 = shuffle %wide.vec, undef, <1, 3, 5, 7> ; Extract odd elements
20574 /// Into:
20575 /// %vld2 = { <4 x i32>, <4 x i32> } call llvm.arm.neon.vld2(%ptr, 4)
20576 /// %vec0 = extractelement { <4 x i32>, <4 x i32> } %vld2, i32 0
20577 /// %vec1 = extractelement { <4 x i32>, <4 x i32> } %vld2, i32 1
20578 bool ARMTargetLowering::lowerInterleavedLoad(
20579 LoadInst *LI, ArrayRef<ShuffleVectorInst *> Shuffles,
20580 ArrayRef<unsigned> Indices, unsigned Factor) const {
20581 assert(Factor >= 2 && Factor <= getMaxSupportedInterleaveFactor() &&
20582 "Invalid interleave factor");
20583 assert(!Shuffles.empty() && "Empty shufflevector input");
20584 assert(Shuffles.size() == Indices.size() &&
20585 "Unmatched number of shufflevectors and indices");
20587 auto *VecTy = cast<FixedVectorType>(Shuffles[0]->getType());
20588 Type *EltTy = VecTy->getElementType();
20590 const DataLayout &DL = LI->getModule()->getDataLayout();
20591 Align Alignment = LI->getAlign();
20593 // Skip if we do not have NEON and skip illegal vector types. We can
20594 // "legalize" wide vector types into multiple interleaved accesses as long as
20595 // the vector types are divisible by 128.
20596 if (!isLegalInterleavedAccessType(Factor, VecTy, Alignment, DL))
20597 return false;
20599 unsigned NumLoads = getNumInterleavedAccesses(VecTy, DL);
20601 // A pointer vector can not be the return type of the ldN intrinsics. Need to
20602 // load integer vectors first and then convert to pointer vectors.
20603 if (EltTy->isPointerTy())
20604 VecTy = FixedVectorType::get(DL.getIntPtrType(EltTy), VecTy);
20606 IRBuilder<> Builder(LI);
20608 // The base address of the load.
20609 Value *BaseAddr = LI->getPointerOperand();
20611 if (NumLoads > 1) {
20612 // If we're going to generate more than one load, reset the sub-vector type
20613 // to something legal.
20614 VecTy = FixedVectorType::get(VecTy->getElementType(),
20615 VecTy->getNumElements() / NumLoads);
20617 // We will compute the pointer operand of each load from the original base
20618 // address using GEPs. Cast the base address to a pointer to the scalar
20619 // element type.
20620 BaseAddr = Builder.CreateBitCast(
20621 BaseAddr,
20622 VecTy->getElementType()->getPointerTo(LI->getPointerAddressSpace()));
20625 assert(isTypeLegal(EVT::getEVT(VecTy)) && "Illegal vldN vector type!");
20627 auto createLoadIntrinsic = [&](Value *BaseAddr) {
20628 if (Subtarget->hasNEON()) {
20629 Type *Int8Ptr = Builder.getInt8PtrTy(LI->getPointerAddressSpace());
20630 Type *Tys[] = {VecTy, Int8Ptr};
20631 static const Intrinsic::ID LoadInts[3] = {Intrinsic::arm_neon_vld2,
20632 Intrinsic::arm_neon_vld3,
20633 Intrinsic::arm_neon_vld4};
20634 Function *VldnFunc =
20635 Intrinsic::getDeclaration(LI->getModule(), LoadInts[Factor - 2], Tys);
20637 SmallVector<Value *, 2> Ops;
20638 Ops.push_back(Builder.CreateBitCast(BaseAddr, Int8Ptr));
20639 Ops.push_back(Builder.getInt32(LI->getAlignment()));
20641 return Builder.CreateCall(VldnFunc, Ops, "vldN");
20642 } else {
20643 assert((Factor == 2 || Factor == 4) &&
20644 "expected interleave factor of 2 or 4 for MVE");
20645 Intrinsic::ID LoadInts =
20646 Factor == 2 ? Intrinsic::arm_mve_vld2q : Intrinsic::arm_mve_vld4q;
20647 Type *VecEltTy =
20648 VecTy->getElementType()->getPointerTo(LI->getPointerAddressSpace());
20649 Type *Tys[] = {VecTy, VecEltTy};
20650 Function *VldnFunc =
20651 Intrinsic::getDeclaration(LI->getModule(), LoadInts, Tys);
20653 SmallVector<Value *, 2> Ops;
20654 Ops.push_back(Builder.CreateBitCast(BaseAddr, VecEltTy));
20655 return Builder.CreateCall(VldnFunc, Ops, "vldN");
20659 // Holds sub-vectors extracted from the load intrinsic return values. The
20660 // sub-vectors are associated with the shufflevector instructions they will
20661 // replace.
20662 DenseMap<ShuffleVectorInst *, SmallVector<Value *, 4>> SubVecs;
20664 for (unsigned LoadCount = 0; LoadCount < NumLoads; ++LoadCount) {
20665 // If we're generating more than one load, compute the base address of
20666 // subsequent loads as an offset from the previous.
20667 if (LoadCount > 0)
20668 BaseAddr = Builder.CreateConstGEP1_32(VecTy->getElementType(), BaseAddr,
20669 VecTy->getNumElements() * Factor);
20671 CallInst *VldN = createLoadIntrinsic(BaseAddr);
20673 // Replace uses of each shufflevector with the corresponding vector loaded
20674 // by ldN.
20675 for (unsigned i = 0; i < Shuffles.size(); i++) {
20676 ShuffleVectorInst *SV = Shuffles[i];
20677 unsigned Index = Indices[i];
20679 Value *SubVec = Builder.CreateExtractValue(VldN, Index);
20681 // Convert the integer vector to pointer vector if the element is pointer.
20682 if (EltTy->isPointerTy())
20683 SubVec = Builder.CreateIntToPtr(
20684 SubVec,
20685 FixedVectorType::get(SV->getType()->getElementType(), VecTy));
20687 SubVecs[SV].push_back(SubVec);
20691 // Replace uses of the shufflevector instructions with the sub-vectors
20692 // returned by the load intrinsic. If a shufflevector instruction is
20693 // associated with more than one sub-vector, those sub-vectors will be
20694 // concatenated into a single wide vector.
20695 for (ShuffleVectorInst *SVI : Shuffles) {
20696 auto &SubVec = SubVecs[SVI];
20697 auto *WideVec =
20698 SubVec.size() > 1 ? concatenateVectors(Builder, SubVec) : SubVec[0];
20699 SVI->replaceAllUsesWith(WideVec);
20702 return true;
20705 /// Lower an interleaved store into a vstN intrinsic.
20707 /// E.g. Lower an interleaved store (Factor = 3):
20708 /// %i.vec = shuffle <8 x i32> %v0, <8 x i32> %v1,
20709 /// <0, 4, 8, 1, 5, 9, 2, 6, 10, 3, 7, 11>
20710 /// store <12 x i32> %i.vec, <12 x i32>* %ptr, align 4
20712 /// Into:
20713 /// %sub.v0 = shuffle <8 x i32> %v0, <8 x i32> v1, <0, 1, 2, 3>
20714 /// %sub.v1 = shuffle <8 x i32> %v0, <8 x i32> v1, <4, 5, 6, 7>
20715 /// %sub.v2 = shuffle <8 x i32> %v0, <8 x i32> v1, <8, 9, 10, 11>
20716 /// call void llvm.arm.neon.vst3(%ptr, %sub.v0, %sub.v1, %sub.v2, 4)
20718 /// Note that the new shufflevectors will be removed and we'll only generate one
20719 /// vst3 instruction in CodeGen.
20721 /// Example for a more general valid mask (Factor 3). Lower:
20722 /// %i.vec = shuffle <32 x i32> %v0, <32 x i32> %v1,
20723 /// <4, 32, 16, 5, 33, 17, 6, 34, 18, 7, 35, 19>
20724 /// store <12 x i32> %i.vec, <12 x i32>* %ptr
20726 /// Into:
20727 /// %sub.v0 = shuffle <32 x i32> %v0, <32 x i32> v1, <4, 5, 6, 7>
20728 /// %sub.v1 = shuffle <32 x i32> %v0, <32 x i32> v1, <32, 33, 34, 35>
20729 /// %sub.v2 = shuffle <32 x i32> %v0, <32 x i32> v1, <16, 17, 18, 19>
20730 /// call void llvm.arm.neon.vst3(%ptr, %sub.v0, %sub.v1, %sub.v2, 4)
20731 bool ARMTargetLowering::lowerInterleavedStore(StoreInst *SI,
20732 ShuffleVectorInst *SVI,
20733 unsigned Factor) const {
20734 assert(Factor >= 2 && Factor <= getMaxSupportedInterleaveFactor() &&
20735 "Invalid interleave factor");
20737 auto *VecTy = cast<FixedVectorType>(SVI->getType());
20738 assert(VecTy->getNumElements() % Factor == 0 && "Invalid interleaved store");
20740 unsigned LaneLen = VecTy->getNumElements() / Factor;
20741 Type *EltTy = VecTy->getElementType();
20742 auto *SubVecTy = FixedVectorType::get(EltTy, LaneLen);
20744 const DataLayout &DL = SI->getModule()->getDataLayout();
20745 Align Alignment = SI->getAlign();
20747 // Skip if we do not have NEON and skip illegal vector types. We can
20748 // "legalize" wide vector types into multiple interleaved accesses as long as
20749 // the vector types are divisible by 128.
20750 if (!isLegalInterleavedAccessType(Factor, SubVecTy, Alignment, DL))
20751 return false;
20753 unsigned NumStores = getNumInterleavedAccesses(SubVecTy, DL);
20755 Value *Op0 = SVI->getOperand(0);
20756 Value *Op1 = SVI->getOperand(1);
20757 IRBuilder<> Builder(SI);
20759 // StN intrinsics don't support pointer vectors as arguments. Convert pointer
20760 // vectors to integer vectors.
20761 if (EltTy->isPointerTy()) {
20762 Type *IntTy = DL.getIntPtrType(EltTy);
20764 // Convert to the corresponding integer vector.
20765 auto *IntVecTy =
20766 FixedVectorType::get(IntTy, cast<FixedVectorType>(Op0->getType()));
20767 Op0 = Builder.CreatePtrToInt(Op0, IntVecTy);
20768 Op1 = Builder.CreatePtrToInt(Op1, IntVecTy);
20770 SubVecTy = FixedVectorType::get(IntTy, LaneLen);
20773 // The base address of the store.
20774 Value *BaseAddr = SI->getPointerOperand();
20776 if (NumStores > 1) {
20777 // If we're going to generate more than one store, reset the lane length
20778 // and sub-vector type to something legal.
20779 LaneLen /= NumStores;
20780 SubVecTy = FixedVectorType::get(SubVecTy->getElementType(), LaneLen);
20782 // We will compute the pointer operand of each store from the original base
20783 // address using GEPs. Cast the base address to a pointer to the scalar
20784 // element type.
20785 BaseAddr = Builder.CreateBitCast(
20786 BaseAddr,
20787 SubVecTy->getElementType()->getPointerTo(SI->getPointerAddressSpace()));
20790 assert(isTypeLegal(EVT::getEVT(SubVecTy)) && "Illegal vstN vector type!");
20792 auto Mask = SVI->getShuffleMask();
20794 auto createStoreIntrinsic = [&](Value *BaseAddr,
20795 SmallVectorImpl<Value *> &Shuffles) {
20796 if (Subtarget->hasNEON()) {
20797 static const Intrinsic::ID StoreInts[3] = {Intrinsic::arm_neon_vst2,
20798 Intrinsic::arm_neon_vst3,
20799 Intrinsic::arm_neon_vst4};
20800 Type *Int8Ptr = Builder.getInt8PtrTy(SI->getPointerAddressSpace());
20801 Type *Tys[] = {Int8Ptr, SubVecTy};
20803 Function *VstNFunc = Intrinsic::getDeclaration(
20804 SI->getModule(), StoreInts[Factor - 2], Tys);
20806 SmallVector<Value *, 6> Ops;
20807 Ops.push_back(Builder.CreateBitCast(BaseAddr, Int8Ptr));
20808 append_range(Ops, Shuffles);
20809 Ops.push_back(Builder.getInt32(SI->getAlignment()));
20810 Builder.CreateCall(VstNFunc, Ops);
20811 } else {
20812 assert((Factor == 2 || Factor == 4) &&
20813 "expected interleave factor of 2 or 4 for MVE");
20814 Intrinsic::ID StoreInts =
20815 Factor == 2 ? Intrinsic::arm_mve_vst2q : Intrinsic::arm_mve_vst4q;
20816 Type *EltPtrTy = SubVecTy->getElementType()->getPointerTo(
20817 SI->getPointerAddressSpace());
20818 Type *Tys[] = {EltPtrTy, SubVecTy};
20819 Function *VstNFunc =
20820 Intrinsic::getDeclaration(SI->getModule(), StoreInts, Tys);
20822 SmallVector<Value *, 6> Ops;
20823 Ops.push_back(Builder.CreateBitCast(BaseAddr, EltPtrTy));
20824 append_range(Ops, Shuffles);
20825 for (unsigned F = 0; F < Factor; F++) {
20826 Ops.push_back(Builder.getInt32(F));
20827 Builder.CreateCall(VstNFunc, Ops);
20828 Ops.pop_back();
20833 for (unsigned StoreCount = 0; StoreCount < NumStores; ++StoreCount) {
20834 // If we generating more than one store, we compute the base address of
20835 // subsequent stores as an offset from the previous.
20836 if (StoreCount > 0)
20837 BaseAddr = Builder.CreateConstGEP1_32(SubVecTy->getElementType(),
20838 BaseAddr, LaneLen * Factor);
20840 SmallVector<Value *, 4> Shuffles;
20842 // Split the shufflevector operands into sub vectors for the new vstN call.
20843 for (unsigned i = 0; i < Factor; i++) {
20844 unsigned IdxI = StoreCount * LaneLen * Factor + i;
20845 if (Mask[IdxI] >= 0) {
20846 Shuffles.push_back(Builder.CreateShuffleVector(
20847 Op0, Op1, createSequentialMask(Mask[IdxI], LaneLen, 0)));
20848 } else {
20849 unsigned StartMask = 0;
20850 for (unsigned j = 1; j < LaneLen; j++) {
20851 unsigned IdxJ = StoreCount * LaneLen * Factor + j;
20852 if (Mask[IdxJ * Factor + IdxI] >= 0) {
20853 StartMask = Mask[IdxJ * Factor + IdxI] - IdxJ;
20854 break;
20857 // Note: If all elements in a chunk are undefs, StartMask=0!
20858 // Note: Filling undef gaps with random elements is ok, since
20859 // those elements were being written anyway (with undefs).
20860 // In the case of all undefs we're defaulting to using elems from 0
20861 // Note: StartMask cannot be negative, it's checked in
20862 // isReInterleaveMask
20863 Shuffles.push_back(Builder.CreateShuffleVector(
20864 Op0, Op1, createSequentialMask(StartMask, LaneLen, 0)));
20868 createStoreIntrinsic(BaseAddr, Shuffles);
20870 return true;
20873 enum HABaseType {
20874 HA_UNKNOWN = 0,
20875 HA_FLOAT,
20876 HA_DOUBLE,
20877 HA_VECT64,
20878 HA_VECT128
20881 static bool isHomogeneousAggregate(Type *Ty, HABaseType &Base,
20882 uint64_t &Members) {
20883 if (auto *ST = dyn_cast<StructType>(Ty)) {
20884 for (unsigned i = 0; i < ST->getNumElements(); ++i) {
20885 uint64_t SubMembers = 0;
20886 if (!isHomogeneousAggregate(ST->getElementType(i), Base, SubMembers))
20887 return false;
20888 Members += SubMembers;
20890 } else if (auto *AT = dyn_cast<ArrayType>(Ty)) {
20891 uint64_t SubMembers = 0;
20892 if (!isHomogeneousAggregate(AT->getElementType(), Base, SubMembers))
20893 return false;
20894 Members += SubMembers * AT->getNumElements();
20895 } else if (Ty->isFloatTy()) {
20896 if (Base != HA_UNKNOWN && Base != HA_FLOAT)
20897 return false;
20898 Members = 1;
20899 Base = HA_FLOAT;
20900 } else if (Ty->isDoubleTy()) {
20901 if (Base != HA_UNKNOWN && Base != HA_DOUBLE)
20902 return false;
20903 Members = 1;
20904 Base = HA_DOUBLE;
20905 } else if (auto *VT = dyn_cast<VectorType>(Ty)) {
20906 Members = 1;
20907 switch (Base) {
20908 case HA_FLOAT:
20909 case HA_DOUBLE:
20910 return false;
20911 case HA_VECT64:
20912 return VT->getPrimitiveSizeInBits().getFixedSize() == 64;
20913 case HA_VECT128:
20914 return VT->getPrimitiveSizeInBits().getFixedSize() == 128;
20915 case HA_UNKNOWN:
20916 switch (VT->getPrimitiveSizeInBits().getFixedSize()) {
20917 case 64:
20918 Base = HA_VECT64;
20919 return true;
20920 case 128:
20921 Base = HA_VECT128;
20922 return true;
20923 default:
20924 return false;
20929 return (Members > 0 && Members <= 4);
20932 /// Return the correct alignment for the current calling convention.
20933 Align ARMTargetLowering::getABIAlignmentForCallingConv(
20934 Type *ArgTy, const DataLayout &DL) const {
20935 const Align ABITypeAlign = DL.getABITypeAlign(ArgTy);
20936 if (!ArgTy->isVectorTy())
20937 return ABITypeAlign;
20939 // Avoid over-aligning vector parameters. It would require realigning the
20940 // stack and waste space for no real benefit.
20941 return std::min(ABITypeAlign, DL.getStackAlignment());
20944 /// Return true if a type is an AAPCS-VFP homogeneous aggregate or one of
20945 /// [N x i32] or [N x i64]. This allows front-ends to skip emitting padding when
20946 /// passing according to AAPCS rules.
20947 bool ARMTargetLowering::functionArgumentNeedsConsecutiveRegisters(
20948 Type *Ty, CallingConv::ID CallConv, bool isVarArg,
20949 const DataLayout &DL) const {
20950 if (getEffectiveCallingConv(CallConv, isVarArg) !=
20951 CallingConv::ARM_AAPCS_VFP)
20952 return false;
20954 HABaseType Base = HA_UNKNOWN;
20955 uint64_t Members = 0;
20956 bool IsHA = isHomogeneousAggregate(Ty, Base, Members);
20957 LLVM_DEBUG(dbgs() << "isHA: " << IsHA << " "; Ty->dump());
20959 bool IsIntArray = Ty->isArrayTy() && Ty->getArrayElementType()->isIntegerTy();
20960 return IsHA || IsIntArray;
20963 Register ARMTargetLowering::getExceptionPointerRegister(
20964 const Constant *PersonalityFn) const {
20965 // Platforms which do not use SjLj EH may return values in these registers
20966 // via the personality function.
20967 return Subtarget->useSjLjEH() ? Register() : ARM::R0;
20970 Register ARMTargetLowering::getExceptionSelectorRegister(
20971 const Constant *PersonalityFn) const {
20972 // Platforms which do not use SjLj EH may return values in these registers
20973 // via the personality function.
20974 return Subtarget->useSjLjEH() ? Register() : ARM::R1;
20977 void ARMTargetLowering::initializeSplitCSR(MachineBasicBlock *Entry) const {
20978 // Update IsSplitCSR in ARMFunctionInfo.
20979 ARMFunctionInfo *AFI = Entry->getParent()->getInfo<ARMFunctionInfo>();
20980 AFI->setIsSplitCSR(true);
20983 void ARMTargetLowering::insertCopiesSplitCSR(
20984 MachineBasicBlock *Entry,
20985 const SmallVectorImpl<MachineBasicBlock *> &Exits) const {
20986 const ARMBaseRegisterInfo *TRI = Subtarget->getRegisterInfo();
20987 const MCPhysReg *IStart = TRI->getCalleeSavedRegsViaCopy(Entry->getParent());
20988 if (!IStart)
20989 return;
20991 const TargetInstrInfo *TII = Subtarget->getInstrInfo();
20992 MachineRegisterInfo *MRI = &Entry->getParent()->getRegInfo();
20993 MachineBasicBlock::iterator MBBI = Entry->begin();
20994 for (const MCPhysReg *I = IStart; *I; ++I) {
20995 const TargetRegisterClass *RC = nullptr;
20996 if (ARM::GPRRegClass.contains(*I))
20997 RC = &ARM::GPRRegClass;
20998 else if (ARM::DPRRegClass.contains(*I))
20999 RC = &ARM::DPRRegClass;
21000 else
21001 llvm_unreachable("Unexpected register class in CSRsViaCopy!");
21003 Register NewVR = MRI->createVirtualRegister(RC);
21004 // Create copy from CSR to a virtual register.
21005 // FIXME: this currently does not emit CFI pseudo-instructions, it works
21006 // fine for CXX_FAST_TLS since the C++-style TLS access functions should be
21007 // nounwind. If we want to generalize this later, we may need to emit
21008 // CFI pseudo-instructions.
21009 assert(Entry->getParent()->getFunction().hasFnAttribute(
21010 Attribute::NoUnwind) &&
21011 "Function should be nounwind in insertCopiesSplitCSR!");
21012 Entry->addLiveIn(*I);
21013 BuildMI(*Entry, MBBI, DebugLoc(), TII->get(TargetOpcode::COPY), NewVR)
21014 .addReg(*I);
21016 // Insert the copy-back instructions right before the terminator.
21017 for (auto *Exit : Exits)
21018 BuildMI(*Exit, Exit->getFirstTerminator(), DebugLoc(),
21019 TII->get(TargetOpcode::COPY), *I)
21020 .addReg(NewVR);
21024 void ARMTargetLowering::finalizeLowering(MachineFunction &MF) const {
21025 MF.getFrameInfo().computeMaxCallFrameSize(MF);
21026 TargetLoweringBase::finalizeLowering(MF);