1 //===- ARMISelLowering.cpp - ARM DAG Lowering Implementation --------------===//
3 // Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
4 // See https://llvm.org/LICENSE.txt for license information.
5 // SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
7 //===----------------------------------------------------------------------===//
9 // This file defines the interfaces that ARM uses to lower LLVM code into a
12 //===----------------------------------------------------------------------===//
14 #include "ARMISelLowering.h"
15 #include "ARMBaseInstrInfo.h"
16 #include "ARMBaseRegisterInfo.h"
17 #include "ARMCallingConv.h"
18 #include "ARMConstantPoolValue.h"
19 #include "ARMMachineFunctionInfo.h"
20 #include "ARMPerfectShuffle.h"
21 #include "ARMRegisterInfo.h"
22 #include "ARMSelectionDAGInfo.h"
23 #include "ARMSubtarget.h"
24 #include "ARMTargetTransformInfo.h"
25 #include "MCTargetDesc/ARMAddressingModes.h"
26 #include "MCTargetDesc/ARMBaseInfo.h"
27 #include "Utils/ARMBaseInfo.h"
28 #include "llvm/ADT/APFloat.h"
29 #include "llvm/ADT/APInt.h"
30 #include "llvm/ADT/ArrayRef.h"
31 #include "llvm/ADT/BitVector.h"
32 #include "llvm/ADT/DenseMap.h"
33 #include "llvm/ADT/STLExtras.h"
34 #include "llvm/ADT/SmallPtrSet.h"
35 #include "llvm/ADT/SmallVector.h"
36 #include "llvm/ADT/Statistic.h"
37 #include "llvm/ADT/StringExtras.h"
38 #include "llvm/ADT/StringRef.h"
39 #include "llvm/ADT/StringSwitch.h"
40 #include "llvm/ADT/Triple.h"
41 #include "llvm/ADT/Twine.h"
42 #include "llvm/Analysis/VectorUtils.h"
43 #include "llvm/CodeGen/CallingConvLower.h"
44 #include "llvm/CodeGen/ISDOpcodes.h"
45 #include "llvm/CodeGen/IntrinsicLowering.h"
46 #include "llvm/CodeGen/MachineBasicBlock.h"
47 #include "llvm/CodeGen/MachineConstantPool.h"
48 #include "llvm/CodeGen/MachineFrameInfo.h"
49 #include "llvm/CodeGen/MachineFunction.h"
50 #include "llvm/CodeGen/MachineInstr.h"
51 #include "llvm/CodeGen/MachineInstrBuilder.h"
52 #include "llvm/CodeGen/MachineJumpTableInfo.h"
53 #include "llvm/CodeGen/MachineMemOperand.h"
54 #include "llvm/CodeGen/MachineOperand.h"
55 #include "llvm/CodeGen/MachineRegisterInfo.h"
56 #include "llvm/CodeGen/RuntimeLibcalls.h"
57 #include "llvm/CodeGen/SelectionDAG.h"
58 #include "llvm/CodeGen/SelectionDAGAddressAnalysis.h"
59 #include "llvm/CodeGen/SelectionDAGNodes.h"
60 #include "llvm/CodeGen/TargetInstrInfo.h"
61 #include "llvm/CodeGen/TargetLowering.h"
62 #include "llvm/CodeGen/TargetOpcodes.h"
63 #include "llvm/CodeGen/TargetRegisterInfo.h"
64 #include "llvm/CodeGen/TargetSubtargetInfo.h"
65 #include "llvm/CodeGen/ValueTypes.h"
66 #include "llvm/IR/Attributes.h"
67 #include "llvm/IR/CallingConv.h"
68 #include "llvm/IR/Constant.h"
69 #include "llvm/IR/Constants.h"
70 #include "llvm/IR/DataLayout.h"
71 #include "llvm/IR/DebugLoc.h"
72 #include "llvm/IR/DerivedTypes.h"
73 #include "llvm/IR/Function.h"
74 #include "llvm/IR/GlobalAlias.h"
75 #include "llvm/IR/GlobalValue.h"
76 #include "llvm/IR/GlobalVariable.h"
77 #include "llvm/IR/IRBuilder.h"
78 #include "llvm/IR/InlineAsm.h"
79 #include "llvm/IR/Instruction.h"
80 #include "llvm/IR/Instructions.h"
81 #include "llvm/IR/IntrinsicInst.h"
82 #include "llvm/IR/Intrinsics.h"
83 #include "llvm/IR/IntrinsicsARM.h"
84 #include "llvm/IR/Module.h"
85 #include "llvm/IR/PatternMatch.h"
86 #include "llvm/IR/Type.h"
87 #include "llvm/IR/User.h"
88 #include "llvm/IR/Value.h"
89 #include "llvm/MC/MCInstrDesc.h"
90 #include "llvm/MC/MCInstrItineraries.h"
91 #include "llvm/MC/MCRegisterInfo.h"
92 #include "llvm/MC/MCSchedule.h"
93 #include "llvm/Support/AtomicOrdering.h"
94 #include "llvm/Support/BranchProbability.h"
95 #include "llvm/Support/Casting.h"
96 #include "llvm/Support/CodeGen.h"
97 #include "llvm/Support/CommandLine.h"
98 #include "llvm/Support/Compiler.h"
99 #include "llvm/Support/Debug.h"
100 #include "llvm/Support/ErrorHandling.h"
101 #include "llvm/Support/KnownBits.h"
102 #include "llvm/Support/MachineValueType.h"
103 #include "llvm/Support/MathExtras.h"
104 #include "llvm/Support/raw_ostream.h"
105 #include "llvm/Target/TargetMachine.h"
106 #include "llvm/Target/TargetOptions.h"
118 using namespace llvm
;
119 using namespace llvm::PatternMatch
;
121 #define DEBUG_TYPE "arm-isel"
123 STATISTIC(NumTailCalls
, "Number of tail calls");
124 STATISTIC(NumMovwMovt
, "Number of GAs materialized with movw + movt");
125 STATISTIC(NumLoopByVals
, "Number of loops generated for byval arguments");
126 STATISTIC(NumConstpoolPromoted
,
127 "Number of constants with their storage promoted into constant pools");
130 ARMInterworking("arm-interworking", cl::Hidden
,
131 cl::desc("Enable / disable ARM interworking (for debugging only)"),
134 static cl::opt
<bool> EnableConstpoolPromotion(
135 "arm-promote-constant", cl::Hidden
,
136 cl::desc("Enable / disable promotion of unnamed_addr constants into "
138 cl::init(false)); // FIXME: set to true by default once PR32780 is fixed
139 static cl::opt
<unsigned> ConstpoolPromotionMaxSize(
140 "arm-promote-constant-max-size", cl::Hidden
,
141 cl::desc("Maximum size of constant to promote into a constant pool"),
143 static cl::opt
<unsigned> ConstpoolPromotionMaxTotal(
144 "arm-promote-constant-max-total", cl::Hidden
,
145 cl::desc("Maximum size of ALL constants to promote into a constant pool"),
149 MVEMaxSupportedInterleaveFactor("mve-max-interleave-factor", cl::Hidden
,
150 cl::desc("Maximum interleave factor for MVE VLDn to generate."),
153 // The APCS parameter registers.
154 static const MCPhysReg GPRArgRegs
[] = {
155 ARM::R0
, ARM::R1
, ARM::R2
, ARM::R3
158 void ARMTargetLowering::addTypeForNEON(MVT VT
, MVT PromotedLdStVT
) {
159 if (VT
!= PromotedLdStVT
) {
160 setOperationAction(ISD::LOAD
, VT
, Promote
);
161 AddPromotedToType (ISD::LOAD
, VT
, PromotedLdStVT
);
163 setOperationAction(ISD::STORE
, VT
, Promote
);
164 AddPromotedToType (ISD::STORE
, VT
, PromotedLdStVT
);
167 MVT ElemTy
= VT
.getVectorElementType();
168 if (ElemTy
!= MVT::f64
)
169 setOperationAction(ISD::SETCC
, VT
, Custom
);
170 setOperationAction(ISD::INSERT_VECTOR_ELT
, VT
, Custom
);
171 setOperationAction(ISD::EXTRACT_VECTOR_ELT
, VT
, Custom
);
172 if (ElemTy
== MVT::i32
) {
173 setOperationAction(ISD::SINT_TO_FP
, VT
, Custom
);
174 setOperationAction(ISD::UINT_TO_FP
, VT
, Custom
);
175 setOperationAction(ISD::FP_TO_SINT
, VT
, Custom
);
176 setOperationAction(ISD::FP_TO_UINT
, VT
, Custom
);
178 setOperationAction(ISD::SINT_TO_FP
, VT
, Expand
);
179 setOperationAction(ISD::UINT_TO_FP
, VT
, Expand
);
180 setOperationAction(ISD::FP_TO_SINT
, VT
, Expand
);
181 setOperationAction(ISD::FP_TO_UINT
, VT
, Expand
);
183 setOperationAction(ISD::BUILD_VECTOR
, VT
, Custom
);
184 setOperationAction(ISD::VECTOR_SHUFFLE
, VT
, Custom
);
185 setOperationAction(ISD::CONCAT_VECTORS
, VT
, Legal
);
186 setOperationAction(ISD::EXTRACT_SUBVECTOR
, VT
, Legal
);
187 setOperationAction(ISD::SELECT
, VT
, Expand
);
188 setOperationAction(ISD::SELECT_CC
, VT
, Expand
);
189 setOperationAction(ISD::VSELECT
, VT
, Expand
);
190 setOperationAction(ISD::SIGN_EXTEND_INREG
, VT
, Expand
);
191 if (VT
.isInteger()) {
192 setOperationAction(ISD::SHL
, VT
, Custom
);
193 setOperationAction(ISD::SRA
, VT
, Custom
);
194 setOperationAction(ISD::SRL
, VT
, Custom
);
197 // Neon does not support vector divide/remainder operations.
198 setOperationAction(ISD::SDIV
, VT
, Expand
);
199 setOperationAction(ISD::UDIV
, VT
, Expand
);
200 setOperationAction(ISD::FDIV
, VT
, Expand
);
201 setOperationAction(ISD::SREM
, VT
, Expand
);
202 setOperationAction(ISD::UREM
, VT
, Expand
);
203 setOperationAction(ISD::FREM
, VT
, Expand
);
204 setOperationAction(ISD::SDIVREM
, VT
, Expand
);
205 setOperationAction(ISD::UDIVREM
, VT
, Expand
);
207 if (!VT
.isFloatingPoint() &&
208 VT
!= MVT::v2i64
&& VT
!= MVT::v1i64
)
209 for (auto Opcode
: {ISD::ABS
, ISD::SMIN
, ISD::SMAX
, ISD::UMIN
, ISD::UMAX
})
210 setOperationAction(Opcode
, VT
, Legal
);
211 if (!VT
.isFloatingPoint())
212 for (auto Opcode
: {ISD::SADDSAT
, ISD::UADDSAT
, ISD::SSUBSAT
, ISD::USUBSAT
})
213 setOperationAction(Opcode
, VT
, Legal
);
216 void ARMTargetLowering::addDRTypeForNEON(MVT VT
) {
217 addRegisterClass(VT
, &ARM::DPRRegClass
);
218 addTypeForNEON(VT
, MVT::f64
);
221 void ARMTargetLowering::addQRTypeForNEON(MVT VT
) {
222 addRegisterClass(VT
, &ARM::DPairRegClass
);
223 addTypeForNEON(VT
, MVT::v2f64
);
226 void ARMTargetLowering::setAllExpand(MVT VT
) {
227 for (unsigned Opc
= 0; Opc
< ISD::BUILTIN_OP_END
; ++Opc
)
228 setOperationAction(Opc
, VT
, Expand
);
230 // We support these really simple operations even on types where all
231 // the actual arithmetic has to be broken down into simpler
232 // operations or turned into library calls.
233 setOperationAction(ISD::BITCAST
, VT
, Legal
);
234 setOperationAction(ISD::LOAD
, VT
, Legal
);
235 setOperationAction(ISD::STORE
, VT
, Legal
);
236 setOperationAction(ISD::UNDEF
, VT
, Legal
);
239 void ARMTargetLowering::addAllExtLoads(const MVT From
, const MVT To
,
240 LegalizeAction Action
) {
241 setLoadExtAction(ISD::EXTLOAD
, From
, To
, Action
);
242 setLoadExtAction(ISD::ZEXTLOAD
, From
, To
, Action
);
243 setLoadExtAction(ISD::SEXTLOAD
, From
, To
, Action
);
246 void ARMTargetLowering::addMVEVectorTypes(bool HasMVEFP
) {
247 const MVT IntTypes
[] = { MVT::v16i8
, MVT::v8i16
, MVT::v4i32
};
249 for (auto VT
: IntTypes
) {
250 addRegisterClass(VT
, &ARM::MQPRRegClass
);
251 setOperationAction(ISD::VECTOR_SHUFFLE
, VT
, Custom
);
252 setOperationAction(ISD::INSERT_VECTOR_ELT
, VT
, Custom
);
253 setOperationAction(ISD::EXTRACT_VECTOR_ELT
, VT
, Custom
);
254 setOperationAction(ISD::BUILD_VECTOR
, VT
, Custom
);
255 setOperationAction(ISD::SHL
, VT
, Custom
);
256 setOperationAction(ISD::SRA
, VT
, Custom
);
257 setOperationAction(ISD::SRL
, VT
, Custom
);
258 setOperationAction(ISD::SMIN
, VT
, Legal
);
259 setOperationAction(ISD::SMAX
, VT
, Legal
);
260 setOperationAction(ISD::UMIN
, VT
, Legal
);
261 setOperationAction(ISD::UMAX
, VT
, Legal
);
262 setOperationAction(ISD::ABS
, VT
, Legal
);
263 setOperationAction(ISD::SETCC
, VT
, Custom
);
264 setOperationAction(ISD::MLOAD
, VT
, Custom
);
265 setOperationAction(ISD::MSTORE
, VT
, Legal
);
266 setOperationAction(ISD::CTLZ
, VT
, Legal
);
267 setOperationAction(ISD::CTTZ
, VT
, Custom
);
268 setOperationAction(ISD::BITREVERSE
, VT
, Legal
);
269 setOperationAction(ISD::BSWAP
, VT
, Legal
);
270 setOperationAction(ISD::SADDSAT
, VT
, Legal
);
271 setOperationAction(ISD::UADDSAT
, VT
, Legal
);
272 setOperationAction(ISD::SSUBSAT
, VT
, Legal
);
273 setOperationAction(ISD::USUBSAT
, VT
, Legal
);
274 setOperationAction(ISD::ABDS
, VT
, Legal
);
275 setOperationAction(ISD::ABDU
, VT
, Legal
);
277 // No native support for these.
278 setOperationAction(ISD::UDIV
, VT
, Expand
);
279 setOperationAction(ISD::SDIV
, VT
, Expand
);
280 setOperationAction(ISD::UREM
, VT
, Expand
);
281 setOperationAction(ISD::SREM
, VT
, Expand
);
282 setOperationAction(ISD::UDIVREM
, VT
, Expand
);
283 setOperationAction(ISD::SDIVREM
, VT
, Expand
);
284 setOperationAction(ISD::CTPOP
, VT
, Expand
);
285 setOperationAction(ISD::SELECT
, VT
, Expand
);
286 setOperationAction(ISD::SELECT_CC
, VT
, Expand
);
289 setOperationAction(ISD::VECREDUCE_ADD
, VT
, Legal
);
290 setOperationAction(ISD::VECREDUCE_SMAX
, VT
, Legal
);
291 setOperationAction(ISD::VECREDUCE_UMAX
, VT
, Legal
);
292 setOperationAction(ISD::VECREDUCE_SMIN
, VT
, Legal
);
293 setOperationAction(ISD::VECREDUCE_UMIN
, VT
, Legal
);
294 setOperationAction(ISD::VECREDUCE_MUL
, VT
, Custom
);
295 setOperationAction(ISD::VECREDUCE_AND
, VT
, Custom
);
296 setOperationAction(ISD::VECREDUCE_OR
, VT
, Custom
);
297 setOperationAction(ISD::VECREDUCE_XOR
, VT
, Custom
);
300 setOperationAction(ISD::SINT_TO_FP
, VT
, Expand
);
301 setOperationAction(ISD::UINT_TO_FP
, VT
, Expand
);
302 setOperationAction(ISD::FP_TO_SINT
, VT
, Expand
);
303 setOperationAction(ISD::FP_TO_UINT
, VT
, Expand
);
306 // Pre and Post inc are supported on loads and stores
307 for (unsigned im
= (unsigned)ISD::PRE_INC
;
308 im
!= (unsigned)ISD::LAST_INDEXED_MODE
; ++im
) {
309 setIndexedLoadAction(im
, VT
, Legal
);
310 setIndexedStoreAction(im
, VT
, Legal
);
311 setIndexedMaskedLoadAction(im
, VT
, Legal
);
312 setIndexedMaskedStoreAction(im
, VT
, Legal
);
316 const MVT FloatTypes
[] = { MVT::v8f16
, MVT::v4f32
};
317 for (auto VT
: FloatTypes
) {
318 addRegisterClass(VT
, &ARM::MQPRRegClass
);
322 // These are legal or custom whether we have MVE.fp or not
323 setOperationAction(ISD::VECTOR_SHUFFLE
, VT
, Custom
);
324 setOperationAction(ISD::INSERT_VECTOR_ELT
, VT
, Custom
);
325 setOperationAction(ISD::INSERT_VECTOR_ELT
, VT
.getVectorElementType(), Custom
);
326 setOperationAction(ISD::EXTRACT_VECTOR_ELT
, VT
, Custom
);
327 setOperationAction(ISD::BUILD_VECTOR
, VT
, Custom
);
328 setOperationAction(ISD::BUILD_VECTOR
, VT
.getVectorElementType(), Custom
);
329 setOperationAction(ISD::SCALAR_TO_VECTOR
, VT
, Legal
);
330 setOperationAction(ISD::SETCC
, VT
, Custom
);
331 setOperationAction(ISD::MLOAD
, VT
, Custom
);
332 setOperationAction(ISD::MSTORE
, VT
, Legal
);
333 setOperationAction(ISD::SELECT
, VT
, Expand
);
334 setOperationAction(ISD::SELECT_CC
, VT
, Expand
);
336 // Pre and Post inc are supported on loads and stores
337 for (unsigned im
= (unsigned)ISD::PRE_INC
;
338 im
!= (unsigned)ISD::LAST_INDEXED_MODE
; ++im
) {
339 setIndexedLoadAction(im
, VT
, Legal
);
340 setIndexedStoreAction(im
, VT
, Legal
);
341 setIndexedMaskedLoadAction(im
, VT
, Legal
);
342 setIndexedMaskedStoreAction(im
, VT
, Legal
);
346 setOperationAction(ISD::FMINNUM
, VT
, Legal
);
347 setOperationAction(ISD::FMAXNUM
, VT
, Legal
);
348 setOperationAction(ISD::FROUND
, VT
, Legal
);
349 setOperationAction(ISD::VECREDUCE_FADD
, VT
, Custom
);
350 setOperationAction(ISD::VECREDUCE_FMUL
, VT
, Custom
);
351 setOperationAction(ISD::VECREDUCE_FMIN
, VT
, Custom
);
352 setOperationAction(ISD::VECREDUCE_FMAX
, VT
, Custom
);
354 // No native support for these.
355 setOperationAction(ISD::FDIV
, VT
, Expand
);
356 setOperationAction(ISD::FREM
, VT
, Expand
);
357 setOperationAction(ISD::FSQRT
, VT
, Expand
);
358 setOperationAction(ISD::FSIN
, VT
, Expand
);
359 setOperationAction(ISD::FCOS
, VT
, Expand
);
360 setOperationAction(ISD::FPOW
, VT
, Expand
);
361 setOperationAction(ISD::FLOG
, VT
, Expand
);
362 setOperationAction(ISD::FLOG2
, VT
, Expand
);
363 setOperationAction(ISD::FLOG10
, VT
, Expand
);
364 setOperationAction(ISD::FEXP
, VT
, Expand
);
365 setOperationAction(ISD::FEXP2
, VT
, Expand
);
366 setOperationAction(ISD::FNEARBYINT
, VT
, Expand
);
370 // Custom Expand smaller than legal vector reductions to prevent false zero
371 // items being added.
372 setOperationAction(ISD::VECREDUCE_FADD
, MVT::v4f16
, Custom
);
373 setOperationAction(ISD::VECREDUCE_FMUL
, MVT::v4f16
, Custom
);
374 setOperationAction(ISD::VECREDUCE_FMIN
, MVT::v4f16
, Custom
);
375 setOperationAction(ISD::VECREDUCE_FMAX
, MVT::v4f16
, Custom
);
376 setOperationAction(ISD::VECREDUCE_FADD
, MVT::v2f16
, Custom
);
377 setOperationAction(ISD::VECREDUCE_FMUL
, MVT::v2f16
, Custom
);
378 setOperationAction(ISD::VECREDUCE_FMIN
, MVT::v2f16
, Custom
);
379 setOperationAction(ISD::VECREDUCE_FMAX
, MVT::v2f16
, Custom
);
381 // We 'support' these types up to bitcast/load/store level, regardless of
382 // MVE integer-only / float support. Only doing FP data processing on the FP
383 // vector types is inhibited at integer-only level.
384 const MVT LongTypes
[] = { MVT::v2i64
, MVT::v2f64
};
385 for (auto VT
: LongTypes
) {
386 addRegisterClass(VT
, &ARM::MQPRRegClass
);
388 setOperationAction(ISD::INSERT_VECTOR_ELT
, VT
, Custom
);
389 setOperationAction(ISD::EXTRACT_VECTOR_ELT
, VT
, Custom
);
390 setOperationAction(ISD::BUILD_VECTOR
, VT
, Custom
);
392 setOperationAction(ISD::SCALAR_TO_VECTOR
, MVT::v2f64
, Legal
);
394 // We can do bitwise operations on v2i64 vectors
395 setOperationAction(ISD::AND
, MVT::v2i64
, Legal
);
396 setOperationAction(ISD::OR
, MVT::v2i64
, Legal
);
397 setOperationAction(ISD::XOR
, MVT::v2i64
, Legal
);
399 // It is legal to extload from v4i8 to v4i16 or v4i32.
400 addAllExtLoads(MVT::v8i16
, MVT::v8i8
, Legal
);
401 addAllExtLoads(MVT::v4i32
, MVT::v4i16
, Legal
);
402 addAllExtLoads(MVT::v4i32
, MVT::v4i8
, Legal
);
404 // It is legal to sign extend from v4i8/v4i16 to v4i32 or v8i8 to v8i16.
405 setOperationAction(ISD::SIGN_EXTEND_INREG
, MVT::v4i8
, Legal
);
406 setOperationAction(ISD::SIGN_EXTEND_INREG
, MVT::v4i16
, Legal
);
407 setOperationAction(ISD::SIGN_EXTEND_INREG
, MVT::v4i32
, Legal
);
408 setOperationAction(ISD::SIGN_EXTEND_INREG
, MVT::v8i8
, Legal
);
409 setOperationAction(ISD::SIGN_EXTEND_INREG
, MVT::v8i16
, Legal
);
411 // Some truncating stores are legal too.
412 setTruncStoreAction(MVT::v4i32
, MVT::v4i16
, Legal
);
413 setTruncStoreAction(MVT::v4i32
, MVT::v4i8
, Legal
);
414 setTruncStoreAction(MVT::v8i16
, MVT::v8i8
, Legal
);
416 // Pre and Post inc on these are legal, given the correct extends
417 for (unsigned im
= (unsigned)ISD::PRE_INC
;
418 im
!= (unsigned)ISD::LAST_INDEXED_MODE
; ++im
) {
419 for (auto VT
: {MVT::v8i8
, MVT::v4i8
, MVT::v4i16
}) {
420 setIndexedLoadAction(im
, VT
, Legal
);
421 setIndexedStoreAction(im
, VT
, Legal
);
422 setIndexedMaskedLoadAction(im
, VT
, Legal
);
423 setIndexedMaskedStoreAction(im
, VT
, Legal
);
428 const MVT pTypes
[] = {MVT::v16i1
, MVT::v8i1
, MVT::v4i1
};
429 for (auto VT
: pTypes
) {
430 addRegisterClass(VT
, &ARM::VCCRRegClass
);
431 setOperationAction(ISD::BUILD_VECTOR
, VT
, Custom
);
432 setOperationAction(ISD::VECTOR_SHUFFLE
, VT
, Custom
);
433 setOperationAction(ISD::EXTRACT_SUBVECTOR
, VT
, Custom
);
434 setOperationAction(ISD::CONCAT_VECTORS
, VT
, Custom
);
435 setOperationAction(ISD::INSERT_VECTOR_ELT
, VT
, Custom
);
436 setOperationAction(ISD::EXTRACT_VECTOR_ELT
, VT
, Custom
);
437 setOperationAction(ISD::SETCC
, VT
, Custom
);
438 setOperationAction(ISD::SCALAR_TO_VECTOR
, VT
, Expand
);
439 setOperationAction(ISD::LOAD
, VT
, Custom
);
440 setOperationAction(ISD::STORE
, VT
, Custom
);
441 setOperationAction(ISD::TRUNCATE
, VT
, Custom
);
442 setOperationAction(ISD::VSELECT
, VT
, Expand
);
443 setOperationAction(ISD::SELECT
, VT
, Expand
);
445 setOperationAction(ISD::SIGN_EXTEND
, MVT::v8i32
, Custom
);
446 setOperationAction(ISD::SIGN_EXTEND
, MVT::v16i16
, Custom
);
447 setOperationAction(ISD::SIGN_EXTEND
, MVT::v16i32
, Custom
);
448 setOperationAction(ISD::ZERO_EXTEND
, MVT::v8i32
, Custom
);
449 setOperationAction(ISD::ZERO_EXTEND
, MVT::v16i16
, Custom
);
450 setOperationAction(ISD::ZERO_EXTEND
, MVT::v16i32
, Custom
);
451 setOperationAction(ISD::TRUNCATE
, MVT::v8i32
, Custom
);
452 setOperationAction(ISD::TRUNCATE
, MVT::v16i16
, Custom
);
455 ARMTargetLowering::ARMTargetLowering(const TargetMachine
&TM
,
456 const ARMSubtarget
&STI
)
457 : TargetLowering(TM
), Subtarget(&STI
) {
458 RegInfo
= Subtarget
->getRegisterInfo();
459 Itins
= Subtarget
->getInstrItineraryData();
461 setBooleanContents(ZeroOrOneBooleanContent
);
462 setBooleanVectorContents(ZeroOrNegativeOneBooleanContent
);
464 if (!Subtarget
->isTargetDarwin() && !Subtarget
->isTargetIOS() &&
465 !Subtarget
->isTargetWatchOS()) {
466 bool IsHFTarget
= TM
.Options
.FloatABIType
== FloatABI::Hard
;
467 for (int LCID
= 0; LCID
< RTLIB::UNKNOWN_LIBCALL
; ++LCID
)
468 setLibcallCallingConv(static_cast<RTLIB::Libcall
>(LCID
),
469 IsHFTarget
? CallingConv::ARM_AAPCS_VFP
470 : CallingConv::ARM_AAPCS
);
473 if (Subtarget
->isTargetMachO()) {
474 // Uses VFP for Thumb libfuncs if available.
475 if (Subtarget
->isThumb() && Subtarget
->hasVFP2Base() &&
476 Subtarget
->hasARMOps() && !Subtarget
->useSoftFloat()) {
477 static const struct {
478 const RTLIB::Libcall Op
;
479 const char * const Name
;
480 const ISD::CondCode Cond
;
482 // Single-precision floating-point arithmetic.
483 { RTLIB::ADD_F32
, "__addsf3vfp", ISD::SETCC_INVALID
},
484 { RTLIB::SUB_F32
, "__subsf3vfp", ISD::SETCC_INVALID
},
485 { RTLIB::MUL_F32
, "__mulsf3vfp", ISD::SETCC_INVALID
},
486 { RTLIB::DIV_F32
, "__divsf3vfp", ISD::SETCC_INVALID
},
488 // Double-precision floating-point arithmetic.
489 { RTLIB::ADD_F64
, "__adddf3vfp", ISD::SETCC_INVALID
},
490 { RTLIB::SUB_F64
, "__subdf3vfp", ISD::SETCC_INVALID
},
491 { RTLIB::MUL_F64
, "__muldf3vfp", ISD::SETCC_INVALID
},
492 { RTLIB::DIV_F64
, "__divdf3vfp", ISD::SETCC_INVALID
},
494 // Single-precision comparisons.
495 { RTLIB::OEQ_F32
, "__eqsf2vfp", ISD::SETNE
},
496 { RTLIB::UNE_F32
, "__nesf2vfp", ISD::SETNE
},
497 { RTLIB::OLT_F32
, "__ltsf2vfp", ISD::SETNE
},
498 { RTLIB::OLE_F32
, "__lesf2vfp", ISD::SETNE
},
499 { RTLIB::OGE_F32
, "__gesf2vfp", ISD::SETNE
},
500 { RTLIB::OGT_F32
, "__gtsf2vfp", ISD::SETNE
},
501 { RTLIB::UO_F32
, "__unordsf2vfp", ISD::SETNE
},
503 // Double-precision comparisons.
504 { RTLIB::OEQ_F64
, "__eqdf2vfp", ISD::SETNE
},
505 { RTLIB::UNE_F64
, "__nedf2vfp", ISD::SETNE
},
506 { RTLIB::OLT_F64
, "__ltdf2vfp", ISD::SETNE
},
507 { RTLIB::OLE_F64
, "__ledf2vfp", ISD::SETNE
},
508 { RTLIB::OGE_F64
, "__gedf2vfp", ISD::SETNE
},
509 { RTLIB::OGT_F64
, "__gtdf2vfp", ISD::SETNE
},
510 { RTLIB::UO_F64
, "__unorddf2vfp", ISD::SETNE
},
512 // Floating-point to integer conversions.
513 // i64 conversions are done via library routines even when generating VFP
514 // instructions, so use the same ones.
515 { RTLIB::FPTOSINT_F64_I32
, "__fixdfsivfp", ISD::SETCC_INVALID
},
516 { RTLIB::FPTOUINT_F64_I32
, "__fixunsdfsivfp", ISD::SETCC_INVALID
},
517 { RTLIB::FPTOSINT_F32_I32
, "__fixsfsivfp", ISD::SETCC_INVALID
},
518 { RTLIB::FPTOUINT_F32_I32
, "__fixunssfsivfp", ISD::SETCC_INVALID
},
520 // Conversions between floating types.
521 { RTLIB::FPROUND_F64_F32
, "__truncdfsf2vfp", ISD::SETCC_INVALID
},
522 { RTLIB::FPEXT_F32_F64
, "__extendsfdf2vfp", ISD::SETCC_INVALID
},
524 // Integer to floating-point conversions.
525 // i64 conversions are done via library routines even when generating VFP
526 // instructions, so use the same ones.
527 // FIXME: There appears to be some naming inconsistency in ARM libgcc:
528 // e.g., __floatunsidf vs. __floatunssidfvfp.
529 { RTLIB::SINTTOFP_I32_F64
, "__floatsidfvfp", ISD::SETCC_INVALID
},
530 { RTLIB::UINTTOFP_I32_F64
, "__floatunssidfvfp", ISD::SETCC_INVALID
},
531 { RTLIB::SINTTOFP_I32_F32
, "__floatsisfvfp", ISD::SETCC_INVALID
},
532 { RTLIB::UINTTOFP_I32_F32
, "__floatunssisfvfp", ISD::SETCC_INVALID
},
535 for (const auto &LC
: LibraryCalls
) {
536 setLibcallName(LC
.Op
, LC
.Name
);
537 if (LC
.Cond
!= ISD::SETCC_INVALID
)
538 setCmpLibcallCC(LC
.Op
, LC
.Cond
);
543 // These libcalls are not available in 32-bit.
544 setLibcallName(RTLIB::SHL_I128
, nullptr);
545 setLibcallName(RTLIB::SRL_I128
, nullptr);
546 setLibcallName(RTLIB::SRA_I128
, nullptr);
547 setLibcallName(RTLIB::MUL_I128
, nullptr);
550 if (Subtarget
->isAAPCS_ABI() &&
551 (Subtarget
->isTargetAEABI() || Subtarget
->isTargetGNUAEABI() ||
552 Subtarget
->isTargetMuslAEABI() || Subtarget
->isTargetAndroid())) {
553 static const struct {
554 const RTLIB::Libcall Op
;
555 const char * const Name
;
556 const CallingConv::ID CC
;
557 const ISD::CondCode Cond
;
559 // Double-precision floating-point arithmetic helper functions
560 // RTABI chapter 4.1.2, Table 2
561 { RTLIB::ADD_F64
, "__aeabi_dadd", CallingConv::ARM_AAPCS
, ISD::SETCC_INVALID
},
562 { RTLIB::DIV_F64
, "__aeabi_ddiv", CallingConv::ARM_AAPCS
, ISD::SETCC_INVALID
},
563 { RTLIB::MUL_F64
, "__aeabi_dmul", CallingConv::ARM_AAPCS
, ISD::SETCC_INVALID
},
564 { RTLIB::SUB_F64
, "__aeabi_dsub", CallingConv::ARM_AAPCS
, ISD::SETCC_INVALID
},
566 // Double-precision floating-point comparison helper functions
567 // RTABI chapter 4.1.2, Table 3
568 { RTLIB::OEQ_F64
, "__aeabi_dcmpeq", CallingConv::ARM_AAPCS
, ISD::SETNE
},
569 { RTLIB::UNE_F64
, "__aeabi_dcmpeq", CallingConv::ARM_AAPCS
, ISD::SETEQ
},
570 { RTLIB::OLT_F64
, "__aeabi_dcmplt", CallingConv::ARM_AAPCS
, ISD::SETNE
},
571 { RTLIB::OLE_F64
, "__aeabi_dcmple", CallingConv::ARM_AAPCS
, ISD::SETNE
},
572 { RTLIB::OGE_F64
, "__aeabi_dcmpge", CallingConv::ARM_AAPCS
, ISD::SETNE
},
573 { RTLIB::OGT_F64
, "__aeabi_dcmpgt", CallingConv::ARM_AAPCS
, ISD::SETNE
},
574 { RTLIB::UO_F64
, "__aeabi_dcmpun", CallingConv::ARM_AAPCS
, ISD::SETNE
},
576 // Single-precision floating-point arithmetic helper functions
577 // RTABI chapter 4.1.2, Table 4
578 { RTLIB::ADD_F32
, "__aeabi_fadd", CallingConv::ARM_AAPCS
, ISD::SETCC_INVALID
},
579 { RTLIB::DIV_F32
, "__aeabi_fdiv", CallingConv::ARM_AAPCS
, ISD::SETCC_INVALID
},
580 { RTLIB::MUL_F32
, "__aeabi_fmul", CallingConv::ARM_AAPCS
, ISD::SETCC_INVALID
},
581 { RTLIB::SUB_F32
, "__aeabi_fsub", CallingConv::ARM_AAPCS
, ISD::SETCC_INVALID
},
583 // Single-precision floating-point comparison helper functions
584 // RTABI chapter 4.1.2, Table 5
585 { RTLIB::OEQ_F32
, "__aeabi_fcmpeq", CallingConv::ARM_AAPCS
, ISD::SETNE
},
586 { RTLIB::UNE_F32
, "__aeabi_fcmpeq", CallingConv::ARM_AAPCS
, ISD::SETEQ
},
587 { RTLIB::OLT_F32
, "__aeabi_fcmplt", CallingConv::ARM_AAPCS
, ISD::SETNE
},
588 { RTLIB::OLE_F32
, "__aeabi_fcmple", CallingConv::ARM_AAPCS
, ISD::SETNE
},
589 { RTLIB::OGE_F32
, "__aeabi_fcmpge", CallingConv::ARM_AAPCS
, ISD::SETNE
},
590 { RTLIB::OGT_F32
, "__aeabi_fcmpgt", CallingConv::ARM_AAPCS
, ISD::SETNE
},
591 { RTLIB::UO_F32
, "__aeabi_fcmpun", CallingConv::ARM_AAPCS
, ISD::SETNE
},
593 // Floating-point to integer conversions.
594 // RTABI chapter 4.1.2, Table 6
595 { RTLIB::FPTOSINT_F64_I32
, "__aeabi_d2iz", CallingConv::ARM_AAPCS
, ISD::SETCC_INVALID
},
596 { RTLIB::FPTOUINT_F64_I32
, "__aeabi_d2uiz", CallingConv::ARM_AAPCS
, ISD::SETCC_INVALID
},
597 { RTLIB::FPTOSINT_F64_I64
, "__aeabi_d2lz", CallingConv::ARM_AAPCS
, ISD::SETCC_INVALID
},
598 { RTLIB::FPTOUINT_F64_I64
, "__aeabi_d2ulz", CallingConv::ARM_AAPCS
, ISD::SETCC_INVALID
},
599 { RTLIB::FPTOSINT_F32_I32
, "__aeabi_f2iz", CallingConv::ARM_AAPCS
, ISD::SETCC_INVALID
},
600 { RTLIB::FPTOUINT_F32_I32
, "__aeabi_f2uiz", CallingConv::ARM_AAPCS
, ISD::SETCC_INVALID
},
601 { RTLIB::FPTOSINT_F32_I64
, "__aeabi_f2lz", CallingConv::ARM_AAPCS
, ISD::SETCC_INVALID
},
602 { RTLIB::FPTOUINT_F32_I64
, "__aeabi_f2ulz", CallingConv::ARM_AAPCS
, ISD::SETCC_INVALID
},
604 // Conversions between floating types.
605 // RTABI chapter 4.1.2, Table 7
606 { RTLIB::FPROUND_F64_F32
, "__aeabi_d2f", CallingConv::ARM_AAPCS
, ISD::SETCC_INVALID
},
607 { RTLIB::FPROUND_F64_F16
, "__aeabi_d2h", CallingConv::ARM_AAPCS
, ISD::SETCC_INVALID
},
608 { RTLIB::FPEXT_F32_F64
, "__aeabi_f2d", CallingConv::ARM_AAPCS
, ISD::SETCC_INVALID
},
610 // Integer to floating-point conversions.
611 // RTABI chapter 4.1.2, Table 8
612 { RTLIB::SINTTOFP_I32_F64
, "__aeabi_i2d", CallingConv::ARM_AAPCS
, ISD::SETCC_INVALID
},
613 { RTLIB::UINTTOFP_I32_F64
, "__aeabi_ui2d", CallingConv::ARM_AAPCS
, ISD::SETCC_INVALID
},
614 { RTLIB::SINTTOFP_I64_F64
, "__aeabi_l2d", CallingConv::ARM_AAPCS
, ISD::SETCC_INVALID
},
615 { RTLIB::UINTTOFP_I64_F64
, "__aeabi_ul2d", CallingConv::ARM_AAPCS
, ISD::SETCC_INVALID
},
616 { RTLIB::SINTTOFP_I32_F32
, "__aeabi_i2f", CallingConv::ARM_AAPCS
, ISD::SETCC_INVALID
},
617 { RTLIB::UINTTOFP_I32_F32
, "__aeabi_ui2f", CallingConv::ARM_AAPCS
, ISD::SETCC_INVALID
},
618 { RTLIB::SINTTOFP_I64_F32
, "__aeabi_l2f", CallingConv::ARM_AAPCS
, ISD::SETCC_INVALID
},
619 { RTLIB::UINTTOFP_I64_F32
, "__aeabi_ul2f", CallingConv::ARM_AAPCS
, ISD::SETCC_INVALID
},
621 // Long long helper functions
622 // RTABI chapter 4.2, Table 9
623 { RTLIB::MUL_I64
, "__aeabi_lmul", CallingConv::ARM_AAPCS
, ISD::SETCC_INVALID
},
624 { RTLIB::SHL_I64
, "__aeabi_llsl", CallingConv::ARM_AAPCS
, ISD::SETCC_INVALID
},
625 { RTLIB::SRL_I64
, "__aeabi_llsr", CallingConv::ARM_AAPCS
, ISD::SETCC_INVALID
},
626 { RTLIB::SRA_I64
, "__aeabi_lasr", CallingConv::ARM_AAPCS
, ISD::SETCC_INVALID
},
628 // Integer division functions
629 // RTABI chapter 4.3.1
630 { RTLIB::SDIV_I8
, "__aeabi_idiv", CallingConv::ARM_AAPCS
, ISD::SETCC_INVALID
},
631 { RTLIB::SDIV_I16
, "__aeabi_idiv", CallingConv::ARM_AAPCS
, ISD::SETCC_INVALID
},
632 { RTLIB::SDIV_I32
, "__aeabi_idiv", CallingConv::ARM_AAPCS
, ISD::SETCC_INVALID
},
633 { RTLIB::SDIV_I64
, "__aeabi_ldivmod", CallingConv::ARM_AAPCS
, ISD::SETCC_INVALID
},
634 { RTLIB::UDIV_I8
, "__aeabi_uidiv", CallingConv::ARM_AAPCS
, ISD::SETCC_INVALID
},
635 { RTLIB::UDIV_I16
, "__aeabi_uidiv", CallingConv::ARM_AAPCS
, ISD::SETCC_INVALID
},
636 { RTLIB::UDIV_I32
, "__aeabi_uidiv", CallingConv::ARM_AAPCS
, ISD::SETCC_INVALID
},
637 { RTLIB::UDIV_I64
, "__aeabi_uldivmod", CallingConv::ARM_AAPCS
, ISD::SETCC_INVALID
},
640 for (const auto &LC
: LibraryCalls
) {
641 setLibcallName(LC
.Op
, LC
.Name
);
642 setLibcallCallingConv(LC
.Op
, LC
.CC
);
643 if (LC
.Cond
!= ISD::SETCC_INVALID
)
644 setCmpLibcallCC(LC
.Op
, LC
.Cond
);
647 // EABI dependent RTLIB
648 if (TM
.Options
.EABIVersion
== EABI::EABI4
||
649 TM
.Options
.EABIVersion
== EABI::EABI5
) {
650 static const struct {
651 const RTLIB::Libcall Op
;
652 const char *const Name
;
653 const CallingConv::ID CC
;
654 const ISD::CondCode Cond
;
655 } MemOpsLibraryCalls
[] = {
657 // RTABI chapter 4.3.4
658 { RTLIB::MEMCPY
, "__aeabi_memcpy", CallingConv::ARM_AAPCS
, ISD::SETCC_INVALID
},
659 { RTLIB::MEMMOVE
, "__aeabi_memmove", CallingConv::ARM_AAPCS
, ISD::SETCC_INVALID
},
660 { RTLIB::MEMSET
, "__aeabi_memset", CallingConv::ARM_AAPCS
, ISD::SETCC_INVALID
},
663 for (const auto &LC
: MemOpsLibraryCalls
) {
664 setLibcallName(LC
.Op
, LC
.Name
);
665 setLibcallCallingConv(LC
.Op
, LC
.CC
);
666 if (LC
.Cond
!= ISD::SETCC_INVALID
)
667 setCmpLibcallCC(LC
.Op
, LC
.Cond
);
672 if (Subtarget
->isTargetWindows()) {
673 static const struct {
674 const RTLIB::Libcall Op
;
675 const char * const Name
;
676 const CallingConv::ID CC
;
678 { RTLIB::FPTOSINT_F32_I64
, "__stoi64", CallingConv::ARM_AAPCS_VFP
},
679 { RTLIB::FPTOSINT_F64_I64
, "__dtoi64", CallingConv::ARM_AAPCS_VFP
},
680 { RTLIB::FPTOUINT_F32_I64
, "__stou64", CallingConv::ARM_AAPCS_VFP
},
681 { RTLIB::FPTOUINT_F64_I64
, "__dtou64", CallingConv::ARM_AAPCS_VFP
},
682 { RTLIB::SINTTOFP_I64_F32
, "__i64tos", CallingConv::ARM_AAPCS_VFP
},
683 { RTLIB::SINTTOFP_I64_F64
, "__i64tod", CallingConv::ARM_AAPCS_VFP
},
684 { RTLIB::UINTTOFP_I64_F32
, "__u64tos", CallingConv::ARM_AAPCS_VFP
},
685 { RTLIB::UINTTOFP_I64_F64
, "__u64tod", CallingConv::ARM_AAPCS_VFP
},
688 for (const auto &LC
: LibraryCalls
) {
689 setLibcallName(LC
.Op
, LC
.Name
);
690 setLibcallCallingConv(LC
.Op
, LC
.CC
);
694 // Use divmod compiler-rt calls for iOS 5.0 and later.
695 if (Subtarget
->isTargetMachO() &&
696 !(Subtarget
->isTargetIOS() &&
697 Subtarget
->getTargetTriple().isOSVersionLT(5, 0))) {
698 setLibcallName(RTLIB::SDIVREM_I32
, "__divmodsi4");
699 setLibcallName(RTLIB::UDIVREM_I32
, "__udivmodsi4");
702 // The half <-> float conversion functions are always soft-float on
703 // non-watchos platforms, but are needed for some targets which use a
704 // hard-float calling convention by default.
705 if (!Subtarget
->isTargetWatchABI()) {
706 if (Subtarget
->isAAPCS_ABI()) {
707 setLibcallCallingConv(RTLIB::FPROUND_F32_F16
, CallingConv::ARM_AAPCS
);
708 setLibcallCallingConv(RTLIB::FPROUND_F64_F16
, CallingConv::ARM_AAPCS
);
709 setLibcallCallingConv(RTLIB::FPEXT_F16_F32
, CallingConv::ARM_AAPCS
);
711 setLibcallCallingConv(RTLIB::FPROUND_F32_F16
, CallingConv::ARM_APCS
);
712 setLibcallCallingConv(RTLIB::FPROUND_F64_F16
, CallingConv::ARM_APCS
);
713 setLibcallCallingConv(RTLIB::FPEXT_F16_F32
, CallingConv::ARM_APCS
);
717 // In EABI, these functions have an __aeabi_ prefix, but in GNUEABI they have
718 // a __gnu_ prefix (which is the default).
719 if (Subtarget
->isTargetAEABI()) {
720 static const struct {
721 const RTLIB::Libcall Op
;
722 const char * const Name
;
723 const CallingConv::ID CC
;
725 { RTLIB::FPROUND_F32_F16
, "__aeabi_f2h", CallingConv::ARM_AAPCS
},
726 { RTLIB::FPROUND_F64_F16
, "__aeabi_d2h", CallingConv::ARM_AAPCS
},
727 { RTLIB::FPEXT_F16_F32
, "__aeabi_h2f", CallingConv::ARM_AAPCS
},
730 for (const auto &LC
: LibraryCalls
) {
731 setLibcallName(LC
.Op
, LC
.Name
);
732 setLibcallCallingConv(LC
.Op
, LC
.CC
);
736 if (Subtarget
->isThumb1Only())
737 addRegisterClass(MVT::i32
, &ARM::tGPRRegClass
);
739 addRegisterClass(MVT::i32
, &ARM::GPRRegClass
);
741 if (!Subtarget
->useSoftFloat() && !Subtarget
->isThumb1Only() &&
742 Subtarget
->hasFPRegs()) {
743 addRegisterClass(MVT::f32
, &ARM::SPRRegClass
);
744 addRegisterClass(MVT::f64
, &ARM::DPRRegClass
);
745 if (!Subtarget
->hasVFP2Base())
746 setAllExpand(MVT::f32
);
747 if (!Subtarget
->hasFP64())
748 setAllExpand(MVT::f64
);
751 if (Subtarget
->hasFullFP16()) {
752 addRegisterClass(MVT::f16
, &ARM::HPRRegClass
);
753 setOperationAction(ISD::BITCAST
, MVT::i16
, Custom
);
754 setOperationAction(ISD::BITCAST
, MVT::f16
, Custom
);
756 setOperationAction(ISD::FMINNUM
, MVT::f16
, Legal
);
757 setOperationAction(ISD::FMAXNUM
, MVT::f16
, Legal
);
760 if (Subtarget
->hasBF16()) {
761 addRegisterClass(MVT::bf16
, &ARM::HPRRegClass
);
762 setAllExpand(MVT::bf16
);
763 if (!Subtarget
->hasFullFP16())
764 setOperationAction(ISD::BITCAST
, MVT::bf16
, Custom
);
767 for (MVT VT
: MVT::fixedlen_vector_valuetypes()) {
768 for (MVT InnerVT
: MVT::fixedlen_vector_valuetypes()) {
769 setTruncStoreAction(VT
, InnerVT
, Expand
);
770 addAllExtLoads(VT
, InnerVT
, Expand
);
773 setOperationAction(ISD::SMUL_LOHI
, VT
, Expand
);
774 setOperationAction(ISD::UMUL_LOHI
, VT
, Expand
);
776 setOperationAction(ISD::BSWAP
, VT
, Expand
);
779 setOperationAction(ISD::ConstantFP
, MVT::f32
, Custom
);
780 setOperationAction(ISD::ConstantFP
, MVT::f64
, Custom
);
782 setOperationAction(ISD::READ_REGISTER
, MVT::i64
, Custom
);
783 setOperationAction(ISD::WRITE_REGISTER
, MVT::i64
, Custom
);
785 if (Subtarget
->hasMVEIntegerOps())
786 addMVEVectorTypes(Subtarget
->hasMVEFloatOps());
788 // Combine low-overhead loop intrinsics so that we can lower i1 types.
789 if (Subtarget
->hasLOB()) {
790 setTargetDAGCombine(ISD::BRCOND
);
791 setTargetDAGCombine(ISD::BR_CC
);
794 if (Subtarget
->hasNEON()) {
795 addDRTypeForNEON(MVT::v2f32
);
796 addDRTypeForNEON(MVT::v8i8
);
797 addDRTypeForNEON(MVT::v4i16
);
798 addDRTypeForNEON(MVT::v2i32
);
799 addDRTypeForNEON(MVT::v1i64
);
801 addQRTypeForNEON(MVT::v4f32
);
802 addQRTypeForNEON(MVT::v2f64
);
803 addQRTypeForNEON(MVT::v16i8
);
804 addQRTypeForNEON(MVT::v8i16
);
805 addQRTypeForNEON(MVT::v4i32
);
806 addQRTypeForNEON(MVT::v2i64
);
808 if (Subtarget
->hasFullFP16()) {
809 addQRTypeForNEON(MVT::v8f16
);
810 addDRTypeForNEON(MVT::v4f16
);
813 if (Subtarget
->hasBF16()) {
814 addQRTypeForNEON(MVT::v8bf16
);
815 addDRTypeForNEON(MVT::v4bf16
);
819 if (Subtarget
->hasMVEIntegerOps() || Subtarget
->hasNEON()) {
820 // v2f64 is legal so that QR subregs can be extracted as f64 elements, but
821 // none of Neon, MVE or VFP supports any arithmetic operations on it.
822 setOperationAction(ISD::FADD
, MVT::v2f64
, Expand
);
823 setOperationAction(ISD::FSUB
, MVT::v2f64
, Expand
);
824 setOperationAction(ISD::FMUL
, MVT::v2f64
, Expand
);
825 // FIXME: Code duplication: FDIV and FREM are expanded always, see
826 // ARMTargetLowering::addTypeForNEON method for details.
827 setOperationAction(ISD::FDIV
, MVT::v2f64
, Expand
);
828 setOperationAction(ISD::FREM
, MVT::v2f64
, Expand
);
829 // FIXME: Create unittest.
830 // In another words, find a way when "copysign" appears in DAG with vector
832 setOperationAction(ISD::FCOPYSIGN
, MVT::v2f64
, Expand
);
833 // FIXME: Code duplication: SETCC has custom operation action, see
834 // ARMTargetLowering::addTypeForNEON method for details.
835 setOperationAction(ISD::SETCC
, MVT::v2f64
, Expand
);
836 // FIXME: Create unittest for FNEG and for FABS.
837 setOperationAction(ISD::FNEG
, MVT::v2f64
, Expand
);
838 setOperationAction(ISD::FABS
, MVT::v2f64
, Expand
);
839 setOperationAction(ISD::FSQRT
, MVT::v2f64
, Expand
);
840 setOperationAction(ISD::FSIN
, MVT::v2f64
, Expand
);
841 setOperationAction(ISD::FCOS
, MVT::v2f64
, Expand
);
842 setOperationAction(ISD::FPOW
, MVT::v2f64
, Expand
);
843 setOperationAction(ISD::FLOG
, MVT::v2f64
, Expand
);
844 setOperationAction(ISD::FLOG2
, MVT::v2f64
, Expand
);
845 setOperationAction(ISD::FLOG10
, MVT::v2f64
, Expand
);
846 setOperationAction(ISD::FEXP
, MVT::v2f64
, Expand
);
847 setOperationAction(ISD::FEXP2
, MVT::v2f64
, Expand
);
848 // FIXME: Create unittest for FCEIL, FTRUNC, FRINT, FNEARBYINT, FFLOOR.
849 setOperationAction(ISD::FCEIL
, MVT::v2f64
, Expand
);
850 setOperationAction(ISD::FTRUNC
, MVT::v2f64
, Expand
);
851 setOperationAction(ISD::FRINT
, MVT::v2f64
, Expand
);
852 setOperationAction(ISD::FNEARBYINT
, MVT::v2f64
, Expand
);
853 setOperationAction(ISD::FFLOOR
, MVT::v2f64
, Expand
);
854 setOperationAction(ISD::FMA
, MVT::v2f64
, Expand
);
857 if (Subtarget
->hasNEON()) {
858 // The same with v4f32. But keep in mind that vadd, vsub, vmul are natively
859 // supported for v4f32.
860 setOperationAction(ISD::FSQRT
, MVT::v4f32
, Expand
);
861 setOperationAction(ISD::FSIN
, MVT::v4f32
, Expand
);
862 setOperationAction(ISD::FCOS
, MVT::v4f32
, Expand
);
863 setOperationAction(ISD::FPOW
, MVT::v4f32
, Expand
);
864 setOperationAction(ISD::FLOG
, MVT::v4f32
, Expand
);
865 setOperationAction(ISD::FLOG2
, MVT::v4f32
, Expand
);
866 setOperationAction(ISD::FLOG10
, MVT::v4f32
, Expand
);
867 setOperationAction(ISD::FEXP
, MVT::v4f32
, Expand
);
868 setOperationAction(ISD::FEXP2
, MVT::v4f32
, Expand
);
869 setOperationAction(ISD::FCEIL
, MVT::v4f32
, Expand
);
870 setOperationAction(ISD::FTRUNC
, MVT::v4f32
, Expand
);
871 setOperationAction(ISD::FRINT
, MVT::v4f32
, Expand
);
872 setOperationAction(ISD::FNEARBYINT
, MVT::v4f32
, Expand
);
873 setOperationAction(ISD::FFLOOR
, MVT::v4f32
, Expand
);
875 // Mark v2f32 intrinsics.
876 setOperationAction(ISD::FSQRT
, MVT::v2f32
, Expand
);
877 setOperationAction(ISD::FSIN
, MVT::v2f32
, Expand
);
878 setOperationAction(ISD::FCOS
, MVT::v2f32
, Expand
);
879 setOperationAction(ISD::FPOW
, MVT::v2f32
, Expand
);
880 setOperationAction(ISD::FLOG
, MVT::v2f32
, Expand
);
881 setOperationAction(ISD::FLOG2
, MVT::v2f32
, Expand
);
882 setOperationAction(ISD::FLOG10
, MVT::v2f32
, Expand
);
883 setOperationAction(ISD::FEXP
, MVT::v2f32
, Expand
);
884 setOperationAction(ISD::FEXP2
, MVT::v2f32
, Expand
);
885 setOperationAction(ISD::FCEIL
, MVT::v2f32
, Expand
);
886 setOperationAction(ISD::FTRUNC
, MVT::v2f32
, Expand
);
887 setOperationAction(ISD::FRINT
, MVT::v2f32
, Expand
);
888 setOperationAction(ISD::FNEARBYINT
, MVT::v2f32
, Expand
);
889 setOperationAction(ISD::FFLOOR
, MVT::v2f32
, Expand
);
891 // Neon does not support some operations on v1i64 and v2i64 types.
892 setOperationAction(ISD::MUL
, MVT::v1i64
, Expand
);
893 // Custom handling for some quad-vector types to detect VMULL.
894 setOperationAction(ISD::MUL
, MVT::v8i16
, Custom
);
895 setOperationAction(ISD::MUL
, MVT::v4i32
, Custom
);
896 setOperationAction(ISD::MUL
, MVT::v2i64
, Custom
);
897 // Custom handling for some vector types to avoid expensive expansions
898 setOperationAction(ISD::SDIV
, MVT::v4i16
, Custom
);
899 setOperationAction(ISD::SDIV
, MVT::v8i8
, Custom
);
900 setOperationAction(ISD::UDIV
, MVT::v4i16
, Custom
);
901 setOperationAction(ISD::UDIV
, MVT::v8i8
, Custom
);
902 // Neon does not have single instruction SINT_TO_FP and UINT_TO_FP with
903 // a destination type that is wider than the source, and nor does
904 // it have a FP_TO_[SU]INT instruction with a narrower destination than
906 setOperationAction(ISD::SINT_TO_FP
, MVT::v4i16
, Custom
);
907 setOperationAction(ISD::SINT_TO_FP
, MVT::v8i16
, Custom
);
908 setOperationAction(ISD::UINT_TO_FP
, MVT::v4i16
, Custom
);
909 setOperationAction(ISD::UINT_TO_FP
, MVT::v8i16
, Custom
);
910 setOperationAction(ISD::FP_TO_UINT
, MVT::v4i16
, Custom
);
911 setOperationAction(ISD::FP_TO_UINT
, MVT::v8i16
, Custom
);
912 setOperationAction(ISD::FP_TO_SINT
, MVT::v4i16
, Custom
);
913 setOperationAction(ISD::FP_TO_SINT
, MVT::v8i16
, Custom
);
915 setOperationAction(ISD::FP_ROUND
, MVT::v2f32
, Expand
);
916 setOperationAction(ISD::FP_EXTEND
, MVT::v2f64
, Expand
);
918 // NEON does not have single instruction CTPOP for vectors with element
919 // types wider than 8-bits. However, custom lowering can leverage the
920 // v8i8/v16i8 vcnt instruction.
921 setOperationAction(ISD::CTPOP
, MVT::v2i32
, Custom
);
922 setOperationAction(ISD::CTPOP
, MVT::v4i32
, Custom
);
923 setOperationAction(ISD::CTPOP
, MVT::v4i16
, Custom
);
924 setOperationAction(ISD::CTPOP
, MVT::v8i16
, Custom
);
925 setOperationAction(ISD::CTPOP
, MVT::v1i64
, Custom
);
926 setOperationAction(ISD::CTPOP
, MVT::v2i64
, Custom
);
928 setOperationAction(ISD::CTLZ
, MVT::v1i64
, Expand
);
929 setOperationAction(ISD::CTLZ
, MVT::v2i64
, Expand
);
931 // NEON does not have single instruction CTTZ for vectors.
932 setOperationAction(ISD::CTTZ
, MVT::v8i8
, Custom
);
933 setOperationAction(ISD::CTTZ
, MVT::v4i16
, Custom
);
934 setOperationAction(ISD::CTTZ
, MVT::v2i32
, Custom
);
935 setOperationAction(ISD::CTTZ
, MVT::v1i64
, Custom
);
937 setOperationAction(ISD::CTTZ
, MVT::v16i8
, Custom
);
938 setOperationAction(ISD::CTTZ
, MVT::v8i16
, Custom
);
939 setOperationAction(ISD::CTTZ
, MVT::v4i32
, Custom
);
940 setOperationAction(ISD::CTTZ
, MVT::v2i64
, Custom
);
942 setOperationAction(ISD::CTTZ_ZERO_UNDEF
, MVT::v8i8
, Custom
);
943 setOperationAction(ISD::CTTZ_ZERO_UNDEF
, MVT::v4i16
, Custom
);
944 setOperationAction(ISD::CTTZ_ZERO_UNDEF
, MVT::v2i32
, Custom
);
945 setOperationAction(ISD::CTTZ_ZERO_UNDEF
, MVT::v1i64
, Custom
);
947 setOperationAction(ISD::CTTZ_ZERO_UNDEF
, MVT::v16i8
, Custom
);
948 setOperationAction(ISD::CTTZ_ZERO_UNDEF
, MVT::v8i16
, Custom
);
949 setOperationAction(ISD::CTTZ_ZERO_UNDEF
, MVT::v4i32
, Custom
);
950 setOperationAction(ISD::CTTZ_ZERO_UNDEF
, MVT::v2i64
, Custom
);
952 for (MVT VT
: MVT::fixedlen_vector_valuetypes()) {
953 setOperationAction(ISD::MULHS
, VT
, Expand
);
954 setOperationAction(ISD::MULHU
, VT
, Expand
);
957 // NEON only has FMA instructions as of VFP4.
958 if (!Subtarget
->hasVFP4Base()) {
959 setOperationAction(ISD::FMA
, MVT::v2f32
, Expand
);
960 setOperationAction(ISD::FMA
, MVT::v4f32
, Expand
);
963 setTargetDAGCombine(ISD::SHL
);
964 setTargetDAGCombine(ISD::SRL
);
965 setTargetDAGCombine(ISD::SRA
);
966 setTargetDAGCombine(ISD::FP_TO_SINT
);
967 setTargetDAGCombine(ISD::FP_TO_UINT
);
968 setTargetDAGCombine(ISD::FDIV
);
969 setTargetDAGCombine(ISD::LOAD
);
971 // It is legal to extload from v4i8 to v4i16 or v4i32.
972 for (MVT Ty
: {MVT::v8i8
, MVT::v4i8
, MVT::v2i8
, MVT::v4i16
, MVT::v2i16
,
974 for (MVT VT
: MVT::integer_fixedlen_vector_valuetypes()) {
975 setLoadExtAction(ISD::EXTLOAD
, VT
, Ty
, Legal
);
976 setLoadExtAction(ISD::ZEXTLOAD
, VT
, Ty
, Legal
);
977 setLoadExtAction(ISD::SEXTLOAD
, VT
, Ty
, Legal
);
982 if (Subtarget
->hasNEON() || Subtarget
->hasMVEIntegerOps()) {
983 setTargetDAGCombine(ISD::BUILD_VECTOR
);
984 setTargetDAGCombine(ISD::VECTOR_SHUFFLE
);
985 setTargetDAGCombine(ISD::INSERT_SUBVECTOR
);
986 setTargetDAGCombine(ISD::INSERT_VECTOR_ELT
);
987 setTargetDAGCombine(ISD::EXTRACT_VECTOR_ELT
);
988 setTargetDAGCombine(ISD::SIGN_EXTEND_INREG
);
989 setTargetDAGCombine(ISD::STORE
);
990 setTargetDAGCombine(ISD::SIGN_EXTEND
);
991 setTargetDAGCombine(ISD::ZERO_EXTEND
);
992 setTargetDAGCombine(ISD::ANY_EXTEND
);
993 setTargetDAGCombine(ISD::INTRINSIC_WO_CHAIN
);
994 setTargetDAGCombine(ISD::INTRINSIC_W_CHAIN
);
995 setTargetDAGCombine(ISD::INTRINSIC_VOID
);
996 setTargetDAGCombine(ISD::VECREDUCE_ADD
);
997 setTargetDAGCombine(ISD::ADD
);
998 setTargetDAGCombine(ISD::BITCAST
);
1000 if (Subtarget
->hasMVEIntegerOps()) {
1001 setTargetDAGCombine(ISD::SMIN
);
1002 setTargetDAGCombine(ISD::UMIN
);
1003 setTargetDAGCombine(ISD::SMAX
);
1004 setTargetDAGCombine(ISD::UMAX
);
1005 setTargetDAGCombine(ISD::FP_EXTEND
);
1006 setTargetDAGCombine(ISD::SELECT
);
1007 setTargetDAGCombine(ISD::SELECT_CC
);
1010 if (!Subtarget
->hasFP64()) {
1011 // When targeting a floating-point unit with only single-precision
1012 // operations, f64 is legal for the few double-precision instructions which
1013 // are present However, no double-precision operations other than moves,
1014 // loads and stores are provided by the hardware.
1015 setOperationAction(ISD::FADD
, MVT::f64
, Expand
);
1016 setOperationAction(ISD::FSUB
, MVT::f64
, Expand
);
1017 setOperationAction(ISD::FMUL
, MVT::f64
, Expand
);
1018 setOperationAction(ISD::FMA
, MVT::f64
, Expand
);
1019 setOperationAction(ISD::FDIV
, MVT::f64
, Expand
);
1020 setOperationAction(ISD::FREM
, MVT::f64
, Expand
);
1021 setOperationAction(ISD::FCOPYSIGN
, MVT::f64
, Expand
);
1022 setOperationAction(ISD::FGETSIGN
, MVT::f64
, Expand
);
1023 setOperationAction(ISD::FNEG
, MVT::f64
, Expand
);
1024 setOperationAction(ISD::FABS
, MVT::f64
, Expand
);
1025 setOperationAction(ISD::FSQRT
, MVT::f64
, Expand
);
1026 setOperationAction(ISD::FSIN
, MVT::f64
, Expand
);
1027 setOperationAction(ISD::FCOS
, MVT::f64
, Expand
);
1028 setOperationAction(ISD::FPOW
, MVT::f64
, Expand
);
1029 setOperationAction(ISD::FLOG
, MVT::f64
, Expand
);
1030 setOperationAction(ISD::FLOG2
, MVT::f64
, Expand
);
1031 setOperationAction(ISD::FLOG10
, MVT::f64
, Expand
);
1032 setOperationAction(ISD::FEXP
, MVT::f64
, Expand
);
1033 setOperationAction(ISD::FEXP2
, MVT::f64
, Expand
);
1034 setOperationAction(ISD::FCEIL
, MVT::f64
, Expand
);
1035 setOperationAction(ISD::FTRUNC
, MVT::f64
, Expand
);
1036 setOperationAction(ISD::FRINT
, MVT::f64
, Expand
);
1037 setOperationAction(ISD::FNEARBYINT
, MVT::f64
, Expand
);
1038 setOperationAction(ISD::FFLOOR
, MVT::f64
, Expand
);
1039 setOperationAction(ISD::SINT_TO_FP
, MVT::i32
, Custom
);
1040 setOperationAction(ISD::UINT_TO_FP
, MVT::i32
, Custom
);
1041 setOperationAction(ISD::FP_TO_SINT
, MVT::i32
, Custom
);
1042 setOperationAction(ISD::FP_TO_UINT
, MVT::i32
, Custom
);
1043 setOperationAction(ISD::FP_TO_SINT
, MVT::f64
, Custom
);
1044 setOperationAction(ISD::FP_TO_UINT
, MVT::f64
, Custom
);
1045 setOperationAction(ISD::FP_ROUND
, MVT::f32
, Custom
);
1046 setOperationAction(ISD::STRICT_FP_TO_SINT
, MVT::i32
, Custom
);
1047 setOperationAction(ISD::STRICT_FP_TO_UINT
, MVT::i32
, Custom
);
1048 setOperationAction(ISD::STRICT_FP_TO_SINT
, MVT::f64
, Custom
);
1049 setOperationAction(ISD::STRICT_FP_TO_UINT
, MVT::f64
, Custom
);
1050 setOperationAction(ISD::STRICT_FP_ROUND
, MVT::f32
, Custom
);
1053 if (!Subtarget
->hasFP64() || !Subtarget
->hasFPARMv8Base()) {
1054 setOperationAction(ISD::FP_EXTEND
, MVT::f64
, Custom
);
1055 setOperationAction(ISD::STRICT_FP_EXTEND
, MVT::f64
, Custom
);
1056 if (Subtarget
->hasFullFP16()) {
1057 setOperationAction(ISD::FP_ROUND
, MVT::f16
, Custom
);
1058 setOperationAction(ISD::STRICT_FP_ROUND
, MVT::f16
, Custom
);
1062 if (!Subtarget
->hasFP16()) {
1063 setOperationAction(ISD::FP_EXTEND
, MVT::f32
, Custom
);
1064 setOperationAction(ISD::STRICT_FP_EXTEND
, MVT::f32
, Custom
);
1067 computeRegisterProperties(Subtarget
->getRegisterInfo());
1069 // ARM does not have floating-point extending loads.
1070 for (MVT VT
: MVT::fp_valuetypes()) {
1071 setLoadExtAction(ISD::EXTLOAD
, VT
, MVT::f32
, Expand
);
1072 setLoadExtAction(ISD::EXTLOAD
, VT
, MVT::f16
, Expand
);
1075 // ... or truncating stores
1076 setTruncStoreAction(MVT::f64
, MVT::f32
, Expand
);
1077 setTruncStoreAction(MVT::f32
, MVT::f16
, Expand
);
1078 setTruncStoreAction(MVT::f64
, MVT::f16
, Expand
);
1080 // ARM does not have i1 sign extending load.
1081 for (MVT VT
: MVT::integer_valuetypes())
1082 setLoadExtAction(ISD::SEXTLOAD
, VT
, MVT::i1
, Promote
);
1084 // ARM supports all 4 flavors of integer indexed load / store.
1085 if (!Subtarget
->isThumb1Only()) {
1086 for (unsigned im
= (unsigned)ISD::PRE_INC
;
1087 im
!= (unsigned)ISD::LAST_INDEXED_MODE
; ++im
) {
1088 setIndexedLoadAction(im
, MVT::i1
, Legal
);
1089 setIndexedLoadAction(im
, MVT::i8
, Legal
);
1090 setIndexedLoadAction(im
, MVT::i16
, Legal
);
1091 setIndexedLoadAction(im
, MVT::i32
, Legal
);
1092 setIndexedStoreAction(im
, MVT::i1
, Legal
);
1093 setIndexedStoreAction(im
, MVT::i8
, Legal
);
1094 setIndexedStoreAction(im
, MVT::i16
, Legal
);
1095 setIndexedStoreAction(im
, MVT::i32
, Legal
);
1098 // Thumb-1 has limited post-inc load/store support - LDM r0!, {r1}.
1099 setIndexedLoadAction(ISD::POST_INC
, MVT::i32
, Legal
);
1100 setIndexedStoreAction(ISD::POST_INC
, MVT::i32
, Legal
);
1103 setOperationAction(ISD::SADDO
, MVT::i32
, Custom
);
1104 setOperationAction(ISD::UADDO
, MVT::i32
, Custom
);
1105 setOperationAction(ISD::SSUBO
, MVT::i32
, Custom
);
1106 setOperationAction(ISD::USUBO
, MVT::i32
, Custom
);
1108 setOperationAction(ISD::ADDCARRY
, MVT::i32
, Custom
);
1109 setOperationAction(ISD::SUBCARRY
, MVT::i32
, Custom
);
1110 if (Subtarget
->hasDSP()) {
1111 setOperationAction(ISD::SADDSAT
, MVT::i8
, Custom
);
1112 setOperationAction(ISD::SSUBSAT
, MVT::i8
, Custom
);
1113 setOperationAction(ISD::SADDSAT
, MVT::i16
, Custom
);
1114 setOperationAction(ISD::SSUBSAT
, MVT::i16
, Custom
);
1115 setOperationAction(ISD::UADDSAT
, MVT::i8
, Custom
);
1116 setOperationAction(ISD::USUBSAT
, MVT::i8
, Custom
);
1117 setOperationAction(ISD::UADDSAT
, MVT::i16
, Custom
);
1118 setOperationAction(ISD::USUBSAT
, MVT::i16
, Custom
);
1120 if (Subtarget
->hasBaseDSP()) {
1121 setOperationAction(ISD::SADDSAT
, MVT::i32
, Legal
);
1122 setOperationAction(ISD::SSUBSAT
, MVT::i32
, Legal
);
1125 // i64 operation support.
1126 setOperationAction(ISD::MUL
, MVT::i64
, Expand
);
1127 setOperationAction(ISD::MULHU
, MVT::i32
, Expand
);
1128 if (Subtarget
->isThumb1Only()) {
1129 setOperationAction(ISD::UMUL_LOHI
, MVT::i32
, Expand
);
1130 setOperationAction(ISD::SMUL_LOHI
, MVT::i32
, Expand
);
1132 if (Subtarget
->isThumb1Only() || !Subtarget
->hasV6Ops()
1133 || (Subtarget
->isThumb2() && !Subtarget
->hasDSP()))
1134 setOperationAction(ISD::MULHS
, MVT::i32
, Expand
);
1136 setOperationAction(ISD::SHL_PARTS
, MVT::i32
, Custom
);
1137 setOperationAction(ISD::SRA_PARTS
, MVT::i32
, Custom
);
1138 setOperationAction(ISD::SRL_PARTS
, MVT::i32
, Custom
);
1139 setOperationAction(ISD::SRL
, MVT::i64
, Custom
);
1140 setOperationAction(ISD::SRA
, MVT::i64
, Custom
);
1141 setOperationAction(ISD::INTRINSIC_VOID
, MVT::Other
, Custom
);
1142 setOperationAction(ISD::INTRINSIC_WO_CHAIN
, MVT::i64
, Custom
);
1143 setOperationAction(ISD::LOAD
, MVT::i64
, Custom
);
1144 setOperationAction(ISD::STORE
, MVT::i64
, Custom
);
1146 // MVE lowers 64 bit shifts to lsll and lsrl
1147 // assuming that ISD::SRL and SRA of i64 are already marked custom
1148 if (Subtarget
->hasMVEIntegerOps())
1149 setOperationAction(ISD::SHL
, MVT::i64
, Custom
);
1151 // Expand to __aeabi_l{lsl,lsr,asr} calls for Thumb1.
1152 if (Subtarget
->isThumb1Only()) {
1153 setOperationAction(ISD::SHL_PARTS
, MVT::i32
, Expand
);
1154 setOperationAction(ISD::SRA_PARTS
, MVT::i32
, Expand
);
1155 setOperationAction(ISD::SRL_PARTS
, MVT::i32
, Expand
);
1158 if (!Subtarget
->isThumb1Only() && Subtarget
->hasV6T2Ops())
1159 setOperationAction(ISD::BITREVERSE
, MVT::i32
, Legal
);
1161 // ARM does not have ROTL.
1162 setOperationAction(ISD::ROTL
, MVT::i32
, Expand
);
1163 for (MVT VT
: MVT::fixedlen_vector_valuetypes()) {
1164 setOperationAction(ISD::ROTL
, VT
, Expand
);
1165 setOperationAction(ISD::ROTR
, VT
, Expand
);
1167 setOperationAction(ISD::CTTZ
, MVT::i32
, Custom
);
1168 setOperationAction(ISD::CTPOP
, MVT::i32
, Expand
);
1169 if (!Subtarget
->hasV5TOps() || Subtarget
->isThumb1Only()) {
1170 setOperationAction(ISD::CTLZ
, MVT::i32
, Expand
);
1171 setOperationAction(ISD::CTLZ_ZERO_UNDEF
, MVT::i32
, LibCall
);
1174 // @llvm.readcyclecounter requires the Performance Monitors extension.
1175 // Default to the 0 expansion on unsupported platforms.
1176 // FIXME: Technically there are older ARM CPUs that have
1177 // implementation-specific ways of obtaining this information.
1178 if (Subtarget
->hasPerfMon())
1179 setOperationAction(ISD::READCYCLECOUNTER
, MVT::i64
, Custom
);
1181 // Only ARMv6 has BSWAP.
1182 if (!Subtarget
->hasV6Ops())
1183 setOperationAction(ISD::BSWAP
, MVT::i32
, Expand
);
1185 bool hasDivide
= Subtarget
->isThumb() ? Subtarget
->hasDivideInThumbMode()
1186 : Subtarget
->hasDivideInARMMode();
1188 // These are expanded into libcalls if the cpu doesn't have HW divider.
1189 setOperationAction(ISD::SDIV
, MVT::i32
, LibCall
);
1190 setOperationAction(ISD::UDIV
, MVT::i32
, LibCall
);
1193 if (Subtarget
->isTargetWindows() && !Subtarget
->hasDivideInThumbMode()) {
1194 setOperationAction(ISD::SDIV
, MVT::i32
, Custom
);
1195 setOperationAction(ISD::UDIV
, MVT::i32
, Custom
);
1197 setOperationAction(ISD::SDIV
, MVT::i64
, Custom
);
1198 setOperationAction(ISD::UDIV
, MVT::i64
, Custom
);
1201 setOperationAction(ISD::SREM
, MVT::i32
, Expand
);
1202 setOperationAction(ISD::UREM
, MVT::i32
, Expand
);
1204 // Register based DivRem for AEABI (RTABI 4.2)
1205 if (Subtarget
->isTargetAEABI() || Subtarget
->isTargetAndroid() ||
1206 Subtarget
->isTargetGNUAEABI() || Subtarget
->isTargetMuslAEABI() ||
1207 Subtarget
->isTargetWindows()) {
1208 setOperationAction(ISD::SREM
, MVT::i64
, Custom
);
1209 setOperationAction(ISD::UREM
, MVT::i64
, Custom
);
1210 HasStandaloneRem
= false;
1212 if (Subtarget
->isTargetWindows()) {
1214 const RTLIB::Libcall Op
;
1215 const char * const Name
;
1216 const CallingConv::ID CC
;
1217 } LibraryCalls
[] = {
1218 { RTLIB::SDIVREM_I8
, "__rt_sdiv", CallingConv::ARM_AAPCS
},
1219 { RTLIB::SDIVREM_I16
, "__rt_sdiv", CallingConv::ARM_AAPCS
},
1220 { RTLIB::SDIVREM_I32
, "__rt_sdiv", CallingConv::ARM_AAPCS
},
1221 { RTLIB::SDIVREM_I64
, "__rt_sdiv64", CallingConv::ARM_AAPCS
},
1223 { RTLIB::UDIVREM_I8
, "__rt_udiv", CallingConv::ARM_AAPCS
},
1224 { RTLIB::UDIVREM_I16
, "__rt_udiv", CallingConv::ARM_AAPCS
},
1225 { RTLIB::UDIVREM_I32
, "__rt_udiv", CallingConv::ARM_AAPCS
},
1226 { RTLIB::UDIVREM_I64
, "__rt_udiv64", CallingConv::ARM_AAPCS
},
1229 for (const auto &LC
: LibraryCalls
) {
1230 setLibcallName(LC
.Op
, LC
.Name
);
1231 setLibcallCallingConv(LC
.Op
, LC
.CC
);
1235 const RTLIB::Libcall Op
;
1236 const char * const Name
;
1237 const CallingConv::ID CC
;
1238 } LibraryCalls
[] = {
1239 { RTLIB::SDIVREM_I8
, "__aeabi_idivmod", CallingConv::ARM_AAPCS
},
1240 { RTLIB::SDIVREM_I16
, "__aeabi_idivmod", CallingConv::ARM_AAPCS
},
1241 { RTLIB::SDIVREM_I32
, "__aeabi_idivmod", CallingConv::ARM_AAPCS
},
1242 { RTLIB::SDIVREM_I64
, "__aeabi_ldivmod", CallingConv::ARM_AAPCS
},
1244 { RTLIB::UDIVREM_I8
, "__aeabi_uidivmod", CallingConv::ARM_AAPCS
},
1245 { RTLIB::UDIVREM_I16
, "__aeabi_uidivmod", CallingConv::ARM_AAPCS
},
1246 { RTLIB::UDIVREM_I32
, "__aeabi_uidivmod", CallingConv::ARM_AAPCS
},
1247 { RTLIB::UDIVREM_I64
, "__aeabi_uldivmod", CallingConv::ARM_AAPCS
},
1250 for (const auto &LC
: LibraryCalls
) {
1251 setLibcallName(LC
.Op
, LC
.Name
);
1252 setLibcallCallingConv(LC
.Op
, LC
.CC
);
1256 setOperationAction(ISD::SDIVREM
, MVT::i32
, Custom
);
1257 setOperationAction(ISD::UDIVREM
, MVT::i32
, Custom
);
1258 setOperationAction(ISD::SDIVREM
, MVT::i64
, Custom
);
1259 setOperationAction(ISD::UDIVREM
, MVT::i64
, Custom
);
1261 setOperationAction(ISD::SDIVREM
, MVT::i32
, Expand
);
1262 setOperationAction(ISD::UDIVREM
, MVT::i32
, Expand
);
1265 if (Subtarget
->getTargetTriple().isOSMSVCRT()) {
1266 // MSVCRT doesn't have powi; fall back to pow
1267 setLibcallName(RTLIB::POWI_F32
, nullptr);
1268 setLibcallName(RTLIB::POWI_F64
, nullptr);
1271 setOperationAction(ISD::GlobalAddress
, MVT::i32
, Custom
);
1272 setOperationAction(ISD::ConstantPool
, MVT::i32
, Custom
);
1273 setOperationAction(ISD::GlobalTLSAddress
, MVT::i32
, Custom
);
1274 setOperationAction(ISD::BlockAddress
, MVT::i32
, Custom
);
1276 setOperationAction(ISD::TRAP
, MVT::Other
, Legal
);
1277 setOperationAction(ISD::DEBUGTRAP
, MVT::Other
, Legal
);
1279 // Use the default implementation.
1280 setOperationAction(ISD::VASTART
, MVT::Other
, Custom
);
1281 setOperationAction(ISD::VAARG
, MVT::Other
, Expand
);
1282 setOperationAction(ISD::VACOPY
, MVT::Other
, Expand
);
1283 setOperationAction(ISD::VAEND
, MVT::Other
, Expand
);
1284 setOperationAction(ISD::STACKSAVE
, MVT::Other
, Expand
);
1285 setOperationAction(ISD::STACKRESTORE
, MVT::Other
, Expand
);
1287 if (Subtarget
->isTargetWindows())
1288 setOperationAction(ISD::DYNAMIC_STACKALLOC
, MVT::i32
, Custom
);
1290 setOperationAction(ISD::DYNAMIC_STACKALLOC
, MVT::i32
, Expand
);
1292 // ARMv6 Thumb1 (except for CPUs that support dmb / dsb) and earlier use
1293 // the default expansion.
1294 InsertFencesForAtomic
= false;
1295 if (Subtarget
->hasAnyDataBarrier() &&
1296 (!Subtarget
->isThumb() || Subtarget
->hasV8MBaselineOps())) {
1297 // ATOMIC_FENCE needs custom lowering; the others should have been expanded
1298 // to ldrex/strex loops already.
1299 setOperationAction(ISD::ATOMIC_FENCE
, MVT::Other
, Custom
);
1300 if (!Subtarget
->isThumb() || !Subtarget
->isMClass())
1301 setOperationAction(ISD::ATOMIC_CMP_SWAP
, MVT::i64
, Custom
);
1303 // On v8, we have particularly efficient implementations of atomic fences
1304 // if they can be combined with nearby atomic loads and stores.
1305 if (!Subtarget
->hasAcquireRelease() ||
1306 getTargetMachine().getOptLevel() == 0) {
1307 // Automatically insert fences (dmb ish) around ATOMIC_SWAP etc.
1308 InsertFencesForAtomic
= true;
1311 // If there's anything we can use as a barrier, go through custom lowering
1312 // for ATOMIC_FENCE.
1313 // If target has DMB in thumb, Fences can be inserted.
1314 if (Subtarget
->hasDataBarrier())
1315 InsertFencesForAtomic
= true;
1317 setOperationAction(ISD::ATOMIC_FENCE
, MVT::Other
,
1318 Subtarget
->hasAnyDataBarrier() ? Custom
: Expand
);
1320 // Set them all for expansion, which will force libcalls.
1321 setOperationAction(ISD::ATOMIC_CMP_SWAP
, MVT::i32
, Expand
);
1322 setOperationAction(ISD::ATOMIC_SWAP
, MVT::i32
, Expand
);
1323 setOperationAction(ISD::ATOMIC_LOAD_ADD
, MVT::i32
, Expand
);
1324 setOperationAction(ISD::ATOMIC_LOAD_SUB
, MVT::i32
, Expand
);
1325 setOperationAction(ISD::ATOMIC_LOAD_AND
, MVT::i32
, Expand
);
1326 setOperationAction(ISD::ATOMIC_LOAD_OR
, MVT::i32
, Expand
);
1327 setOperationAction(ISD::ATOMIC_LOAD_XOR
, MVT::i32
, Expand
);
1328 setOperationAction(ISD::ATOMIC_LOAD_NAND
, MVT::i32
, Expand
);
1329 setOperationAction(ISD::ATOMIC_LOAD_MIN
, MVT::i32
, Expand
);
1330 setOperationAction(ISD::ATOMIC_LOAD_MAX
, MVT::i32
, Expand
);
1331 setOperationAction(ISD::ATOMIC_LOAD_UMIN
, MVT::i32
, Expand
);
1332 setOperationAction(ISD::ATOMIC_LOAD_UMAX
, MVT::i32
, Expand
);
1333 // Mark ATOMIC_LOAD and ATOMIC_STORE custom so we can handle the
1334 // Unordered/Monotonic case.
1335 if (!InsertFencesForAtomic
) {
1336 setOperationAction(ISD::ATOMIC_LOAD
, MVT::i32
, Custom
);
1337 setOperationAction(ISD::ATOMIC_STORE
, MVT::i32
, Custom
);
1341 setOperationAction(ISD::PREFETCH
, MVT::Other
, Custom
);
1343 // Requires SXTB/SXTH, available on v6 and up in both ARM and Thumb modes.
1344 if (!Subtarget
->hasV6Ops()) {
1345 setOperationAction(ISD::SIGN_EXTEND_INREG
, MVT::i16
, Expand
);
1346 setOperationAction(ISD::SIGN_EXTEND_INREG
, MVT::i8
, Expand
);
1348 setOperationAction(ISD::SIGN_EXTEND_INREG
, MVT::i1
, Expand
);
1350 if (!Subtarget
->useSoftFloat() && Subtarget
->hasFPRegs() &&
1351 !Subtarget
->isThumb1Only()) {
1352 // Turn f64->i64 into VMOVRRD, i64 -> f64 to VMOVDRR
1353 // iff target supports vfp2.
1354 setOperationAction(ISD::BITCAST
, MVT::i64
, Custom
);
1355 setOperationAction(ISD::FLT_ROUNDS_
, MVT::i32
, Custom
);
1356 setOperationAction(ISD::SET_ROUNDING
, MVT::Other
, Custom
);
1359 // We want to custom lower some of our intrinsics.
1360 setOperationAction(ISD::INTRINSIC_WO_CHAIN
, MVT::Other
, Custom
);
1361 setOperationAction(ISD::EH_SJLJ_SETJMP
, MVT::i32
, Custom
);
1362 setOperationAction(ISD::EH_SJLJ_LONGJMP
, MVT::Other
, Custom
);
1363 setOperationAction(ISD::EH_SJLJ_SETUP_DISPATCH
, MVT::Other
, Custom
);
1364 if (Subtarget
->useSjLjEH())
1365 setLibcallName(RTLIB::UNWIND_RESUME
, "_Unwind_SjLj_Resume");
1367 setOperationAction(ISD::SETCC
, MVT::i32
, Expand
);
1368 setOperationAction(ISD::SETCC
, MVT::f32
, Expand
);
1369 setOperationAction(ISD::SETCC
, MVT::f64
, Expand
);
1370 setOperationAction(ISD::SELECT
, MVT::i32
, Custom
);
1371 setOperationAction(ISD::SELECT
, MVT::f32
, Custom
);
1372 setOperationAction(ISD::SELECT
, MVT::f64
, Custom
);
1373 setOperationAction(ISD::SELECT_CC
, MVT::i32
, Custom
);
1374 setOperationAction(ISD::SELECT_CC
, MVT::f32
, Custom
);
1375 setOperationAction(ISD::SELECT_CC
, MVT::f64
, Custom
);
1376 if (Subtarget
->hasFullFP16()) {
1377 setOperationAction(ISD::SETCC
, MVT::f16
, Expand
);
1378 setOperationAction(ISD::SELECT
, MVT::f16
, Custom
);
1379 setOperationAction(ISD::SELECT_CC
, MVT::f16
, Custom
);
1382 setOperationAction(ISD::SETCCCARRY
, MVT::i32
, Custom
);
1384 setOperationAction(ISD::BRCOND
, MVT::Other
, Custom
);
1385 setOperationAction(ISD::BR_CC
, MVT::i32
, Custom
);
1386 if (Subtarget
->hasFullFP16())
1387 setOperationAction(ISD::BR_CC
, MVT::f16
, Custom
);
1388 setOperationAction(ISD::BR_CC
, MVT::f32
, Custom
);
1389 setOperationAction(ISD::BR_CC
, MVT::f64
, Custom
);
1390 setOperationAction(ISD::BR_JT
, MVT::Other
, Custom
);
1392 // We don't support sin/cos/fmod/copysign/pow
1393 setOperationAction(ISD::FSIN
, MVT::f64
, Expand
);
1394 setOperationAction(ISD::FSIN
, MVT::f32
, Expand
);
1395 setOperationAction(ISD::FCOS
, MVT::f32
, Expand
);
1396 setOperationAction(ISD::FCOS
, MVT::f64
, Expand
);
1397 setOperationAction(ISD::FSINCOS
, MVT::f64
, Expand
);
1398 setOperationAction(ISD::FSINCOS
, MVT::f32
, Expand
);
1399 setOperationAction(ISD::FREM
, MVT::f64
, Expand
);
1400 setOperationAction(ISD::FREM
, MVT::f32
, Expand
);
1401 if (!Subtarget
->useSoftFloat() && Subtarget
->hasVFP2Base() &&
1402 !Subtarget
->isThumb1Only()) {
1403 setOperationAction(ISD::FCOPYSIGN
, MVT::f64
, Custom
);
1404 setOperationAction(ISD::FCOPYSIGN
, MVT::f32
, Custom
);
1406 setOperationAction(ISD::FPOW
, MVT::f64
, Expand
);
1407 setOperationAction(ISD::FPOW
, MVT::f32
, Expand
);
1409 if (!Subtarget
->hasVFP4Base()) {
1410 setOperationAction(ISD::FMA
, MVT::f64
, Expand
);
1411 setOperationAction(ISD::FMA
, MVT::f32
, Expand
);
1414 // Various VFP goodness
1415 if (!Subtarget
->useSoftFloat() && !Subtarget
->isThumb1Only()) {
1416 // FP-ARMv8 adds f64 <-> f16 conversion. Before that it should be expanded.
1417 if (!Subtarget
->hasFPARMv8Base() || !Subtarget
->hasFP64()) {
1418 setOperationAction(ISD::FP16_TO_FP
, MVT::f64
, Expand
);
1419 setOperationAction(ISD::FP_TO_FP16
, MVT::f64
, Expand
);
1422 // fp16 is a special v7 extension that adds f16 <-> f32 conversions.
1423 if (!Subtarget
->hasFP16()) {
1424 setOperationAction(ISD::FP16_TO_FP
, MVT::f32
, Expand
);
1425 setOperationAction(ISD::FP_TO_FP16
, MVT::f32
, Expand
);
1428 // Strict floating-point comparisons need custom lowering.
1429 setOperationAction(ISD::STRICT_FSETCC
, MVT::f16
, Custom
);
1430 setOperationAction(ISD::STRICT_FSETCCS
, MVT::f16
, Custom
);
1431 setOperationAction(ISD::STRICT_FSETCC
, MVT::f32
, Custom
);
1432 setOperationAction(ISD::STRICT_FSETCCS
, MVT::f32
, Custom
);
1433 setOperationAction(ISD::STRICT_FSETCC
, MVT::f64
, Custom
);
1434 setOperationAction(ISD::STRICT_FSETCCS
, MVT::f64
, Custom
);
1437 // Use __sincos_stret if available.
1438 if (getLibcallName(RTLIB::SINCOS_STRET_F32
) != nullptr &&
1439 getLibcallName(RTLIB::SINCOS_STRET_F64
) != nullptr) {
1440 setOperationAction(ISD::FSINCOS
, MVT::f64
, Custom
);
1441 setOperationAction(ISD::FSINCOS
, MVT::f32
, Custom
);
1444 // FP-ARMv8 implements a lot of rounding-like FP operations.
1445 if (Subtarget
->hasFPARMv8Base()) {
1446 setOperationAction(ISD::FFLOOR
, MVT::f32
, Legal
);
1447 setOperationAction(ISD::FCEIL
, MVT::f32
, Legal
);
1448 setOperationAction(ISD::FROUND
, MVT::f32
, Legal
);
1449 setOperationAction(ISD::FTRUNC
, MVT::f32
, Legal
);
1450 setOperationAction(ISD::FNEARBYINT
, MVT::f32
, Legal
);
1451 setOperationAction(ISD::FRINT
, MVT::f32
, Legal
);
1452 setOperationAction(ISD::FMINNUM
, MVT::f32
, Legal
);
1453 setOperationAction(ISD::FMAXNUM
, MVT::f32
, Legal
);
1454 if (Subtarget
->hasNEON()) {
1455 setOperationAction(ISD::FMINNUM
, MVT::v2f32
, Legal
);
1456 setOperationAction(ISD::FMAXNUM
, MVT::v2f32
, Legal
);
1457 setOperationAction(ISD::FMINNUM
, MVT::v4f32
, Legal
);
1458 setOperationAction(ISD::FMAXNUM
, MVT::v4f32
, Legal
);
1461 if (Subtarget
->hasFP64()) {
1462 setOperationAction(ISD::FFLOOR
, MVT::f64
, Legal
);
1463 setOperationAction(ISD::FCEIL
, MVT::f64
, Legal
);
1464 setOperationAction(ISD::FROUND
, MVT::f64
, Legal
);
1465 setOperationAction(ISD::FTRUNC
, MVT::f64
, Legal
);
1466 setOperationAction(ISD::FNEARBYINT
, MVT::f64
, Legal
);
1467 setOperationAction(ISD::FRINT
, MVT::f64
, Legal
);
1468 setOperationAction(ISD::FMINNUM
, MVT::f64
, Legal
);
1469 setOperationAction(ISD::FMAXNUM
, MVT::f64
, Legal
);
1473 // FP16 often need to be promoted to call lib functions
1474 if (Subtarget
->hasFullFP16()) {
1475 setOperationAction(ISD::FREM
, MVT::f16
, Promote
);
1476 setOperationAction(ISD::FCOPYSIGN
, MVT::f16
, Expand
);
1477 setOperationAction(ISD::FSIN
, MVT::f16
, Promote
);
1478 setOperationAction(ISD::FCOS
, MVT::f16
, Promote
);
1479 setOperationAction(ISD::FSINCOS
, MVT::f16
, Promote
);
1480 setOperationAction(ISD::FPOWI
, MVT::f16
, Promote
);
1481 setOperationAction(ISD::FPOW
, MVT::f16
, Promote
);
1482 setOperationAction(ISD::FEXP
, MVT::f16
, Promote
);
1483 setOperationAction(ISD::FEXP2
, MVT::f16
, Promote
);
1484 setOperationAction(ISD::FLOG
, MVT::f16
, Promote
);
1485 setOperationAction(ISD::FLOG10
, MVT::f16
, Promote
);
1486 setOperationAction(ISD::FLOG2
, MVT::f16
, Promote
);
1488 setOperationAction(ISD::FROUND
, MVT::f16
, Legal
);
1491 if (Subtarget
->hasNEON()) {
1492 // vmin and vmax aren't available in a scalar form, so we can use
1493 // a NEON instruction with an undef lane instead. This has a performance
1494 // penalty on some cores, so we don't do this unless we have been
1495 // asked to by the core tuning model.
1496 if (Subtarget
->useNEONForSinglePrecisionFP()) {
1497 setOperationAction(ISD::FMINIMUM
, MVT::f32
, Legal
);
1498 setOperationAction(ISD::FMAXIMUM
, MVT::f32
, Legal
);
1499 setOperationAction(ISD::FMINIMUM
, MVT::f16
, Legal
);
1500 setOperationAction(ISD::FMAXIMUM
, MVT::f16
, Legal
);
1502 setOperationAction(ISD::FMINIMUM
, MVT::v2f32
, Legal
);
1503 setOperationAction(ISD::FMAXIMUM
, MVT::v2f32
, Legal
);
1504 setOperationAction(ISD::FMINIMUM
, MVT::v4f32
, Legal
);
1505 setOperationAction(ISD::FMAXIMUM
, MVT::v4f32
, Legal
);
1507 if (Subtarget
->hasFullFP16()) {
1508 setOperationAction(ISD::FMINNUM
, MVT::v4f16
, Legal
);
1509 setOperationAction(ISD::FMAXNUM
, MVT::v4f16
, Legal
);
1510 setOperationAction(ISD::FMINNUM
, MVT::v8f16
, Legal
);
1511 setOperationAction(ISD::FMAXNUM
, MVT::v8f16
, Legal
);
1513 setOperationAction(ISD::FMINIMUM
, MVT::v4f16
, Legal
);
1514 setOperationAction(ISD::FMAXIMUM
, MVT::v4f16
, Legal
);
1515 setOperationAction(ISD::FMINIMUM
, MVT::v8f16
, Legal
);
1516 setOperationAction(ISD::FMAXIMUM
, MVT::v8f16
, Legal
);
1520 // We have target-specific dag combine patterns for the following nodes:
1521 // ARMISD::VMOVRRD - No need to call setTargetDAGCombine
1522 setTargetDAGCombine(ISD::ADD
);
1523 setTargetDAGCombine(ISD::SUB
);
1524 setTargetDAGCombine(ISD::MUL
);
1525 setTargetDAGCombine(ISD::AND
);
1526 setTargetDAGCombine(ISD::OR
);
1527 setTargetDAGCombine(ISD::XOR
);
1529 if (Subtarget
->hasMVEIntegerOps())
1530 setTargetDAGCombine(ISD::VSELECT
);
1532 if (Subtarget
->hasV6Ops())
1533 setTargetDAGCombine(ISD::SRL
);
1534 if (Subtarget
->isThumb1Only())
1535 setTargetDAGCombine(ISD::SHL
);
1537 setStackPointerRegisterToSaveRestore(ARM::SP
);
1539 if (Subtarget
->useSoftFloat() || Subtarget
->isThumb1Only() ||
1540 !Subtarget
->hasVFP2Base() || Subtarget
->hasMinSize())
1541 setSchedulingPreference(Sched::RegPressure
);
1543 setSchedulingPreference(Sched::Hybrid
);
1545 //// temporary - rewrite interface to use type
1546 MaxStoresPerMemset
= 8;
1547 MaxStoresPerMemsetOptSize
= 4;
1548 MaxStoresPerMemcpy
= 4; // For @llvm.memcpy -> sequence of stores
1549 MaxStoresPerMemcpyOptSize
= 2;
1550 MaxStoresPerMemmove
= 4; // For @llvm.memmove -> sequence of stores
1551 MaxStoresPerMemmoveOptSize
= 2;
1553 // On ARM arguments smaller than 4 bytes are extended, so all arguments
1554 // are at least 4 bytes aligned.
1555 setMinStackArgumentAlignment(Align(4));
1557 // Prefer likely predicted branches to selects on out-of-order cores.
1558 PredictableSelectIsExpensive
= Subtarget
->getSchedModel().isOutOfOrder();
1560 setPrefLoopAlignment(Align(1ULL << Subtarget
->getPrefLoopLogAlignment()));
1562 setMinFunctionAlignment(Subtarget
->isThumb() ? Align(2) : Align(4));
1564 if (Subtarget
->isThumb() || Subtarget
->isThumb2())
1565 setTargetDAGCombine(ISD::ABS
);
1568 bool ARMTargetLowering::useSoftFloat() const {
1569 return Subtarget
->useSoftFloat();
1572 // FIXME: It might make sense to define the representative register class as the
1573 // nearest super-register that has a non-null superset. For example, DPR_VFP2 is
1574 // a super-register of SPR, and DPR is a superset if DPR_VFP2. Consequently,
1575 // SPR's representative would be DPR_VFP2. This should work well if register
1576 // pressure tracking were modified such that a register use would increment the
1577 // pressure of the register class's representative and all of it's super
1578 // classes' representatives transitively. We have not implemented this because
1579 // of the difficulty prior to coalescing of modeling operand register classes
1580 // due to the common occurrence of cross class copies and subregister insertions
1582 std::pair
<const TargetRegisterClass
*, uint8_t>
1583 ARMTargetLowering::findRepresentativeClass(const TargetRegisterInfo
*TRI
,
1585 const TargetRegisterClass
*RRC
= nullptr;
1587 switch (VT
.SimpleTy
) {
1589 return TargetLowering::findRepresentativeClass(TRI
, VT
);
1590 // Use DPR as representative register class for all floating point
1591 // and vector types. Since there are 32 SPR registers and 32 DPR registers so
1592 // the cost is 1 for both f32 and f64.
1593 case MVT::f32
: case MVT::f64
: case MVT::v8i8
: case MVT::v4i16
:
1594 case MVT::v2i32
: case MVT::v1i64
: case MVT::v2f32
:
1595 RRC
= &ARM::DPRRegClass
;
1596 // When NEON is used for SP, only half of the register file is available
1597 // because operations that define both SP and DP results will be constrained
1598 // to the VFP2 class (D0-D15). We currently model this constraint prior to
1599 // coalescing by double-counting the SP regs. See the FIXME above.
1600 if (Subtarget
->useNEONForSinglePrecisionFP())
1603 case MVT::v16i8
: case MVT::v8i16
: case MVT::v4i32
: case MVT::v2i64
:
1604 case MVT::v4f32
: case MVT::v2f64
:
1605 RRC
= &ARM::DPRRegClass
;
1609 RRC
= &ARM::DPRRegClass
;
1613 RRC
= &ARM::DPRRegClass
;
1617 return std::make_pair(RRC
, Cost
);
1620 const char *ARMTargetLowering::getTargetNodeName(unsigned Opcode
) const {
1621 #define MAKE_CASE(V) \
1624 switch ((ARMISD::NodeType
)Opcode
) {
1625 case ARMISD::FIRST_NUMBER
:
1627 MAKE_CASE(ARMISD::Wrapper
)
1628 MAKE_CASE(ARMISD::WrapperPIC
)
1629 MAKE_CASE(ARMISD::WrapperJT
)
1630 MAKE_CASE(ARMISD::COPY_STRUCT_BYVAL
)
1631 MAKE_CASE(ARMISD::CALL
)
1632 MAKE_CASE(ARMISD::CALL_PRED
)
1633 MAKE_CASE(ARMISD::CALL_NOLINK
)
1634 MAKE_CASE(ARMISD::tSECALL
)
1635 MAKE_CASE(ARMISD::BRCOND
)
1636 MAKE_CASE(ARMISD::BR_JT
)
1637 MAKE_CASE(ARMISD::BR2_JT
)
1638 MAKE_CASE(ARMISD::RET_FLAG
)
1639 MAKE_CASE(ARMISD::SERET_FLAG
)
1640 MAKE_CASE(ARMISD::INTRET_FLAG
)
1641 MAKE_CASE(ARMISD::PIC_ADD
)
1642 MAKE_CASE(ARMISD::CMP
)
1643 MAKE_CASE(ARMISD::CMN
)
1644 MAKE_CASE(ARMISD::CMPZ
)
1645 MAKE_CASE(ARMISD::CMPFP
)
1646 MAKE_CASE(ARMISD::CMPFPE
)
1647 MAKE_CASE(ARMISD::CMPFPw0
)
1648 MAKE_CASE(ARMISD::CMPFPEw0
)
1649 MAKE_CASE(ARMISD::BCC_i64
)
1650 MAKE_CASE(ARMISD::FMSTAT
)
1651 MAKE_CASE(ARMISD::CMOV
)
1652 MAKE_CASE(ARMISD::SUBS
)
1653 MAKE_CASE(ARMISD::SSAT
)
1654 MAKE_CASE(ARMISD::USAT
)
1655 MAKE_CASE(ARMISD::ASRL
)
1656 MAKE_CASE(ARMISD::LSRL
)
1657 MAKE_CASE(ARMISD::LSLL
)
1658 MAKE_CASE(ARMISD::SRL_FLAG
)
1659 MAKE_CASE(ARMISD::SRA_FLAG
)
1660 MAKE_CASE(ARMISD::RRX
)
1661 MAKE_CASE(ARMISD::ADDC
)
1662 MAKE_CASE(ARMISD::ADDE
)
1663 MAKE_CASE(ARMISD::SUBC
)
1664 MAKE_CASE(ARMISD::SUBE
)
1665 MAKE_CASE(ARMISD::LSLS
)
1666 MAKE_CASE(ARMISD::VMOVRRD
)
1667 MAKE_CASE(ARMISD::VMOVDRR
)
1668 MAKE_CASE(ARMISD::VMOVhr
)
1669 MAKE_CASE(ARMISD::VMOVrh
)
1670 MAKE_CASE(ARMISD::VMOVSR
)
1671 MAKE_CASE(ARMISD::EH_SJLJ_SETJMP
)
1672 MAKE_CASE(ARMISD::EH_SJLJ_LONGJMP
)
1673 MAKE_CASE(ARMISD::EH_SJLJ_SETUP_DISPATCH
)
1674 MAKE_CASE(ARMISD::TC_RETURN
)
1675 MAKE_CASE(ARMISD::THREAD_POINTER
)
1676 MAKE_CASE(ARMISD::DYN_ALLOC
)
1677 MAKE_CASE(ARMISD::MEMBARRIER_MCR
)
1678 MAKE_CASE(ARMISD::PRELOAD
)
1679 MAKE_CASE(ARMISD::LDRD
)
1680 MAKE_CASE(ARMISD::STRD
)
1681 MAKE_CASE(ARMISD::WIN__CHKSTK
)
1682 MAKE_CASE(ARMISD::WIN__DBZCHK
)
1683 MAKE_CASE(ARMISD::PREDICATE_CAST
)
1684 MAKE_CASE(ARMISD::VECTOR_REG_CAST
)
1685 MAKE_CASE(ARMISD::MVESEXT
)
1686 MAKE_CASE(ARMISD::MVEZEXT
)
1687 MAKE_CASE(ARMISD::MVETRUNC
)
1688 MAKE_CASE(ARMISD::VCMP
)
1689 MAKE_CASE(ARMISD::VCMPZ
)
1690 MAKE_CASE(ARMISD::VTST
)
1691 MAKE_CASE(ARMISD::VSHLs
)
1692 MAKE_CASE(ARMISD::VSHLu
)
1693 MAKE_CASE(ARMISD::VSHLIMM
)
1694 MAKE_CASE(ARMISD::VSHRsIMM
)
1695 MAKE_CASE(ARMISD::VSHRuIMM
)
1696 MAKE_CASE(ARMISD::VRSHRsIMM
)
1697 MAKE_CASE(ARMISD::VRSHRuIMM
)
1698 MAKE_CASE(ARMISD::VRSHRNIMM
)
1699 MAKE_CASE(ARMISD::VQSHLsIMM
)
1700 MAKE_CASE(ARMISD::VQSHLuIMM
)
1701 MAKE_CASE(ARMISD::VQSHLsuIMM
)
1702 MAKE_CASE(ARMISD::VQSHRNsIMM
)
1703 MAKE_CASE(ARMISD::VQSHRNuIMM
)
1704 MAKE_CASE(ARMISD::VQSHRNsuIMM
)
1705 MAKE_CASE(ARMISD::VQRSHRNsIMM
)
1706 MAKE_CASE(ARMISD::VQRSHRNuIMM
)
1707 MAKE_CASE(ARMISD::VQRSHRNsuIMM
)
1708 MAKE_CASE(ARMISD::VSLIIMM
)
1709 MAKE_CASE(ARMISD::VSRIIMM
)
1710 MAKE_CASE(ARMISD::VGETLANEu
)
1711 MAKE_CASE(ARMISD::VGETLANEs
)
1712 MAKE_CASE(ARMISD::VMOVIMM
)
1713 MAKE_CASE(ARMISD::VMVNIMM
)
1714 MAKE_CASE(ARMISD::VMOVFPIMM
)
1715 MAKE_CASE(ARMISD::VDUP
)
1716 MAKE_CASE(ARMISD::VDUPLANE
)
1717 MAKE_CASE(ARMISD::VEXT
)
1718 MAKE_CASE(ARMISD::VREV64
)
1719 MAKE_CASE(ARMISD::VREV32
)
1720 MAKE_CASE(ARMISD::VREV16
)
1721 MAKE_CASE(ARMISD::VZIP
)
1722 MAKE_CASE(ARMISD::VUZP
)
1723 MAKE_CASE(ARMISD::VTRN
)
1724 MAKE_CASE(ARMISD::VTBL1
)
1725 MAKE_CASE(ARMISD::VTBL2
)
1726 MAKE_CASE(ARMISD::VMOVN
)
1727 MAKE_CASE(ARMISD::VQMOVNs
)
1728 MAKE_CASE(ARMISD::VQMOVNu
)
1729 MAKE_CASE(ARMISD::VCVTN
)
1730 MAKE_CASE(ARMISD::VCVTL
)
1731 MAKE_CASE(ARMISD::VIDUP
)
1732 MAKE_CASE(ARMISD::VMULLs
)
1733 MAKE_CASE(ARMISD::VMULLu
)
1734 MAKE_CASE(ARMISD::VQDMULH
)
1735 MAKE_CASE(ARMISD::VADDVs
)
1736 MAKE_CASE(ARMISD::VADDVu
)
1737 MAKE_CASE(ARMISD::VADDVps
)
1738 MAKE_CASE(ARMISD::VADDVpu
)
1739 MAKE_CASE(ARMISD::VADDLVs
)
1740 MAKE_CASE(ARMISD::VADDLVu
)
1741 MAKE_CASE(ARMISD::VADDLVAs
)
1742 MAKE_CASE(ARMISD::VADDLVAu
)
1743 MAKE_CASE(ARMISD::VADDLVps
)
1744 MAKE_CASE(ARMISD::VADDLVpu
)
1745 MAKE_CASE(ARMISD::VADDLVAps
)
1746 MAKE_CASE(ARMISD::VADDLVApu
)
1747 MAKE_CASE(ARMISD::VMLAVs
)
1748 MAKE_CASE(ARMISD::VMLAVu
)
1749 MAKE_CASE(ARMISD::VMLAVps
)
1750 MAKE_CASE(ARMISD::VMLAVpu
)
1751 MAKE_CASE(ARMISD::VMLALVs
)
1752 MAKE_CASE(ARMISD::VMLALVu
)
1753 MAKE_CASE(ARMISD::VMLALVps
)
1754 MAKE_CASE(ARMISD::VMLALVpu
)
1755 MAKE_CASE(ARMISD::VMLALVAs
)
1756 MAKE_CASE(ARMISD::VMLALVAu
)
1757 MAKE_CASE(ARMISD::VMLALVAps
)
1758 MAKE_CASE(ARMISD::VMLALVApu
)
1759 MAKE_CASE(ARMISD::VMINVu
)
1760 MAKE_CASE(ARMISD::VMINVs
)
1761 MAKE_CASE(ARMISD::VMAXVu
)
1762 MAKE_CASE(ARMISD::VMAXVs
)
1763 MAKE_CASE(ARMISD::UMAAL
)
1764 MAKE_CASE(ARMISD::UMLAL
)
1765 MAKE_CASE(ARMISD::SMLAL
)
1766 MAKE_CASE(ARMISD::SMLALBB
)
1767 MAKE_CASE(ARMISD::SMLALBT
)
1768 MAKE_CASE(ARMISD::SMLALTB
)
1769 MAKE_CASE(ARMISD::SMLALTT
)
1770 MAKE_CASE(ARMISD::SMULWB
)
1771 MAKE_CASE(ARMISD::SMULWT
)
1772 MAKE_CASE(ARMISD::SMLALD
)
1773 MAKE_CASE(ARMISD::SMLALDX
)
1774 MAKE_CASE(ARMISD::SMLSLD
)
1775 MAKE_CASE(ARMISD::SMLSLDX
)
1776 MAKE_CASE(ARMISD::SMMLAR
)
1777 MAKE_CASE(ARMISD::SMMLSR
)
1778 MAKE_CASE(ARMISD::QADD16b
)
1779 MAKE_CASE(ARMISD::QSUB16b
)
1780 MAKE_CASE(ARMISD::QADD8b
)
1781 MAKE_CASE(ARMISD::QSUB8b
)
1782 MAKE_CASE(ARMISD::UQADD16b
)
1783 MAKE_CASE(ARMISD::UQSUB16b
)
1784 MAKE_CASE(ARMISD::UQADD8b
)
1785 MAKE_CASE(ARMISD::UQSUB8b
)
1786 MAKE_CASE(ARMISD::BUILD_VECTOR
)
1787 MAKE_CASE(ARMISD::BFI
)
1788 MAKE_CASE(ARMISD::VORRIMM
)
1789 MAKE_CASE(ARMISD::VBICIMM
)
1790 MAKE_CASE(ARMISD::VBSP
)
1791 MAKE_CASE(ARMISD::MEMCPY
)
1792 MAKE_CASE(ARMISD::VLD1DUP
)
1793 MAKE_CASE(ARMISD::VLD2DUP
)
1794 MAKE_CASE(ARMISD::VLD3DUP
)
1795 MAKE_CASE(ARMISD::VLD4DUP
)
1796 MAKE_CASE(ARMISD::VLD1_UPD
)
1797 MAKE_CASE(ARMISD::VLD2_UPD
)
1798 MAKE_CASE(ARMISD::VLD3_UPD
)
1799 MAKE_CASE(ARMISD::VLD4_UPD
)
1800 MAKE_CASE(ARMISD::VLD1x2_UPD
)
1801 MAKE_CASE(ARMISD::VLD1x3_UPD
)
1802 MAKE_CASE(ARMISD::VLD1x4_UPD
)
1803 MAKE_CASE(ARMISD::VLD2LN_UPD
)
1804 MAKE_CASE(ARMISD::VLD3LN_UPD
)
1805 MAKE_CASE(ARMISD::VLD4LN_UPD
)
1806 MAKE_CASE(ARMISD::VLD1DUP_UPD
)
1807 MAKE_CASE(ARMISD::VLD2DUP_UPD
)
1808 MAKE_CASE(ARMISD::VLD3DUP_UPD
)
1809 MAKE_CASE(ARMISD::VLD4DUP_UPD
)
1810 MAKE_CASE(ARMISD::VST1_UPD
)
1811 MAKE_CASE(ARMISD::VST2_UPD
)
1812 MAKE_CASE(ARMISD::VST3_UPD
)
1813 MAKE_CASE(ARMISD::VST4_UPD
)
1814 MAKE_CASE(ARMISD::VST1x2_UPD
)
1815 MAKE_CASE(ARMISD::VST1x3_UPD
)
1816 MAKE_CASE(ARMISD::VST1x4_UPD
)
1817 MAKE_CASE(ARMISD::VST2LN_UPD
)
1818 MAKE_CASE(ARMISD::VST3LN_UPD
)
1819 MAKE_CASE(ARMISD::VST4LN_UPD
)
1820 MAKE_CASE(ARMISD::WLS
)
1821 MAKE_CASE(ARMISD::WLSSETUP
)
1822 MAKE_CASE(ARMISD::LE
)
1823 MAKE_CASE(ARMISD::LOOP_DEC
)
1824 MAKE_CASE(ARMISD::CSINV
)
1825 MAKE_CASE(ARMISD::CSNEG
)
1826 MAKE_CASE(ARMISD::CSINC
)
1827 MAKE_CASE(ARMISD::MEMCPYLOOP
)
1828 MAKE_CASE(ARMISD::MEMSETLOOP
)
1834 EVT
ARMTargetLowering::getSetCCResultType(const DataLayout
&DL
, LLVMContext
&,
1837 return getPointerTy(DL
);
1839 // MVE has a predicate register.
1840 if ((Subtarget
->hasMVEIntegerOps() &&
1841 (VT
== MVT::v4i32
|| VT
== MVT::v8i16
|| VT
== MVT::v16i8
)) ||
1842 (Subtarget
->hasMVEFloatOps() && (VT
== MVT::v4f32
|| VT
== MVT::v8f16
)))
1843 return MVT::getVectorVT(MVT::i1
, VT
.getVectorElementCount());
1844 return VT
.changeVectorElementTypeToInteger();
1847 /// getRegClassFor - Return the register class that should be used for the
1848 /// specified value type.
1849 const TargetRegisterClass
*
1850 ARMTargetLowering::getRegClassFor(MVT VT
, bool isDivergent
) const {
1852 // Map v4i64 to QQ registers but do not make the type legal. Similarly map
1853 // v8i64 to QQQQ registers. v4i64 and v8i64 are only used for REG_SEQUENCE to
1854 // load / store 4 to 8 consecutive NEON D registers, or 2 to 4 consecutive
1856 if (Subtarget
->hasNEON()) {
1857 if (VT
== MVT::v4i64
)
1858 return &ARM::QQPRRegClass
;
1859 if (VT
== MVT::v8i64
)
1860 return &ARM::QQQQPRRegClass
;
1862 if (Subtarget
->hasMVEIntegerOps()) {
1863 if (VT
== MVT::v4i64
)
1864 return &ARM::MQQPRRegClass
;
1865 if (VT
== MVT::v8i64
)
1866 return &ARM::MQQQQPRRegClass
;
1868 return TargetLowering::getRegClassFor(VT
);
1871 // memcpy, and other memory intrinsics, typically tries to use LDM/STM if the
1872 // source/dest is aligned and the copy size is large enough. We therefore want
1873 // to align such objects passed to memory intrinsics.
1874 bool ARMTargetLowering::shouldAlignPointerArgs(CallInst
*CI
, unsigned &MinSize
,
1875 unsigned &PrefAlign
) const {
1876 if (!isa
<MemIntrinsic
>(CI
))
1879 // On ARM11 onwards (excluding M class) 8-byte aligned LDM is typically 1
1880 // cycle faster than 4-byte aligned LDM.
1881 PrefAlign
= (Subtarget
->hasV6Ops() && !Subtarget
->isMClass() ? 8 : 4);
1885 // Create a fast isel object.
1887 ARMTargetLowering::createFastISel(FunctionLoweringInfo
&funcInfo
,
1888 const TargetLibraryInfo
*libInfo
) const {
1889 return ARM::createFastISel(funcInfo
, libInfo
);
1892 Sched::Preference
ARMTargetLowering::getSchedulingPreference(SDNode
*N
) const {
1893 unsigned NumVals
= N
->getNumValues();
1895 return Sched::RegPressure
;
1897 for (unsigned i
= 0; i
!= NumVals
; ++i
) {
1898 EVT VT
= N
->getValueType(i
);
1899 if (VT
== MVT::Glue
|| VT
== MVT::Other
)
1901 if (VT
.isFloatingPoint() || VT
.isVector())
1905 if (!N
->isMachineOpcode())
1906 return Sched::RegPressure
;
1908 // Load are scheduled for latency even if there instruction itinerary
1909 // is not available.
1910 const TargetInstrInfo
*TII
= Subtarget
->getInstrInfo();
1911 const MCInstrDesc
&MCID
= TII
->get(N
->getMachineOpcode());
1913 if (MCID
.getNumDefs() == 0)
1914 return Sched::RegPressure
;
1915 if (!Itins
->isEmpty() &&
1916 Itins
->getOperandCycle(MCID
.getSchedClass(), 0) > 2)
1919 return Sched::RegPressure
;
1922 //===----------------------------------------------------------------------===//
1924 //===----------------------------------------------------------------------===//
1926 static bool isSRL16(const SDValue
&Op
) {
1927 if (Op
.getOpcode() != ISD::SRL
)
1929 if (auto Const
= dyn_cast
<ConstantSDNode
>(Op
.getOperand(1)))
1930 return Const
->getZExtValue() == 16;
1934 static bool isSRA16(const SDValue
&Op
) {
1935 if (Op
.getOpcode() != ISD::SRA
)
1937 if (auto Const
= dyn_cast
<ConstantSDNode
>(Op
.getOperand(1)))
1938 return Const
->getZExtValue() == 16;
1942 static bool isSHL16(const SDValue
&Op
) {
1943 if (Op
.getOpcode() != ISD::SHL
)
1945 if (auto Const
= dyn_cast
<ConstantSDNode
>(Op
.getOperand(1)))
1946 return Const
->getZExtValue() == 16;
1950 // Check for a signed 16-bit value. We special case SRA because it makes it
1951 // more simple when also looking for SRAs that aren't sign extending a
1952 // smaller value. Without the check, we'd need to take extra care with
1953 // checking order for some operations.
1954 static bool isS16(const SDValue
&Op
, SelectionDAG
&DAG
) {
1956 return isSHL16(Op
.getOperand(0));
1957 return DAG
.ComputeNumSignBits(Op
) == 17;
1960 /// IntCCToARMCC - Convert a DAG integer condition code to an ARM CC
1961 static ARMCC::CondCodes
IntCCToARMCC(ISD::CondCode CC
) {
1963 default: llvm_unreachable("Unknown condition code!");
1964 case ISD::SETNE
: return ARMCC::NE
;
1965 case ISD::SETEQ
: return ARMCC::EQ
;
1966 case ISD::SETGT
: return ARMCC::GT
;
1967 case ISD::SETGE
: return ARMCC::GE
;
1968 case ISD::SETLT
: return ARMCC::LT
;
1969 case ISD::SETLE
: return ARMCC::LE
;
1970 case ISD::SETUGT
: return ARMCC::HI
;
1971 case ISD::SETUGE
: return ARMCC::HS
;
1972 case ISD::SETULT
: return ARMCC::LO
;
1973 case ISD::SETULE
: return ARMCC::LS
;
1977 /// FPCCToARMCC - Convert a DAG fp condition code to an ARM CC.
1978 static void FPCCToARMCC(ISD::CondCode CC
, ARMCC::CondCodes
&CondCode
,
1979 ARMCC::CondCodes
&CondCode2
) {
1980 CondCode2
= ARMCC::AL
;
1982 default: llvm_unreachable("Unknown FP condition!");
1984 case ISD::SETOEQ
: CondCode
= ARMCC::EQ
; break;
1986 case ISD::SETOGT
: CondCode
= ARMCC::GT
; break;
1988 case ISD::SETOGE
: CondCode
= ARMCC::GE
; break;
1989 case ISD::SETOLT
: CondCode
= ARMCC::MI
; break;
1990 case ISD::SETOLE
: CondCode
= ARMCC::LS
; break;
1991 case ISD::SETONE
: CondCode
= ARMCC::MI
; CondCode2
= ARMCC::GT
; break;
1992 case ISD::SETO
: CondCode
= ARMCC::VC
; break;
1993 case ISD::SETUO
: CondCode
= ARMCC::VS
; break;
1994 case ISD::SETUEQ
: CondCode
= ARMCC::EQ
; CondCode2
= ARMCC::VS
; break;
1995 case ISD::SETUGT
: CondCode
= ARMCC::HI
; break;
1996 case ISD::SETUGE
: CondCode
= ARMCC::PL
; break;
1998 case ISD::SETULT
: CondCode
= ARMCC::LT
; break;
2000 case ISD::SETULE
: CondCode
= ARMCC::LE
; break;
2002 case ISD::SETUNE
: CondCode
= ARMCC::NE
; break;
2006 //===----------------------------------------------------------------------===//
2007 // Calling Convention Implementation
2008 //===----------------------------------------------------------------------===//
2010 /// getEffectiveCallingConv - Get the effective calling convention, taking into
2011 /// account presence of floating point hardware and calling convention
2012 /// limitations, such as support for variadic functions.
2014 ARMTargetLowering::getEffectiveCallingConv(CallingConv::ID CC
,
2015 bool isVarArg
) const {
2018 report_fatal_error("Unsupported calling convention");
2019 case CallingConv::ARM_AAPCS
:
2020 case CallingConv::ARM_APCS
:
2021 case CallingConv::GHC
:
2022 case CallingConv::CFGuard_Check
:
2024 case CallingConv::PreserveMost
:
2025 return CallingConv::PreserveMost
;
2026 case CallingConv::ARM_AAPCS_VFP
:
2027 case CallingConv::Swift
:
2028 case CallingConv::SwiftTail
:
2029 return isVarArg
? CallingConv::ARM_AAPCS
: CallingConv::ARM_AAPCS_VFP
;
2030 case CallingConv::C
:
2031 case CallingConv::Tail
:
2032 if (!Subtarget
->isAAPCS_ABI())
2033 return CallingConv::ARM_APCS
;
2034 else if (Subtarget
->hasVFP2Base() && !Subtarget
->isThumb1Only() &&
2035 getTargetMachine().Options
.FloatABIType
== FloatABI::Hard
&&
2037 return CallingConv::ARM_AAPCS_VFP
;
2039 return CallingConv::ARM_AAPCS
;
2040 case CallingConv::Fast
:
2041 case CallingConv::CXX_FAST_TLS
:
2042 if (!Subtarget
->isAAPCS_ABI()) {
2043 if (Subtarget
->hasVFP2Base() && !Subtarget
->isThumb1Only() && !isVarArg
)
2044 return CallingConv::Fast
;
2045 return CallingConv::ARM_APCS
;
2046 } else if (Subtarget
->hasVFP2Base() &&
2047 !Subtarget
->isThumb1Only() && !isVarArg
)
2048 return CallingConv::ARM_AAPCS_VFP
;
2050 return CallingConv::ARM_AAPCS
;
2054 CCAssignFn
*ARMTargetLowering::CCAssignFnForCall(CallingConv::ID CC
,
2055 bool isVarArg
) const {
2056 return CCAssignFnForNode(CC
, false, isVarArg
);
2059 CCAssignFn
*ARMTargetLowering::CCAssignFnForReturn(CallingConv::ID CC
,
2060 bool isVarArg
) const {
2061 return CCAssignFnForNode(CC
, true, isVarArg
);
2064 /// CCAssignFnForNode - Selects the correct CCAssignFn for the given
2065 /// CallingConvention.
2066 CCAssignFn
*ARMTargetLowering::CCAssignFnForNode(CallingConv::ID CC
,
2068 bool isVarArg
) const {
2069 switch (getEffectiveCallingConv(CC
, isVarArg
)) {
2071 report_fatal_error("Unsupported calling convention");
2072 case CallingConv::ARM_APCS
:
2073 return (Return
? RetCC_ARM_APCS
: CC_ARM_APCS
);
2074 case CallingConv::ARM_AAPCS
:
2075 return (Return
? RetCC_ARM_AAPCS
: CC_ARM_AAPCS
);
2076 case CallingConv::ARM_AAPCS_VFP
:
2077 return (Return
? RetCC_ARM_AAPCS_VFP
: CC_ARM_AAPCS_VFP
);
2078 case CallingConv::Fast
:
2079 return (Return
? RetFastCC_ARM_APCS
: FastCC_ARM_APCS
);
2080 case CallingConv::GHC
:
2081 return (Return
? RetCC_ARM_APCS
: CC_ARM_APCS_GHC
);
2082 case CallingConv::PreserveMost
:
2083 return (Return
? RetCC_ARM_AAPCS
: CC_ARM_AAPCS
);
2084 case CallingConv::CFGuard_Check
:
2085 return (Return
? RetCC_ARM_AAPCS
: CC_ARM_Win32_CFGuard_Check
);
2089 SDValue
ARMTargetLowering::MoveToHPR(const SDLoc
&dl
, SelectionDAG
&DAG
,
2090 MVT LocVT
, MVT ValVT
, SDValue Val
) const {
2091 Val
= DAG
.getNode(ISD::BITCAST
, dl
, MVT::getIntegerVT(LocVT
.getSizeInBits()),
2093 if (Subtarget
->hasFullFP16()) {
2094 Val
= DAG
.getNode(ARMISD::VMOVhr
, dl
, ValVT
, Val
);
2096 Val
= DAG
.getNode(ISD::TRUNCATE
, dl
,
2097 MVT::getIntegerVT(ValVT
.getSizeInBits()), Val
);
2098 Val
= DAG
.getNode(ISD::BITCAST
, dl
, ValVT
, Val
);
2103 SDValue
ARMTargetLowering::MoveFromHPR(const SDLoc
&dl
, SelectionDAG
&DAG
,
2104 MVT LocVT
, MVT ValVT
,
2105 SDValue Val
) const {
2106 if (Subtarget
->hasFullFP16()) {
2107 Val
= DAG
.getNode(ARMISD::VMOVrh
, dl
,
2108 MVT::getIntegerVT(LocVT
.getSizeInBits()), Val
);
2110 Val
= DAG
.getNode(ISD::BITCAST
, dl
,
2111 MVT::getIntegerVT(ValVT
.getSizeInBits()), Val
);
2112 Val
= DAG
.getNode(ISD::ZERO_EXTEND
, dl
,
2113 MVT::getIntegerVT(LocVT
.getSizeInBits()), Val
);
2115 return DAG
.getNode(ISD::BITCAST
, dl
, LocVT
, Val
);
2118 /// LowerCallResult - Lower the result values of a call into the
2119 /// appropriate copies out of appropriate physical registers.
2120 SDValue
ARMTargetLowering::LowerCallResult(
2121 SDValue Chain
, SDValue InFlag
, CallingConv::ID CallConv
, bool isVarArg
,
2122 const SmallVectorImpl
<ISD::InputArg
> &Ins
, const SDLoc
&dl
,
2123 SelectionDAG
&DAG
, SmallVectorImpl
<SDValue
> &InVals
, bool isThisReturn
,
2124 SDValue ThisVal
) const {
2125 // Assign locations to each value returned by this call.
2126 SmallVector
<CCValAssign
, 16> RVLocs
;
2127 CCState
CCInfo(CallConv
, isVarArg
, DAG
.getMachineFunction(), RVLocs
,
2129 CCInfo
.AnalyzeCallResult(Ins
, CCAssignFnForReturn(CallConv
, isVarArg
));
2131 // Copy all of the result registers out of their specified physreg.
2132 for (unsigned i
= 0; i
!= RVLocs
.size(); ++i
) {
2133 CCValAssign VA
= RVLocs
[i
];
2135 // Pass 'this' value directly from the argument to return value, to avoid
2136 // reg unit interference
2137 if (i
== 0 && isThisReturn
) {
2138 assert(!VA
.needsCustom() && VA
.getLocVT() == MVT::i32
&&
2139 "unexpected return calling convention register assignment");
2140 InVals
.push_back(ThisVal
);
2145 if (VA
.needsCustom() &&
2146 (VA
.getLocVT() == MVT::f64
|| VA
.getLocVT() == MVT::v2f64
)) {
2147 // Handle f64 or half of a v2f64.
2148 SDValue Lo
= DAG
.getCopyFromReg(Chain
, dl
, VA
.getLocReg(), MVT::i32
,
2150 Chain
= Lo
.getValue(1);
2151 InFlag
= Lo
.getValue(2);
2152 VA
= RVLocs
[++i
]; // skip ahead to next loc
2153 SDValue Hi
= DAG
.getCopyFromReg(Chain
, dl
, VA
.getLocReg(), MVT::i32
,
2155 Chain
= Hi
.getValue(1);
2156 InFlag
= Hi
.getValue(2);
2157 if (!Subtarget
->isLittle())
2159 Val
= DAG
.getNode(ARMISD::VMOVDRR
, dl
, MVT::f64
, Lo
, Hi
);
2161 if (VA
.getLocVT() == MVT::v2f64
) {
2162 SDValue Vec
= DAG
.getNode(ISD::UNDEF
, dl
, MVT::v2f64
);
2163 Vec
= DAG
.getNode(ISD::INSERT_VECTOR_ELT
, dl
, MVT::v2f64
, Vec
, Val
,
2164 DAG
.getConstant(0, dl
, MVT::i32
));
2166 VA
= RVLocs
[++i
]; // skip ahead to next loc
2167 Lo
= DAG
.getCopyFromReg(Chain
, dl
, VA
.getLocReg(), MVT::i32
, InFlag
);
2168 Chain
= Lo
.getValue(1);
2169 InFlag
= Lo
.getValue(2);
2170 VA
= RVLocs
[++i
]; // skip ahead to next loc
2171 Hi
= DAG
.getCopyFromReg(Chain
, dl
, VA
.getLocReg(), MVT::i32
, InFlag
);
2172 Chain
= Hi
.getValue(1);
2173 InFlag
= Hi
.getValue(2);
2174 if (!Subtarget
->isLittle())
2176 Val
= DAG
.getNode(ARMISD::VMOVDRR
, dl
, MVT::f64
, Lo
, Hi
);
2177 Val
= DAG
.getNode(ISD::INSERT_VECTOR_ELT
, dl
, MVT::v2f64
, Vec
, Val
,
2178 DAG
.getConstant(1, dl
, MVT::i32
));
2181 Val
= DAG
.getCopyFromReg(Chain
, dl
, VA
.getLocReg(), VA
.getLocVT(),
2183 Chain
= Val
.getValue(1);
2184 InFlag
= Val
.getValue(2);
2187 switch (VA
.getLocInfo()) {
2188 default: llvm_unreachable("Unknown loc info!");
2189 case CCValAssign::Full
: break;
2190 case CCValAssign::BCvt
:
2191 Val
= DAG
.getNode(ISD::BITCAST
, dl
, VA
.getValVT(), Val
);
2195 // f16 arguments have their size extended to 4 bytes and passed as if they
2196 // had been copied to the LSBs of a 32-bit register.
2197 // For that, it's passed extended to i32 (soft ABI) or to f32 (hard ABI)
2198 if (VA
.needsCustom() &&
2199 (VA
.getValVT() == MVT::f16
|| VA
.getValVT() == MVT::bf16
))
2200 Val
= MoveToHPR(dl
, DAG
, VA
.getLocVT(), VA
.getValVT(), Val
);
2202 InVals
.push_back(Val
);
2208 std::pair
<SDValue
, MachinePointerInfo
> ARMTargetLowering::computeAddrForCallArg(
2209 const SDLoc
&dl
, SelectionDAG
&DAG
, const CCValAssign
&VA
, SDValue StackPtr
,
2210 bool IsTailCall
, int SPDiff
) const {
2212 MachinePointerInfo DstInfo
;
2213 int32_t Offset
= VA
.getLocMemOffset();
2214 MachineFunction
&MF
= DAG
.getMachineFunction();
2218 auto PtrVT
= getPointerTy(DAG
.getDataLayout());
2219 int Size
= VA
.getLocVT().getFixedSizeInBits() / 8;
2220 int FI
= MF
.getFrameInfo().CreateFixedObject(Size
, Offset
, true);
2221 DstAddr
= DAG
.getFrameIndex(FI
, PtrVT
);
2223 MachinePointerInfo::getFixedStack(DAG
.getMachineFunction(), FI
);
2225 SDValue PtrOff
= DAG
.getIntPtrConstant(Offset
, dl
);
2226 DstAddr
= DAG
.getNode(ISD::ADD
, dl
, getPointerTy(DAG
.getDataLayout()),
2229 MachinePointerInfo::getStack(DAG
.getMachineFunction(), Offset
);
2232 return std::make_pair(DstAddr
, DstInfo
);
2235 void ARMTargetLowering::PassF64ArgInRegs(const SDLoc
&dl
, SelectionDAG
&DAG
,
2236 SDValue Chain
, SDValue
&Arg
,
2237 RegsToPassVector
&RegsToPass
,
2238 CCValAssign
&VA
, CCValAssign
&NextVA
,
2240 SmallVectorImpl
<SDValue
> &MemOpChains
,
2243 SDValue fmrrd
= DAG
.getNode(ARMISD::VMOVRRD
, dl
,
2244 DAG
.getVTList(MVT::i32
, MVT::i32
), Arg
);
2245 unsigned id
= Subtarget
->isLittle() ? 0 : 1;
2246 RegsToPass
.push_back(std::make_pair(VA
.getLocReg(), fmrrd
.getValue(id
)));
2248 if (NextVA
.isRegLoc())
2249 RegsToPass
.push_back(std::make_pair(NextVA
.getLocReg(), fmrrd
.getValue(1-id
)));
2251 assert(NextVA
.isMemLoc());
2252 if (!StackPtr
.getNode())
2253 StackPtr
= DAG
.getCopyFromReg(Chain
, dl
, ARM::SP
,
2254 getPointerTy(DAG
.getDataLayout()));
2257 MachinePointerInfo DstInfo
;
2258 std::tie(DstAddr
, DstInfo
) =
2259 computeAddrForCallArg(dl
, DAG
, NextVA
, StackPtr
, IsTailCall
, SPDiff
);
2260 MemOpChains
.push_back(
2261 DAG
.getStore(Chain
, dl
, fmrrd
.getValue(1 - id
), DstAddr
, DstInfo
));
2265 static bool canGuaranteeTCO(CallingConv::ID CC
, bool GuaranteeTailCalls
) {
2266 return (CC
== CallingConv::Fast
&& GuaranteeTailCalls
) ||
2267 CC
== CallingConv::Tail
|| CC
== CallingConv::SwiftTail
;
2270 /// LowerCall - Lowering a call into a callseq_start <-
2271 /// ARMISD:CALL <- callseq_end chain. Also add input and output parameter
2274 ARMTargetLowering::LowerCall(TargetLowering::CallLoweringInfo
&CLI
,
2275 SmallVectorImpl
<SDValue
> &InVals
) const {
2276 SelectionDAG
&DAG
= CLI
.DAG
;
2278 SmallVectorImpl
<ISD::OutputArg
> &Outs
= CLI
.Outs
;
2279 SmallVectorImpl
<SDValue
> &OutVals
= CLI
.OutVals
;
2280 SmallVectorImpl
<ISD::InputArg
> &Ins
= CLI
.Ins
;
2281 SDValue Chain
= CLI
.Chain
;
2282 SDValue Callee
= CLI
.Callee
;
2283 bool &isTailCall
= CLI
.IsTailCall
;
2284 CallingConv::ID CallConv
= CLI
.CallConv
;
2285 bool doesNotRet
= CLI
.DoesNotReturn
;
2286 bool isVarArg
= CLI
.IsVarArg
;
2288 MachineFunction
&MF
= DAG
.getMachineFunction();
2289 ARMFunctionInfo
*AFI
= MF
.getInfo
<ARMFunctionInfo
>();
2290 MachineFunction::CallSiteInfo CSInfo
;
2291 bool isStructRet
= (Outs
.empty()) ? false : Outs
[0].Flags
.isSRet();
2292 bool isThisReturn
= false;
2293 bool isCmseNSCall
= false;
2294 bool isSibCall
= false;
2295 bool PreferIndirect
= false;
2297 // Determine whether this is a non-secure function call.
2298 if (CLI
.CB
&& CLI
.CB
->getAttributes().hasFnAttr("cmse_nonsecure_call"))
2299 isCmseNSCall
= true;
2301 // Disable tail calls if they're not supported.
2302 if (!Subtarget
->supportsTailCall())
2305 // For both the non-secure calls and the returns from a CMSE entry function,
2306 // the function needs to do some extra work afte r the call, or before the
2307 // return, respectively, thus it cannot end with atail call
2308 if (isCmseNSCall
|| AFI
->isCmseNSEntryFunction())
2311 if (isa
<GlobalAddressSDNode
>(Callee
)) {
2312 // If we're optimizing for minimum size and the function is called three or
2313 // more times in this block, we can improve codesize by calling indirectly
2314 // as BLXr has a 16-bit encoding.
2315 auto *GV
= cast
<GlobalAddressSDNode
>(Callee
)->getGlobal();
2317 auto *BB
= CLI
.CB
->getParent();
2318 PreferIndirect
= Subtarget
->isThumb() && Subtarget
->hasMinSize() &&
2319 count_if(GV
->users(), [&BB
](const User
*U
) {
2320 return isa
<Instruction
>(U
) &&
2321 cast
<Instruction
>(U
)->getParent() == BB
;
2326 // Check if it's really possible to do a tail call.
2327 isTailCall
= IsEligibleForTailCallOptimization(
2328 Callee
, CallConv
, isVarArg
, isStructRet
,
2329 MF
.getFunction().hasStructRetAttr(), Outs
, OutVals
, Ins
, DAG
,
2332 if (isTailCall
&& !getTargetMachine().Options
.GuaranteedTailCallOpt
&&
2333 CallConv
!= CallingConv::Tail
&& CallConv
!= CallingConv::SwiftTail
)
2336 // We don't support GuaranteedTailCallOpt for ARM, only automatically
2337 // detected sibcalls.
2342 if (!isTailCall
&& CLI
.CB
&& CLI
.CB
->isMustTailCall())
2343 report_fatal_error("failed to perform tail call elimination on a call "
2344 "site marked musttail");
2345 // Analyze operands of the call, assigning locations to each operand.
2346 SmallVector
<CCValAssign
, 16> ArgLocs
;
2347 CCState
CCInfo(CallConv
, isVarArg
, DAG
.getMachineFunction(), ArgLocs
,
2349 CCInfo
.AnalyzeCallOperands(Outs
, CCAssignFnForCall(CallConv
, isVarArg
));
2351 // Get a count of how many bytes are to be pushed on the stack.
2352 unsigned NumBytes
= CCInfo
.getNextStackOffset();
2354 // SPDiff is the byte offset of the call's argument area from the callee's.
2355 // Stores to callee stack arguments will be placed in FixedStackSlots offset
2356 // by this amount for a tail call. In a sibling call it must be 0 because the
2357 // caller will deallocate the entire stack and the callee still expects its
2358 // arguments to begin at SP+0. Completely unused for non-tail calls.
2361 if (isTailCall
&& !isSibCall
) {
2362 auto FuncInfo
= MF
.getInfo
<ARMFunctionInfo
>();
2363 unsigned NumReusableBytes
= FuncInfo
->getArgumentStackSize();
2365 // Since callee will pop argument stack as a tail call, we must keep the
2366 // popped size 16-byte aligned.
2367 Align StackAlign
= DAG
.getDataLayout().getStackAlignment();
2368 NumBytes
= alignTo(NumBytes
, StackAlign
);
2370 // SPDiff will be negative if this tail call requires more space than we
2371 // would automatically have in our incoming argument space. Positive if we
2372 // can actually shrink the stack.
2373 SPDiff
= NumReusableBytes
- NumBytes
;
2375 // If this call requires more stack than we have available from
2376 // LowerFormalArguments, tell FrameLowering to reserve space for it.
2377 if (SPDiff
< 0 && AFI
->getArgRegsSaveSize() < (unsigned)-SPDiff
)
2378 AFI
->setArgRegsSaveSize(-SPDiff
);
2382 // For sibling tail calls, memory operands are available in our caller's stack.
2385 // Adjust the stack pointer for the new arguments...
2386 // These operations are automatically eliminated by the prolog/epilog pass
2387 Chain
= DAG
.getCALLSEQ_START(Chain
, isTailCall
? 0 : NumBytes
, 0, dl
);
2391 DAG
.getCopyFromReg(Chain
, dl
, ARM::SP
, getPointerTy(DAG
.getDataLayout()));
2393 RegsToPassVector RegsToPass
;
2394 SmallVector
<SDValue
, 8> MemOpChains
;
2396 // During a tail call, stores to the argument area must happen after all of
2397 // the function's incoming arguments have been loaded because they may alias.
2398 // This is done by folding in a TokenFactor from LowerFormalArguments, but
2399 // there's no point in doing so repeatedly so this tracks whether that's
2401 bool AfterFormalArgLoads
= false;
2403 // Walk the register/memloc assignments, inserting copies/loads. In the case
2404 // of tail call optimization, arguments are handled later.
2405 for (unsigned i
= 0, realArgIdx
= 0, e
= ArgLocs
.size();
2407 ++i
, ++realArgIdx
) {
2408 CCValAssign
&VA
= ArgLocs
[i
];
2409 SDValue Arg
= OutVals
[realArgIdx
];
2410 ISD::ArgFlagsTy Flags
= Outs
[realArgIdx
].Flags
;
2411 bool isByVal
= Flags
.isByVal();
2413 // Promote the value if needed.
2414 switch (VA
.getLocInfo()) {
2415 default: llvm_unreachable("Unknown loc info!");
2416 case CCValAssign::Full
: break;
2417 case CCValAssign::SExt
:
2418 Arg
= DAG
.getNode(ISD::SIGN_EXTEND
, dl
, VA
.getLocVT(), Arg
);
2420 case CCValAssign::ZExt
:
2421 Arg
= DAG
.getNode(ISD::ZERO_EXTEND
, dl
, VA
.getLocVT(), Arg
);
2423 case CCValAssign::AExt
:
2424 Arg
= DAG
.getNode(ISD::ANY_EXTEND
, dl
, VA
.getLocVT(), Arg
);
2426 case CCValAssign::BCvt
:
2427 Arg
= DAG
.getNode(ISD::BITCAST
, dl
, VA
.getLocVT(), Arg
);
2431 if (isTailCall
&& VA
.isMemLoc() && !AfterFormalArgLoads
) {
2432 Chain
= DAG
.getStackArgumentTokenFactor(Chain
);
2433 AfterFormalArgLoads
= true;
2436 // f16 arguments have their size extended to 4 bytes and passed as if they
2437 // had been copied to the LSBs of a 32-bit register.
2438 // For that, it's passed extended to i32 (soft ABI) or to f32 (hard ABI)
2439 if (VA
.needsCustom() &&
2440 (VA
.getValVT() == MVT::f16
|| VA
.getValVT() == MVT::bf16
)) {
2441 Arg
= MoveFromHPR(dl
, DAG
, VA
.getLocVT(), VA
.getValVT(), Arg
);
2443 // f16 arguments could have been extended prior to argument lowering.
2444 // Mask them arguments if this is a CMSE nonsecure call.
2445 auto ArgVT
= Outs
[realArgIdx
].ArgVT
;
2446 if (isCmseNSCall
&& (ArgVT
== MVT::f16
)) {
2447 auto LocBits
= VA
.getLocVT().getSizeInBits();
2448 auto MaskValue
= APInt::getLowBitsSet(LocBits
, ArgVT
.getSizeInBits());
2450 DAG
.getConstant(MaskValue
, dl
, MVT::getIntegerVT(LocBits
));
2451 Arg
= DAG
.getNode(ISD::BITCAST
, dl
, MVT::getIntegerVT(LocBits
), Arg
);
2452 Arg
= DAG
.getNode(ISD::AND
, dl
, MVT::getIntegerVT(LocBits
), Arg
, Mask
);
2453 Arg
= DAG
.getNode(ISD::BITCAST
, dl
, VA
.getLocVT(), Arg
);
2457 // f64 and v2f64 might be passed in i32 pairs and must be split into pieces
2458 if (VA
.needsCustom() && VA
.getLocVT() == MVT::v2f64
) {
2459 SDValue Op0
= DAG
.getNode(ISD::EXTRACT_VECTOR_ELT
, dl
, MVT::f64
, Arg
,
2460 DAG
.getConstant(0, dl
, MVT::i32
));
2461 SDValue Op1
= DAG
.getNode(ISD::EXTRACT_VECTOR_ELT
, dl
, MVT::f64
, Arg
,
2462 DAG
.getConstant(1, dl
, MVT::i32
));
2464 PassF64ArgInRegs(dl
, DAG
, Chain
, Op0
, RegsToPass
, VA
, ArgLocs
[++i
],
2465 StackPtr
, MemOpChains
, isTailCall
, SPDiff
);
2467 VA
= ArgLocs
[++i
]; // skip ahead to next loc
2468 if (VA
.isRegLoc()) {
2469 PassF64ArgInRegs(dl
, DAG
, Chain
, Op1
, RegsToPass
, VA
, ArgLocs
[++i
],
2470 StackPtr
, MemOpChains
, isTailCall
, SPDiff
);
2472 assert(VA
.isMemLoc());
2474 MachinePointerInfo DstInfo
;
2475 std::tie(DstAddr
, DstInfo
) =
2476 computeAddrForCallArg(dl
, DAG
, VA
, StackPtr
, isTailCall
, SPDiff
);
2477 MemOpChains
.push_back(DAG
.getStore(Chain
, dl
, Op1
, DstAddr
, DstInfo
));
2479 } else if (VA
.needsCustom() && VA
.getLocVT() == MVT::f64
) {
2480 PassF64ArgInRegs(dl
, DAG
, Chain
, Arg
, RegsToPass
, VA
, ArgLocs
[++i
],
2481 StackPtr
, MemOpChains
, isTailCall
, SPDiff
);
2482 } else if (VA
.isRegLoc()) {
2483 if (realArgIdx
== 0 && Flags
.isReturned() && !Flags
.isSwiftSelf() &&
2484 Outs
[0].VT
== MVT::i32
) {
2485 assert(VA
.getLocVT() == MVT::i32
&&
2486 "unexpected calling convention register assignment");
2487 assert(!Ins
.empty() && Ins
[0].VT
== MVT::i32
&&
2488 "unexpected use of 'returned'");
2489 isThisReturn
= true;
2491 const TargetOptions
&Options
= DAG
.getTarget().Options
;
2492 if (Options
.EmitCallSiteInfo
)
2493 CSInfo
.emplace_back(VA
.getLocReg(), i
);
2494 RegsToPass
.push_back(std::make_pair(VA
.getLocReg(), Arg
));
2495 } else if (isByVal
) {
2496 assert(VA
.isMemLoc());
2497 unsigned offset
= 0;
2499 // True if this byval aggregate will be split between registers
2501 unsigned ByValArgsCount
= CCInfo
.getInRegsParamsCount();
2502 unsigned CurByValIdx
= CCInfo
.getInRegsParamsProcessed();
2504 if (CurByValIdx
< ByValArgsCount
) {
2506 unsigned RegBegin
, RegEnd
;
2507 CCInfo
.getInRegsParamInfo(CurByValIdx
, RegBegin
, RegEnd
);
2510 DAG
.getTargetLoweringInfo().getPointerTy(DAG
.getDataLayout());
2512 for (i
= 0, j
= RegBegin
; j
< RegEnd
; i
++, j
++) {
2513 SDValue Const
= DAG
.getConstant(4*i
, dl
, MVT::i32
);
2514 SDValue AddArg
= DAG
.getNode(ISD::ADD
, dl
, PtrVT
, Arg
, Const
);
2516 DAG
.getLoad(PtrVT
, dl
, Chain
, AddArg
, MachinePointerInfo(),
2517 DAG
.InferPtrAlign(AddArg
));
2518 MemOpChains
.push_back(Load
.getValue(1));
2519 RegsToPass
.push_back(std::make_pair(j
, Load
));
2522 // If parameter size outsides register area, "offset" value
2523 // helps us to calculate stack slot for remained part properly.
2524 offset
= RegEnd
- RegBegin
;
2526 CCInfo
.nextInRegsParam();
2529 if (Flags
.getByValSize() > 4*offset
) {
2530 auto PtrVT
= getPointerTy(DAG
.getDataLayout());
2532 MachinePointerInfo DstInfo
;
2533 std::tie(Dst
, DstInfo
) =
2534 computeAddrForCallArg(dl
, DAG
, VA
, StackPtr
, isTailCall
, SPDiff
);
2535 SDValue SrcOffset
= DAG
.getIntPtrConstant(4*offset
, dl
);
2536 SDValue Src
= DAG
.getNode(ISD::ADD
, dl
, PtrVT
, Arg
, SrcOffset
);
2537 SDValue SizeNode
= DAG
.getConstant(Flags
.getByValSize() - 4*offset
, dl
,
2540 DAG
.getConstant(Flags
.getNonZeroByValAlign().value(), dl
, MVT::i32
);
2542 SDVTList VTs
= DAG
.getVTList(MVT::Other
, MVT::Glue
);
2543 SDValue Ops
[] = { Chain
, Dst
, Src
, SizeNode
, AlignNode
};
2544 MemOpChains
.push_back(DAG
.getNode(ARMISD::COPY_STRUCT_BYVAL
, dl
, VTs
,
2548 assert(VA
.isMemLoc());
2550 MachinePointerInfo DstInfo
;
2551 std::tie(DstAddr
, DstInfo
) =
2552 computeAddrForCallArg(dl
, DAG
, VA
, StackPtr
, isTailCall
, SPDiff
);
2554 SDValue Store
= DAG
.getStore(Chain
, dl
, Arg
, DstAddr
, DstInfo
);
2555 MemOpChains
.push_back(Store
);
2559 if (!MemOpChains
.empty())
2560 Chain
= DAG
.getNode(ISD::TokenFactor
, dl
, MVT::Other
, MemOpChains
);
2562 // Build a sequence of copy-to-reg nodes chained together with token chain
2563 // and flag operands which copy the outgoing args into the appropriate regs.
2565 for (unsigned i
= 0, e
= RegsToPass
.size(); i
!= e
; ++i
) {
2566 Chain
= DAG
.getCopyToReg(Chain
, dl
, RegsToPass
[i
].first
,
2567 RegsToPass
[i
].second
, InFlag
);
2568 InFlag
= Chain
.getValue(1);
2571 // If the callee is a GlobalAddress/ExternalSymbol node (quite common, every
2572 // direct call is) turn it into a TargetGlobalAddress/TargetExternalSymbol
2573 // node so that legalize doesn't hack it.
2574 bool isDirect
= false;
2576 const TargetMachine
&TM
= getTargetMachine();
2577 const Module
*Mod
= MF
.getFunction().getParent();
2578 const GlobalValue
*GV
= nullptr;
2579 if (GlobalAddressSDNode
*G
= dyn_cast
<GlobalAddressSDNode
>(Callee
))
2580 GV
= G
->getGlobal();
2582 !TM
.shouldAssumeDSOLocal(*Mod
, GV
) && Subtarget
->isTargetMachO();
2584 bool isARMFunc
= !Subtarget
->isThumb() || (isStub
&& !Subtarget
->isMClass());
2585 bool isLocalARMFunc
= false;
2586 auto PtrVt
= getPointerTy(DAG
.getDataLayout());
2588 if (Subtarget
->genLongCalls()) {
2589 assert((!isPositionIndependent() || Subtarget
->isTargetWindows()) &&
2590 "long-calls codegen is not position independent!");
2591 // Handle a global address or an external symbol. If it's not one of
2592 // those, the target's already in a register, so we don't need to do
2594 if (isa
<GlobalAddressSDNode
>(Callee
)) {
2595 // Create a constant pool entry for the callee address
2596 unsigned ARMPCLabelIndex
= AFI
->createPICLabelUId();
2597 ARMConstantPoolValue
*CPV
=
2598 ARMConstantPoolConstant::Create(GV
, ARMPCLabelIndex
, ARMCP::CPValue
, 0);
2600 // Get the address of the callee into a register
2601 SDValue CPAddr
= DAG
.getTargetConstantPool(CPV
, PtrVt
, Align(4));
2602 CPAddr
= DAG
.getNode(ARMISD::Wrapper
, dl
, MVT::i32
, CPAddr
);
2603 Callee
= DAG
.getLoad(
2604 PtrVt
, dl
, DAG
.getEntryNode(), CPAddr
,
2605 MachinePointerInfo::getConstantPool(DAG
.getMachineFunction()));
2606 } else if (ExternalSymbolSDNode
*S
=dyn_cast
<ExternalSymbolSDNode
>(Callee
)) {
2607 const char *Sym
= S
->getSymbol();
2609 // Create a constant pool entry for the callee address
2610 unsigned ARMPCLabelIndex
= AFI
->createPICLabelUId();
2611 ARMConstantPoolValue
*CPV
=
2612 ARMConstantPoolSymbol::Create(*DAG
.getContext(), Sym
,
2613 ARMPCLabelIndex
, 0);
2614 // Get the address of the callee into a register
2615 SDValue CPAddr
= DAG
.getTargetConstantPool(CPV
, PtrVt
, Align(4));
2616 CPAddr
= DAG
.getNode(ARMISD::Wrapper
, dl
, MVT::i32
, CPAddr
);
2617 Callee
= DAG
.getLoad(
2618 PtrVt
, dl
, DAG
.getEntryNode(), CPAddr
,
2619 MachinePointerInfo::getConstantPool(DAG
.getMachineFunction()));
2621 } else if (isa
<GlobalAddressSDNode
>(Callee
)) {
2622 if (!PreferIndirect
) {
2624 bool isDef
= GV
->isStrongDefinitionForLinker();
2626 // ARM call to a local ARM function is predicable.
2627 isLocalARMFunc
= !Subtarget
->isThumb() && (isDef
|| !ARMInterworking
);
2628 // tBX takes a register source operand.
2629 if (isStub
&& Subtarget
->isThumb1Only() && !Subtarget
->hasV5TOps()) {
2630 assert(Subtarget
->isTargetMachO() && "WrapperPIC use on non-MachO?");
2631 Callee
= DAG
.getNode(
2632 ARMISD::WrapperPIC
, dl
, PtrVt
,
2633 DAG
.getTargetGlobalAddress(GV
, dl
, PtrVt
, 0, ARMII::MO_NONLAZY
));
2634 Callee
= DAG
.getLoad(
2635 PtrVt
, dl
, DAG
.getEntryNode(), Callee
,
2636 MachinePointerInfo::getGOT(DAG
.getMachineFunction()), MaybeAlign(),
2637 MachineMemOperand::MODereferenceable
|
2638 MachineMemOperand::MOInvariant
);
2639 } else if (Subtarget
->isTargetCOFF()) {
2640 assert(Subtarget
->isTargetWindows() &&
2641 "Windows is the only supported COFF target");
2642 unsigned TargetFlags
= ARMII::MO_NO_FLAG
;
2643 if (GV
->hasDLLImportStorageClass())
2644 TargetFlags
= ARMII::MO_DLLIMPORT
;
2645 else if (!TM
.shouldAssumeDSOLocal(*GV
->getParent(), GV
))
2646 TargetFlags
= ARMII::MO_COFFSTUB
;
2647 Callee
= DAG
.getTargetGlobalAddress(GV
, dl
, PtrVt
, /*offset=*/0,
2649 if (TargetFlags
& (ARMII::MO_DLLIMPORT
| ARMII::MO_COFFSTUB
))
2651 DAG
.getLoad(PtrVt
, dl
, DAG
.getEntryNode(),
2652 DAG
.getNode(ARMISD::Wrapper
, dl
, PtrVt
, Callee
),
2653 MachinePointerInfo::getGOT(DAG
.getMachineFunction()));
2655 Callee
= DAG
.getTargetGlobalAddress(GV
, dl
, PtrVt
, 0, 0);
2658 } else if (ExternalSymbolSDNode
*S
= dyn_cast
<ExternalSymbolSDNode
>(Callee
)) {
2660 // tBX takes a register source operand.
2661 const char *Sym
= S
->getSymbol();
2662 if (isARMFunc
&& Subtarget
->isThumb1Only() && !Subtarget
->hasV5TOps()) {
2663 unsigned ARMPCLabelIndex
= AFI
->createPICLabelUId();
2664 ARMConstantPoolValue
*CPV
=
2665 ARMConstantPoolSymbol::Create(*DAG
.getContext(), Sym
,
2666 ARMPCLabelIndex
, 4);
2667 SDValue CPAddr
= DAG
.getTargetConstantPool(CPV
, PtrVt
, Align(4));
2668 CPAddr
= DAG
.getNode(ARMISD::Wrapper
, dl
, MVT::i32
, CPAddr
);
2669 Callee
= DAG
.getLoad(
2670 PtrVt
, dl
, DAG
.getEntryNode(), CPAddr
,
2671 MachinePointerInfo::getConstantPool(DAG
.getMachineFunction()));
2672 SDValue PICLabel
= DAG
.getConstant(ARMPCLabelIndex
, dl
, MVT::i32
);
2673 Callee
= DAG
.getNode(ARMISD::PIC_ADD
, dl
, PtrVt
, Callee
, PICLabel
);
2675 Callee
= DAG
.getTargetExternalSymbol(Sym
, PtrVt
, 0);
2680 assert(!isARMFunc
&& !isDirect
&&
2681 "Cannot handle call to ARM function or direct call");
2683 DiagnosticInfoUnsupported
Diag(DAG
.getMachineFunction().getFunction(),
2684 "call to non-secure function would "
2685 "require passing arguments on stack",
2687 DAG
.getContext()->diagnose(Diag
);
2690 DiagnosticInfoUnsupported
Diag(
2691 DAG
.getMachineFunction().getFunction(),
2692 "call to non-secure function would return value through pointer",
2694 DAG
.getContext()->diagnose(Diag
);
2698 // FIXME: handle tail calls differently.
2700 if (Subtarget
->isThumb()) {
2702 CallOpc
= ARMISD::tSECALL
;
2703 else if ((!isDirect
|| isARMFunc
) && !Subtarget
->hasV5TOps())
2704 CallOpc
= ARMISD::CALL_NOLINK
;
2706 CallOpc
= ARMISD::CALL
;
2708 if (!isDirect
&& !Subtarget
->hasV5TOps())
2709 CallOpc
= ARMISD::CALL_NOLINK
;
2710 else if (doesNotRet
&& isDirect
&& Subtarget
->hasRetAddrStack() &&
2711 // Emit regular call when code size is the priority
2712 !Subtarget
->hasMinSize())
2713 // "mov lr, pc; b _foo" to avoid confusing the RSP
2714 CallOpc
= ARMISD::CALL_NOLINK
;
2716 CallOpc
= isLocalARMFunc
? ARMISD::CALL_PRED
: ARMISD::CALL
;
2719 // We don't usually want to end the call-sequence here because we would tidy
2720 // the frame up *after* the call, however in the ABI-changing tail-call case
2721 // we've carefully laid out the parameters so that when sp is reset they'll be
2722 // in the correct location.
2723 if (isTailCall
&& !isSibCall
) {
2724 Chain
= DAG
.getCALLSEQ_END(Chain
, DAG
.getIntPtrConstant(0, dl
, true),
2725 DAG
.getIntPtrConstant(0, dl
, true), InFlag
, dl
);
2726 InFlag
= Chain
.getValue(1);
2729 std::vector
<SDValue
> Ops
;
2730 Ops
.push_back(Chain
);
2731 Ops
.push_back(Callee
);
2734 Ops
.push_back(DAG
.getTargetConstant(SPDiff
, dl
, MVT::i32
));
2737 // Add argument registers to the end of the list so that they are known live
2739 for (unsigned i
= 0, e
= RegsToPass
.size(); i
!= e
; ++i
)
2740 Ops
.push_back(DAG
.getRegister(RegsToPass
[i
].first
,
2741 RegsToPass
[i
].second
.getValueType()));
2743 // Add a register mask operand representing the call-preserved registers.
2745 const uint32_t *Mask
;
2746 const ARMBaseRegisterInfo
*ARI
= Subtarget
->getRegisterInfo();
2748 // For 'this' returns, use the R0-preserving mask if applicable
2749 Mask
= ARI
->getThisReturnPreservedMask(MF
, CallConv
);
2751 // Set isThisReturn to false if the calling convention is not one that
2752 // allows 'returned' to be modeled in this way, so LowerCallResult does
2753 // not try to pass 'this' straight through
2754 isThisReturn
= false;
2755 Mask
= ARI
->getCallPreservedMask(MF
, CallConv
);
2758 Mask
= ARI
->getCallPreservedMask(MF
, CallConv
);
2760 assert(Mask
&& "Missing call preserved mask for calling convention");
2761 Ops
.push_back(DAG
.getRegisterMask(Mask
));
2764 if (InFlag
.getNode())
2765 Ops
.push_back(InFlag
);
2767 SDVTList NodeTys
= DAG
.getVTList(MVT::Other
, MVT::Glue
);
2769 MF
.getFrameInfo().setHasTailCall();
2770 SDValue Ret
= DAG
.getNode(ARMISD::TC_RETURN
, dl
, NodeTys
, Ops
);
2771 DAG
.addCallSiteInfo(Ret
.getNode(), std::move(CSInfo
));
2775 // Returns a chain and a flag for retval copy to use.
2776 Chain
= DAG
.getNode(CallOpc
, dl
, NodeTys
, Ops
);
2777 DAG
.addNoMergeSiteInfo(Chain
.getNode(), CLI
.NoMerge
);
2778 InFlag
= Chain
.getValue(1);
2779 DAG
.addCallSiteInfo(Chain
.getNode(), std::move(CSInfo
));
2781 // If we're guaranteeing tail-calls will be honoured, the callee must
2782 // pop its own argument stack on return. But this call is *not* a tail call so
2783 // we need to undo that after it returns to restore the status-quo.
2784 bool TailCallOpt
= getTargetMachine().Options
.GuaranteedTailCallOpt
;
2785 uint64_t CalleePopBytes
=
2786 canGuaranteeTCO(CallConv
, TailCallOpt
) ? alignTo(NumBytes
, 16) : -1ULL;
2788 Chain
= DAG
.getCALLSEQ_END(Chain
, DAG
.getIntPtrConstant(NumBytes
, dl
, true),
2789 DAG
.getIntPtrConstant(CalleePopBytes
, dl
, true),
2792 InFlag
= Chain
.getValue(1);
2794 // Handle result values, copying them out of physregs into vregs that we
2796 return LowerCallResult(Chain
, InFlag
, CallConv
, isVarArg
, Ins
, dl
, DAG
,
2797 InVals
, isThisReturn
,
2798 isThisReturn
? OutVals
[0] : SDValue());
2801 /// HandleByVal - Every parameter *after* a byval parameter is passed
2802 /// on the stack. Remember the next parameter register to allocate,
2803 /// and then confiscate the rest of the parameter registers to insure
2805 void ARMTargetLowering::HandleByVal(CCState
*State
, unsigned &Size
,
2806 Align Alignment
) const {
2807 // Byval (as with any stack) slots are always at least 4 byte aligned.
2808 Alignment
= std::max(Alignment
, Align(4));
2810 unsigned Reg
= State
->AllocateReg(GPRArgRegs
);
2814 unsigned AlignInRegs
= Alignment
.value() / 4;
2815 unsigned Waste
= (ARM::R4
- Reg
) % AlignInRegs
;
2816 for (unsigned i
= 0; i
< Waste
; ++i
)
2817 Reg
= State
->AllocateReg(GPRArgRegs
);
2822 unsigned Excess
= 4 * (ARM::R4
- Reg
);
2824 // Special case when NSAA != SP and parameter size greater than size of
2825 // all remained GPR regs. In that case we can't split parameter, we must
2826 // send it to stack. We also must set NCRN to R4, so waste all
2827 // remained registers.
2828 const unsigned NSAAOffset
= State
->getNextStackOffset();
2829 if (NSAAOffset
!= 0 && Size
> Excess
) {
2830 while (State
->AllocateReg(GPRArgRegs
))
2835 // First register for byval parameter is the first register that wasn't
2836 // allocated before this method call, so it would be "reg".
2837 // If parameter is small enough to be saved in range [reg, r4), then
2838 // the end (first after last) register would be reg + param-size-in-regs,
2839 // else parameter would be splitted between registers and stack,
2840 // end register would be r4 in this case.
2841 unsigned ByValRegBegin
= Reg
;
2842 unsigned ByValRegEnd
= std::min
<unsigned>(Reg
+ Size
/ 4, ARM::R4
);
2843 State
->addInRegsParamInfo(ByValRegBegin
, ByValRegEnd
);
2844 // Note, first register is allocated in the beginning of function already,
2845 // allocate remained amount of registers we need.
2846 for (unsigned i
= Reg
+ 1; i
!= ByValRegEnd
; ++i
)
2847 State
->AllocateReg(GPRArgRegs
);
2848 // A byval parameter that is split between registers and memory needs its
2849 // size truncated here.
2850 // In the case where the entire structure fits in registers, we set the
2851 // size in memory to zero.
2852 Size
= std::max
<int>(Size
- Excess
, 0);
2855 /// MatchingStackOffset - Return true if the given stack call argument is
2856 /// already available in the same position (relatively) of the caller's
2857 /// incoming argument stack.
2859 bool MatchingStackOffset(SDValue Arg
, unsigned Offset
, ISD::ArgFlagsTy Flags
,
2860 MachineFrameInfo
&MFI
, const MachineRegisterInfo
*MRI
,
2861 const TargetInstrInfo
*TII
) {
2862 unsigned Bytes
= Arg
.getValueSizeInBits() / 8;
2863 int FI
= std::numeric_limits
<int>::max();
2864 if (Arg
.getOpcode() == ISD::CopyFromReg
) {
2865 unsigned VR
= cast
<RegisterSDNode
>(Arg
.getOperand(1))->getReg();
2866 if (!Register::isVirtualRegister(VR
))
2868 MachineInstr
*Def
= MRI
->getVRegDef(VR
);
2871 if (!Flags
.isByVal()) {
2872 if (!TII
->isLoadFromStackSlot(*Def
, FI
))
2877 } else if (LoadSDNode
*Ld
= dyn_cast
<LoadSDNode
>(Arg
)) {
2878 if (Flags
.isByVal())
2879 // ByVal argument is passed in as a pointer but it's now being
2880 // dereferenced. e.g.
2881 // define @foo(%struct.X* %A) {
2882 // tail call @bar(%struct.X* byval %A)
2885 SDValue Ptr
= Ld
->getBasePtr();
2886 FrameIndexSDNode
*FINode
= dyn_cast
<FrameIndexSDNode
>(Ptr
);
2889 FI
= FINode
->getIndex();
2893 assert(FI
!= std::numeric_limits
<int>::max());
2894 if (!MFI
.isFixedObjectIndex(FI
))
2896 return Offset
== MFI
.getObjectOffset(FI
) && Bytes
== MFI
.getObjectSize(FI
);
2899 /// IsEligibleForTailCallOptimization - Check whether the call is eligible
2900 /// for tail call optimization. Targets which want to do tail call
2901 /// optimization should implement this function.
2902 bool ARMTargetLowering::IsEligibleForTailCallOptimization(
2903 SDValue Callee
, CallingConv::ID CalleeCC
, bool isVarArg
,
2904 bool isCalleeStructRet
, bool isCallerStructRet
,
2905 const SmallVectorImpl
<ISD::OutputArg
> &Outs
,
2906 const SmallVectorImpl
<SDValue
> &OutVals
,
2907 const SmallVectorImpl
<ISD::InputArg
> &Ins
, SelectionDAG
&DAG
,
2908 const bool isIndirect
) const {
2909 MachineFunction
&MF
= DAG
.getMachineFunction();
2910 const Function
&CallerF
= MF
.getFunction();
2911 CallingConv::ID CallerCC
= CallerF
.getCallingConv();
2913 assert(Subtarget
->supportsTailCall());
2915 // Indirect tail calls cannot be optimized for Thumb1 if the args
2916 // to the call take up r0-r3. The reason is that there are no legal registers
2917 // left to hold the pointer to the function to be called.
2918 if (Subtarget
->isThumb1Only() && Outs
.size() >= 4 &&
2919 (!isa
<GlobalAddressSDNode
>(Callee
.getNode()) || isIndirect
))
2922 // Look for obvious safe cases to perform tail call optimization that do not
2923 // require ABI changes. This is what gcc calls sibcall.
2925 // Exception-handling functions need a special set of instructions to indicate
2926 // a return to the hardware. Tail-calling another function would probably
2928 if (CallerF
.hasFnAttribute("interrupt"))
2931 if (canGuaranteeTCO(CalleeCC
, getTargetMachine().Options
.GuaranteedTailCallOpt
))
2932 return CalleeCC
== CallerCC
;
2934 // Also avoid sibcall optimization if either caller or callee uses struct
2935 // return semantics.
2936 if (isCalleeStructRet
|| isCallerStructRet
)
2939 // Externally-defined functions with weak linkage should not be
2940 // tail-called on ARM when the OS does not support dynamic
2941 // pre-emption of symbols, as the AAELF spec requires normal calls
2942 // to undefined weak functions to be replaced with a NOP or jump to the
2943 // next instruction. The behaviour of branch instructions in this
2944 // situation (as used for tail calls) is implementation-defined, so we
2945 // cannot rely on the linker replacing the tail call with a return.
2946 if (GlobalAddressSDNode
*G
= dyn_cast
<GlobalAddressSDNode
>(Callee
)) {
2947 const GlobalValue
*GV
= G
->getGlobal();
2948 const Triple
&TT
= getTargetMachine().getTargetTriple();
2949 if (GV
->hasExternalWeakLinkage() &&
2950 (!TT
.isOSWindows() || TT
.isOSBinFormatELF() || TT
.isOSBinFormatMachO()))
2954 // Check that the call results are passed in the same way.
2955 LLVMContext
&C
= *DAG
.getContext();
2956 if (!CCState::resultsCompatible(
2957 getEffectiveCallingConv(CalleeCC
, isVarArg
),
2958 getEffectiveCallingConv(CallerCC
, CallerF
.isVarArg()), MF
, C
, Ins
,
2959 CCAssignFnForReturn(CalleeCC
, isVarArg
),
2960 CCAssignFnForReturn(CallerCC
, CallerF
.isVarArg())))
2962 // The callee has to preserve all registers the caller needs to preserve.
2963 const ARMBaseRegisterInfo
*TRI
= Subtarget
->getRegisterInfo();
2964 const uint32_t *CallerPreserved
= TRI
->getCallPreservedMask(MF
, CallerCC
);
2965 if (CalleeCC
!= CallerCC
) {
2966 const uint32_t *CalleePreserved
= TRI
->getCallPreservedMask(MF
, CalleeCC
);
2967 if (!TRI
->regmaskSubsetEqual(CallerPreserved
, CalleePreserved
))
2971 // If Caller's vararg or byval argument has been split between registers and
2972 // stack, do not perform tail call, since part of the argument is in caller's
2974 const ARMFunctionInfo
*AFI_Caller
= MF
.getInfo
<ARMFunctionInfo
>();
2975 if (AFI_Caller
->getArgRegsSaveSize())
2978 // If the callee takes no arguments then go on to check the results of the
2980 if (!Outs
.empty()) {
2981 // Check if stack adjustment is needed. For now, do not do this if any
2982 // argument is passed on the stack.
2983 SmallVector
<CCValAssign
, 16> ArgLocs
;
2984 CCState
CCInfo(CalleeCC
, isVarArg
, MF
, ArgLocs
, C
);
2985 CCInfo
.AnalyzeCallOperands(Outs
, CCAssignFnForCall(CalleeCC
, isVarArg
));
2986 if (CCInfo
.getNextStackOffset()) {
2987 // Check if the arguments are already laid out in the right way as
2988 // the caller's fixed stack objects.
2989 MachineFrameInfo
&MFI
= MF
.getFrameInfo();
2990 const MachineRegisterInfo
*MRI
= &MF
.getRegInfo();
2991 const TargetInstrInfo
*TII
= Subtarget
->getInstrInfo();
2992 for (unsigned i
= 0, realArgIdx
= 0, e
= ArgLocs
.size();
2994 ++i
, ++realArgIdx
) {
2995 CCValAssign
&VA
= ArgLocs
[i
];
2996 EVT RegVT
= VA
.getLocVT();
2997 SDValue Arg
= OutVals
[realArgIdx
];
2998 ISD::ArgFlagsTy Flags
= Outs
[realArgIdx
].Flags
;
2999 if (VA
.getLocInfo() == CCValAssign::Indirect
)
3001 if (VA
.needsCustom() && (RegVT
== MVT::f64
|| RegVT
== MVT::v2f64
)) {
3002 // f64 and vector types are split into multiple registers or
3003 // register/stack-slot combinations. The types will not match
3004 // the registers; give up on memory f64 refs until we figure
3005 // out what to do about this.
3008 if (!ArgLocs
[++i
].isRegLoc())
3010 if (RegVT
== MVT::v2f64
) {
3011 if (!ArgLocs
[++i
].isRegLoc())
3013 if (!ArgLocs
[++i
].isRegLoc())
3016 } else if (!VA
.isRegLoc()) {
3017 if (!MatchingStackOffset(Arg
, VA
.getLocMemOffset(), Flags
,
3024 const MachineRegisterInfo
&MRI
= MF
.getRegInfo();
3025 if (!parametersInCSRMatch(MRI
, CallerPreserved
, ArgLocs
, OutVals
))
3033 ARMTargetLowering::CanLowerReturn(CallingConv::ID CallConv
,
3034 MachineFunction
&MF
, bool isVarArg
,
3035 const SmallVectorImpl
<ISD::OutputArg
> &Outs
,
3036 LLVMContext
&Context
) const {
3037 SmallVector
<CCValAssign
, 16> RVLocs
;
3038 CCState
CCInfo(CallConv
, isVarArg
, MF
, RVLocs
, Context
);
3039 return CCInfo
.CheckReturn(Outs
, CCAssignFnForReturn(CallConv
, isVarArg
));
3042 static SDValue
LowerInterruptReturn(SmallVectorImpl
<SDValue
> &RetOps
,
3043 const SDLoc
&DL
, SelectionDAG
&DAG
) {
3044 const MachineFunction
&MF
= DAG
.getMachineFunction();
3045 const Function
&F
= MF
.getFunction();
3047 StringRef IntKind
= F
.getFnAttribute("interrupt").getValueAsString();
3049 // See ARM ARM v7 B1.8.3. On exception entry LR is set to a possibly offset
3050 // version of the "preferred return address". These offsets affect the return
3051 // instruction if this is a return from PL1 without hypervisor extensions.
3052 // IRQ/FIQ: +4 "subs pc, lr, #4"
3053 // SWI: 0 "subs pc, lr, #0"
3054 // ABORT: +4 "subs pc, lr, #4"
3055 // UNDEF: +4/+2 "subs pc, lr, #0"
3056 // UNDEF varies depending on where the exception came from ARM or Thumb
3057 // mode. Alongside GCC, we throw our hands up in disgust and pretend it's 0.
3060 if (IntKind
== "" || IntKind
== "IRQ" || IntKind
== "FIQ" ||
3063 else if (IntKind
== "SWI" || IntKind
== "UNDEF")
3066 report_fatal_error("Unsupported interrupt attribute. If present, value "
3067 "must be one of: IRQ, FIQ, SWI, ABORT or UNDEF");
3069 RetOps
.insert(RetOps
.begin() + 1,
3070 DAG
.getConstant(LROffset
, DL
, MVT::i32
, false));
3072 return DAG
.getNode(ARMISD::INTRET_FLAG
, DL
, MVT::Other
, RetOps
);
3076 ARMTargetLowering::LowerReturn(SDValue Chain
, CallingConv::ID CallConv
,
3078 const SmallVectorImpl
<ISD::OutputArg
> &Outs
,
3079 const SmallVectorImpl
<SDValue
> &OutVals
,
3080 const SDLoc
&dl
, SelectionDAG
&DAG
) const {
3081 // CCValAssign - represent the assignment of the return value to a location.
3082 SmallVector
<CCValAssign
, 16> RVLocs
;
3084 // CCState - Info about the registers and stack slots.
3085 CCState
CCInfo(CallConv
, isVarArg
, DAG
.getMachineFunction(), RVLocs
,
3088 // Analyze outgoing return values.
3089 CCInfo
.AnalyzeReturn(Outs
, CCAssignFnForReturn(CallConv
, isVarArg
));
3092 SmallVector
<SDValue
, 4> RetOps
;
3093 RetOps
.push_back(Chain
); // Operand #0 = Chain (updated below)
3094 bool isLittleEndian
= Subtarget
->isLittle();
3096 MachineFunction
&MF
= DAG
.getMachineFunction();
3097 ARMFunctionInfo
*AFI
= MF
.getInfo
<ARMFunctionInfo
>();
3098 AFI
->setReturnRegsCount(RVLocs
.size());
3100 // Report error if cmse entry function returns structure through first ptr arg.
3101 if (AFI
->isCmseNSEntryFunction() && MF
.getFunction().hasStructRetAttr()) {
3102 // Note: using an empty SDLoc(), as the first line of the function is a
3103 // better place to report than the last line.
3104 DiagnosticInfoUnsupported
Diag(
3105 DAG
.getMachineFunction().getFunction(),
3106 "secure entry function would return value through pointer",
3107 SDLoc().getDebugLoc());
3108 DAG
.getContext()->diagnose(Diag
);
3111 // Copy the result values into the output registers.
3112 for (unsigned i
= 0, realRVLocIdx
= 0;
3114 ++i
, ++realRVLocIdx
) {
3115 CCValAssign
&VA
= RVLocs
[i
];
3116 assert(VA
.isRegLoc() && "Can only return in registers!");
3118 SDValue Arg
= OutVals
[realRVLocIdx
];
3119 bool ReturnF16
= false;
3121 if (Subtarget
->hasFullFP16() && Subtarget
->isTargetHardFloat()) {
3122 // Half-precision return values can be returned like this:
3124 // t11 f16 = fadd ...
3125 // t12: i16 = bitcast t11
3126 // t13: i32 = zero_extend t12
3127 // t14: f32 = bitcast t13 <~~~~~~~ Arg
3129 // to avoid code generation for bitcasts, we simply set Arg to the node
3130 // that produces the f16 value, t11 in this case.
3132 if (Arg
.getValueType() == MVT::f32
&& Arg
.getOpcode() == ISD::BITCAST
) {
3133 SDValue ZE
= Arg
.getOperand(0);
3134 if (ZE
.getOpcode() == ISD::ZERO_EXTEND
&& ZE
.getValueType() == MVT::i32
) {
3135 SDValue BC
= ZE
.getOperand(0);
3136 if (BC
.getOpcode() == ISD::BITCAST
&& BC
.getValueType() == MVT::i16
) {
3137 Arg
= BC
.getOperand(0);
3144 switch (VA
.getLocInfo()) {
3145 default: llvm_unreachable("Unknown loc info!");
3146 case CCValAssign::Full
: break;
3147 case CCValAssign::BCvt
:
3149 Arg
= DAG
.getNode(ISD::BITCAST
, dl
, VA
.getLocVT(), Arg
);
3153 // Mask f16 arguments if this is a CMSE nonsecure entry.
3154 auto RetVT
= Outs
[realRVLocIdx
].ArgVT
;
3155 if (AFI
->isCmseNSEntryFunction() && (RetVT
== MVT::f16
)) {
3156 if (VA
.needsCustom() && VA
.getValVT() == MVT::f16
) {
3157 Arg
= MoveFromHPR(dl
, DAG
, VA
.getLocVT(), VA
.getValVT(), Arg
);
3159 auto LocBits
= VA
.getLocVT().getSizeInBits();
3160 auto MaskValue
= APInt::getLowBitsSet(LocBits
, RetVT
.getSizeInBits());
3162 DAG
.getConstant(MaskValue
, dl
, MVT::getIntegerVT(LocBits
));
3163 Arg
= DAG
.getNode(ISD::BITCAST
, dl
, MVT::getIntegerVT(LocBits
), Arg
);
3164 Arg
= DAG
.getNode(ISD::AND
, dl
, MVT::getIntegerVT(LocBits
), Arg
, Mask
);
3165 Arg
= DAG
.getNode(ISD::BITCAST
, dl
, VA
.getLocVT(), Arg
);
3169 if (VA
.needsCustom() &&
3170 (VA
.getLocVT() == MVT::v2f64
|| VA
.getLocVT() == MVT::f64
)) {
3171 if (VA
.getLocVT() == MVT::v2f64
) {
3172 // Extract the first half and return it in two registers.
3173 SDValue Half
= DAG
.getNode(ISD::EXTRACT_VECTOR_ELT
, dl
, MVT::f64
, Arg
,
3174 DAG
.getConstant(0, dl
, MVT::i32
));
3175 SDValue HalfGPRs
= DAG
.getNode(ARMISD::VMOVRRD
, dl
,
3176 DAG
.getVTList(MVT::i32
, MVT::i32
), Half
);
3179 DAG
.getCopyToReg(Chain
, dl
, VA
.getLocReg(),
3180 HalfGPRs
.getValue(isLittleEndian
? 0 : 1), Flag
);
3181 Flag
= Chain
.getValue(1);
3182 RetOps
.push_back(DAG
.getRegister(VA
.getLocReg(), VA
.getLocVT()));
3183 VA
= RVLocs
[++i
]; // skip ahead to next loc
3185 DAG
.getCopyToReg(Chain
, dl
, VA
.getLocReg(),
3186 HalfGPRs
.getValue(isLittleEndian
? 1 : 0), Flag
);
3187 Flag
= Chain
.getValue(1);
3188 RetOps
.push_back(DAG
.getRegister(VA
.getLocReg(), VA
.getLocVT()));
3189 VA
= RVLocs
[++i
]; // skip ahead to next loc
3191 // Extract the 2nd half and fall through to handle it as an f64 value.
3192 Arg
= DAG
.getNode(ISD::EXTRACT_VECTOR_ELT
, dl
, MVT::f64
, Arg
,
3193 DAG
.getConstant(1, dl
, MVT::i32
));
3195 // Legalize ret f64 -> ret 2 x i32. We always have fmrrd if f64 is
3197 SDValue fmrrd
= DAG
.getNode(ARMISD::VMOVRRD
, dl
,
3198 DAG
.getVTList(MVT::i32
, MVT::i32
), Arg
);
3199 Chain
= DAG
.getCopyToReg(Chain
, dl
, VA
.getLocReg(),
3200 fmrrd
.getValue(isLittleEndian
? 0 : 1), Flag
);
3201 Flag
= Chain
.getValue(1);
3202 RetOps
.push_back(DAG
.getRegister(VA
.getLocReg(), VA
.getLocVT()));
3203 VA
= RVLocs
[++i
]; // skip ahead to next loc
3204 Chain
= DAG
.getCopyToReg(Chain
, dl
, VA
.getLocReg(),
3205 fmrrd
.getValue(isLittleEndian
? 1 : 0), Flag
);
3207 Chain
= DAG
.getCopyToReg(Chain
, dl
, VA
.getLocReg(), Arg
, Flag
);
3209 // Guarantee that all emitted copies are
3210 // stuck together, avoiding something bad.
3211 Flag
= Chain
.getValue(1);
3212 RetOps
.push_back(DAG
.getRegister(
3213 VA
.getLocReg(), ReturnF16
? Arg
.getValueType() : VA
.getLocVT()));
3215 const ARMBaseRegisterInfo
*TRI
= Subtarget
->getRegisterInfo();
3216 const MCPhysReg
*I
=
3217 TRI
->getCalleeSavedRegsViaCopy(&DAG
.getMachineFunction());
3220 if (ARM::GPRRegClass
.contains(*I
))
3221 RetOps
.push_back(DAG
.getRegister(*I
, MVT::i32
));
3222 else if (ARM::DPRRegClass
.contains(*I
))
3223 RetOps
.push_back(DAG
.getRegister(*I
, MVT::getFloatingPointVT(64)));
3225 llvm_unreachable("Unexpected register class in CSRsViaCopy!");
3229 // Update chain and glue.
3232 RetOps
.push_back(Flag
);
3234 // CPUs which aren't M-class use a special sequence to return from
3235 // exceptions (roughly, any instruction setting pc and cpsr simultaneously,
3236 // though we use "subs pc, lr, #N").
3238 // M-class CPUs actually use a normal return sequence with a special
3239 // (hardware-provided) value in LR, so the normal code path works.
3240 if (DAG
.getMachineFunction().getFunction().hasFnAttribute("interrupt") &&
3241 !Subtarget
->isMClass()) {
3242 if (Subtarget
->isThumb1Only())
3243 report_fatal_error("interrupt attribute is not supported in Thumb1");
3244 return LowerInterruptReturn(RetOps
, dl
, DAG
);
3247 ARMISD::NodeType RetNode
= AFI
->isCmseNSEntryFunction() ? ARMISD::SERET_FLAG
:
3249 return DAG
.getNode(RetNode
, dl
, MVT::Other
, RetOps
);
3252 bool ARMTargetLowering::isUsedByReturnOnly(SDNode
*N
, SDValue
&Chain
) const {
3253 if (N
->getNumValues() != 1)
3255 if (!N
->hasNUsesOfValue(1, 0))
3258 SDValue TCChain
= Chain
;
3259 SDNode
*Copy
= *N
->use_begin();
3260 if (Copy
->getOpcode() == ISD::CopyToReg
) {
3261 // If the copy has a glue operand, we conservatively assume it isn't safe to
3262 // perform a tail call.
3263 if (Copy
->getOperand(Copy
->getNumOperands()-1).getValueType() == MVT::Glue
)
3265 TCChain
= Copy
->getOperand(0);
3266 } else if (Copy
->getOpcode() == ARMISD::VMOVRRD
) {
3267 SDNode
*VMov
= Copy
;
3268 // f64 returned in a pair of GPRs.
3269 SmallPtrSet
<SDNode
*, 2> Copies
;
3270 for (SDNode::use_iterator UI
= VMov
->use_begin(), UE
= VMov
->use_end();
3272 if (UI
->getOpcode() != ISD::CopyToReg
)
3276 if (Copies
.size() > 2)
3279 for (SDNode::use_iterator UI
= VMov
->use_begin(), UE
= VMov
->use_end();
3281 SDValue UseChain
= UI
->getOperand(0);
3282 if (Copies
.count(UseChain
.getNode()))
3286 // We are at the top of this chain.
3287 // If the copy has a glue operand, we conservatively assume it
3288 // isn't safe to perform a tail call.
3289 if (UI
->getOperand(UI
->getNumOperands()-1).getValueType() == MVT::Glue
)
3295 } else if (Copy
->getOpcode() == ISD::BITCAST
) {
3296 // f32 returned in a single GPR.
3297 if (!Copy
->hasOneUse())
3299 Copy
= *Copy
->use_begin();
3300 if (Copy
->getOpcode() != ISD::CopyToReg
|| !Copy
->hasNUsesOfValue(1, 0))
3302 // If the copy has a glue operand, we conservatively assume it isn't safe to
3303 // perform a tail call.
3304 if (Copy
->getOperand(Copy
->getNumOperands()-1).getValueType() == MVT::Glue
)
3306 TCChain
= Copy
->getOperand(0);
3311 bool HasRet
= false;
3312 for (SDNode::use_iterator UI
= Copy
->use_begin(), UE
= Copy
->use_end();
3314 if (UI
->getOpcode() != ARMISD::RET_FLAG
&&
3315 UI
->getOpcode() != ARMISD::INTRET_FLAG
)
3327 bool ARMTargetLowering::mayBeEmittedAsTailCall(const CallInst
*CI
) const {
3328 if (!Subtarget
->supportsTailCall())
3331 if (!CI
->isTailCall())
3337 // Trying to write a 64 bit value so need to split into two 32 bit values first,
3338 // and pass the lower and high parts through.
3339 static SDValue
LowerWRITE_REGISTER(SDValue Op
, SelectionDAG
&DAG
) {
3341 SDValue WriteValue
= Op
->getOperand(2);
3343 // This function is only supposed to be called for i64 type argument.
3344 assert(WriteValue
.getValueType() == MVT::i64
3345 && "LowerWRITE_REGISTER called for non-i64 type argument.");
3347 SDValue Lo
= DAG
.getNode(ISD::EXTRACT_ELEMENT
, DL
, MVT::i32
, WriteValue
,
3348 DAG
.getConstant(0, DL
, MVT::i32
));
3349 SDValue Hi
= DAG
.getNode(ISD::EXTRACT_ELEMENT
, DL
, MVT::i32
, WriteValue
,
3350 DAG
.getConstant(1, DL
, MVT::i32
));
3351 SDValue Ops
[] = { Op
->getOperand(0), Op
->getOperand(1), Lo
, Hi
};
3352 return DAG
.getNode(ISD::WRITE_REGISTER
, DL
, MVT::Other
, Ops
);
3355 // ConstantPool, JumpTable, GlobalAddress, and ExternalSymbol are lowered as
3356 // their target counterpart wrapped in the ARMISD::Wrapper node. Suppose N is
3357 // one of the above mentioned nodes. It has to be wrapped because otherwise
3358 // Select(N) returns N. So the raw TargetGlobalAddress nodes, etc. can only
3359 // be used to form addressing mode. These wrapped nodes will be selected
3361 SDValue
ARMTargetLowering::LowerConstantPool(SDValue Op
,
3362 SelectionDAG
&DAG
) const {
3363 EVT PtrVT
= Op
.getValueType();
3364 // FIXME there is no actual debug info here
3366 ConstantPoolSDNode
*CP
= cast
<ConstantPoolSDNode
>(Op
);
3369 // When generating execute-only code Constant Pools must be promoted to the
3370 // global data section. It's a bit ugly that we can't share them across basic
3371 // blocks, but this way we guarantee that execute-only behaves correct with
3372 // position-independent addressing modes.
3373 if (Subtarget
->genExecuteOnly()) {
3374 auto AFI
= DAG
.getMachineFunction().getInfo
<ARMFunctionInfo
>();
3375 auto T
= const_cast<Type
*>(CP
->getType());
3376 auto C
= const_cast<Constant
*>(CP
->getConstVal());
3377 auto M
= const_cast<Module
*>(DAG
.getMachineFunction().
3378 getFunction().getParent());
3379 auto GV
= new GlobalVariable(
3380 *M
, T
, /*isConstant=*/true, GlobalVariable::InternalLinkage
, C
,
3381 Twine(DAG
.getDataLayout().getPrivateGlobalPrefix()) + "CP" +
3382 Twine(DAG
.getMachineFunction().getFunctionNumber()) + "_" +
3383 Twine(AFI
->createPICLabelUId())
3385 SDValue GA
= DAG
.getTargetGlobalAddress(dyn_cast
<GlobalValue
>(GV
),
3387 return LowerGlobalAddress(GA
, DAG
);
3390 if (CP
->isMachineConstantPoolEntry())
3392 DAG
.getTargetConstantPool(CP
->getMachineCPVal(), PtrVT
, CP
->getAlign());
3394 Res
= DAG
.getTargetConstantPool(CP
->getConstVal(), PtrVT
, CP
->getAlign());
3395 return DAG
.getNode(ARMISD::Wrapper
, dl
, MVT::i32
, Res
);
3398 unsigned ARMTargetLowering::getJumpTableEncoding() const {
3399 return MachineJumpTableInfo::EK_Inline
;
3402 SDValue
ARMTargetLowering::LowerBlockAddress(SDValue Op
,
3403 SelectionDAG
&DAG
) const {
3404 MachineFunction
&MF
= DAG
.getMachineFunction();
3405 ARMFunctionInfo
*AFI
= MF
.getInfo
<ARMFunctionInfo
>();
3406 unsigned ARMPCLabelIndex
= 0;
3408 EVT PtrVT
= getPointerTy(DAG
.getDataLayout());
3409 const BlockAddress
*BA
= cast
<BlockAddressSDNode
>(Op
)->getBlockAddress();
3411 bool IsPositionIndependent
= isPositionIndependent() || Subtarget
->isROPI();
3412 if (!IsPositionIndependent
) {
3413 CPAddr
= DAG
.getTargetConstantPool(BA
, PtrVT
, Align(4));
3415 unsigned PCAdj
= Subtarget
->isThumb() ? 4 : 8;
3416 ARMPCLabelIndex
= AFI
->createPICLabelUId();
3417 ARMConstantPoolValue
*CPV
=
3418 ARMConstantPoolConstant::Create(BA
, ARMPCLabelIndex
,
3419 ARMCP::CPBlockAddress
, PCAdj
);
3420 CPAddr
= DAG
.getTargetConstantPool(CPV
, PtrVT
, Align(4));
3422 CPAddr
= DAG
.getNode(ARMISD::Wrapper
, DL
, PtrVT
, CPAddr
);
3423 SDValue Result
= DAG
.getLoad(
3424 PtrVT
, DL
, DAG
.getEntryNode(), CPAddr
,
3425 MachinePointerInfo::getConstantPool(DAG
.getMachineFunction()));
3426 if (!IsPositionIndependent
)
3428 SDValue PICLabel
= DAG
.getConstant(ARMPCLabelIndex
, DL
, MVT::i32
);
3429 return DAG
.getNode(ARMISD::PIC_ADD
, DL
, PtrVT
, Result
, PICLabel
);
3432 /// Convert a TLS address reference into the correct sequence of loads
3433 /// and calls to compute the variable's address for Darwin, and return an
3434 /// SDValue containing the final node.
3436 /// Darwin only has one TLS scheme which must be capable of dealing with the
3437 /// fully general situation, in the worst case. This means:
3438 /// + "extern __thread" declaration.
3439 /// + Defined in a possibly unknown dynamic library.
3441 /// The general system is that each __thread variable has a [3 x i32] descriptor
3442 /// which contains information used by the runtime to calculate the address. The
3443 /// only part of this the compiler needs to know about is the first word, which
3444 /// contains a function pointer that must be called with the address of the
3445 /// entire descriptor in "r0".
3447 /// Since this descriptor may be in a different unit, in general access must
3448 /// proceed along the usual ARM rules. A common sequence to produce is:
3450 /// movw rT1, :lower16:_var$non_lazy_ptr
3451 /// movt rT1, :upper16:_var$non_lazy_ptr
3455 /// [...address now in r0...]
3457 ARMTargetLowering::LowerGlobalTLSAddressDarwin(SDValue Op
,
3458 SelectionDAG
&DAG
) const {
3459 assert(Subtarget
->isTargetDarwin() &&
3460 "This function expects a Darwin target");
3463 // First step is to get the address of the actua global symbol. This is where
3464 // the TLS descriptor lives.
3465 SDValue DescAddr
= LowerGlobalAddressDarwin(Op
, DAG
);
3467 // The first entry in the descriptor is a function pointer that we must call
3468 // to obtain the address of the variable.
3469 SDValue Chain
= DAG
.getEntryNode();
3470 SDValue FuncTLVGet
= DAG
.getLoad(
3471 MVT::i32
, DL
, Chain
, DescAddr
,
3472 MachinePointerInfo::getGOT(DAG
.getMachineFunction()), Align(4),
3473 MachineMemOperand::MONonTemporal
| MachineMemOperand::MODereferenceable
|
3474 MachineMemOperand::MOInvariant
);
3475 Chain
= FuncTLVGet
.getValue(1);
3477 MachineFunction
&F
= DAG
.getMachineFunction();
3478 MachineFrameInfo
&MFI
= F
.getFrameInfo();
3479 MFI
.setAdjustsStack(true);
3481 // TLS calls preserve all registers except those that absolutely must be
3482 // trashed: R0 (it takes an argument), LR (it's a call) and CPSR (let's not be
3485 getTargetMachine().getSubtargetImpl(F
.getFunction())->getRegisterInfo();
3486 auto ARI
= static_cast<const ARMRegisterInfo
*>(TRI
);
3487 const uint32_t *Mask
= ARI
->getTLSCallPreservedMask(DAG
.getMachineFunction());
3489 // Finally, we can make the call. This is just a degenerate version of a
3490 // normal AArch64 call node: r0 takes the address of the descriptor, and
3491 // returns the address of the variable in this thread.
3492 Chain
= DAG
.getCopyToReg(Chain
, DL
, ARM::R0
, DescAddr
, SDValue());
3494 DAG
.getNode(ARMISD::CALL
, DL
, DAG
.getVTList(MVT::Other
, MVT::Glue
),
3495 Chain
, FuncTLVGet
, DAG
.getRegister(ARM::R0
, MVT::i32
),
3496 DAG
.getRegisterMask(Mask
), Chain
.getValue(1));
3497 return DAG
.getCopyFromReg(Chain
, DL
, ARM::R0
, MVT::i32
, Chain
.getValue(1));
3501 ARMTargetLowering::LowerGlobalTLSAddressWindows(SDValue Op
,
3502 SelectionDAG
&DAG
) const {
3503 assert(Subtarget
->isTargetWindows() && "Windows specific TLS lowering");
3505 SDValue Chain
= DAG
.getEntryNode();
3506 EVT PtrVT
= getPointerTy(DAG
.getDataLayout());
3509 // Load the current TEB (thread environment block)
3510 SDValue Ops
[] = {Chain
,
3511 DAG
.getTargetConstant(Intrinsic::arm_mrc
, DL
, MVT::i32
),
3512 DAG
.getTargetConstant(15, DL
, MVT::i32
),
3513 DAG
.getTargetConstant(0, DL
, MVT::i32
),
3514 DAG
.getTargetConstant(13, DL
, MVT::i32
),
3515 DAG
.getTargetConstant(0, DL
, MVT::i32
),
3516 DAG
.getTargetConstant(2, DL
, MVT::i32
)};
3517 SDValue CurrentTEB
= DAG
.getNode(ISD::INTRINSIC_W_CHAIN
, DL
,
3518 DAG
.getVTList(MVT::i32
, MVT::Other
), Ops
);
3520 SDValue TEB
= CurrentTEB
.getValue(0);
3521 Chain
= CurrentTEB
.getValue(1);
3523 // Load the ThreadLocalStoragePointer from the TEB
3524 // A pointer to the TLS array is located at offset 0x2c from the TEB.
3526 DAG
.getNode(ISD::ADD
, DL
, PtrVT
, TEB
, DAG
.getIntPtrConstant(0x2c, DL
));
3527 TLSArray
= DAG
.getLoad(PtrVT
, DL
, Chain
, TLSArray
, MachinePointerInfo());
3529 // The pointer to the thread's TLS data area is at the TLS Index scaled by 4
3530 // offset into the TLSArray.
3532 // Load the TLS index from the C runtime
3534 DAG
.getTargetExternalSymbol("_tls_index", PtrVT
, ARMII::MO_NO_FLAG
);
3535 TLSIndex
= DAG
.getNode(ARMISD::Wrapper
, DL
, PtrVT
, TLSIndex
);
3536 TLSIndex
= DAG
.getLoad(PtrVT
, DL
, Chain
, TLSIndex
, MachinePointerInfo());
3538 SDValue Slot
= DAG
.getNode(ISD::SHL
, DL
, PtrVT
, TLSIndex
,
3539 DAG
.getConstant(2, DL
, MVT::i32
));
3540 SDValue TLS
= DAG
.getLoad(PtrVT
, DL
, Chain
,
3541 DAG
.getNode(ISD::ADD
, DL
, PtrVT
, TLSArray
, Slot
),
3542 MachinePointerInfo());
3544 // Get the offset of the start of the .tls section (section base)
3545 const auto *GA
= cast
<GlobalAddressSDNode
>(Op
);
3546 auto *CPV
= ARMConstantPoolConstant::Create(GA
->getGlobal(), ARMCP::SECREL
);
3547 SDValue Offset
= DAG
.getLoad(
3549 DAG
.getNode(ARMISD::Wrapper
, DL
, MVT::i32
,
3550 DAG
.getTargetConstantPool(CPV
, PtrVT
, Align(4))),
3551 MachinePointerInfo::getConstantPool(DAG
.getMachineFunction()));
3553 return DAG
.getNode(ISD::ADD
, DL
, PtrVT
, TLS
, Offset
);
3556 // Lower ISD::GlobalTLSAddress using the "general dynamic" model
3558 ARMTargetLowering::LowerToTLSGeneralDynamicModel(GlobalAddressSDNode
*GA
,
3559 SelectionDAG
&DAG
) const {
3561 EVT PtrVT
= getPointerTy(DAG
.getDataLayout());
3562 unsigned char PCAdj
= Subtarget
->isThumb() ? 4 : 8;
3563 MachineFunction
&MF
= DAG
.getMachineFunction();
3564 ARMFunctionInfo
*AFI
= MF
.getInfo
<ARMFunctionInfo
>();
3565 unsigned ARMPCLabelIndex
= AFI
->createPICLabelUId();
3566 ARMConstantPoolValue
*CPV
=
3567 ARMConstantPoolConstant::Create(GA
->getGlobal(), ARMPCLabelIndex
,
3568 ARMCP::CPValue
, PCAdj
, ARMCP::TLSGD
, true);
3569 SDValue Argument
= DAG
.getTargetConstantPool(CPV
, PtrVT
, Align(4));
3570 Argument
= DAG
.getNode(ARMISD::Wrapper
, dl
, MVT::i32
, Argument
);
3571 Argument
= DAG
.getLoad(
3572 PtrVT
, dl
, DAG
.getEntryNode(), Argument
,
3573 MachinePointerInfo::getConstantPool(DAG
.getMachineFunction()));
3574 SDValue Chain
= Argument
.getValue(1);
3576 SDValue PICLabel
= DAG
.getConstant(ARMPCLabelIndex
, dl
, MVT::i32
);
3577 Argument
= DAG
.getNode(ARMISD::PIC_ADD
, dl
, PtrVT
, Argument
, PICLabel
);
3579 // call __tls_get_addr.
3582 Entry
.Node
= Argument
;
3583 Entry
.Ty
= (Type
*) Type::getInt32Ty(*DAG
.getContext());
3584 Args
.push_back(Entry
);
3586 // FIXME: is there useful debug info available here?
3587 TargetLowering::CallLoweringInfo
CLI(DAG
);
3588 CLI
.setDebugLoc(dl
).setChain(Chain
).setLibCallee(
3589 CallingConv::C
, Type::getInt32Ty(*DAG
.getContext()),
3590 DAG
.getExternalSymbol("__tls_get_addr", PtrVT
), std::move(Args
));
3592 std::pair
<SDValue
, SDValue
> CallResult
= LowerCallTo(CLI
);
3593 return CallResult
.first
;
3596 // Lower ISD::GlobalTLSAddress using the "initial exec" or
3597 // "local exec" model.
3599 ARMTargetLowering::LowerToTLSExecModels(GlobalAddressSDNode
*GA
,
3601 TLSModel::Model model
) const {
3602 const GlobalValue
*GV
= GA
->getGlobal();
3605 SDValue Chain
= DAG
.getEntryNode();
3606 EVT PtrVT
= getPointerTy(DAG
.getDataLayout());
3607 // Get the Thread Pointer
3608 SDValue ThreadPointer
= DAG
.getNode(ARMISD::THREAD_POINTER
, dl
, PtrVT
);
3610 if (model
== TLSModel::InitialExec
) {
3611 MachineFunction
&MF
= DAG
.getMachineFunction();
3612 ARMFunctionInfo
*AFI
= MF
.getInfo
<ARMFunctionInfo
>();
3613 unsigned ARMPCLabelIndex
= AFI
->createPICLabelUId();
3614 // Initial exec model.
3615 unsigned char PCAdj
= Subtarget
->isThumb() ? 4 : 8;
3616 ARMConstantPoolValue
*CPV
=
3617 ARMConstantPoolConstant::Create(GA
->getGlobal(), ARMPCLabelIndex
,
3618 ARMCP::CPValue
, PCAdj
, ARMCP::GOTTPOFF
,
3620 Offset
= DAG
.getTargetConstantPool(CPV
, PtrVT
, Align(4));
3621 Offset
= DAG
.getNode(ARMISD::Wrapper
, dl
, MVT::i32
, Offset
);
3622 Offset
= DAG
.getLoad(
3623 PtrVT
, dl
, Chain
, Offset
,
3624 MachinePointerInfo::getConstantPool(DAG
.getMachineFunction()));
3625 Chain
= Offset
.getValue(1);
3627 SDValue PICLabel
= DAG
.getConstant(ARMPCLabelIndex
, dl
, MVT::i32
);
3628 Offset
= DAG
.getNode(ARMISD::PIC_ADD
, dl
, PtrVT
, Offset
, PICLabel
);
3630 Offset
= DAG
.getLoad(
3631 PtrVT
, dl
, Chain
, Offset
,
3632 MachinePointerInfo::getConstantPool(DAG
.getMachineFunction()));
3635 assert(model
== TLSModel::LocalExec
);
3636 ARMConstantPoolValue
*CPV
=
3637 ARMConstantPoolConstant::Create(GV
, ARMCP::TPOFF
);
3638 Offset
= DAG
.getTargetConstantPool(CPV
, PtrVT
, Align(4));
3639 Offset
= DAG
.getNode(ARMISD::Wrapper
, dl
, MVT::i32
, Offset
);
3640 Offset
= DAG
.getLoad(
3641 PtrVT
, dl
, Chain
, Offset
,
3642 MachinePointerInfo::getConstantPool(DAG
.getMachineFunction()));
3645 // The address of the thread local variable is the add of the thread
3646 // pointer with the offset of the variable.
3647 return DAG
.getNode(ISD::ADD
, dl
, PtrVT
, ThreadPointer
, Offset
);
3651 ARMTargetLowering::LowerGlobalTLSAddress(SDValue Op
, SelectionDAG
&DAG
) const {
3652 GlobalAddressSDNode
*GA
= cast
<GlobalAddressSDNode
>(Op
);
3653 if (DAG
.getTarget().useEmulatedTLS())
3654 return LowerToTLSEmulatedModel(GA
, DAG
);
3656 if (Subtarget
->isTargetDarwin())
3657 return LowerGlobalTLSAddressDarwin(Op
, DAG
);
3659 if (Subtarget
->isTargetWindows())
3660 return LowerGlobalTLSAddressWindows(Op
, DAG
);
3662 // TODO: implement the "local dynamic" model
3663 assert(Subtarget
->isTargetELF() && "Only ELF implemented here");
3664 TLSModel::Model model
= getTargetMachine().getTLSModel(GA
->getGlobal());
3667 case TLSModel::GeneralDynamic
:
3668 case TLSModel::LocalDynamic
:
3669 return LowerToTLSGeneralDynamicModel(GA
, DAG
);
3670 case TLSModel::InitialExec
:
3671 case TLSModel::LocalExec
:
3672 return LowerToTLSExecModels(GA
, DAG
, model
);
3674 llvm_unreachable("bogus TLS model");
3677 /// Return true if all users of V are within function F, looking through
3679 static bool allUsersAreInFunction(const Value
*V
, const Function
*F
) {
3680 SmallVector
<const User
*,4> Worklist(V
->users());
3681 while (!Worklist
.empty()) {
3682 auto *U
= Worklist
.pop_back_val();
3683 if (isa
<ConstantExpr
>(U
)) {
3684 append_range(Worklist
, U
->users());
3688 auto *I
= dyn_cast
<Instruction
>(U
);
3689 if (!I
|| I
->getParent()->getParent() != F
)
3695 static SDValue
promoteToConstantPool(const ARMTargetLowering
*TLI
,
3696 const GlobalValue
*GV
, SelectionDAG
&DAG
,
3697 EVT PtrVT
, const SDLoc
&dl
) {
3698 // If we're creating a pool entry for a constant global with unnamed address,
3699 // and the global is small enough, we can emit it inline into the constant pool
3700 // to save ourselves an indirection.
3702 // This is a win if the constant is only used in one function (so it doesn't
3703 // need to be duplicated) or duplicating the constant wouldn't increase code
3704 // size (implying the constant is no larger than 4 bytes).
3705 const Function
&F
= DAG
.getMachineFunction().getFunction();
3707 // We rely on this decision to inline being idemopotent and unrelated to the
3708 // use-site. We know that if we inline a variable at one use site, we'll
3709 // inline it elsewhere too (and reuse the constant pool entry). Fast-isel
3710 // doesn't know about this optimization, so bail out if it's enabled else
3711 // we could decide to inline here (and thus never emit the GV) but require
3712 // the GV from fast-isel generated code.
3713 if (!EnableConstpoolPromotion
||
3714 DAG
.getMachineFunction().getTarget().Options
.EnableFastISel
)
3717 auto *GVar
= dyn_cast
<GlobalVariable
>(GV
);
3718 if (!GVar
|| !GVar
->hasInitializer() ||
3719 !GVar
->isConstant() || !GVar
->hasGlobalUnnamedAddr() ||
3720 !GVar
->hasLocalLinkage())
3723 // If we inline a value that contains relocations, we move the relocations
3724 // from .data to .text. This is not allowed in position-independent code.
3725 auto *Init
= GVar
->getInitializer();
3726 if ((TLI
->isPositionIndependent() || TLI
->getSubtarget()->isROPI()) &&
3727 Init
->needsDynamicRelocation())
3730 // The constant islands pass can only really deal with alignment requests
3731 // <= 4 bytes and cannot pad constants itself. Therefore we cannot promote
3732 // any type wanting greater alignment requirements than 4 bytes. We also
3733 // can only promote constants that are multiples of 4 bytes in size or
3734 // are paddable to a multiple of 4. Currently we only try and pad constants
3735 // that are strings for simplicity.
3736 auto *CDAInit
= dyn_cast
<ConstantDataArray
>(Init
);
3737 unsigned Size
= DAG
.getDataLayout().getTypeAllocSize(Init
->getType());
3738 Align PrefAlign
= DAG
.getDataLayout().getPreferredAlign(GVar
);
3739 unsigned RequiredPadding
= 4 - (Size
% 4);
3740 bool PaddingPossible
=
3741 RequiredPadding
== 4 || (CDAInit
&& CDAInit
->isString());
3742 if (!PaddingPossible
|| PrefAlign
> 4 || Size
> ConstpoolPromotionMaxSize
||
3746 unsigned PaddedSize
= Size
+ ((RequiredPadding
== 4) ? 0 : RequiredPadding
);
3747 MachineFunction
&MF
= DAG
.getMachineFunction();
3748 ARMFunctionInfo
*AFI
= MF
.getInfo
<ARMFunctionInfo
>();
3750 // We can't bloat the constant pool too much, else the ConstantIslands pass
3751 // may fail to converge. If we haven't promoted this global yet (it may have
3752 // multiple uses), and promoting it would increase the constant pool size (Sz
3753 // > 4), ensure we have space to do so up to MaxTotal.
3754 if (!AFI
->getGlobalsPromotedToConstantPool().count(GVar
) && Size
> 4)
3755 if (AFI
->getPromotedConstpoolIncrease() + PaddedSize
- 4 >=
3756 ConstpoolPromotionMaxTotal
)
3759 // This is only valid if all users are in a single function; we can't clone
3760 // the constant in general. The LLVM IR unnamed_addr allows merging
3761 // constants, but not cloning them.
3763 // We could potentially allow cloning if we could prove all uses of the
3764 // constant in the current function don't care about the address, like
3765 // printf format strings. But that isn't implemented for now.
3766 if (!allUsersAreInFunction(GVar
, &F
))
3769 // We're going to inline this global. Pad it out if needed.
3770 if (RequiredPadding
!= 4) {
3771 StringRef S
= CDAInit
->getAsString();
3773 SmallVector
<uint8_t,16> V(S
.size());
3774 std::copy(S
.bytes_begin(), S
.bytes_end(), V
.begin());
3775 while (RequiredPadding
--)
3777 Init
= ConstantDataArray::get(*DAG
.getContext(), V
);
3780 auto CPVal
= ARMConstantPoolConstant::Create(GVar
, Init
);
3781 SDValue CPAddr
= DAG
.getTargetConstantPool(CPVal
, PtrVT
, Align(4));
3782 if (!AFI
->getGlobalsPromotedToConstantPool().count(GVar
)) {
3783 AFI
->markGlobalAsPromotedToConstantPool(GVar
);
3784 AFI
->setPromotedConstpoolIncrease(AFI
->getPromotedConstpoolIncrease() +
3787 ++NumConstpoolPromoted
;
3788 return DAG
.getNode(ARMISD::Wrapper
, dl
, MVT::i32
, CPAddr
);
3791 bool ARMTargetLowering::isReadOnly(const GlobalValue
*GV
) const {
3792 if (const GlobalAlias
*GA
= dyn_cast
<GlobalAlias
>(GV
))
3793 if (!(GV
= GA
->getBaseObject()))
3795 if (const auto *V
= dyn_cast
<GlobalVariable
>(GV
))
3796 return V
->isConstant();
3797 return isa
<Function
>(GV
);
3800 SDValue
ARMTargetLowering::LowerGlobalAddress(SDValue Op
,
3801 SelectionDAG
&DAG
) const {
3802 switch (Subtarget
->getTargetTriple().getObjectFormat()) {
3803 default: llvm_unreachable("unknown object format");
3805 return LowerGlobalAddressWindows(Op
, DAG
);
3807 return LowerGlobalAddressELF(Op
, DAG
);
3809 return LowerGlobalAddressDarwin(Op
, DAG
);
3813 SDValue
ARMTargetLowering::LowerGlobalAddressELF(SDValue Op
,
3814 SelectionDAG
&DAG
) const {
3815 EVT PtrVT
= getPointerTy(DAG
.getDataLayout());
3817 const GlobalValue
*GV
= cast
<GlobalAddressSDNode
>(Op
)->getGlobal();
3818 const TargetMachine
&TM
= getTargetMachine();
3819 bool IsRO
= isReadOnly(GV
);
3821 // promoteToConstantPool only if not generating XO text section
3822 if (TM
.shouldAssumeDSOLocal(*GV
->getParent(), GV
) && !Subtarget
->genExecuteOnly())
3823 if (SDValue V
= promoteToConstantPool(this, GV
, DAG
, PtrVT
, dl
))
3826 if (isPositionIndependent()) {
3827 bool UseGOT_PREL
= !TM
.shouldAssumeDSOLocal(*GV
->getParent(), GV
);
3828 SDValue G
= DAG
.getTargetGlobalAddress(GV
, dl
, PtrVT
, 0,
3829 UseGOT_PREL
? ARMII::MO_GOT
: 0);
3830 SDValue Result
= DAG
.getNode(ARMISD::WrapperPIC
, dl
, PtrVT
, G
);
3833 DAG
.getLoad(PtrVT
, dl
, DAG
.getEntryNode(), Result
,
3834 MachinePointerInfo::getGOT(DAG
.getMachineFunction()));
3836 } else if (Subtarget
->isROPI() && IsRO
) {
3838 SDValue G
= DAG
.getTargetGlobalAddress(GV
, dl
, PtrVT
);
3839 SDValue Result
= DAG
.getNode(ARMISD::WrapperPIC
, dl
, PtrVT
, G
);
3841 } else if (Subtarget
->isRWPI() && !IsRO
) {
3844 if (Subtarget
->useMovt()) {
3846 SDValue G
= DAG
.getTargetGlobalAddress(GV
, dl
, PtrVT
, 0, ARMII::MO_SBREL
);
3847 RelAddr
= DAG
.getNode(ARMISD::Wrapper
, dl
, PtrVT
, G
);
3848 } else { // use literal pool for address constant
3849 ARMConstantPoolValue
*CPV
=
3850 ARMConstantPoolConstant::Create(GV
, ARMCP::SBREL
);
3851 SDValue CPAddr
= DAG
.getTargetConstantPool(CPV
, PtrVT
, Align(4));
3852 CPAddr
= DAG
.getNode(ARMISD::Wrapper
, dl
, MVT::i32
, CPAddr
);
3853 RelAddr
= DAG
.getLoad(
3854 PtrVT
, dl
, DAG
.getEntryNode(), CPAddr
,
3855 MachinePointerInfo::getConstantPool(DAG
.getMachineFunction()));
3857 SDValue SB
= DAG
.getCopyFromReg(DAG
.getEntryNode(), dl
, ARM::R9
, PtrVT
);
3858 SDValue Result
= DAG
.getNode(ISD::ADD
, dl
, PtrVT
, SB
, RelAddr
);
3862 // If we have T2 ops, we can materialize the address directly via movt/movw
3863 // pair. This is always cheaper.
3864 if (Subtarget
->useMovt()) {
3866 // FIXME: Once remat is capable of dealing with instructions with register
3867 // operands, expand this into two nodes.
3868 return DAG
.getNode(ARMISD::Wrapper
, dl
, PtrVT
,
3869 DAG
.getTargetGlobalAddress(GV
, dl
, PtrVT
));
3871 SDValue CPAddr
= DAG
.getTargetConstantPool(GV
, PtrVT
, Align(4));
3872 CPAddr
= DAG
.getNode(ARMISD::Wrapper
, dl
, MVT::i32
, CPAddr
);
3874 PtrVT
, dl
, DAG
.getEntryNode(), CPAddr
,
3875 MachinePointerInfo::getConstantPool(DAG
.getMachineFunction()));
3879 SDValue
ARMTargetLowering::LowerGlobalAddressDarwin(SDValue Op
,
3880 SelectionDAG
&DAG
) const {
3881 assert(!Subtarget
->isROPI() && !Subtarget
->isRWPI() &&
3882 "ROPI/RWPI not currently supported for Darwin");
3883 EVT PtrVT
= getPointerTy(DAG
.getDataLayout());
3885 const GlobalValue
*GV
= cast
<GlobalAddressSDNode
>(Op
)->getGlobal();
3887 if (Subtarget
->useMovt())
3890 // FIXME: Once remat is capable of dealing with instructions with register
3891 // operands, expand this into multiple nodes
3893 isPositionIndependent() ? ARMISD::WrapperPIC
: ARMISD::Wrapper
;
3895 SDValue G
= DAG
.getTargetGlobalAddress(GV
, dl
, PtrVT
, 0, ARMII::MO_NONLAZY
);
3896 SDValue Result
= DAG
.getNode(Wrapper
, dl
, PtrVT
, G
);
3898 if (Subtarget
->isGVIndirectSymbol(GV
))
3899 Result
= DAG
.getLoad(PtrVT
, dl
, DAG
.getEntryNode(), Result
,
3900 MachinePointerInfo::getGOT(DAG
.getMachineFunction()));
3904 SDValue
ARMTargetLowering::LowerGlobalAddressWindows(SDValue Op
,
3905 SelectionDAG
&DAG
) const {
3906 assert(Subtarget
->isTargetWindows() && "non-Windows COFF is not supported");
3907 assert(Subtarget
->useMovt() &&
3908 "Windows on ARM expects to use movw/movt");
3909 assert(!Subtarget
->isROPI() && !Subtarget
->isRWPI() &&
3910 "ROPI/RWPI not currently supported for Windows");
3912 const TargetMachine
&TM
= getTargetMachine();
3913 const GlobalValue
*GV
= cast
<GlobalAddressSDNode
>(Op
)->getGlobal();
3914 ARMII::TOF TargetFlags
= ARMII::MO_NO_FLAG
;
3915 if (GV
->hasDLLImportStorageClass())
3916 TargetFlags
= ARMII::MO_DLLIMPORT
;
3917 else if (!TM
.shouldAssumeDSOLocal(*GV
->getParent(), GV
))
3918 TargetFlags
= ARMII::MO_COFFSTUB
;
3919 EVT PtrVT
= getPointerTy(DAG
.getDataLayout());
3925 // FIXME: Once remat is capable of dealing with instructions with register
3926 // operands, expand this into two nodes.
3927 Result
= DAG
.getNode(ARMISD::Wrapper
, DL
, PtrVT
,
3928 DAG
.getTargetGlobalAddress(GV
, DL
, PtrVT
, /*offset=*/0,
3930 if (TargetFlags
& (ARMII::MO_DLLIMPORT
| ARMII::MO_COFFSTUB
))
3931 Result
= DAG
.getLoad(PtrVT
, DL
, DAG
.getEntryNode(), Result
,
3932 MachinePointerInfo::getGOT(DAG
.getMachineFunction()));
3937 ARMTargetLowering::LowerEH_SJLJ_SETJMP(SDValue Op
, SelectionDAG
&DAG
) const {
3939 SDValue Val
= DAG
.getConstant(0, dl
, MVT::i32
);
3940 return DAG
.getNode(ARMISD::EH_SJLJ_SETJMP
, dl
,
3941 DAG
.getVTList(MVT::i32
, MVT::Other
), Op
.getOperand(0),
3942 Op
.getOperand(1), Val
);
3946 ARMTargetLowering::LowerEH_SJLJ_LONGJMP(SDValue Op
, SelectionDAG
&DAG
) const {
3948 return DAG
.getNode(ARMISD::EH_SJLJ_LONGJMP
, dl
, MVT::Other
, Op
.getOperand(0),
3949 Op
.getOperand(1), DAG
.getConstant(0, dl
, MVT::i32
));
3952 SDValue
ARMTargetLowering::LowerEH_SJLJ_SETUP_DISPATCH(SDValue Op
,
3953 SelectionDAG
&DAG
) const {
3955 return DAG
.getNode(ARMISD::EH_SJLJ_SETUP_DISPATCH
, dl
, MVT::Other
,
3959 SDValue
ARMTargetLowering::LowerINTRINSIC_VOID(
3960 SDValue Op
, SelectionDAG
&DAG
, const ARMSubtarget
*Subtarget
) const {
3962 cast
<ConstantSDNode
>(
3963 Op
.getOperand(Op
.getOperand(0).getValueType() == MVT::Other
))
3967 return SDValue(); // Don't custom lower most intrinsics.
3968 case Intrinsic::arm_gnu_eabi_mcount
: {
3969 MachineFunction
&MF
= DAG
.getMachineFunction();
3970 EVT PtrVT
= getPointerTy(DAG
.getDataLayout());
3972 SDValue Chain
= Op
.getOperand(0);
3973 // call "\01__gnu_mcount_nc"
3974 const ARMBaseRegisterInfo
*ARI
= Subtarget
->getRegisterInfo();
3975 const uint32_t *Mask
=
3976 ARI
->getCallPreservedMask(DAG
.getMachineFunction(), CallingConv::C
);
3977 assert(Mask
&& "Missing call preserved mask for calling convention");
3978 // Mark LR an implicit live-in.
3979 unsigned Reg
= MF
.addLiveIn(ARM::LR
, getRegClassFor(MVT::i32
));
3980 SDValue ReturnAddress
=
3981 DAG
.getCopyFromReg(DAG
.getEntryNode(), dl
, Reg
, PtrVT
);
3982 constexpr EVT ResultTys
[] = {MVT::Other
, MVT::Glue
};
3984 DAG
.getTargetExternalSymbol("\01__gnu_mcount_nc", PtrVT
, 0);
3985 SDValue RegisterMask
= DAG
.getRegisterMask(Mask
);
3986 if (Subtarget
->isThumb())
3989 ARM::tBL_PUSHLR
, dl
, ResultTys
,
3990 {ReturnAddress
, DAG
.getTargetConstant(ARMCC::AL
, dl
, PtrVT
),
3991 DAG
.getRegister(0, PtrVT
), Callee
, RegisterMask
, Chain
}),
3994 DAG
.getMachineNode(ARM::BL_PUSHLR
, dl
, ResultTys
,
3995 {ReturnAddress
, Callee
, RegisterMask
, Chain
}),
4002 ARMTargetLowering::LowerINTRINSIC_WO_CHAIN(SDValue Op
, SelectionDAG
&DAG
,
4003 const ARMSubtarget
*Subtarget
) const {
4004 unsigned IntNo
= cast
<ConstantSDNode
>(Op
.getOperand(0))->getZExtValue();
4007 default: return SDValue(); // Don't custom lower most intrinsics.
4008 case Intrinsic::thread_pointer
: {
4009 EVT PtrVT
= getPointerTy(DAG
.getDataLayout());
4010 return DAG
.getNode(ARMISD::THREAD_POINTER
, dl
, PtrVT
);
4012 case Intrinsic::arm_cls
: {
4013 const SDValue
&Operand
= Op
.getOperand(1);
4014 const EVT VTy
= Op
.getValueType();
4016 DAG
.getNode(ISD::SRA
, dl
, VTy
, Operand
, DAG
.getConstant(31, dl
, VTy
));
4017 SDValue XOR
= DAG
.getNode(ISD::XOR
, dl
, VTy
, SRA
, Operand
);
4019 DAG
.getNode(ISD::SHL
, dl
, VTy
, XOR
, DAG
.getConstant(1, dl
, VTy
));
4021 DAG
.getNode(ISD::OR
, dl
, VTy
, SHL
, DAG
.getConstant(1, dl
, VTy
));
4022 SDValue Result
= DAG
.getNode(ISD::CTLZ
, dl
, VTy
, OR
);
4025 case Intrinsic::arm_cls64
: {
4026 // cls(x) = if cls(hi(x)) != 31 then cls(hi(x))
4027 // else 31 + clz(if hi(x) == 0 then lo(x) else not(lo(x)))
4028 const SDValue
&Operand
= Op
.getOperand(1);
4029 const EVT VTy
= Op
.getValueType();
4031 SDValue Hi
= DAG
.getNode(ISD::EXTRACT_ELEMENT
, dl
, VTy
, Operand
,
4032 DAG
.getConstant(1, dl
, VTy
));
4033 SDValue Lo
= DAG
.getNode(ISD::EXTRACT_ELEMENT
, dl
, VTy
, Operand
,
4034 DAG
.getConstant(0, dl
, VTy
));
4035 SDValue Constant0
= DAG
.getConstant(0, dl
, VTy
);
4036 SDValue Constant1
= DAG
.getConstant(1, dl
, VTy
);
4037 SDValue Constant31
= DAG
.getConstant(31, dl
, VTy
);
4038 SDValue SRAHi
= DAG
.getNode(ISD::SRA
, dl
, VTy
, Hi
, Constant31
);
4039 SDValue XORHi
= DAG
.getNode(ISD::XOR
, dl
, VTy
, SRAHi
, Hi
);
4040 SDValue SHLHi
= DAG
.getNode(ISD::SHL
, dl
, VTy
, XORHi
, Constant1
);
4041 SDValue ORHi
= DAG
.getNode(ISD::OR
, dl
, VTy
, SHLHi
, Constant1
);
4042 SDValue CLSHi
= DAG
.getNode(ISD::CTLZ
, dl
, VTy
, ORHi
);
4044 DAG
.getSetCC(dl
, MVT::i1
, CLSHi
, Constant31
, ISD::CondCode::SETEQ
);
4046 DAG
.getSetCC(dl
, MVT::i1
, Hi
, Constant0
, ISD::CondCode::SETEQ
);
4047 SDValue AdjustedLo
=
4048 DAG
.getSelect(dl
, VTy
, HiIsZero
, Lo
, DAG
.getNOT(dl
, Lo
, VTy
));
4049 SDValue CLZAdjustedLo
= DAG
.getNode(ISD::CTLZ
, dl
, VTy
, AdjustedLo
);
4051 DAG
.getSelect(dl
, VTy
, CheckLo
,
4052 DAG
.getNode(ISD::ADD
, dl
, VTy
, CLZAdjustedLo
, Constant31
), CLSHi
);
4055 case Intrinsic::eh_sjlj_lsda
: {
4056 MachineFunction
&MF
= DAG
.getMachineFunction();
4057 ARMFunctionInfo
*AFI
= MF
.getInfo
<ARMFunctionInfo
>();
4058 unsigned ARMPCLabelIndex
= AFI
->createPICLabelUId();
4059 EVT PtrVT
= getPointerTy(DAG
.getDataLayout());
4061 bool IsPositionIndependent
= isPositionIndependent();
4062 unsigned PCAdj
= IsPositionIndependent
? (Subtarget
->isThumb() ? 4 : 8) : 0;
4063 ARMConstantPoolValue
*CPV
=
4064 ARMConstantPoolConstant::Create(&MF
.getFunction(), ARMPCLabelIndex
,
4065 ARMCP::CPLSDA
, PCAdj
);
4066 CPAddr
= DAG
.getTargetConstantPool(CPV
, PtrVT
, Align(4));
4067 CPAddr
= DAG
.getNode(ARMISD::Wrapper
, dl
, MVT::i32
, CPAddr
);
4068 SDValue Result
= DAG
.getLoad(
4069 PtrVT
, dl
, DAG
.getEntryNode(), CPAddr
,
4070 MachinePointerInfo::getConstantPool(DAG
.getMachineFunction()));
4072 if (IsPositionIndependent
) {
4073 SDValue PICLabel
= DAG
.getConstant(ARMPCLabelIndex
, dl
, MVT::i32
);
4074 Result
= DAG
.getNode(ARMISD::PIC_ADD
, dl
, PtrVT
, Result
, PICLabel
);
4078 case Intrinsic::arm_neon_vabs
:
4079 return DAG
.getNode(ISD::ABS
, SDLoc(Op
), Op
.getValueType(),
4081 case Intrinsic::arm_neon_vmulls
:
4082 case Intrinsic::arm_neon_vmullu
: {
4083 unsigned NewOpc
= (IntNo
== Intrinsic::arm_neon_vmulls
)
4084 ? ARMISD::VMULLs
: ARMISD::VMULLu
;
4085 return DAG
.getNode(NewOpc
, SDLoc(Op
), Op
.getValueType(),
4086 Op
.getOperand(1), Op
.getOperand(2));
4088 case Intrinsic::arm_neon_vminnm
:
4089 case Intrinsic::arm_neon_vmaxnm
: {
4090 unsigned NewOpc
= (IntNo
== Intrinsic::arm_neon_vminnm
)
4091 ? ISD::FMINNUM
: ISD::FMAXNUM
;
4092 return DAG
.getNode(NewOpc
, SDLoc(Op
), Op
.getValueType(),
4093 Op
.getOperand(1), Op
.getOperand(2));
4095 case Intrinsic::arm_neon_vminu
:
4096 case Intrinsic::arm_neon_vmaxu
: {
4097 if (Op
.getValueType().isFloatingPoint())
4099 unsigned NewOpc
= (IntNo
== Intrinsic::arm_neon_vminu
)
4100 ? ISD::UMIN
: ISD::UMAX
;
4101 return DAG
.getNode(NewOpc
, SDLoc(Op
), Op
.getValueType(),
4102 Op
.getOperand(1), Op
.getOperand(2));
4104 case Intrinsic::arm_neon_vmins
:
4105 case Intrinsic::arm_neon_vmaxs
: {
4106 // v{min,max}s is overloaded between signed integers and floats.
4107 if (!Op
.getValueType().isFloatingPoint()) {
4108 unsigned NewOpc
= (IntNo
== Intrinsic::arm_neon_vmins
)
4109 ? ISD::SMIN
: ISD::SMAX
;
4110 return DAG
.getNode(NewOpc
, SDLoc(Op
), Op
.getValueType(),
4111 Op
.getOperand(1), Op
.getOperand(2));
4113 unsigned NewOpc
= (IntNo
== Intrinsic::arm_neon_vmins
)
4114 ? ISD::FMINIMUM
: ISD::FMAXIMUM
;
4115 return DAG
.getNode(NewOpc
, SDLoc(Op
), Op
.getValueType(),
4116 Op
.getOperand(1), Op
.getOperand(2));
4118 case Intrinsic::arm_neon_vtbl1
:
4119 return DAG
.getNode(ARMISD::VTBL1
, SDLoc(Op
), Op
.getValueType(),
4120 Op
.getOperand(1), Op
.getOperand(2));
4121 case Intrinsic::arm_neon_vtbl2
:
4122 return DAG
.getNode(ARMISD::VTBL2
, SDLoc(Op
), Op
.getValueType(),
4123 Op
.getOperand(1), Op
.getOperand(2), Op
.getOperand(3));
4124 case Intrinsic::arm_mve_pred_i2v
:
4125 case Intrinsic::arm_mve_pred_v2i
:
4126 return DAG
.getNode(ARMISD::PREDICATE_CAST
, SDLoc(Op
), Op
.getValueType(),
4128 case Intrinsic::arm_mve_vreinterpretq
:
4129 return DAG
.getNode(ARMISD::VECTOR_REG_CAST
, SDLoc(Op
), Op
.getValueType(),
4131 case Intrinsic::arm_mve_lsll
:
4132 return DAG
.getNode(ARMISD::LSLL
, SDLoc(Op
), Op
->getVTList(),
4133 Op
.getOperand(1), Op
.getOperand(2), Op
.getOperand(3));
4134 case Intrinsic::arm_mve_asrl
:
4135 return DAG
.getNode(ARMISD::ASRL
, SDLoc(Op
), Op
->getVTList(),
4136 Op
.getOperand(1), Op
.getOperand(2), Op
.getOperand(3));
4140 static SDValue
LowerATOMIC_FENCE(SDValue Op
, SelectionDAG
&DAG
,
4141 const ARMSubtarget
*Subtarget
) {
4143 ConstantSDNode
*SSIDNode
= cast
<ConstantSDNode
>(Op
.getOperand(2));
4144 auto SSID
= static_cast<SyncScope::ID
>(SSIDNode
->getZExtValue());
4145 if (SSID
== SyncScope::SingleThread
)
4148 if (!Subtarget
->hasDataBarrier()) {
4149 // Some ARMv6 cpus can support data barriers with an mcr instruction.
4150 // Thumb1 and pre-v6 ARM mode use a libcall instead and should never get
4152 assert(Subtarget
->hasV6Ops() && !Subtarget
->isThumb() &&
4153 "Unexpected ISD::ATOMIC_FENCE encountered. Should be libcall!");
4154 return DAG
.getNode(ARMISD::MEMBARRIER_MCR
, dl
, MVT::Other
, Op
.getOperand(0),
4155 DAG
.getConstant(0, dl
, MVT::i32
));
4158 ConstantSDNode
*OrdN
= cast
<ConstantSDNode
>(Op
.getOperand(1));
4159 AtomicOrdering Ord
= static_cast<AtomicOrdering
>(OrdN
->getZExtValue());
4160 ARM_MB::MemBOpt Domain
= ARM_MB::ISH
;
4161 if (Subtarget
->isMClass()) {
4162 // Only a full system barrier exists in the M-class architectures.
4163 Domain
= ARM_MB::SY
;
4164 } else if (Subtarget
->preferISHSTBarriers() &&
4165 Ord
== AtomicOrdering::Release
) {
4166 // Swift happens to implement ISHST barriers in a way that's compatible with
4167 // Release semantics but weaker than ISH so we'd be fools not to use
4168 // it. Beware: other processors probably don't!
4169 Domain
= ARM_MB::ISHST
;
4172 return DAG
.getNode(ISD::INTRINSIC_VOID
, dl
, MVT::Other
, Op
.getOperand(0),
4173 DAG
.getConstant(Intrinsic::arm_dmb
, dl
, MVT::i32
),
4174 DAG
.getConstant(Domain
, dl
, MVT::i32
));
4177 static SDValue
LowerPREFETCH(SDValue Op
, SelectionDAG
&DAG
,
4178 const ARMSubtarget
*Subtarget
) {
4179 // ARM pre v5TE and Thumb1 does not have preload instructions.
4180 if (!(Subtarget
->isThumb2() ||
4181 (!Subtarget
->isThumb1Only() && Subtarget
->hasV5TEOps())))
4182 // Just preserve the chain.
4183 return Op
.getOperand(0);
4186 unsigned isRead
= ~cast
<ConstantSDNode
>(Op
.getOperand(2))->getZExtValue() & 1;
4188 (!Subtarget
->hasV7Ops() || !Subtarget
->hasMPExtension()))
4189 // ARMv7 with MP extension has PLDW.
4190 return Op
.getOperand(0);
4192 unsigned isData
= cast
<ConstantSDNode
>(Op
.getOperand(4))->getZExtValue();
4193 if (Subtarget
->isThumb()) {
4195 isRead
= ~isRead
& 1;
4196 isData
= ~isData
& 1;
4199 return DAG
.getNode(ARMISD::PRELOAD
, dl
, MVT::Other
, Op
.getOperand(0),
4200 Op
.getOperand(1), DAG
.getConstant(isRead
, dl
, MVT::i32
),
4201 DAG
.getConstant(isData
, dl
, MVT::i32
));
4204 static SDValue
LowerVASTART(SDValue Op
, SelectionDAG
&DAG
) {
4205 MachineFunction
&MF
= DAG
.getMachineFunction();
4206 ARMFunctionInfo
*FuncInfo
= MF
.getInfo
<ARMFunctionInfo
>();
4208 // vastart just stores the address of the VarArgsFrameIndex slot into the
4209 // memory location argument.
4211 EVT PtrVT
= DAG
.getTargetLoweringInfo().getPointerTy(DAG
.getDataLayout());
4212 SDValue FR
= DAG
.getFrameIndex(FuncInfo
->getVarArgsFrameIndex(), PtrVT
);
4213 const Value
*SV
= cast
<SrcValueSDNode
>(Op
.getOperand(2))->getValue();
4214 return DAG
.getStore(Op
.getOperand(0), dl
, FR
, Op
.getOperand(1),
4215 MachinePointerInfo(SV
));
4218 SDValue
ARMTargetLowering::GetF64FormalArgument(CCValAssign
&VA
,
4219 CCValAssign
&NextVA
,
4222 const SDLoc
&dl
) const {
4223 MachineFunction
&MF
= DAG
.getMachineFunction();
4224 ARMFunctionInfo
*AFI
= MF
.getInfo
<ARMFunctionInfo
>();
4226 const TargetRegisterClass
*RC
;
4227 if (AFI
->isThumb1OnlyFunction())
4228 RC
= &ARM::tGPRRegClass
;
4230 RC
= &ARM::GPRRegClass
;
4232 // Transform the arguments stored in physical registers into virtual ones.
4233 unsigned Reg
= MF
.addLiveIn(VA
.getLocReg(), RC
);
4234 SDValue ArgValue
= DAG
.getCopyFromReg(Root
, dl
, Reg
, MVT::i32
);
4237 if (NextVA
.isMemLoc()) {
4238 MachineFrameInfo
&MFI
= MF
.getFrameInfo();
4239 int FI
= MFI
.CreateFixedObject(4, NextVA
.getLocMemOffset(), true);
4241 // Create load node to retrieve arguments from the stack.
4242 SDValue FIN
= DAG
.getFrameIndex(FI
, getPointerTy(DAG
.getDataLayout()));
4243 ArgValue2
= DAG
.getLoad(
4244 MVT::i32
, dl
, Root
, FIN
,
4245 MachinePointerInfo::getFixedStack(DAG
.getMachineFunction(), FI
));
4247 Reg
= MF
.addLiveIn(NextVA
.getLocReg(), RC
);
4248 ArgValue2
= DAG
.getCopyFromReg(Root
, dl
, Reg
, MVT::i32
);
4250 if (!Subtarget
->isLittle())
4251 std::swap (ArgValue
, ArgValue2
);
4252 return DAG
.getNode(ARMISD::VMOVDRR
, dl
, MVT::f64
, ArgValue
, ArgValue2
);
4255 // The remaining GPRs hold either the beginning of variable-argument
4256 // data, or the beginning of an aggregate passed by value (usually
4257 // byval). Either way, we allocate stack slots adjacent to the data
4258 // provided by our caller, and store the unallocated registers there.
4259 // If this is a variadic function, the va_list pointer will begin with
4260 // these values; otherwise, this reassembles a (byval) structure that
4261 // was split between registers and memory.
4262 // Return: The frame index registers were stored into.
4263 int ARMTargetLowering::StoreByValRegs(CCState
&CCInfo
, SelectionDAG
&DAG
,
4264 const SDLoc
&dl
, SDValue
&Chain
,
4265 const Value
*OrigArg
,
4266 unsigned InRegsParamRecordIdx
,
4267 int ArgOffset
, unsigned ArgSize
) const {
4268 // Currently, two use-cases possible:
4269 // Case #1. Non-var-args function, and we meet first byval parameter.
4270 // Setup first unallocated register as first byval register;
4271 // eat all remained registers
4272 // (these two actions are performed by HandleByVal method).
4273 // Then, here, we initialize stack frame with
4274 // "store-reg" instructions.
4275 // Case #2. Var-args function, that doesn't contain byval parameters.
4276 // The same: eat all remained unallocated registers,
4277 // initialize stack frame.
4279 MachineFunction
&MF
= DAG
.getMachineFunction();
4280 MachineFrameInfo
&MFI
= MF
.getFrameInfo();
4281 ARMFunctionInfo
*AFI
= MF
.getInfo
<ARMFunctionInfo
>();
4282 unsigned RBegin
, REnd
;
4283 if (InRegsParamRecordIdx
< CCInfo
.getInRegsParamsCount()) {
4284 CCInfo
.getInRegsParamInfo(InRegsParamRecordIdx
, RBegin
, REnd
);
4286 unsigned RBeginIdx
= CCInfo
.getFirstUnallocated(GPRArgRegs
);
4287 RBegin
= RBeginIdx
== 4 ? (unsigned)ARM::R4
: GPRArgRegs
[RBeginIdx
];
4292 ArgOffset
= -4 * (ARM::R4
- RBegin
);
4294 auto PtrVT
= getPointerTy(DAG
.getDataLayout());
4295 int FrameIndex
= MFI
.CreateFixedObject(ArgSize
, ArgOffset
, false);
4296 SDValue FIN
= DAG
.getFrameIndex(FrameIndex
, PtrVT
);
4298 SmallVector
<SDValue
, 4> MemOps
;
4299 const TargetRegisterClass
*RC
=
4300 AFI
->isThumb1OnlyFunction() ? &ARM::tGPRRegClass
: &ARM::GPRRegClass
;
4302 for (unsigned Reg
= RBegin
, i
= 0; Reg
< REnd
; ++Reg
, ++i
) {
4303 unsigned VReg
= MF
.addLiveIn(Reg
, RC
);
4304 SDValue Val
= DAG
.getCopyFromReg(Chain
, dl
, VReg
, MVT::i32
);
4305 SDValue Store
= DAG
.getStore(Val
.getValue(1), dl
, Val
, FIN
,
4306 MachinePointerInfo(OrigArg
, 4 * i
));
4307 MemOps
.push_back(Store
);
4308 FIN
= DAG
.getNode(ISD::ADD
, dl
, PtrVT
, FIN
, DAG
.getConstant(4, dl
, PtrVT
));
4311 if (!MemOps
.empty())
4312 Chain
= DAG
.getNode(ISD::TokenFactor
, dl
, MVT::Other
, MemOps
);
4316 // Setup stack frame, the va_list pointer will start from.
4317 void ARMTargetLowering::VarArgStyleRegisters(CCState
&CCInfo
, SelectionDAG
&DAG
,
4318 const SDLoc
&dl
, SDValue
&Chain
,
4320 unsigned TotalArgRegsSaveSize
,
4321 bool ForceMutable
) const {
4322 MachineFunction
&MF
= DAG
.getMachineFunction();
4323 ARMFunctionInfo
*AFI
= MF
.getInfo
<ARMFunctionInfo
>();
4325 // Try to store any remaining integer argument regs
4326 // to their spots on the stack so that they may be loaded by dereferencing
4327 // the result of va_next.
4328 // If there is no regs to be stored, just point address after last
4329 // argument passed via stack.
4330 int FrameIndex
= StoreByValRegs(CCInfo
, DAG
, dl
, Chain
, nullptr,
4331 CCInfo
.getInRegsParamsCount(),
4332 CCInfo
.getNextStackOffset(),
4333 std::max(4U, TotalArgRegsSaveSize
));
4334 AFI
->setVarArgsFrameIndex(FrameIndex
);
4337 bool ARMTargetLowering::splitValueIntoRegisterParts(
4338 SelectionDAG
&DAG
, const SDLoc
&DL
, SDValue Val
, SDValue
*Parts
,
4339 unsigned NumParts
, MVT PartVT
, Optional
<CallingConv::ID
> CC
) const {
4340 bool IsABIRegCopy
= CC
.hasValue();
4341 EVT ValueVT
= Val
.getValueType();
4342 if (IsABIRegCopy
&& (ValueVT
== MVT::f16
|| ValueVT
== MVT::bf16
) &&
4343 PartVT
== MVT::f32
) {
4344 unsigned ValueBits
= ValueVT
.getSizeInBits();
4345 unsigned PartBits
= PartVT
.getSizeInBits();
4346 Val
= DAG
.getNode(ISD::BITCAST
, DL
, MVT::getIntegerVT(ValueBits
), Val
);
4347 Val
= DAG
.getNode(ISD::ANY_EXTEND
, DL
, MVT::getIntegerVT(PartBits
), Val
);
4348 Val
= DAG
.getNode(ISD::BITCAST
, DL
, PartVT
, Val
);
4355 SDValue
ARMTargetLowering::joinRegisterPartsIntoValue(
4356 SelectionDAG
&DAG
, const SDLoc
&DL
, const SDValue
*Parts
, unsigned NumParts
,
4357 MVT PartVT
, EVT ValueVT
, Optional
<CallingConv::ID
> CC
) const {
4358 bool IsABIRegCopy
= CC
.hasValue();
4359 if (IsABIRegCopy
&& (ValueVT
== MVT::f16
|| ValueVT
== MVT::bf16
) &&
4360 PartVT
== MVT::f32
) {
4361 unsigned ValueBits
= ValueVT
.getSizeInBits();
4362 unsigned PartBits
= PartVT
.getSizeInBits();
4363 SDValue Val
= Parts
[0];
4365 Val
= DAG
.getNode(ISD::BITCAST
, DL
, MVT::getIntegerVT(PartBits
), Val
);
4366 Val
= DAG
.getNode(ISD::TRUNCATE
, DL
, MVT::getIntegerVT(ValueBits
), Val
);
4367 Val
= DAG
.getNode(ISD::BITCAST
, DL
, ValueVT
, Val
);
4373 SDValue
ARMTargetLowering::LowerFormalArguments(
4374 SDValue Chain
, CallingConv::ID CallConv
, bool isVarArg
,
4375 const SmallVectorImpl
<ISD::InputArg
> &Ins
, const SDLoc
&dl
,
4376 SelectionDAG
&DAG
, SmallVectorImpl
<SDValue
> &InVals
) const {
4377 MachineFunction
&MF
= DAG
.getMachineFunction();
4378 MachineFrameInfo
&MFI
= MF
.getFrameInfo();
4380 ARMFunctionInfo
*AFI
= MF
.getInfo
<ARMFunctionInfo
>();
4382 // Assign locations to all of the incoming arguments.
4383 SmallVector
<CCValAssign
, 16> ArgLocs
;
4384 CCState
CCInfo(CallConv
, isVarArg
, DAG
.getMachineFunction(), ArgLocs
,
4386 CCInfo
.AnalyzeFormalArguments(Ins
, CCAssignFnForCall(CallConv
, isVarArg
));
4388 SmallVector
<SDValue
, 16> ArgValues
;
4390 Function::const_arg_iterator CurOrigArg
= MF
.getFunction().arg_begin();
4391 unsigned CurArgIdx
= 0;
4393 // Initially ArgRegsSaveSize is zero.
4394 // Then we increase this value each time we meet byval parameter.
4395 // We also increase this value in case of varargs function.
4396 AFI
->setArgRegsSaveSize(0);
4398 // Calculate the amount of stack space that we need to allocate to store
4399 // byval and variadic arguments that are passed in registers.
4400 // We need to know this before we allocate the first byval or variadic
4401 // argument, as they will be allocated a stack slot below the CFA (Canonical
4402 // Frame Address, the stack pointer at entry to the function).
4403 unsigned ArgRegBegin
= ARM::R4
;
4404 for (unsigned i
= 0, e
= ArgLocs
.size(); i
!= e
; ++i
) {
4405 if (CCInfo
.getInRegsParamsProcessed() >= CCInfo
.getInRegsParamsCount())
4408 CCValAssign
&VA
= ArgLocs
[i
];
4409 unsigned Index
= VA
.getValNo();
4410 ISD::ArgFlagsTy Flags
= Ins
[Index
].Flags
;
4411 if (!Flags
.isByVal())
4414 assert(VA
.isMemLoc() && "unexpected byval pointer in reg");
4415 unsigned RBegin
, REnd
;
4416 CCInfo
.getInRegsParamInfo(CCInfo
.getInRegsParamsProcessed(), RBegin
, REnd
);
4417 ArgRegBegin
= std::min(ArgRegBegin
, RBegin
);
4419 CCInfo
.nextInRegsParam();
4421 CCInfo
.rewindByValRegsInfo();
4423 int lastInsIndex
= -1;
4424 if (isVarArg
&& MFI
.hasVAStart()) {
4425 unsigned RegIdx
= CCInfo
.getFirstUnallocated(GPRArgRegs
);
4426 if (RegIdx
!= array_lengthof(GPRArgRegs
))
4427 ArgRegBegin
= std::min(ArgRegBegin
, (unsigned)GPRArgRegs
[RegIdx
]);
4430 unsigned TotalArgRegsSaveSize
= 4 * (ARM::R4
- ArgRegBegin
);
4431 AFI
->setArgRegsSaveSize(TotalArgRegsSaveSize
);
4432 auto PtrVT
= getPointerTy(DAG
.getDataLayout());
4434 for (unsigned i
= 0, e
= ArgLocs
.size(); i
!= e
; ++i
) {
4435 CCValAssign
&VA
= ArgLocs
[i
];
4436 if (Ins
[VA
.getValNo()].isOrigArg()) {
4437 std::advance(CurOrigArg
,
4438 Ins
[VA
.getValNo()].getOrigArgIndex() - CurArgIdx
);
4439 CurArgIdx
= Ins
[VA
.getValNo()].getOrigArgIndex();
4441 // Arguments stored in registers.
4442 if (VA
.isRegLoc()) {
4443 EVT RegVT
= VA
.getLocVT();
4445 if (VA
.needsCustom() && VA
.getLocVT() == MVT::v2f64
) {
4446 // f64 and vector types are split up into multiple registers or
4447 // combinations of registers and stack slots.
4449 GetF64FormalArgument(VA
, ArgLocs
[++i
], Chain
, DAG
, dl
);
4450 VA
= ArgLocs
[++i
]; // skip ahead to next loc
4452 if (VA
.isMemLoc()) {
4453 int FI
= MFI
.CreateFixedObject(8, VA
.getLocMemOffset(), true);
4454 SDValue FIN
= DAG
.getFrameIndex(FI
, PtrVT
);
4455 ArgValue2
= DAG
.getLoad(
4456 MVT::f64
, dl
, Chain
, FIN
,
4457 MachinePointerInfo::getFixedStack(DAG
.getMachineFunction(), FI
));
4459 ArgValue2
= GetF64FormalArgument(VA
, ArgLocs
[++i
], Chain
, DAG
, dl
);
4461 ArgValue
= DAG
.getNode(ISD::UNDEF
, dl
, MVT::v2f64
);
4462 ArgValue
= DAG
.getNode(ISD::INSERT_VECTOR_ELT
, dl
, MVT::v2f64
, ArgValue
,
4463 ArgValue1
, DAG
.getIntPtrConstant(0, dl
));
4464 ArgValue
= DAG
.getNode(ISD::INSERT_VECTOR_ELT
, dl
, MVT::v2f64
, ArgValue
,
4465 ArgValue2
, DAG
.getIntPtrConstant(1, dl
));
4466 } else if (VA
.needsCustom() && VA
.getLocVT() == MVT::f64
) {
4467 ArgValue
= GetF64FormalArgument(VA
, ArgLocs
[++i
], Chain
, DAG
, dl
);
4469 const TargetRegisterClass
*RC
;
4471 if (RegVT
== MVT::f16
|| RegVT
== MVT::bf16
)
4472 RC
= &ARM::HPRRegClass
;
4473 else if (RegVT
== MVT::f32
)
4474 RC
= &ARM::SPRRegClass
;
4475 else if (RegVT
== MVT::f64
|| RegVT
== MVT::v4f16
||
4476 RegVT
== MVT::v4bf16
)
4477 RC
= &ARM::DPRRegClass
;
4478 else if (RegVT
== MVT::v2f64
|| RegVT
== MVT::v8f16
||
4479 RegVT
== MVT::v8bf16
)
4480 RC
= &ARM::QPRRegClass
;
4481 else if (RegVT
== MVT::i32
)
4482 RC
= AFI
->isThumb1OnlyFunction() ? &ARM::tGPRRegClass
4483 : &ARM::GPRRegClass
;
4485 llvm_unreachable("RegVT not supported by FORMAL_ARGUMENTS Lowering");
4487 // Transform the arguments in physical registers into virtual ones.
4488 unsigned Reg
= MF
.addLiveIn(VA
.getLocReg(), RC
);
4489 ArgValue
= DAG
.getCopyFromReg(Chain
, dl
, Reg
, RegVT
);
4491 // If this value is passed in r0 and has the returned attribute (e.g.
4492 // C++ 'structors), record this fact for later use.
4493 if (VA
.getLocReg() == ARM::R0
&& Ins
[VA
.getValNo()].Flags
.isReturned()) {
4494 AFI
->setPreservesR0();
4498 // If this is an 8 or 16-bit value, it is really passed promoted
4499 // to 32 bits. Insert an assert[sz]ext to capture this, then
4500 // truncate to the right size.
4501 switch (VA
.getLocInfo()) {
4502 default: llvm_unreachable("Unknown loc info!");
4503 case CCValAssign::Full
: break;
4504 case CCValAssign::BCvt
:
4505 ArgValue
= DAG
.getNode(ISD::BITCAST
, dl
, VA
.getValVT(), ArgValue
);
4507 case CCValAssign::SExt
:
4508 ArgValue
= DAG
.getNode(ISD::AssertSext
, dl
, RegVT
, ArgValue
,
4509 DAG
.getValueType(VA
.getValVT()));
4510 ArgValue
= DAG
.getNode(ISD::TRUNCATE
, dl
, VA
.getValVT(), ArgValue
);
4512 case CCValAssign::ZExt
:
4513 ArgValue
= DAG
.getNode(ISD::AssertZext
, dl
, RegVT
, ArgValue
,
4514 DAG
.getValueType(VA
.getValVT()));
4515 ArgValue
= DAG
.getNode(ISD::TRUNCATE
, dl
, VA
.getValVT(), ArgValue
);
4519 // f16 arguments have their size extended to 4 bytes and passed as if they
4520 // had been copied to the LSBs of a 32-bit register.
4521 // For that, it's passed extended to i32 (soft ABI) or to f32 (hard ABI)
4522 if (VA
.needsCustom() &&
4523 (VA
.getValVT() == MVT::f16
|| VA
.getValVT() == MVT::bf16
))
4524 ArgValue
= MoveToHPR(dl
, DAG
, VA
.getLocVT(), VA
.getValVT(), ArgValue
);
4526 InVals
.push_back(ArgValue
);
4527 } else { // VA.isRegLoc()
4529 assert(VA
.isMemLoc());
4530 assert(VA
.getValVT() != MVT::i64
&& "i64 should already be lowered");
4532 int index
= VA
.getValNo();
4534 // Some Ins[] entries become multiple ArgLoc[] entries.
4535 // Process them only once.
4536 if (index
!= lastInsIndex
)
4538 ISD::ArgFlagsTy Flags
= Ins
[index
].Flags
;
4539 // FIXME: For now, all byval parameter objects are marked mutable.
4540 // This can be changed with more analysis.
4541 // In case of tail call optimization mark all arguments mutable.
4542 // Since they could be overwritten by lowering of arguments in case of
4544 if (Flags
.isByVal()) {
4545 assert(Ins
[index
].isOrigArg() &&
4546 "Byval arguments cannot be implicit");
4547 unsigned CurByValIndex
= CCInfo
.getInRegsParamsProcessed();
4549 int FrameIndex
= StoreByValRegs(
4550 CCInfo
, DAG
, dl
, Chain
, &*CurOrigArg
, CurByValIndex
,
4551 VA
.getLocMemOffset(), Flags
.getByValSize());
4552 InVals
.push_back(DAG
.getFrameIndex(FrameIndex
, PtrVT
));
4553 CCInfo
.nextInRegsParam();
4555 unsigned FIOffset
= VA
.getLocMemOffset();
4556 int FI
= MFI
.CreateFixedObject(VA
.getLocVT().getSizeInBits()/8,
4559 // Create load nodes to retrieve arguments from the stack.
4560 SDValue FIN
= DAG
.getFrameIndex(FI
, PtrVT
);
4561 InVals
.push_back(DAG
.getLoad(VA
.getValVT(), dl
, Chain
, FIN
,
4562 MachinePointerInfo::getFixedStack(
4563 DAG
.getMachineFunction(), FI
)));
4565 lastInsIndex
= index
;
4571 if (isVarArg
&& MFI
.hasVAStart()) {
4572 VarArgStyleRegisters(CCInfo
, DAG
, dl
, Chain
, CCInfo
.getNextStackOffset(),
4573 TotalArgRegsSaveSize
);
4574 if (AFI
->isCmseNSEntryFunction()) {
4575 DiagnosticInfoUnsupported
Diag(
4576 DAG
.getMachineFunction().getFunction(),
4577 "secure entry function must not be variadic", dl
.getDebugLoc());
4578 DAG
.getContext()->diagnose(Diag
);
4582 unsigned StackArgSize
= CCInfo
.getNextStackOffset();
4583 bool TailCallOpt
= MF
.getTarget().Options
.GuaranteedTailCallOpt
;
4584 if (canGuaranteeTCO(CallConv
, TailCallOpt
)) {
4585 // The only way to guarantee a tail call is if the callee restores its
4586 // argument area, but it must also keep the stack aligned when doing so.
4587 const DataLayout
&DL
= DAG
.getDataLayout();
4588 StackArgSize
= alignTo(StackArgSize
, DL
.getStackAlignment());
4590 AFI
->setArgumentStackToRestore(StackArgSize
);
4592 AFI
->setArgumentStackSize(StackArgSize
);
4594 if (CCInfo
.getNextStackOffset() > 0 && AFI
->isCmseNSEntryFunction()) {
4595 DiagnosticInfoUnsupported
Diag(
4596 DAG
.getMachineFunction().getFunction(),
4597 "secure entry function requires arguments on stack", dl
.getDebugLoc());
4598 DAG
.getContext()->diagnose(Diag
);
4604 /// isFloatingPointZero - Return true if this is +0.0.
4605 static bool isFloatingPointZero(SDValue Op
) {
4606 if (ConstantFPSDNode
*CFP
= dyn_cast
<ConstantFPSDNode
>(Op
))
4607 return CFP
->getValueAPF().isPosZero();
4608 else if (ISD::isEXTLoad(Op
.getNode()) || ISD::isNON_EXTLoad(Op
.getNode())) {
4609 // Maybe this has already been legalized into the constant pool?
4610 if (Op
.getOperand(1).getOpcode() == ARMISD::Wrapper
) {
4611 SDValue WrapperOp
= Op
.getOperand(1).getOperand(0);
4612 if (ConstantPoolSDNode
*CP
= dyn_cast
<ConstantPoolSDNode
>(WrapperOp
))
4613 if (const ConstantFP
*CFP
= dyn_cast
<ConstantFP
>(CP
->getConstVal()))
4614 return CFP
->getValueAPF().isPosZero();
4616 } else if (Op
->getOpcode() == ISD::BITCAST
&&
4617 Op
->getValueType(0) == MVT::f64
) {
4618 // Handle (ISD::BITCAST (ARMISD::VMOVIMM (ISD::TargetConstant 0)) MVT::f64)
4619 // created by LowerConstantFP().
4620 SDValue BitcastOp
= Op
->getOperand(0);
4621 if (BitcastOp
->getOpcode() == ARMISD::VMOVIMM
&&
4622 isNullConstant(BitcastOp
->getOperand(0)))
4628 /// Returns appropriate ARM CMP (cmp) and corresponding condition code for
4629 /// the given operands.
4630 SDValue
ARMTargetLowering::getARMCmp(SDValue LHS
, SDValue RHS
, ISD::CondCode CC
,
4631 SDValue
&ARMcc
, SelectionDAG
&DAG
,
4632 const SDLoc
&dl
) const {
4633 if (ConstantSDNode
*RHSC
= dyn_cast
<ConstantSDNode
>(RHS
.getNode())) {
4634 unsigned C
= RHSC
->getZExtValue();
4635 if (!isLegalICmpImmediate((int32_t)C
)) {
4636 // Constant does not fit, try adjusting it by one.
4641 if (C
!= 0x80000000 && isLegalICmpImmediate(C
-1)) {
4642 CC
= (CC
== ISD::SETLT
) ? ISD::SETLE
: ISD::SETGT
;
4643 RHS
= DAG
.getConstant(C
- 1, dl
, MVT::i32
);
4648 if (C
!= 0 && isLegalICmpImmediate(C
-1)) {
4649 CC
= (CC
== ISD::SETULT
) ? ISD::SETULE
: ISD::SETUGT
;
4650 RHS
= DAG
.getConstant(C
- 1, dl
, MVT::i32
);
4655 if (C
!= 0x7fffffff && isLegalICmpImmediate(C
+1)) {
4656 CC
= (CC
== ISD::SETLE
) ? ISD::SETLT
: ISD::SETGE
;
4657 RHS
= DAG
.getConstant(C
+ 1, dl
, MVT::i32
);
4662 if (C
!= 0xffffffff && isLegalICmpImmediate(C
+1)) {
4663 CC
= (CC
== ISD::SETULE
) ? ISD::SETULT
: ISD::SETUGE
;
4664 RHS
= DAG
.getConstant(C
+ 1, dl
, MVT::i32
);
4669 } else if ((ARM_AM::getShiftOpcForNode(LHS
.getOpcode()) != ARM_AM::no_shift
) &&
4670 (ARM_AM::getShiftOpcForNode(RHS
.getOpcode()) == ARM_AM::no_shift
)) {
4671 // In ARM and Thumb-2, the compare instructions can shift their second
4673 CC
= ISD::getSetCCSwappedOperands(CC
);
4674 std::swap(LHS
, RHS
);
4677 // Thumb1 has very limited immediate modes, so turning an "and" into a
4678 // shift can save multiple instructions.
4680 // If we have (x & C1), and C1 is an appropriate mask, we can transform it
4681 // into "((x << n) >> n)". But that isn't necessarily profitable on its
4682 // own. If it's the operand to an unsigned comparison with an immediate,
4683 // we can eliminate one of the shifts: we transform
4684 // "((x << n) >> n) == C2" to "(x << n) == (C2 << n)".
4686 // We avoid transforming cases which aren't profitable due to encoding
4689 // 1. C2 fits into the immediate field of a cmp, and the transformed version
4690 // would not; in that case, we're essentially trading one immediate load for
4692 // 2. C1 is 255 or 65535, so we can use uxtb or uxth.
4693 // 3. C2 is zero; we have other code for this special case.
4695 // FIXME: Figure out profitability for Thumb2; we usually can't save an
4696 // instruction, since the AND is always one instruction anyway, but we could
4697 // use narrow instructions in some cases.
4698 if (Subtarget
->isThumb1Only() && LHS
->getOpcode() == ISD::AND
&&
4699 LHS
->hasOneUse() && isa
<ConstantSDNode
>(LHS
.getOperand(1)) &&
4700 LHS
.getValueType() == MVT::i32
&& isa
<ConstantSDNode
>(RHS
) &&
4701 !isSignedIntSetCC(CC
)) {
4702 unsigned Mask
= cast
<ConstantSDNode
>(LHS
.getOperand(1))->getZExtValue();
4703 auto *RHSC
= cast
<ConstantSDNode
>(RHS
.getNode());
4704 uint64_t RHSV
= RHSC
->getZExtValue();
4705 if (isMask_32(Mask
) && (RHSV
& ~Mask
) == 0 && Mask
!= 255 && Mask
!= 65535) {
4706 unsigned ShiftBits
= countLeadingZeros(Mask
);
4707 if (RHSV
&& (RHSV
> 255 || (RHSV
<< ShiftBits
) <= 255)) {
4708 SDValue ShiftAmt
= DAG
.getConstant(ShiftBits
, dl
, MVT::i32
);
4709 LHS
= DAG
.getNode(ISD::SHL
, dl
, MVT::i32
, LHS
.getOperand(0), ShiftAmt
);
4710 RHS
= DAG
.getConstant(RHSV
<< ShiftBits
, dl
, MVT::i32
);
4715 // The specific comparison "(x<<c) > 0x80000000U" can be optimized to a
4716 // single "lsls x, c+1". The shift sets the "C" and "Z" flags the same
4718 // FIXME: Add support for ARM/Thumb2; this would need isel patterns, and
4719 // some tweaks to the heuristics for the previous and->shift transform.
4720 // FIXME: Optimize cases where the LHS isn't a shift.
4721 if (Subtarget
->isThumb1Only() && LHS
->getOpcode() == ISD::SHL
&&
4722 isa
<ConstantSDNode
>(RHS
) &&
4723 cast
<ConstantSDNode
>(RHS
)->getZExtValue() == 0x80000000U
&&
4724 CC
== ISD::SETUGT
&& isa
<ConstantSDNode
>(LHS
.getOperand(1)) &&
4725 cast
<ConstantSDNode
>(LHS
.getOperand(1))->getZExtValue() < 31) {
4727 cast
<ConstantSDNode
>(LHS
.getOperand(1))->getZExtValue() + 1;
4728 SDValue Shift
= DAG
.getNode(ARMISD::LSLS
, dl
,
4729 DAG
.getVTList(MVT::i32
, MVT::i32
),
4731 DAG
.getConstant(ShiftAmt
, dl
, MVT::i32
));
4732 SDValue Chain
= DAG
.getCopyToReg(DAG
.getEntryNode(), dl
, ARM::CPSR
,
4733 Shift
.getValue(1), SDValue());
4734 ARMcc
= DAG
.getConstant(ARMCC::HI
, dl
, MVT::i32
);
4735 return Chain
.getValue(1);
4738 ARMCC::CondCodes CondCode
= IntCCToARMCC(CC
);
4740 // If the RHS is a constant zero then the V (overflow) flag will never be
4741 // set. This can allow us to simplify GE to PL or LT to MI, which can be
4742 // simpler for other passes (like the peephole optimiser) to deal with.
4743 if (isNullConstant(RHS
)) {
4747 CondCode
= ARMCC::PL
;
4750 CondCode
= ARMCC::MI
;
4755 ARMISD::NodeType CompareType
;
4758 CompareType
= ARMISD::CMP
;
4763 CompareType
= ARMISD::CMPZ
;
4766 ARMcc
= DAG
.getConstant(CondCode
, dl
, MVT::i32
);
4767 return DAG
.getNode(CompareType
, dl
, MVT::Glue
, LHS
, RHS
);
4770 /// Returns a appropriate VFP CMP (fcmp{s|d}+fmstat) for the given operands.
4771 SDValue
ARMTargetLowering::getVFPCmp(SDValue LHS
, SDValue RHS
,
4772 SelectionDAG
&DAG
, const SDLoc
&dl
,
4773 bool Signaling
) const {
4774 assert(Subtarget
->hasFP64() || RHS
.getValueType() != MVT::f64
);
4776 if (!isFloatingPointZero(RHS
))
4777 Cmp
= DAG
.getNode(Signaling
? ARMISD::CMPFPE
: ARMISD::CMPFP
,
4778 dl
, MVT::Glue
, LHS
, RHS
);
4780 Cmp
= DAG
.getNode(Signaling
? ARMISD::CMPFPEw0
: ARMISD::CMPFPw0
,
4781 dl
, MVT::Glue
, LHS
);
4782 return DAG
.getNode(ARMISD::FMSTAT
, dl
, MVT::Glue
, Cmp
);
4785 /// duplicateCmp - Glue values can have only one use, so this function
4786 /// duplicates a comparison node.
4788 ARMTargetLowering::duplicateCmp(SDValue Cmp
, SelectionDAG
&DAG
) const {
4789 unsigned Opc
= Cmp
.getOpcode();
4791 if (Opc
== ARMISD::CMP
|| Opc
== ARMISD::CMPZ
)
4792 return DAG
.getNode(Opc
, DL
, MVT::Glue
, Cmp
.getOperand(0),Cmp
.getOperand(1));
4794 assert(Opc
== ARMISD::FMSTAT
&& "unexpected comparison operation");
4795 Cmp
= Cmp
.getOperand(0);
4796 Opc
= Cmp
.getOpcode();
4797 if (Opc
== ARMISD::CMPFP
)
4798 Cmp
= DAG
.getNode(Opc
, DL
, MVT::Glue
, Cmp
.getOperand(0),Cmp
.getOperand(1));
4800 assert(Opc
== ARMISD::CMPFPw0
&& "unexpected operand of FMSTAT");
4801 Cmp
= DAG
.getNode(Opc
, DL
, MVT::Glue
, Cmp
.getOperand(0));
4803 return DAG
.getNode(ARMISD::FMSTAT
, DL
, MVT::Glue
, Cmp
);
4806 // This function returns three things: the arithmetic computation itself
4807 // (Value), a comparison (OverflowCmp), and a condition code (ARMcc). The
4808 // comparison and the condition code define the case in which the arithmetic
4809 // computation *does not* overflow.
4810 std::pair
<SDValue
, SDValue
>
4811 ARMTargetLowering::getARMXALUOOp(SDValue Op
, SelectionDAG
&DAG
,
4812 SDValue
&ARMcc
) const {
4813 assert(Op
.getValueType() == MVT::i32
&& "Unsupported value type");
4815 SDValue Value
, OverflowCmp
;
4816 SDValue LHS
= Op
.getOperand(0);
4817 SDValue RHS
= Op
.getOperand(1);
4820 // FIXME: We are currently always generating CMPs because we don't support
4821 // generating CMN through the backend. This is not as good as the natural
4822 // CMP case because it causes a register dependency and cannot be folded
4825 switch (Op
.getOpcode()) {
4827 llvm_unreachable("Unknown overflow instruction!");
4829 ARMcc
= DAG
.getConstant(ARMCC::VC
, dl
, MVT::i32
);
4830 Value
= DAG
.getNode(ISD::ADD
, dl
, Op
.getValueType(), LHS
, RHS
);
4831 OverflowCmp
= DAG
.getNode(ARMISD::CMP
, dl
, MVT::Glue
, Value
, LHS
);
4834 ARMcc
= DAG
.getConstant(ARMCC::HS
, dl
, MVT::i32
);
4835 // We use ADDC here to correspond to its use in LowerUnsignedALUO.
4836 // We do not use it in the USUBO case as Value may not be used.
4837 Value
= DAG
.getNode(ARMISD::ADDC
, dl
,
4838 DAG
.getVTList(Op
.getValueType(), MVT::i32
), LHS
, RHS
)
4840 OverflowCmp
= DAG
.getNode(ARMISD::CMP
, dl
, MVT::Glue
, Value
, LHS
);
4843 ARMcc
= DAG
.getConstant(ARMCC::VC
, dl
, MVT::i32
);
4844 Value
= DAG
.getNode(ISD::SUB
, dl
, Op
.getValueType(), LHS
, RHS
);
4845 OverflowCmp
= DAG
.getNode(ARMISD::CMP
, dl
, MVT::Glue
, LHS
, RHS
);
4848 ARMcc
= DAG
.getConstant(ARMCC::HS
, dl
, MVT::i32
);
4849 Value
= DAG
.getNode(ISD::SUB
, dl
, Op
.getValueType(), LHS
, RHS
);
4850 OverflowCmp
= DAG
.getNode(ARMISD::CMP
, dl
, MVT::Glue
, LHS
, RHS
);
4853 // We generate a UMUL_LOHI and then check if the high word is 0.
4854 ARMcc
= DAG
.getConstant(ARMCC::EQ
, dl
, MVT::i32
);
4855 Value
= DAG
.getNode(ISD::UMUL_LOHI
, dl
,
4856 DAG
.getVTList(Op
.getValueType(), Op
.getValueType()),
4858 OverflowCmp
= DAG
.getNode(ARMISD::CMP
, dl
, MVT::Glue
, Value
.getValue(1),
4859 DAG
.getConstant(0, dl
, MVT::i32
));
4860 Value
= Value
.getValue(0); // We only want the low 32 bits for the result.
4863 // We generate a SMUL_LOHI and then check if all the bits of the high word
4864 // are the same as the sign bit of the low word.
4865 ARMcc
= DAG
.getConstant(ARMCC::EQ
, dl
, MVT::i32
);
4866 Value
= DAG
.getNode(ISD::SMUL_LOHI
, dl
,
4867 DAG
.getVTList(Op
.getValueType(), Op
.getValueType()),
4869 OverflowCmp
= DAG
.getNode(ARMISD::CMP
, dl
, MVT::Glue
, Value
.getValue(1),
4870 DAG
.getNode(ISD::SRA
, dl
, Op
.getValueType(),
4872 DAG
.getConstant(31, dl
, MVT::i32
)));
4873 Value
= Value
.getValue(0); // We only want the low 32 bits for the result.
4877 return std::make_pair(Value
, OverflowCmp
);
4881 ARMTargetLowering::LowerSignedALUO(SDValue Op
, SelectionDAG
&DAG
) const {
4882 // Let legalize expand this if it isn't a legal type yet.
4883 if (!DAG
.getTargetLoweringInfo().isTypeLegal(Op
.getValueType()))
4886 SDValue Value
, OverflowCmp
;
4888 std::tie(Value
, OverflowCmp
) = getARMXALUOOp(Op
, DAG
, ARMcc
);
4889 SDValue CCR
= DAG
.getRegister(ARM::CPSR
, MVT::i32
);
4891 // We use 0 and 1 as false and true values.
4892 SDValue TVal
= DAG
.getConstant(1, dl
, MVT::i32
);
4893 SDValue FVal
= DAG
.getConstant(0, dl
, MVT::i32
);
4894 EVT VT
= Op
.getValueType();
4896 SDValue Overflow
= DAG
.getNode(ARMISD::CMOV
, dl
, VT
, TVal
, FVal
,
4897 ARMcc
, CCR
, OverflowCmp
);
4899 SDVTList VTs
= DAG
.getVTList(Op
.getValueType(), MVT::i32
);
4900 return DAG
.getNode(ISD::MERGE_VALUES
, dl
, VTs
, Value
, Overflow
);
4903 static SDValue
ConvertBooleanCarryToCarryFlag(SDValue BoolCarry
,
4904 SelectionDAG
&DAG
) {
4905 SDLoc
DL(BoolCarry
);
4906 EVT CarryVT
= BoolCarry
.getValueType();
4908 // This converts the boolean value carry into the carry flag by doing
4909 // ARMISD::SUBC Carry, 1
4910 SDValue Carry
= DAG
.getNode(ARMISD::SUBC
, DL
,
4911 DAG
.getVTList(CarryVT
, MVT::i32
),
4912 BoolCarry
, DAG
.getConstant(1, DL
, CarryVT
));
4913 return Carry
.getValue(1);
4916 static SDValue
ConvertCarryFlagToBooleanCarry(SDValue Flags
, EVT VT
,
4917 SelectionDAG
&DAG
) {
4920 // Now convert the carry flag into a boolean carry. We do this
4921 // using ARMISD:ADDE 0, 0, Carry
4922 return DAG
.getNode(ARMISD::ADDE
, DL
, DAG
.getVTList(VT
, MVT::i32
),
4923 DAG
.getConstant(0, DL
, MVT::i32
),
4924 DAG
.getConstant(0, DL
, MVT::i32
), Flags
);
4927 SDValue
ARMTargetLowering::LowerUnsignedALUO(SDValue Op
,
4928 SelectionDAG
&DAG
) const {
4929 // Let legalize expand this if it isn't a legal type yet.
4930 if (!DAG
.getTargetLoweringInfo().isTypeLegal(Op
.getValueType()))
4933 SDValue LHS
= Op
.getOperand(0);
4934 SDValue RHS
= Op
.getOperand(1);
4937 EVT VT
= Op
.getValueType();
4938 SDVTList VTs
= DAG
.getVTList(VT
, MVT::i32
);
4941 switch (Op
.getOpcode()) {
4943 llvm_unreachable("Unknown overflow instruction!");
4945 Value
= DAG
.getNode(ARMISD::ADDC
, dl
, VTs
, LHS
, RHS
);
4946 // Convert the carry flag into a boolean value.
4947 Overflow
= ConvertCarryFlagToBooleanCarry(Value
.getValue(1), VT
, DAG
);
4950 Value
= DAG
.getNode(ARMISD::SUBC
, dl
, VTs
, LHS
, RHS
);
4951 // Convert the carry flag into a boolean value.
4952 Overflow
= ConvertCarryFlagToBooleanCarry(Value
.getValue(1), VT
, DAG
);
4953 // ARMISD::SUBC returns 0 when we have to borrow, so make it an overflow
4954 // value. So compute 1 - C.
4955 Overflow
= DAG
.getNode(ISD::SUB
, dl
, MVT::i32
,
4956 DAG
.getConstant(1, dl
, MVT::i32
), Overflow
);
4961 return DAG
.getNode(ISD::MERGE_VALUES
, dl
, VTs
, Value
, Overflow
);
4964 static SDValue
LowerADDSUBSAT(SDValue Op
, SelectionDAG
&DAG
,
4965 const ARMSubtarget
*Subtarget
) {
4966 EVT VT
= Op
.getValueType();
4967 if (!Subtarget
->hasV6Ops() || !Subtarget
->hasDSP())
4973 switch (VT
.getSimpleVT().SimpleTy
) {
4977 switch (Op
->getOpcode()) {
4979 NewOpcode
= ARMISD::UQADD8b
;
4982 NewOpcode
= ARMISD::QADD8b
;
4985 NewOpcode
= ARMISD::UQSUB8b
;
4988 NewOpcode
= ARMISD::QSUB8b
;
4993 switch (Op
->getOpcode()) {
4995 NewOpcode
= ARMISD::UQADD16b
;
4998 NewOpcode
= ARMISD::QADD16b
;
5001 NewOpcode
= ARMISD::UQSUB16b
;
5004 NewOpcode
= ARMISD::QSUB16b
;
5012 DAG
.getNode(NewOpcode
, dl
, MVT::i32
,
5013 DAG
.getSExtOrTrunc(Op
->getOperand(0), dl
, MVT::i32
),
5014 DAG
.getSExtOrTrunc(Op
->getOperand(1), dl
, MVT::i32
));
5015 return DAG
.getNode(ISD::TRUNCATE
, dl
, VT
, Add
);
5018 SDValue
ARMTargetLowering::LowerSELECT(SDValue Op
, SelectionDAG
&DAG
) const {
5019 SDValue Cond
= Op
.getOperand(0);
5020 SDValue SelectTrue
= Op
.getOperand(1);
5021 SDValue SelectFalse
= Op
.getOperand(2);
5023 unsigned Opc
= Cond
.getOpcode();
5025 if (Cond
.getResNo() == 1 &&
5026 (Opc
== ISD::SADDO
|| Opc
== ISD::UADDO
|| Opc
== ISD::SSUBO
||
5027 Opc
== ISD::USUBO
)) {
5028 if (!DAG
.getTargetLoweringInfo().isTypeLegal(Cond
->getValueType(0)))
5031 SDValue Value
, OverflowCmp
;
5033 std::tie(Value
, OverflowCmp
) = getARMXALUOOp(Cond
, DAG
, ARMcc
);
5034 SDValue CCR
= DAG
.getRegister(ARM::CPSR
, MVT::i32
);
5035 EVT VT
= Op
.getValueType();
5037 return getCMOV(dl
, VT
, SelectTrue
, SelectFalse
, ARMcc
, CCR
,
5043 // (select (cmov 1, 0, cond), t, f) -> (cmov t, f, cond)
5044 // (select (cmov 0, 1, cond), t, f) -> (cmov f, t, cond)
5046 if (Cond
.getOpcode() == ARMISD::CMOV
&& Cond
.hasOneUse()) {
5047 const ConstantSDNode
*CMOVTrue
=
5048 dyn_cast
<ConstantSDNode
>(Cond
.getOperand(0));
5049 const ConstantSDNode
*CMOVFalse
=
5050 dyn_cast
<ConstantSDNode
>(Cond
.getOperand(1));
5052 if (CMOVTrue
&& CMOVFalse
) {
5053 unsigned CMOVTrueVal
= CMOVTrue
->getZExtValue();
5054 unsigned CMOVFalseVal
= CMOVFalse
->getZExtValue();
5058 if (CMOVTrueVal
== 1 && CMOVFalseVal
== 0) {
5060 False
= SelectFalse
;
5061 } else if (CMOVTrueVal
== 0 && CMOVFalseVal
== 1) {
5066 if (True
.getNode() && False
.getNode()) {
5067 EVT VT
= Op
.getValueType();
5068 SDValue ARMcc
= Cond
.getOperand(2);
5069 SDValue CCR
= Cond
.getOperand(3);
5070 SDValue Cmp
= duplicateCmp(Cond
.getOperand(4), DAG
);
5071 assert(True
.getValueType() == VT
);
5072 return getCMOV(dl
, VT
, True
, False
, ARMcc
, CCR
, Cmp
, DAG
);
5077 // ARM's BooleanContents value is UndefinedBooleanContent. Mask out the
5078 // undefined bits before doing a full-word comparison with zero.
5079 Cond
= DAG
.getNode(ISD::AND
, dl
, Cond
.getValueType(), Cond
,
5080 DAG
.getConstant(1, dl
, Cond
.getValueType()));
5082 return DAG
.getSelectCC(dl
, Cond
,
5083 DAG
.getConstant(0, dl
, Cond
.getValueType()),
5084 SelectTrue
, SelectFalse
, ISD::SETNE
);
5087 static void checkVSELConstraints(ISD::CondCode CC
, ARMCC::CondCodes
&CondCode
,
5088 bool &swpCmpOps
, bool &swpVselOps
) {
5089 // Start by selecting the GE condition code for opcodes that return true for
5091 if (CC
== ISD::SETUGE
|| CC
== ISD::SETOGE
|| CC
== ISD::SETOLE
||
5092 CC
== ISD::SETULE
|| CC
== ISD::SETGE
|| CC
== ISD::SETLE
)
5093 CondCode
= ARMCC::GE
;
5095 // and GT for opcodes that return false for 'equality'.
5096 else if (CC
== ISD::SETUGT
|| CC
== ISD::SETOGT
|| CC
== ISD::SETOLT
||
5097 CC
== ISD::SETULT
|| CC
== ISD::SETGT
|| CC
== ISD::SETLT
)
5098 CondCode
= ARMCC::GT
;
5100 // Since we are constrained to GE/GT, if the opcode contains 'less', we need
5101 // to swap the compare operands.
5102 if (CC
== ISD::SETOLE
|| CC
== ISD::SETULE
|| CC
== ISD::SETOLT
||
5103 CC
== ISD::SETULT
|| CC
== ISD::SETLE
|| CC
== ISD::SETLT
)
5106 // Both GT and GE are ordered comparisons, and return false for 'unordered'.
5107 // If we have an unordered opcode, we need to swap the operands to the VSEL
5108 // instruction (effectively negating the condition).
5110 // This also has the effect of swapping which one of 'less' or 'greater'
5111 // returns true, so we also swap the compare operands. It also switches
5112 // whether we return true for 'equality', so we compensate by picking the
5113 // opposite condition code to our original choice.
5114 if (CC
== ISD::SETULE
|| CC
== ISD::SETULT
|| CC
== ISD::SETUGE
||
5115 CC
== ISD::SETUGT
) {
5116 swpCmpOps
= !swpCmpOps
;
5117 swpVselOps
= !swpVselOps
;
5118 CondCode
= CondCode
== ARMCC::GT
? ARMCC::GE
: ARMCC::GT
;
5121 // 'ordered' is 'anything but unordered', so use the VS condition code and
5122 // swap the VSEL operands.
5123 if (CC
== ISD::SETO
) {
5124 CondCode
= ARMCC::VS
;
5128 // 'unordered or not equal' is 'anything but equal', so use the EQ condition
5129 // code and swap the VSEL operands. Also do this if we don't care about the
5131 if (CC
== ISD::SETUNE
|| CC
== ISD::SETNE
) {
5132 CondCode
= ARMCC::EQ
;
5137 SDValue
ARMTargetLowering::getCMOV(const SDLoc
&dl
, EVT VT
, SDValue FalseVal
,
5138 SDValue TrueVal
, SDValue ARMcc
, SDValue CCR
,
5139 SDValue Cmp
, SelectionDAG
&DAG
) const {
5140 if (!Subtarget
->hasFP64() && VT
== MVT::f64
) {
5141 FalseVal
= DAG
.getNode(ARMISD::VMOVRRD
, dl
,
5142 DAG
.getVTList(MVT::i32
, MVT::i32
), FalseVal
);
5143 TrueVal
= DAG
.getNode(ARMISD::VMOVRRD
, dl
,
5144 DAG
.getVTList(MVT::i32
, MVT::i32
), TrueVal
);
5146 SDValue TrueLow
= TrueVal
.getValue(0);
5147 SDValue TrueHigh
= TrueVal
.getValue(1);
5148 SDValue FalseLow
= FalseVal
.getValue(0);
5149 SDValue FalseHigh
= FalseVal
.getValue(1);
5151 SDValue Low
= DAG
.getNode(ARMISD::CMOV
, dl
, MVT::i32
, FalseLow
, TrueLow
,
5153 SDValue High
= DAG
.getNode(ARMISD::CMOV
, dl
, MVT::i32
, FalseHigh
, TrueHigh
,
5154 ARMcc
, CCR
, duplicateCmp(Cmp
, DAG
));
5156 return DAG
.getNode(ARMISD::VMOVDRR
, dl
, MVT::f64
, Low
, High
);
5158 return DAG
.getNode(ARMISD::CMOV
, dl
, VT
, FalseVal
, TrueVal
, ARMcc
, CCR
,
5163 static bool isGTorGE(ISD::CondCode CC
) {
5164 return CC
== ISD::SETGT
|| CC
== ISD::SETGE
;
5167 static bool isLTorLE(ISD::CondCode CC
) {
5168 return CC
== ISD::SETLT
|| CC
== ISD::SETLE
;
5171 // See if a conditional (LHS CC RHS ? TrueVal : FalseVal) is lower-saturating.
5172 // All of these conditions (and their <= and >= counterparts) will do:
5177 static bool isLowerSaturate(const SDValue LHS
, const SDValue RHS
,
5178 const SDValue TrueVal
, const SDValue FalseVal
,
5179 const ISD::CondCode CC
, const SDValue K
) {
5180 return (isGTorGE(CC
) &&
5181 ((K
== LHS
&& K
== TrueVal
) || (K
== RHS
&& K
== FalseVal
))) ||
5183 ((K
== RHS
&& K
== TrueVal
) || (K
== LHS
&& K
== FalseVal
)));
5186 // Check if two chained conditionals could be converted into SSAT or USAT.
5188 // SSAT can replace a set of two conditional selectors that bound a number to an
5189 // interval of type [k, ~k] when k + 1 is a power of 2. Here are some examples:
5191 // x < -k ? -k : (x > k ? k : x)
5192 // x < -k ? -k : (x < k ? x : k)
5193 // x > -k ? (x > k ? k : x) : -k
5194 // x < k ? (x < -k ? -k : x) : k
5197 // LLVM canonicalizes these to either a min(max()) or a max(min())
5198 // pattern. This function tries to match one of these and will return a SSAT
5199 // node if successful.
5201 // USAT works similarily to SSAT but bounds on the interval [0, k] where k + 1
5203 static SDValue
LowerSaturatingConditional(SDValue Op
, SelectionDAG
&DAG
) {
5204 EVT VT
= Op
.getValueType();
5205 SDValue V1
= Op
.getOperand(0);
5206 SDValue K1
= Op
.getOperand(1);
5207 SDValue TrueVal1
= Op
.getOperand(2);
5208 SDValue FalseVal1
= Op
.getOperand(3);
5209 ISD::CondCode CC1
= cast
<CondCodeSDNode
>(Op
.getOperand(4))->get();
5211 const SDValue Op2
= isa
<ConstantSDNode
>(TrueVal1
) ? FalseVal1
: TrueVal1
;
5212 if (Op2
.getOpcode() != ISD::SELECT_CC
)
5215 SDValue V2
= Op2
.getOperand(0);
5216 SDValue K2
= Op2
.getOperand(1);
5217 SDValue TrueVal2
= Op2
.getOperand(2);
5218 SDValue FalseVal2
= Op2
.getOperand(3);
5219 ISD::CondCode CC2
= cast
<CondCodeSDNode
>(Op2
.getOperand(4))->get();
5224 // Check that the registers and the constants match a max(min()) or min(max())
5226 if (V1Tmp
!= TrueVal1
|| V2Tmp
!= TrueVal2
|| K1
!= FalseVal1
||
5228 !((isGTorGE(CC1
) && isLTorLE(CC2
)) || (isLTorLE(CC1
) && isGTorGE(CC2
))))
5231 // Check that the constant in the lower-bound check is
5232 // the opposite of the constant in the upper-bound check
5233 // in 1's complement.
5234 if (!isa
<ConstantSDNode
>(K1
) || !isa
<ConstantSDNode
>(K2
))
5237 int64_t Val1
= cast
<ConstantSDNode
>(K1
)->getSExtValue();
5238 int64_t Val2
= cast
<ConstantSDNode
>(K2
)->getSExtValue();
5239 int64_t PosVal
= std::max(Val1
, Val2
);
5240 int64_t NegVal
= std::min(Val1
, Val2
);
5242 if (!((Val1
> Val2
&& isLTorLE(CC1
)) || (Val1
< Val2
&& isLTorLE(CC2
))) ||
5243 !isPowerOf2_64(PosVal
+ 1))
5246 // Handle the difference between USAT (unsigned) and SSAT (signed)
5248 // At this point, PosVal is guaranteed to be positive
5249 uint64_t K
= PosVal
;
5252 return DAG
.getNode(ARMISD::SSAT
, dl
, VT
, V2Tmp
,
5253 DAG
.getConstant(countTrailingOnes(K
), dl
, VT
));
5255 return DAG
.getNode(ARMISD::USAT
, dl
, VT
, V2Tmp
,
5256 DAG
.getConstant(countTrailingOnes(K
), dl
, VT
));
5261 // Check if a condition of the type x < k ? k : x can be converted into a
5262 // bit operation instead of conditional moves.
5263 // Currently this is allowed given:
5264 // - The conditions and values match up
5265 // - k is 0 or -1 (all ones)
5266 // This function will not check the last condition, thats up to the caller
5267 // It returns true if the transformation can be made, and in such case
5268 // returns x in V, and k in SatK.
5269 static bool isLowerSaturatingConditional(const SDValue
&Op
, SDValue
&V
,
5272 SDValue LHS
= Op
.getOperand(0);
5273 SDValue RHS
= Op
.getOperand(1);
5274 ISD::CondCode CC
= cast
<CondCodeSDNode
>(Op
.getOperand(4))->get();
5275 SDValue TrueVal
= Op
.getOperand(2);
5276 SDValue FalseVal
= Op
.getOperand(3);
5278 SDValue
*K
= isa
<ConstantSDNode
>(LHS
) ? &LHS
: isa
<ConstantSDNode
>(RHS
)
5282 // No constant operation in comparison, early out
5286 SDValue KTmp
= isa
<ConstantSDNode
>(TrueVal
) ? TrueVal
: FalseVal
;
5287 V
= (KTmp
== TrueVal
) ? FalseVal
: TrueVal
;
5288 SDValue VTmp
= (K
&& *K
== LHS
) ? RHS
: LHS
;
5290 // If the constant on left and right side, or variable on left and right,
5291 // does not match, early out
5292 if (*K
!= KTmp
|| V
!= VTmp
)
5295 if (isLowerSaturate(LHS
, RHS
, TrueVal
, FalseVal
, CC
, *K
)) {
5303 bool ARMTargetLowering::isUnsupportedFloatingType(EVT VT
) const {
5305 return !Subtarget
->hasVFP2Base();
5307 return !Subtarget
->hasFP64();
5309 return !Subtarget
->hasFullFP16();
5313 SDValue
ARMTargetLowering::LowerSELECT_CC(SDValue Op
, SelectionDAG
&DAG
) const {
5314 EVT VT
= Op
.getValueType();
5317 // Try to convert two saturating conditional selects into a single SSAT
5318 if ((!Subtarget
->isThumb() && Subtarget
->hasV6Ops()) || Subtarget
->isThumb2())
5319 if (SDValue SatValue
= LowerSaturatingConditional(Op
, DAG
))
5322 // Try to convert expressions of the form x < k ? k : x (and similar forms)
5323 // into more efficient bit operations, which is possible when k is 0 or -1
5324 // On ARM and Thumb-2 which have flexible operand 2 this will result in
5325 // single instructions. On Thumb the shift and the bit operation will be two
5327 // Only allow this transformation on full-width (32-bit) operations
5328 SDValue LowerSatConstant
;
5330 if (VT
== MVT::i32
&&
5331 isLowerSaturatingConditional(Op
, SatValue
, LowerSatConstant
)) {
5332 SDValue ShiftV
= DAG
.getNode(ISD::SRA
, dl
, VT
, SatValue
,
5333 DAG
.getConstant(31, dl
, VT
));
5334 if (isNullConstant(LowerSatConstant
)) {
5335 SDValue NotShiftV
= DAG
.getNode(ISD::XOR
, dl
, VT
, ShiftV
,
5336 DAG
.getAllOnesConstant(dl
, VT
));
5337 return DAG
.getNode(ISD::AND
, dl
, VT
, SatValue
, NotShiftV
);
5338 } else if (isAllOnesConstant(LowerSatConstant
))
5339 return DAG
.getNode(ISD::OR
, dl
, VT
, SatValue
, ShiftV
);
5342 SDValue LHS
= Op
.getOperand(0);
5343 SDValue RHS
= Op
.getOperand(1);
5344 ISD::CondCode CC
= cast
<CondCodeSDNode
>(Op
.getOperand(4))->get();
5345 SDValue TrueVal
= Op
.getOperand(2);
5346 SDValue FalseVal
= Op
.getOperand(3);
5347 ConstantSDNode
*CFVal
= dyn_cast
<ConstantSDNode
>(FalseVal
);
5348 ConstantSDNode
*CTVal
= dyn_cast
<ConstantSDNode
>(TrueVal
);
5350 if (Subtarget
->hasV8_1MMainlineOps() && CFVal
&& CTVal
&&
5351 LHS
.getValueType() == MVT::i32
&& RHS
.getValueType() == MVT::i32
) {
5352 unsigned TVal
= CTVal
->getZExtValue();
5353 unsigned FVal
= CFVal
->getZExtValue();
5354 unsigned Opcode
= 0;
5356 if (TVal
== ~FVal
) {
5357 Opcode
= ARMISD::CSINV
;
5358 } else if (TVal
== ~FVal
+ 1) {
5359 Opcode
= ARMISD::CSNEG
;
5360 } else if (TVal
+ 1 == FVal
) {
5361 Opcode
= ARMISD::CSINC
;
5362 } else if (TVal
== FVal
+ 1) {
5363 Opcode
= ARMISD::CSINC
;
5364 std::swap(TrueVal
, FalseVal
);
5365 std::swap(TVal
, FVal
);
5366 CC
= ISD::getSetCCInverse(CC
, LHS
.getValueType());
5370 // If one of the constants is cheaper than another, materialise the
5371 // cheaper one and let the csel generate the other.
5372 if (Opcode
!= ARMISD::CSINC
&&
5373 HasLowerConstantMaterializationCost(FVal
, TVal
, Subtarget
)) {
5374 std::swap(TrueVal
, FalseVal
);
5375 std::swap(TVal
, FVal
);
5376 CC
= ISD::getSetCCInverse(CC
, LHS
.getValueType());
5379 // Attempt to use ZR checking TVal is 0, possibly inverting the condition
5380 // to get there. CSINC not is invertable like the other two (~(~a) == a,
5381 // -(-a) == a, but (a+1)+1 != a).
5382 if (FVal
== 0 && Opcode
!= ARMISD::CSINC
) {
5383 std::swap(TrueVal
, FalseVal
);
5384 std::swap(TVal
, FVal
);
5385 CC
= ISD::getSetCCInverse(CC
, LHS
.getValueType());
5388 // Drops F's value because we can get it by inverting/negating TVal.
5392 SDValue Cmp
= getARMCmp(LHS
, RHS
, CC
, ARMcc
, DAG
, dl
);
5393 EVT VT
= TrueVal
.getValueType();
5394 return DAG
.getNode(Opcode
, dl
, VT
, TrueVal
, FalseVal
, ARMcc
, Cmp
);
5398 if (isUnsupportedFloatingType(LHS
.getValueType())) {
5399 DAG
.getTargetLoweringInfo().softenSetCCOperands(
5400 DAG
, LHS
.getValueType(), LHS
, RHS
, CC
, dl
, LHS
, RHS
);
5402 // If softenSetCCOperands only returned one value, we should compare it to
5404 if (!RHS
.getNode()) {
5405 RHS
= DAG
.getConstant(0, dl
, LHS
.getValueType());
5410 if (LHS
.getValueType() == MVT::i32
) {
5411 // Try to generate VSEL on ARMv8.
5412 // The VSEL instruction can't use all the usual ARM condition
5413 // codes: it only has two bits to select the condition code, so it's
5414 // constrained to use only GE, GT, VS and EQ.
5416 // To implement all the various ISD::SETXXX opcodes, we sometimes need to
5417 // swap the operands of the previous compare instruction (effectively
5418 // inverting the compare condition, swapping 'less' and 'greater') and
5419 // sometimes need to swap the operands to the VSEL (which inverts the
5420 // condition in the sense of firing whenever the previous condition didn't)
5421 if (Subtarget
->hasFPARMv8Base() && (TrueVal
.getValueType() == MVT::f16
||
5422 TrueVal
.getValueType() == MVT::f32
||
5423 TrueVal
.getValueType() == MVT::f64
)) {
5424 ARMCC::CondCodes CondCode
= IntCCToARMCC(CC
);
5425 if (CondCode
== ARMCC::LT
|| CondCode
== ARMCC::LE
||
5426 CondCode
== ARMCC::VC
|| CondCode
== ARMCC::NE
) {
5427 CC
= ISD::getSetCCInverse(CC
, LHS
.getValueType());
5428 std::swap(TrueVal
, FalseVal
);
5433 SDValue CCR
= DAG
.getRegister(ARM::CPSR
, MVT::i32
);
5434 SDValue Cmp
= getARMCmp(LHS
, RHS
, CC
, ARMcc
, DAG
, dl
);
5435 // Choose GE over PL, which vsel does now support
5436 if (cast
<ConstantSDNode
>(ARMcc
)->getZExtValue() == ARMCC::PL
)
5437 ARMcc
= DAG
.getConstant(ARMCC::GE
, dl
, MVT::i32
);
5438 return getCMOV(dl
, VT
, FalseVal
, TrueVal
, ARMcc
, CCR
, Cmp
, DAG
);
5441 ARMCC::CondCodes CondCode
, CondCode2
;
5442 FPCCToARMCC(CC
, CondCode
, CondCode2
);
5444 // Normalize the fp compare. If RHS is zero we prefer to keep it there so we
5445 // match CMPFPw0 instead of CMPFP, though we don't do this for f16 because we
5446 // must use VSEL (limited condition codes), due to not having conditional f16
5448 if (Subtarget
->hasFPARMv8Base() &&
5449 !(isFloatingPointZero(RHS
) && TrueVal
.getValueType() != MVT::f16
) &&
5450 (TrueVal
.getValueType() == MVT::f16
||
5451 TrueVal
.getValueType() == MVT::f32
||
5452 TrueVal
.getValueType() == MVT::f64
)) {
5453 bool swpCmpOps
= false;
5454 bool swpVselOps
= false;
5455 checkVSELConstraints(CC
, CondCode
, swpCmpOps
, swpVselOps
);
5457 if (CondCode
== ARMCC::GT
|| CondCode
== ARMCC::GE
||
5458 CondCode
== ARMCC::VS
|| CondCode
== ARMCC::EQ
) {
5460 std::swap(LHS
, RHS
);
5462 std::swap(TrueVal
, FalseVal
);
5466 SDValue ARMcc
= DAG
.getConstant(CondCode
, dl
, MVT::i32
);
5467 SDValue Cmp
= getVFPCmp(LHS
, RHS
, DAG
, dl
);
5468 SDValue CCR
= DAG
.getRegister(ARM::CPSR
, MVT::i32
);
5469 SDValue Result
= getCMOV(dl
, VT
, FalseVal
, TrueVal
, ARMcc
, CCR
, Cmp
, DAG
);
5470 if (CondCode2
!= ARMCC::AL
) {
5471 SDValue ARMcc2
= DAG
.getConstant(CondCode2
, dl
, MVT::i32
);
5472 // FIXME: Needs another CMP because flag can have but one use.
5473 SDValue Cmp2
= getVFPCmp(LHS
, RHS
, DAG
, dl
);
5474 Result
= getCMOV(dl
, VT
, Result
, TrueVal
, ARMcc2
, CCR
, Cmp2
, DAG
);
5479 /// canChangeToInt - Given the fp compare operand, return true if it is suitable
5480 /// to morph to an integer compare sequence.
5481 static bool canChangeToInt(SDValue Op
, bool &SeenZero
,
5482 const ARMSubtarget
*Subtarget
) {
5483 SDNode
*N
= Op
.getNode();
5484 if (!N
->hasOneUse())
5485 // Otherwise it requires moving the value from fp to integer registers.
5487 if (!N
->getNumValues())
5489 EVT VT
= Op
.getValueType();
5490 if (VT
!= MVT::f32
&& !Subtarget
->isFPBrccSlow())
5491 // f32 case is generally profitable. f64 case only makes sense when vcmpe +
5492 // vmrs are very slow, e.g. cortex-a8.
5495 if (isFloatingPointZero(Op
)) {
5499 return ISD::isNormalLoad(N
);
5502 static SDValue
bitcastf32Toi32(SDValue Op
, SelectionDAG
&DAG
) {
5503 if (isFloatingPointZero(Op
))
5504 return DAG
.getConstant(0, SDLoc(Op
), MVT::i32
);
5506 if (LoadSDNode
*Ld
= dyn_cast
<LoadSDNode
>(Op
))
5507 return DAG
.getLoad(MVT::i32
, SDLoc(Op
), Ld
->getChain(), Ld
->getBasePtr(),
5508 Ld
->getPointerInfo(), Ld
->getAlignment(),
5509 Ld
->getMemOperand()->getFlags());
5511 llvm_unreachable("Unknown VFP cmp argument!");
5514 static void expandf64Toi32(SDValue Op
, SelectionDAG
&DAG
,
5515 SDValue
&RetVal1
, SDValue
&RetVal2
) {
5518 if (isFloatingPointZero(Op
)) {
5519 RetVal1
= DAG
.getConstant(0, dl
, MVT::i32
);
5520 RetVal2
= DAG
.getConstant(0, dl
, MVT::i32
);
5524 if (LoadSDNode
*Ld
= dyn_cast
<LoadSDNode
>(Op
)) {
5525 SDValue Ptr
= Ld
->getBasePtr();
5527 DAG
.getLoad(MVT::i32
, dl
, Ld
->getChain(), Ptr
, Ld
->getPointerInfo(),
5528 Ld
->getAlignment(), Ld
->getMemOperand()->getFlags());
5530 EVT PtrType
= Ptr
.getValueType();
5531 unsigned NewAlign
= MinAlign(Ld
->getAlignment(), 4);
5532 SDValue NewPtr
= DAG
.getNode(ISD::ADD
, dl
,
5533 PtrType
, Ptr
, DAG
.getConstant(4, dl
, PtrType
));
5534 RetVal2
= DAG
.getLoad(MVT::i32
, dl
, Ld
->getChain(), NewPtr
,
5535 Ld
->getPointerInfo().getWithOffset(4), NewAlign
,
5536 Ld
->getMemOperand()->getFlags());
5540 llvm_unreachable("Unknown VFP cmp argument!");
5543 /// OptimizeVFPBrcond - With -enable-unsafe-fp-math, it's legal to optimize some
5544 /// f32 and even f64 comparisons to integer ones.
5546 ARMTargetLowering::OptimizeVFPBrcond(SDValue Op
, SelectionDAG
&DAG
) const {
5547 SDValue Chain
= Op
.getOperand(0);
5548 ISD::CondCode CC
= cast
<CondCodeSDNode
>(Op
.getOperand(1))->get();
5549 SDValue LHS
= Op
.getOperand(2);
5550 SDValue RHS
= Op
.getOperand(3);
5551 SDValue Dest
= Op
.getOperand(4);
5554 bool LHSSeenZero
= false;
5555 bool LHSOk
= canChangeToInt(LHS
, LHSSeenZero
, Subtarget
);
5556 bool RHSSeenZero
= false;
5557 bool RHSOk
= canChangeToInt(RHS
, RHSSeenZero
, Subtarget
);
5558 if (LHSOk
&& RHSOk
&& (LHSSeenZero
|| RHSSeenZero
)) {
5559 // If unsafe fp math optimization is enabled and there are no other uses of
5560 // the CMP operands, and the condition code is EQ or NE, we can optimize it
5561 // to an integer comparison.
5562 if (CC
== ISD::SETOEQ
)
5564 else if (CC
== ISD::SETUNE
)
5567 SDValue Mask
= DAG
.getConstant(0x7fffffff, dl
, MVT::i32
);
5569 if (LHS
.getValueType() == MVT::f32
) {
5570 LHS
= DAG
.getNode(ISD::AND
, dl
, MVT::i32
,
5571 bitcastf32Toi32(LHS
, DAG
), Mask
);
5572 RHS
= DAG
.getNode(ISD::AND
, dl
, MVT::i32
,
5573 bitcastf32Toi32(RHS
, DAG
), Mask
);
5574 SDValue Cmp
= getARMCmp(LHS
, RHS
, CC
, ARMcc
, DAG
, dl
);
5575 SDValue CCR
= DAG
.getRegister(ARM::CPSR
, MVT::i32
);
5576 return DAG
.getNode(ARMISD::BRCOND
, dl
, MVT::Other
,
5577 Chain
, Dest
, ARMcc
, CCR
, Cmp
);
5582 expandf64Toi32(LHS
, DAG
, LHS1
, LHS2
);
5583 expandf64Toi32(RHS
, DAG
, RHS1
, RHS2
);
5584 LHS2
= DAG
.getNode(ISD::AND
, dl
, MVT::i32
, LHS2
, Mask
);
5585 RHS2
= DAG
.getNode(ISD::AND
, dl
, MVT::i32
, RHS2
, Mask
);
5586 ARMCC::CondCodes CondCode
= IntCCToARMCC(CC
);
5587 ARMcc
= DAG
.getConstant(CondCode
, dl
, MVT::i32
);
5588 SDVTList VTList
= DAG
.getVTList(MVT::Other
, MVT::Glue
);
5589 SDValue Ops
[] = { Chain
, ARMcc
, LHS1
, LHS2
, RHS1
, RHS2
, Dest
};
5590 return DAG
.getNode(ARMISD::BCC_i64
, dl
, VTList
, Ops
);
5596 SDValue
ARMTargetLowering::LowerBRCOND(SDValue Op
, SelectionDAG
&DAG
) const {
5597 SDValue Chain
= Op
.getOperand(0);
5598 SDValue Cond
= Op
.getOperand(1);
5599 SDValue Dest
= Op
.getOperand(2);
5602 // Optimize {s|u}{add|sub|mul}.with.overflow feeding into a branch
5604 unsigned Opc
= Cond
.getOpcode();
5605 bool OptimizeMul
= (Opc
== ISD::SMULO
|| Opc
== ISD::UMULO
) &&
5606 !Subtarget
->isThumb1Only();
5607 if (Cond
.getResNo() == 1 &&
5608 (Opc
== ISD::SADDO
|| Opc
== ISD::UADDO
|| Opc
== ISD::SSUBO
||
5609 Opc
== ISD::USUBO
|| OptimizeMul
)) {
5610 // Only lower legal XALUO ops.
5611 if (!DAG
.getTargetLoweringInfo().isTypeLegal(Cond
->getValueType(0)))
5614 // The actual operation with overflow check.
5615 SDValue Value
, OverflowCmp
;
5617 std::tie(Value
, OverflowCmp
) = getARMXALUOOp(Cond
, DAG
, ARMcc
);
5619 // Reverse the condition code.
5620 ARMCC::CondCodes CondCode
=
5621 (ARMCC::CondCodes
)cast
<const ConstantSDNode
>(ARMcc
)->getZExtValue();
5622 CondCode
= ARMCC::getOppositeCondition(CondCode
);
5623 ARMcc
= DAG
.getConstant(CondCode
, SDLoc(ARMcc
), MVT::i32
);
5624 SDValue CCR
= DAG
.getRegister(ARM::CPSR
, MVT::i32
);
5626 return DAG
.getNode(ARMISD::BRCOND
, dl
, MVT::Other
, Chain
, Dest
, ARMcc
, CCR
,
5633 SDValue
ARMTargetLowering::LowerBR_CC(SDValue Op
, SelectionDAG
&DAG
) const {
5634 SDValue Chain
= Op
.getOperand(0);
5635 ISD::CondCode CC
= cast
<CondCodeSDNode
>(Op
.getOperand(1))->get();
5636 SDValue LHS
= Op
.getOperand(2);
5637 SDValue RHS
= Op
.getOperand(3);
5638 SDValue Dest
= Op
.getOperand(4);
5641 if (isUnsupportedFloatingType(LHS
.getValueType())) {
5642 DAG
.getTargetLoweringInfo().softenSetCCOperands(
5643 DAG
, LHS
.getValueType(), LHS
, RHS
, CC
, dl
, LHS
, RHS
);
5645 // If softenSetCCOperands only returned one value, we should compare it to
5647 if (!RHS
.getNode()) {
5648 RHS
= DAG
.getConstant(0, dl
, LHS
.getValueType());
5653 // Optimize {s|u}{add|sub|mul}.with.overflow feeding into a branch
5655 unsigned Opc
= LHS
.getOpcode();
5656 bool OptimizeMul
= (Opc
== ISD::SMULO
|| Opc
== ISD::UMULO
) &&
5657 !Subtarget
->isThumb1Only();
5658 if (LHS
.getResNo() == 1 && (isOneConstant(RHS
) || isNullConstant(RHS
)) &&
5659 (Opc
== ISD::SADDO
|| Opc
== ISD::UADDO
|| Opc
== ISD::SSUBO
||
5660 Opc
== ISD::USUBO
|| OptimizeMul
) &&
5661 (CC
== ISD::SETEQ
|| CC
== ISD::SETNE
)) {
5662 // Only lower legal XALUO ops.
5663 if (!DAG
.getTargetLoweringInfo().isTypeLegal(LHS
->getValueType(0)))
5666 // The actual operation with overflow check.
5667 SDValue Value
, OverflowCmp
;
5669 std::tie(Value
, OverflowCmp
) = getARMXALUOOp(LHS
.getValue(0), DAG
, ARMcc
);
5671 if ((CC
== ISD::SETNE
) != isOneConstant(RHS
)) {
5672 // Reverse the condition code.
5673 ARMCC::CondCodes CondCode
=
5674 (ARMCC::CondCodes
)cast
<const ConstantSDNode
>(ARMcc
)->getZExtValue();
5675 CondCode
= ARMCC::getOppositeCondition(CondCode
);
5676 ARMcc
= DAG
.getConstant(CondCode
, SDLoc(ARMcc
), MVT::i32
);
5678 SDValue CCR
= DAG
.getRegister(ARM::CPSR
, MVT::i32
);
5680 return DAG
.getNode(ARMISD::BRCOND
, dl
, MVT::Other
, Chain
, Dest
, ARMcc
, CCR
,
5684 if (LHS
.getValueType() == MVT::i32
) {
5686 SDValue Cmp
= getARMCmp(LHS
, RHS
, CC
, ARMcc
, DAG
, dl
);
5687 SDValue CCR
= DAG
.getRegister(ARM::CPSR
, MVT::i32
);
5688 return DAG
.getNode(ARMISD::BRCOND
, dl
, MVT::Other
,
5689 Chain
, Dest
, ARMcc
, CCR
, Cmp
);
5692 if (getTargetMachine().Options
.UnsafeFPMath
&&
5693 (CC
== ISD::SETEQ
|| CC
== ISD::SETOEQ
||
5694 CC
== ISD::SETNE
|| CC
== ISD::SETUNE
)) {
5695 if (SDValue Result
= OptimizeVFPBrcond(Op
, DAG
))
5699 ARMCC::CondCodes CondCode
, CondCode2
;
5700 FPCCToARMCC(CC
, CondCode
, CondCode2
);
5702 SDValue ARMcc
= DAG
.getConstant(CondCode
, dl
, MVT::i32
);
5703 SDValue Cmp
= getVFPCmp(LHS
, RHS
, DAG
, dl
);
5704 SDValue CCR
= DAG
.getRegister(ARM::CPSR
, MVT::i32
);
5705 SDVTList VTList
= DAG
.getVTList(MVT::Other
, MVT::Glue
);
5706 SDValue Ops
[] = { Chain
, Dest
, ARMcc
, CCR
, Cmp
};
5707 SDValue Res
= DAG
.getNode(ARMISD::BRCOND
, dl
, VTList
, Ops
);
5708 if (CondCode2
!= ARMCC::AL
) {
5709 ARMcc
= DAG
.getConstant(CondCode2
, dl
, MVT::i32
);
5710 SDValue Ops
[] = { Res
, Dest
, ARMcc
, CCR
, Res
.getValue(1) };
5711 Res
= DAG
.getNode(ARMISD::BRCOND
, dl
, VTList
, Ops
);
5716 SDValue
ARMTargetLowering::LowerBR_JT(SDValue Op
, SelectionDAG
&DAG
) const {
5717 SDValue Chain
= Op
.getOperand(0);
5718 SDValue Table
= Op
.getOperand(1);
5719 SDValue Index
= Op
.getOperand(2);
5722 EVT PTy
= getPointerTy(DAG
.getDataLayout());
5723 JumpTableSDNode
*JT
= cast
<JumpTableSDNode
>(Table
);
5724 SDValue JTI
= DAG
.getTargetJumpTable(JT
->getIndex(), PTy
);
5725 Table
= DAG
.getNode(ARMISD::WrapperJT
, dl
, MVT::i32
, JTI
);
5726 Index
= DAG
.getNode(ISD::MUL
, dl
, PTy
, Index
, DAG
.getConstant(4, dl
, PTy
));
5727 SDValue Addr
= DAG
.getNode(ISD::ADD
, dl
, PTy
, Table
, Index
);
5728 if (Subtarget
->isThumb2() || (Subtarget
->hasV8MBaselineOps() && Subtarget
->isThumb())) {
5729 // Thumb2 and ARMv8-M use a two-level jump. That is, it jumps into the jump table
5730 // which does another jump to the destination. This also makes it easier
5731 // to translate it to TBB / TBH later (Thumb2 only).
5732 // FIXME: This might not work if the function is extremely large.
5733 return DAG
.getNode(ARMISD::BR2_JT
, dl
, MVT::Other
, Chain
,
5734 Addr
, Op
.getOperand(2), JTI
);
5736 if (isPositionIndependent() || Subtarget
->isROPI()) {
5738 DAG
.getLoad((EVT
)MVT::i32
, dl
, Chain
, Addr
,
5739 MachinePointerInfo::getJumpTable(DAG
.getMachineFunction()));
5740 Chain
= Addr
.getValue(1);
5741 Addr
= DAG
.getNode(ISD::ADD
, dl
, PTy
, Table
, Addr
);
5742 return DAG
.getNode(ARMISD::BR_JT
, dl
, MVT::Other
, Chain
, Addr
, JTI
);
5745 DAG
.getLoad(PTy
, dl
, Chain
, Addr
,
5746 MachinePointerInfo::getJumpTable(DAG
.getMachineFunction()));
5747 Chain
= Addr
.getValue(1);
5748 return DAG
.getNode(ARMISD::BR_JT
, dl
, MVT::Other
, Chain
, Addr
, JTI
);
5752 static SDValue
LowerVectorFP_TO_INT(SDValue Op
, SelectionDAG
&DAG
) {
5753 EVT VT
= Op
.getValueType();
5756 if (Op
.getValueType().getVectorElementType() == MVT::i32
) {
5757 if (Op
.getOperand(0).getValueType().getVectorElementType() == MVT::f32
)
5759 return DAG
.UnrollVectorOp(Op
.getNode());
5762 const bool HasFullFP16
=
5763 static_cast<const ARMSubtarget
&>(DAG
.getSubtarget()).hasFullFP16();
5766 const EVT OpTy
= Op
.getOperand(0).getValueType();
5767 if (OpTy
== MVT::v4f32
)
5769 else if (OpTy
== MVT::v4f16
&& HasFullFP16
)
5771 else if (OpTy
== MVT::v8f16
&& HasFullFP16
)
5774 llvm_unreachable("Invalid type for custom lowering!");
5776 if (VT
!= MVT::v4i16
&& VT
!= MVT::v8i16
)
5777 return DAG
.UnrollVectorOp(Op
.getNode());
5779 Op
= DAG
.getNode(Op
.getOpcode(), dl
, NewTy
, Op
.getOperand(0));
5780 return DAG
.getNode(ISD::TRUNCATE
, dl
, VT
, Op
);
5783 SDValue
ARMTargetLowering::LowerFP_TO_INT(SDValue Op
, SelectionDAG
&DAG
) const {
5784 EVT VT
= Op
.getValueType();
5786 return LowerVectorFP_TO_INT(Op
, DAG
);
5788 bool IsStrict
= Op
->isStrictFPOpcode();
5789 SDValue SrcVal
= Op
.getOperand(IsStrict
? 1 : 0);
5791 if (isUnsupportedFloatingType(SrcVal
.getValueType())) {
5793 if (Op
.getOpcode() == ISD::FP_TO_SINT
||
5794 Op
.getOpcode() == ISD::STRICT_FP_TO_SINT
)
5795 LC
= RTLIB::getFPTOSINT(SrcVal
.getValueType(),
5798 LC
= RTLIB::getFPTOUINT(SrcVal
.getValueType(),
5801 MakeLibCallOptions CallOptions
;
5802 SDValue Chain
= IsStrict
? Op
.getOperand(0) : SDValue();
5804 std::tie(Result
, Chain
) = makeLibCall(DAG
, LC
, Op
.getValueType(), SrcVal
,
5805 CallOptions
, Loc
, Chain
);
5806 return IsStrict
? DAG
.getMergeValues({Result
, Chain
}, Loc
) : Result
;
5809 // FIXME: Remove this when we have strict fp instruction selection patterns
5813 DAG
.getNode(Op
.getOpcode() == ISD::STRICT_FP_TO_SINT
? ISD::FP_TO_SINT
5815 Loc
, Op
.getValueType(), SrcVal
);
5816 return DAG
.getMergeValues({Result
, Op
.getOperand(0)}, Loc
);
5822 static SDValue
LowerVectorINT_TO_FP(SDValue Op
, SelectionDAG
&DAG
) {
5823 EVT VT
= Op
.getValueType();
5826 if (Op
.getOperand(0).getValueType().getVectorElementType() == MVT::i32
) {
5827 if (VT
.getVectorElementType() == MVT::f32
)
5829 return DAG
.UnrollVectorOp(Op
.getNode());
5832 assert((Op
.getOperand(0).getValueType() == MVT::v4i16
||
5833 Op
.getOperand(0).getValueType() == MVT::v8i16
) &&
5834 "Invalid type for custom lowering!");
5836 const bool HasFullFP16
=
5837 static_cast<const ARMSubtarget
&>(DAG
.getSubtarget()).hasFullFP16();
5840 if (VT
== MVT::v4f32
)
5841 DestVecType
= MVT::v4i32
;
5842 else if (VT
== MVT::v4f16
&& HasFullFP16
)
5843 DestVecType
= MVT::v4i16
;
5844 else if (VT
== MVT::v8f16
&& HasFullFP16
)
5845 DestVecType
= MVT::v8i16
;
5847 return DAG
.UnrollVectorOp(Op
.getNode());
5851 switch (Op
.getOpcode()) {
5852 default: llvm_unreachable("Invalid opcode!");
5853 case ISD::SINT_TO_FP
:
5854 CastOpc
= ISD::SIGN_EXTEND
;
5855 Opc
= ISD::SINT_TO_FP
;
5857 case ISD::UINT_TO_FP
:
5858 CastOpc
= ISD::ZERO_EXTEND
;
5859 Opc
= ISD::UINT_TO_FP
;
5863 Op
= DAG
.getNode(CastOpc
, dl
, DestVecType
, Op
.getOperand(0));
5864 return DAG
.getNode(Opc
, dl
, VT
, Op
);
5867 SDValue
ARMTargetLowering::LowerINT_TO_FP(SDValue Op
, SelectionDAG
&DAG
) const {
5868 EVT VT
= Op
.getValueType();
5870 return LowerVectorINT_TO_FP(Op
, DAG
);
5871 if (isUnsupportedFloatingType(VT
)) {
5873 if (Op
.getOpcode() == ISD::SINT_TO_FP
)
5874 LC
= RTLIB::getSINTTOFP(Op
.getOperand(0).getValueType(),
5877 LC
= RTLIB::getUINTTOFP(Op
.getOperand(0).getValueType(),
5879 MakeLibCallOptions CallOptions
;
5880 return makeLibCall(DAG
, LC
, Op
.getValueType(), Op
.getOperand(0),
5881 CallOptions
, SDLoc(Op
)).first
;
5887 SDValue
ARMTargetLowering::LowerFCOPYSIGN(SDValue Op
, SelectionDAG
&DAG
) const {
5888 // Implement fcopysign with a fabs and a conditional fneg.
5889 SDValue Tmp0
= Op
.getOperand(0);
5890 SDValue Tmp1
= Op
.getOperand(1);
5892 EVT VT
= Op
.getValueType();
5893 EVT SrcVT
= Tmp1
.getValueType();
5894 bool InGPR
= Tmp0
.getOpcode() == ISD::BITCAST
||
5895 Tmp0
.getOpcode() == ARMISD::VMOVDRR
;
5896 bool UseNEON
= !InGPR
&& Subtarget
->hasNEON();
5899 // Use VBSL to copy the sign bit.
5900 unsigned EncodedVal
= ARM_AM::createVMOVModImm(0x6, 0x80);
5901 SDValue Mask
= DAG
.getNode(ARMISD::VMOVIMM
, dl
, MVT::v2i32
,
5902 DAG
.getTargetConstant(EncodedVal
, dl
, MVT::i32
));
5903 EVT OpVT
= (VT
== MVT::f32
) ? MVT::v2i32
: MVT::v1i64
;
5905 Mask
= DAG
.getNode(ARMISD::VSHLIMM
, dl
, OpVT
,
5906 DAG
.getNode(ISD::BITCAST
, dl
, OpVT
, Mask
),
5907 DAG
.getConstant(32, dl
, MVT::i32
));
5908 else /*if (VT == MVT::f32)*/
5909 Tmp0
= DAG
.getNode(ISD::SCALAR_TO_VECTOR
, dl
, MVT::v2f32
, Tmp0
);
5910 if (SrcVT
== MVT::f32
) {
5911 Tmp1
= DAG
.getNode(ISD::SCALAR_TO_VECTOR
, dl
, MVT::v2f32
, Tmp1
);
5913 Tmp1
= DAG
.getNode(ARMISD::VSHLIMM
, dl
, OpVT
,
5914 DAG
.getNode(ISD::BITCAST
, dl
, OpVT
, Tmp1
),
5915 DAG
.getConstant(32, dl
, MVT::i32
));
5916 } else if (VT
== MVT::f32
)
5917 Tmp1
= DAG
.getNode(ARMISD::VSHRuIMM
, dl
, MVT::v1i64
,
5918 DAG
.getNode(ISD::BITCAST
, dl
, MVT::v1i64
, Tmp1
),
5919 DAG
.getConstant(32, dl
, MVT::i32
));
5920 Tmp0
= DAG
.getNode(ISD::BITCAST
, dl
, OpVT
, Tmp0
);
5921 Tmp1
= DAG
.getNode(ISD::BITCAST
, dl
, OpVT
, Tmp1
);
5923 SDValue AllOnes
= DAG
.getTargetConstant(ARM_AM::createVMOVModImm(0xe, 0xff),
5925 AllOnes
= DAG
.getNode(ARMISD::VMOVIMM
, dl
, MVT::v8i8
, AllOnes
);
5926 SDValue MaskNot
= DAG
.getNode(ISD::XOR
, dl
, OpVT
, Mask
,
5927 DAG
.getNode(ISD::BITCAST
, dl
, OpVT
, AllOnes
));
5929 SDValue Res
= DAG
.getNode(ISD::OR
, dl
, OpVT
,
5930 DAG
.getNode(ISD::AND
, dl
, OpVT
, Tmp1
, Mask
),
5931 DAG
.getNode(ISD::AND
, dl
, OpVT
, Tmp0
, MaskNot
));
5932 if (VT
== MVT::f32
) {
5933 Res
= DAG
.getNode(ISD::BITCAST
, dl
, MVT::v2f32
, Res
);
5934 Res
= DAG
.getNode(ISD::EXTRACT_VECTOR_ELT
, dl
, MVT::f32
, Res
,
5935 DAG
.getConstant(0, dl
, MVT::i32
));
5937 Res
= DAG
.getNode(ISD::BITCAST
, dl
, MVT::f64
, Res
);
5943 // Bitcast operand 1 to i32.
5944 if (SrcVT
== MVT::f64
)
5945 Tmp1
= DAG
.getNode(ARMISD::VMOVRRD
, dl
, DAG
.getVTList(MVT::i32
, MVT::i32
),
5947 Tmp1
= DAG
.getNode(ISD::BITCAST
, dl
, MVT::i32
, Tmp1
);
5949 // Or in the signbit with integer operations.
5950 SDValue Mask1
= DAG
.getConstant(0x80000000, dl
, MVT::i32
);
5951 SDValue Mask2
= DAG
.getConstant(0x7fffffff, dl
, MVT::i32
);
5952 Tmp1
= DAG
.getNode(ISD::AND
, dl
, MVT::i32
, Tmp1
, Mask1
);
5953 if (VT
== MVT::f32
) {
5954 Tmp0
= DAG
.getNode(ISD::AND
, dl
, MVT::i32
,
5955 DAG
.getNode(ISD::BITCAST
, dl
, MVT::i32
, Tmp0
), Mask2
);
5956 return DAG
.getNode(ISD::BITCAST
, dl
, MVT::f32
,
5957 DAG
.getNode(ISD::OR
, dl
, MVT::i32
, Tmp0
, Tmp1
));
5960 // f64: Or the high part with signbit and then combine two parts.
5961 Tmp0
= DAG
.getNode(ARMISD::VMOVRRD
, dl
, DAG
.getVTList(MVT::i32
, MVT::i32
),
5963 SDValue Lo
= Tmp0
.getValue(0);
5964 SDValue Hi
= DAG
.getNode(ISD::AND
, dl
, MVT::i32
, Tmp0
.getValue(1), Mask2
);
5965 Hi
= DAG
.getNode(ISD::OR
, dl
, MVT::i32
, Hi
, Tmp1
);
5966 return DAG
.getNode(ARMISD::VMOVDRR
, dl
, MVT::f64
, Lo
, Hi
);
5969 SDValue
ARMTargetLowering::LowerRETURNADDR(SDValue Op
, SelectionDAG
&DAG
) const{
5970 MachineFunction
&MF
= DAG
.getMachineFunction();
5971 MachineFrameInfo
&MFI
= MF
.getFrameInfo();
5972 MFI
.setReturnAddressIsTaken(true);
5974 if (verifyReturnAddressArgumentIsConstant(Op
, DAG
))
5977 EVT VT
= Op
.getValueType();
5979 unsigned Depth
= cast
<ConstantSDNode
>(Op
.getOperand(0))->getZExtValue();
5981 SDValue FrameAddr
= LowerFRAMEADDR(Op
, DAG
);
5982 SDValue Offset
= DAG
.getConstant(4, dl
, MVT::i32
);
5983 return DAG
.getLoad(VT
, dl
, DAG
.getEntryNode(),
5984 DAG
.getNode(ISD::ADD
, dl
, VT
, FrameAddr
, Offset
),
5985 MachinePointerInfo());
5988 // Return LR, which contains the return address. Mark it an implicit live-in.
5989 unsigned Reg
= MF
.addLiveIn(ARM::LR
, getRegClassFor(MVT::i32
));
5990 return DAG
.getCopyFromReg(DAG
.getEntryNode(), dl
, Reg
, VT
);
5993 SDValue
ARMTargetLowering::LowerFRAMEADDR(SDValue Op
, SelectionDAG
&DAG
) const {
5994 const ARMBaseRegisterInfo
&ARI
=
5995 *static_cast<const ARMBaseRegisterInfo
*>(RegInfo
);
5996 MachineFunction
&MF
= DAG
.getMachineFunction();
5997 MachineFrameInfo
&MFI
= MF
.getFrameInfo();
5998 MFI
.setFrameAddressIsTaken(true);
6000 EVT VT
= Op
.getValueType();
6001 SDLoc
dl(Op
); // FIXME probably not meaningful
6002 unsigned Depth
= cast
<ConstantSDNode
>(Op
.getOperand(0))->getZExtValue();
6003 Register FrameReg
= ARI
.getFrameRegister(MF
);
6004 SDValue FrameAddr
= DAG
.getCopyFromReg(DAG
.getEntryNode(), dl
, FrameReg
, VT
);
6006 FrameAddr
= DAG
.getLoad(VT
, dl
, DAG
.getEntryNode(), FrameAddr
,
6007 MachinePointerInfo());
6011 // FIXME? Maybe this could be a TableGen attribute on some registers and
6012 // this table could be generated automatically from RegInfo.
6013 Register
ARMTargetLowering::getRegisterByName(const char* RegName
, LLT VT
,
6014 const MachineFunction
&MF
) const {
6015 Register Reg
= StringSwitch
<unsigned>(RegName
)
6016 .Case("sp", ARM::SP
)
6020 report_fatal_error(Twine("Invalid register name \""
6021 + StringRef(RegName
) + "\"."));
6024 // Result is 64 bit value so split into two 32 bit values and return as a
6026 static void ExpandREAD_REGISTER(SDNode
*N
, SmallVectorImpl
<SDValue
> &Results
,
6027 SelectionDAG
&DAG
) {
6030 // This function is only supposed to be called for i64 type destination.
6031 assert(N
->getValueType(0) == MVT::i64
6032 && "ExpandREAD_REGISTER called for non-i64 type result.");
6034 SDValue Read
= DAG
.getNode(ISD::READ_REGISTER
, DL
,
6035 DAG
.getVTList(MVT::i32
, MVT::i32
, MVT::Other
),
6039 Results
.push_back(DAG
.getNode(ISD::BUILD_PAIR
, DL
, MVT::i64
, Read
.getValue(0),
6041 Results
.push_back(Read
.getOperand(0));
6044 /// \p BC is a bitcast that is about to be turned into a VMOVDRR.
6045 /// When \p DstVT, the destination type of \p BC, is on the vector
6046 /// register bank and the source of bitcast, \p Op, operates on the same bank,
6047 /// it might be possible to combine them, such that everything stays on the
6048 /// vector register bank.
6049 /// \p return The node that would replace \p BT, if the combine
6051 static SDValue
CombineVMOVDRRCandidateWithVecOp(const SDNode
*BC
,
6052 SelectionDAG
&DAG
) {
6053 SDValue Op
= BC
->getOperand(0);
6054 EVT DstVT
= BC
->getValueType(0);
6056 // The only vector instruction that can produce a scalar (remember,
6057 // since the bitcast was about to be turned into VMOVDRR, the source
6058 // type is i64) from a vector is EXTRACT_VECTOR_ELT.
6059 // Moreover, we can do this combine only if there is one use.
6060 // Finally, if the destination type is not a vector, there is not
6061 // much point on forcing everything on the vector bank.
6062 if (!DstVT
.isVector() || Op
.getOpcode() != ISD::EXTRACT_VECTOR_ELT
||
6066 // If the index is not constant, we will introduce an additional
6067 // multiply that will stick.
6068 // Give up in that case.
6069 ConstantSDNode
*Index
= dyn_cast
<ConstantSDNode
>(Op
.getOperand(1));
6072 unsigned DstNumElt
= DstVT
.getVectorNumElements();
6074 // Compute the new index.
6075 const APInt
&APIntIndex
= Index
->getAPIntValue();
6076 APInt
NewIndex(APIntIndex
.getBitWidth(), DstNumElt
);
6077 NewIndex
*= APIntIndex
;
6078 // Check if the new constant index fits into i32.
6079 if (NewIndex
.getBitWidth() > 32)
6082 // vMTy bitcast(i64 extractelt vNi64 src, i32 index) ->
6083 // vMTy extractsubvector vNxMTy (bitcast vNi64 src), i32 index*M)
6085 SDValue ExtractSrc
= Op
.getOperand(0);
6086 EVT VecVT
= EVT::getVectorVT(
6087 *DAG
.getContext(), DstVT
.getScalarType(),
6088 ExtractSrc
.getValueType().getVectorNumElements() * DstNumElt
);
6089 SDValue BitCast
= DAG
.getNode(ISD::BITCAST
, dl
, VecVT
, ExtractSrc
);
6090 return DAG
.getNode(ISD::EXTRACT_SUBVECTOR
, dl
, DstVT
, BitCast
,
6091 DAG
.getConstant(NewIndex
.getZExtValue(), dl
, MVT::i32
));
6094 /// ExpandBITCAST - If the target supports VFP, this function is called to
6095 /// expand a bit convert where either the source or destination type is i64 to
6096 /// use a VMOVDRR or VMOVRRD node. This should not be done when the non-i64
6097 /// operand type is illegal (e.g., v2f32 for a target that doesn't support
6098 /// vectors), since the legalizer won't know what to do with that.
6099 SDValue
ARMTargetLowering::ExpandBITCAST(SDNode
*N
, SelectionDAG
&DAG
,
6100 const ARMSubtarget
*Subtarget
) const {
6101 const TargetLowering
&TLI
= DAG
.getTargetLoweringInfo();
6103 SDValue Op
= N
->getOperand(0);
6105 // This function is only supposed to be called for i16 and i64 types, either
6106 // as the source or destination of the bit convert.
6107 EVT SrcVT
= Op
.getValueType();
6108 EVT DstVT
= N
->getValueType(0);
6110 if ((SrcVT
== MVT::i16
|| SrcVT
== MVT::i32
) &&
6111 (DstVT
== MVT::f16
|| DstVT
== MVT::bf16
))
6112 return MoveToHPR(SDLoc(N
), DAG
, MVT::i32
, DstVT
.getSimpleVT(),
6113 DAG
.getNode(ISD::ZERO_EXTEND
, SDLoc(N
), MVT::i32
, Op
));
6115 if ((DstVT
== MVT::i16
|| DstVT
== MVT::i32
) &&
6116 (SrcVT
== MVT::f16
|| SrcVT
== MVT::bf16
))
6118 ISD::TRUNCATE
, SDLoc(N
), DstVT
,
6119 MoveFromHPR(SDLoc(N
), DAG
, MVT::i32
, SrcVT
.getSimpleVT(), Op
));
6121 if (!(SrcVT
== MVT::i64
|| DstVT
== MVT::i64
))
6124 // Turn i64->f64 into VMOVDRR.
6125 if (SrcVT
== MVT::i64
&& TLI
.isTypeLegal(DstVT
)) {
6126 // Do not force values to GPRs (this is what VMOVDRR does for the inputs)
6127 // if we can combine the bitcast with its source.
6128 if (SDValue Val
= CombineVMOVDRRCandidateWithVecOp(N
, DAG
))
6131 SDValue Lo
= DAG
.getNode(ISD::EXTRACT_ELEMENT
, dl
, MVT::i32
, Op
,
6132 DAG
.getConstant(0, dl
, MVT::i32
));
6133 SDValue Hi
= DAG
.getNode(ISD::EXTRACT_ELEMENT
, dl
, MVT::i32
, Op
,
6134 DAG
.getConstant(1, dl
, MVT::i32
));
6135 return DAG
.getNode(ISD::BITCAST
, dl
, DstVT
,
6136 DAG
.getNode(ARMISD::VMOVDRR
, dl
, MVT::f64
, Lo
, Hi
));
6139 // Turn f64->i64 into VMOVRRD.
6140 if (DstVT
== MVT::i64
&& TLI
.isTypeLegal(SrcVT
)) {
6142 if (DAG
.getDataLayout().isBigEndian() && SrcVT
.isVector() &&
6143 SrcVT
.getVectorNumElements() > 1)
6144 Cvt
= DAG
.getNode(ARMISD::VMOVRRD
, dl
,
6145 DAG
.getVTList(MVT::i32
, MVT::i32
),
6146 DAG
.getNode(ARMISD::VREV64
, dl
, SrcVT
, Op
));
6148 Cvt
= DAG
.getNode(ARMISD::VMOVRRD
, dl
,
6149 DAG
.getVTList(MVT::i32
, MVT::i32
), Op
);
6150 // Merge the pieces into a single i64 value.
6151 return DAG
.getNode(ISD::BUILD_PAIR
, dl
, MVT::i64
, Cvt
, Cvt
.getValue(1));
6157 /// getZeroVector - Returns a vector of specified type with all zero elements.
6158 /// Zero vectors are used to represent vector negation and in those cases
6159 /// will be implemented with the NEON VNEG instruction. However, VNEG does
6160 /// not support i64 elements, so sometimes the zero vectors will need to be
6161 /// explicitly constructed. Regardless, use a canonical VMOV to create the
6163 static SDValue
getZeroVector(EVT VT
, SelectionDAG
&DAG
, const SDLoc
&dl
) {
6164 assert(VT
.isVector() && "Expected a vector type");
6165 // The canonical modified immediate encoding of a zero vector is....0!
6166 SDValue EncodedVal
= DAG
.getTargetConstant(0, dl
, MVT::i32
);
6167 EVT VmovVT
= VT
.is128BitVector() ? MVT::v4i32
: MVT::v2i32
;
6168 SDValue Vmov
= DAG
.getNode(ARMISD::VMOVIMM
, dl
, VmovVT
, EncodedVal
);
6169 return DAG
.getNode(ISD::BITCAST
, dl
, VT
, Vmov
);
6172 /// LowerShiftRightParts - Lower SRA_PARTS, which returns two
6173 /// i32 values and take a 2 x i32 value to shift plus a shift amount.
6174 SDValue
ARMTargetLowering::LowerShiftRightParts(SDValue Op
,
6175 SelectionDAG
&DAG
) const {
6176 assert(Op
.getNumOperands() == 3 && "Not a double-shift!");
6177 EVT VT
= Op
.getValueType();
6178 unsigned VTBits
= VT
.getSizeInBits();
6180 SDValue ShOpLo
= Op
.getOperand(0);
6181 SDValue ShOpHi
= Op
.getOperand(1);
6182 SDValue ShAmt
= Op
.getOperand(2);
6184 SDValue CCR
= DAG
.getRegister(ARM::CPSR
, MVT::i32
);
6185 unsigned Opc
= (Op
.getOpcode() == ISD::SRA_PARTS
) ? ISD::SRA
: ISD::SRL
;
6187 assert(Op
.getOpcode() == ISD::SRA_PARTS
|| Op
.getOpcode() == ISD::SRL_PARTS
);
6189 SDValue RevShAmt
= DAG
.getNode(ISD::SUB
, dl
, MVT::i32
,
6190 DAG
.getConstant(VTBits
, dl
, MVT::i32
), ShAmt
);
6191 SDValue Tmp1
= DAG
.getNode(ISD::SRL
, dl
, VT
, ShOpLo
, ShAmt
);
6192 SDValue ExtraShAmt
= DAG
.getNode(ISD::SUB
, dl
, MVT::i32
, ShAmt
,
6193 DAG
.getConstant(VTBits
, dl
, MVT::i32
));
6194 SDValue Tmp2
= DAG
.getNode(ISD::SHL
, dl
, VT
, ShOpHi
, RevShAmt
);
6195 SDValue LoSmallShift
= DAG
.getNode(ISD::OR
, dl
, VT
, Tmp1
, Tmp2
);
6196 SDValue LoBigShift
= DAG
.getNode(Opc
, dl
, VT
, ShOpHi
, ExtraShAmt
);
6197 SDValue CmpLo
= getARMCmp(ExtraShAmt
, DAG
.getConstant(0, dl
, MVT::i32
),
6198 ISD::SETGE
, ARMcc
, DAG
, dl
);
6199 SDValue Lo
= DAG
.getNode(ARMISD::CMOV
, dl
, VT
, LoSmallShift
, LoBigShift
,
6202 SDValue HiSmallShift
= DAG
.getNode(Opc
, dl
, VT
, ShOpHi
, ShAmt
);
6203 SDValue HiBigShift
= Opc
== ISD::SRA
6204 ? DAG
.getNode(Opc
, dl
, VT
, ShOpHi
,
6205 DAG
.getConstant(VTBits
- 1, dl
, VT
))
6206 : DAG
.getConstant(0, dl
, VT
);
6207 SDValue CmpHi
= getARMCmp(ExtraShAmt
, DAG
.getConstant(0, dl
, MVT::i32
),
6208 ISD::SETGE
, ARMcc
, DAG
, dl
);
6209 SDValue Hi
= DAG
.getNode(ARMISD::CMOV
, dl
, VT
, HiSmallShift
, HiBigShift
,
6212 SDValue Ops
[2] = { Lo
, Hi
};
6213 return DAG
.getMergeValues(Ops
, dl
);
6216 /// LowerShiftLeftParts - Lower SHL_PARTS, which returns two
6217 /// i32 values and take a 2 x i32 value to shift plus a shift amount.
6218 SDValue
ARMTargetLowering::LowerShiftLeftParts(SDValue Op
,
6219 SelectionDAG
&DAG
) const {
6220 assert(Op
.getNumOperands() == 3 && "Not a double-shift!");
6221 EVT VT
= Op
.getValueType();
6222 unsigned VTBits
= VT
.getSizeInBits();
6224 SDValue ShOpLo
= Op
.getOperand(0);
6225 SDValue ShOpHi
= Op
.getOperand(1);
6226 SDValue ShAmt
= Op
.getOperand(2);
6228 SDValue CCR
= DAG
.getRegister(ARM::CPSR
, MVT::i32
);
6230 assert(Op
.getOpcode() == ISD::SHL_PARTS
);
6231 SDValue RevShAmt
= DAG
.getNode(ISD::SUB
, dl
, MVT::i32
,
6232 DAG
.getConstant(VTBits
, dl
, MVT::i32
), ShAmt
);
6233 SDValue Tmp1
= DAG
.getNode(ISD::SRL
, dl
, VT
, ShOpLo
, RevShAmt
);
6234 SDValue Tmp2
= DAG
.getNode(ISD::SHL
, dl
, VT
, ShOpHi
, ShAmt
);
6235 SDValue HiSmallShift
= DAG
.getNode(ISD::OR
, dl
, VT
, Tmp1
, Tmp2
);
6237 SDValue ExtraShAmt
= DAG
.getNode(ISD::SUB
, dl
, MVT::i32
, ShAmt
,
6238 DAG
.getConstant(VTBits
, dl
, MVT::i32
));
6239 SDValue HiBigShift
= DAG
.getNode(ISD::SHL
, dl
, VT
, ShOpLo
, ExtraShAmt
);
6240 SDValue CmpHi
= getARMCmp(ExtraShAmt
, DAG
.getConstant(0, dl
, MVT::i32
),
6241 ISD::SETGE
, ARMcc
, DAG
, dl
);
6242 SDValue Hi
= DAG
.getNode(ARMISD::CMOV
, dl
, VT
, HiSmallShift
, HiBigShift
,
6245 SDValue CmpLo
= getARMCmp(ExtraShAmt
, DAG
.getConstant(0, dl
, MVT::i32
),
6246 ISD::SETGE
, ARMcc
, DAG
, dl
);
6247 SDValue LoSmallShift
= DAG
.getNode(ISD::SHL
, dl
, VT
, ShOpLo
, ShAmt
);
6248 SDValue Lo
= DAG
.getNode(ARMISD::CMOV
, dl
, VT
, LoSmallShift
,
6249 DAG
.getConstant(0, dl
, VT
), ARMcc
, CCR
, CmpLo
);
6251 SDValue Ops
[2] = { Lo
, Hi
};
6252 return DAG
.getMergeValues(Ops
, dl
);
6255 SDValue
ARMTargetLowering::LowerFLT_ROUNDS_(SDValue Op
,
6256 SelectionDAG
&DAG
) const {
6257 // The rounding mode is in bits 23:22 of the FPSCR.
6258 // The ARM rounding mode value to FLT_ROUNDS mapping is 0->1, 1->2, 2->3, 3->0
6259 // The formula we use to implement this is (((FPSCR + 1 << 22) >> 22) & 3)
6260 // so that the shift + and get folded into a bitfield extract.
6262 SDValue Chain
= Op
.getOperand(0);
6263 SDValue Ops
[] = {Chain
,
6264 DAG
.getConstant(Intrinsic::arm_get_fpscr
, dl
, MVT::i32
)};
6267 DAG
.getNode(ISD::INTRINSIC_W_CHAIN
, dl
, {MVT::i32
, MVT::Other
}, Ops
);
6268 Chain
= FPSCR
.getValue(1);
6269 SDValue FltRounds
= DAG
.getNode(ISD::ADD
, dl
, MVT::i32
, FPSCR
,
6270 DAG
.getConstant(1U << 22, dl
, MVT::i32
));
6271 SDValue RMODE
= DAG
.getNode(ISD::SRL
, dl
, MVT::i32
, FltRounds
,
6272 DAG
.getConstant(22, dl
, MVT::i32
));
6273 SDValue And
= DAG
.getNode(ISD::AND
, dl
, MVT::i32
, RMODE
,
6274 DAG
.getConstant(3, dl
, MVT::i32
));
6275 return DAG
.getMergeValues({And
, Chain
}, dl
);
6278 SDValue
ARMTargetLowering::LowerSET_ROUNDING(SDValue Op
,
6279 SelectionDAG
&DAG
) const {
6281 SDValue Chain
= Op
->getOperand(0);
6282 SDValue RMValue
= Op
->getOperand(1);
6284 // The rounding mode is in bits 23:22 of the FPSCR.
6285 // The llvm.set.rounding argument value to ARM rounding mode value mapping
6286 // is 0->3, 1->0, 2->1, 3->2. The formula we use to implement this is
6287 // ((arg - 1) & 3) << 22).
6289 // It is expected that the argument of llvm.set.rounding is within the
6290 // segment [0, 3], so NearestTiesToAway (4) is not handled here. It is
6291 // responsibility of the code generated llvm.set.rounding to ensure this
6294 // Calculate new value of FPSCR[23:22].
6295 RMValue
= DAG
.getNode(ISD::SUB
, DL
, MVT::i32
, RMValue
,
6296 DAG
.getConstant(1, DL
, MVT::i32
));
6297 RMValue
= DAG
.getNode(ISD::AND
, DL
, MVT::i32
, RMValue
,
6298 DAG
.getConstant(0x3, DL
, MVT::i32
));
6299 RMValue
= DAG
.getNode(ISD::SHL
, DL
, MVT::i32
, RMValue
,
6300 DAG
.getConstant(ARM::RoundingBitsPos
, DL
, MVT::i32
));
6302 // Get current value of FPSCR.
6303 SDValue Ops
[] = {Chain
,
6304 DAG
.getConstant(Intrinsic::arm_get_fpscr
, DL
, MVT::i32
)};
6306 DAG
.getNode(ISD::INTRINSIC_W_CHAIN
, DL
, {MVT::i32
, MVT::Other
}, Ops
);
6307 Chain
= FPSCR
.getValue(1);
6308 FPSCR
= FPSCR
.getValue(0);
6310 // Put new rounding mode into FPSCR[23:22].
6311 const unsigned RMMask
= ~(ARM::Rounding::rmMask
<< ARM::RoundingBitsPos
);
6312 FPSCR
= DAG
.getNode(ISD::AND
, DL
, MVT::i32
, FPSCR
,
6313 DAG
.getConstant(RMMask
, DL
, MVT::i32
));
6314 FPSCR
= DAG
.getNode(ISD::OR
, DL
, MVT::i32
, FPSCR
, RMValue
);
6316 Chain
, DAG
.getConstant(Intrinsic::arm_set_fpscr
, DL
, MVT::i32
), FPSCR
};
6317 return DAG
.getNode(ISD::INTRINSIC_VOID
, DL
, MVT::Other
, Ops2
);
6320 static SDValue
LowerCTTZ(SDNode
*N
, SelectionDAG
&DAG
,
6321 const ARMSubtarget
*ST
) {
6323 EVT VT
= N
->getValueType(0);
6324 if (VT
.isVector() && ST
->hasNEON()) {
6326 // Compute the least significant set bit: LSB = X & -X
6327 SDValue X
= N
->getOperand(0);
6328 SDValue NX
= DAG
.getNode(ISD::SUB
, dl
, VT
, getZeroVector(VT
, DAG
, dl
), X
);
6329 SDValue LSB
= DAG
.getNode(ISD::AND
, dl
, VT
, X
, NX
);
6331 EVT ElemTy
= VT
.getVectorElementType();
6333 if (ElemTy
== MVT::i8
) {
6334 // Compute with: cttz(x) = ctpop(lsb - 1)
6335 SDValue One
= DAG
.getNode(ARMISD::VMOVIMM
, dl
, VT
,
6336 DAG
.getTargetConstant(1, dl
, ElemTy
));
6337 SDValue Bits
= DAG
.getNode(ISD::SUB
, dl
, VT
, LSB
, One
);
6338 return DAG
.getNode(ISD::CTPOP
, dl
, VT
, Bits
);
6341 if ((ElemTy
== MVT::i16
|| ElemTy
== MVT::i32
) &&
6342 (N
->getOpcode() == ISD::CTTZ_ZERO_UNDEF
)) {
6343 // Compute with: cttz(x) = (width - 1) - ctlz(lsb), if x != 0
6344 unsigned NumBits
= ElemTy
.getSizeInBits();
6345 SDValue WidthMinus1
=
6346 DAG
.getNode(ARMISD::VMOVIMM
, dl
, VT
,
6347 DAG
.getTargetConstant(NumBits
- 1, dl
, ElemTy
));
6348 SDValue CTLZ
= DAG
.getNode(ISD::CTLZ
, dl
, VT
, LSB
);
6349 return DAG
.getNode(ISD::SUB
, dl
, VT
, WidthMinus1
, CTLZ
);
6352 // Compute with: cttz(x) = ctpop(lsb - 1)
6356 if (ElemTy
== MVT::i64
) {
6357 // Load constant 0xffff'ffff'ffff'ffff to register.
6358 SDValue FF
= DAG
.getNode(ARMISD::VMOVIMM
, dl
, VT
,
6359 DAG
.getTargetConstant(0x1eff, dl
, MVT::i32
));
6360 Bits
= DAG
.getNode(ISD::ADD
, dl
, VT
, LSB
, FF
);
6362 SDValue One
= DAG
.getNode(ARMISD::VMOVIMM
, dl
, VT
,
6363 DAG
.getTargetConstant(1, dl
, ElemTy
));
6364 Bits
= DAG
.getNode(ISD::SUB
, dl
, VT
, LSB
, One
);
6366 return DAG
.getNode(ISD::CTPOP
, dl
, VT
, Bits
);
6369 if (!ST
->hasV6T2Ops())
6372 SDValue rbit
= DAG
.getNode(ISD::BITREVERSE
, dl
, VT
, N
->getOperand(0));
6373 return DAG
.getNode(ISD::CTLZ
, dl
, VT
, rbit
);
6376 static SDValue
LowerCTPOP(SDNode
*N
, SelectionDAG
&DAG
,
6377 const ARMSubtarget
*ST
) {
6378 EVT VT
= N
->getValueType(0);
6381 assert(ST
->hasNEON() && "Custom ctpop lowering requires NEON.");
6382 assert((VT
== MVT::v1i64
|| VT
== MVT::v2i64
|| VT
== MVT::v2i32
||
6383 VT
== MVT::v4i32
|| VT
== MVT::v4i16
|| VT
== MVT::v8i16
) &&
6384 "Unexpected type for custom ctpop lowering");
6386 const TargetLowering
&TLI
= DAG
.getTargetLoweringInfo();
6387 EVT VT8Bit
= VT
.is64BitVector() ? MVT::v8i8
: MVT::v16i8
;
6388 SDValue Res
= DAG
.getBitcast(VT8Bit
, N
->getOperand(0));
6389 Res
= DAG
.getNode(ISD::CTPOP
, DL
, VT8Bit
, Res
);
6391 // Widen v8i8/v16i8 CTPOP result to VT by repeatedly widening pairwise adds.
6392 unsigned EltSize
= 8;
6393 unsigned NumElts
= VT
.is64BitVector() ? 8 : 16;
6394 while (EltSize
!= VT
.getScalarSizeInBits()) {
6395 SmallVector
<SDValue
, 8> Ops
;
6396 Ops
.push_back(DAG
.getConstant(Intrinsic::arm_neon_vpaddlu
, DL
,
6397 TLI
.getPointerTy(DAG
.getDataLayout())));
6402 MVT WidenVT
= MVT::getVectorVT(MVT::getIntegerVT(EltSize
), NumElts
);
6403 Res
= DAG
.getNode(ISD::INTRINSIC_WO_CHAIN
, DL
, WidenVT
, Ops
);
6409 /// Getvshiftimm - Check if this is a valid build_vector for the immediate
6410 /// operand of a vector shift operation, where all the elements of the
6411 /// build_vector must have the same constant integer value.
6412 static bool getVShiftImm(SDValue Op
, unsigned ElementBits
, int64_t &Cnt
) {
6413 // Ignore bit_converts.
6414 while (Op
.getOpcode() == ISD::BITCAST
)
6415 Op
= Op
.getOperand(0);
6416 BuildVectorSDNode
*BVN
= dyn_cast
<BuildVectorSDNode
>(Op
.getNode());
6417 APInt SplatBits
, SplatUndef
;
6418 unsigned SplatBitSize
;
6421 !BVN
->isConstantSplat(SplatBits
, SplatUndef
, SplatBitSize
, HasAnyUndefs
,
6423 SplatBitSize
> ElementBits
)
6425 Cnt
= SplatBits
.getSExtValue();
6429 /// isVShiftLImm - Check if this is a valid build_vector for the immediate
6430 /// operand of a vector shift left operation. That value must be in the range:
6431 /// 0 <= Value < ElementBits for a left shift; or
6432 /// 0 <= Value <= ElementBits for a long left shift.
6433 static bool isVShiftLImm(SDValue Op
, EVT VT
, bool isLong
, int64_t &Cnt
) {
6434 assert(VT
.isVector() && "vector shift count is not a vector type");
6435 int64_t ElementBits
= VT
.getScalarSizeInBits();
6436 if (!getVShiftImm(Op
, ElementBits
, Cnt
))
6438 return (Cnt
>= 0 && (isLong
? Cnt
- 1 : Cnt
) < ElementBits
);
6441 /// isVShiftRImm - Check if this is a valid build_vector for the immediate
6442 /// operand of a vector shift right operation. For a shift opcode, the value
6443 /// is positive, but for an intrinsic the value count must be negative. The
6444 /// absolute value must be in the range:
6445 /// 1 <= |Value| <= ElementBits for a right shift; or
6446 /// 1 <= |Value| <= ElementBits/2 for a narrow right shift.
6447 static bool isVShiftRImm(SDValue Op
, EVT VT
, bool isNarrow
, bool isIntrinsic
,
6449 assert(VT
.isVector() && "vector shift count is not a vector type");
6450 int64_t ElementBits
= VT
.getScalarSizeInBits();
6451 if (!getVShiftImm(Op
, ElementBits
, Cnt
))
6454 return (Cnt
>= 1 && Cnt
<= (isNarrow
? ElementBits
/ 2 : ElementBits
));
6455 if (Cnt
>= -(isNarrow
? ElementBits
/ 2 : ElementBits
) && Cnt
<= -1) {
6462 static SDValue
LowerShift(SDNode
*N
, SelectionDAG
&DAG
,
6463 const ARMSubtarget
*ST
) {
6464 EVT VT
= N
->getValueType(0);
6471 // We essentially have two forms here. Shift by an immediate and shift by a
6472 // vector register (there are also shift by a gpr, but that is just handled
6473 // with a tablegen pattern). We cannot easily match shift by an immediate in
6474 // tablegen so we do that here and generate a VSHLIMM/VSHRsIMM/VSHRuIMM.
6475 // For shifting by a vector, we don't have VSHR, only VSHL (which can be
6476 // signed or unsigned, and a negative shift indicates a shift right).
6477 if (N
->getOpcode() == ISD::SHL
) {
6478 if (isVShiftLImm(N
->getOperand(1), VT
, false, Cnt
))
6479 return DAG
.getNode(ARMISD::VSHLIMM
, dl
, VT
, N
->getOperand(0),
6480 DAG
.getConstant(Cnt
, dl
, MVT::i32
));
6481 return DAG
.getNode(ARMISD::VSHLu
, dl
, VT
, N
->getOperand(0),
6485 assert((N
->getOpcode() == ISD::SRA
|| N
->getOpcode() == ISD::SRL
) &&
6486 "unexpected vector shift opcode");
6488 if (isVShiftRImm(N
->getOperand(1), VT
, false, false, Cnt
)) {
6489 unsigned VShiftOpc
=
6490 (N
->getOpcode() == ISD::SRA
? ARMISD::VSHRsIMM
: ARMISD::VSHRuIMM
);
6491 return DAG
.getNode(VShiftOpc
, dl
, VT
, N
->getOperand(0),
6492 DAG
.getConstant(Cnt
, dl
, MVT::i32
));
6495 // Other right shifts we don't have operations for (we use a shift left by a
6496 // negative number).
6497 EVT ShiftVT
= N
->getOperand(1).getValueType();
6498 SDValue NegatedCount
= DAG
.getNode(
6499 ISD::SUB
, dl
, ShiftVT
, getZeroVector(ShiftVT
, DAG
, dl
), N
->getOperand(1));
6500 unsigned VShiftOpc
=
6501 (N
->getOpcode() == ISD::SRA
? ARMISD::VSHLs
: ARMISD::VSHLu
);
6502 return DAG
.getNode(VShiftOpc
, dl
, VT
, N
->getOperand(0), NegatedCount
);
6505 static SDValue
Expand64BitShift(SDNode
*N
, SelectionDAG
&DAG
,
6506 const ARMSubtarget
*ST
) {
6507 EVT VT
= N
->getValueType(0);
6510 // We can get here for a node like i32 = ISD::SHL i32, i64
6514 assert((N
->getOpcode() == ISD::SRL
|| N
->getOpcode() == ISD::SRA
||
6515 N
->getOpcode() == ISD::SHL
) &&
6516 "Unknown shift to lower!");
6518 unsigned ShOpc
= N
->getOpcode();
6519 if (ST
->hasMVEIntegerOps()) {
6520 SDValue ShAmt
= N
->getOperand(1);
6521 unsigned ShPartsOpc
= ARMISD::LSLL
;
6522 ConstantSDNode
*Con
= dyn_cast
<ConstantSDNode
>(ShAmt
);
6524 // If the shift amount is greater than 32 or has a greater bitwidth than 64
6525 // then do the default optimisation
6526 if (ShAmt
->getValueType(0).getSizeInBits() > 64 ||
6527 (Con
&& (Con
->getZExtValue() == 0 || Con
->getZExtValue() >= 32)))
6530 // Extract the lower 32 bits of the shift amount if it's not an i32
6531 if (ShAmt
->getValueType(0) != MVT::i32
)
6532 ShAmt
= DAG
.getZExtOrTrunc(ShAmt
, dl
, MVT::i32
);
6534 if (ShOpc
== ISD::SRL
) {
6536 // There is no t2LSRLr instruction so negate and perform an lsll if the
6537 // shift amount is in a register, emulating a right shift.
6538 ShAmt
= DAG
.getNode(ISD::SUB
, dl
, MVT::i32
,
6539 DAG
.getConstant(0, dl
, MVT::i32
), ShAmt
);
6541 // Else generate an lsrl on the immediate shift amount
6542 ShPartsOpc
= ARMISD::LSRL
;
6543 } else if (ShOpc
== ISD::SRA
)
6544 ShPartsOpc
= ARMISD::ASRL
;
6546 // Lower 32 bits of the destination/source
6547 SDValue Lo
= DAG
.getNode(ISD::EXTRACT_ELEMENT
, dl
, MVT::i32
, N
->getOperand(0),
6548 DAG
.getConstant(0, dl
, MVT::i32
));
6549 // Upper 32 bits of the destination/source
6550 SDValue Hi
= DAG
.getNode(ISD::EXTRACT_ELEMENT
, dl
, MVT::i32
, N
->getOperand(0),
6551 DAG
.getConstant(1, dl
, MVT::i32
));
6553 // Generate the shift operation as computed above
6554 Lo
= DAG
.getNode(ShPartsOpc
, dl
, DAG
.getVTList(MVT::i32
, MVT::i32
), Lo
, Hi
,
6556 // The upper 32 bits come from the second return value of lsll
6557 Hi
= SDValue(Lo
.getNode(), 1);
6558 return DAG
.getNode(ISD::BUILD_PAIR
, dl
, MVT::i64
, Lo
, Hi
);
6561 // We only lower SRA, SRL of 1 here, all others use generic lowering.
6562 if (!isOneConstant(N
->getOperand(1)) || N
->getOpcode() == ISD::SHL
)
6565 // If we are in thumb mode, we don't have RRX.
6566 if (ST
->isThumb1Only())
6569 // Okay, we have a 64-bit SRA or SRL of 1. Lower this to an RRX expr.
6570 SDValue Lo
= DAG
.getNode(ISD::EXTRACT_ELEMENT
, dl
, MVT::i32
, N
->getOperand(0),
6571 DAG
.getConstant(0, dl
, MVT::i32
));
6572 SDValue Hi
= DAG
.getNode(ISD::EXTRACT_ELEMENT
, dl
, MVT::i32
, N
->getOperand(0),
6573 DAG
.getConstant(1, dl
, MVT::i32
));
6575 // First, build a SRA_FLAG/SRL_FLAG op, which shifts the top part by one and
6576 // captures the result into a carry flag.
6577 unsigned Opc
= N
->getOpcode() == ISD::SRL
? ARMISD::SRL_FLAG
:ARMISD::SRA_FLAG
;
6578 Hi
= DAG
.getNode(Opc
, dl
, DAG
.getVTList(MVT::i32
, MVT::Glue
), Hi
);
6580 // The low part is an ARMISD::RRX operand, which shifts the carry in.
6581 Lo
= DAG
.getNode(ARMISD::RRX
, dl
, MVT::i32
, Lo
, Hi
.getValue(1));
6583 // Merge the pieces into a single i64 value.
6584 return DAG
.getNode(ISD::BUILD_PAIR
, dl
, MVT::i64
, Lo
, Hi
);
6587 static SDValue
LowerVSETCC(SDValue Op
, SelectionDAG
&DAG
,
6588 const ARMSubtarget
*ST
) {
6589 bool Invert
= false;
6591 unsigned Opc
= ARMCC::AL
;
6593 SDValue Op0
= Op
.getOperand(0);
6594 SDValue Op1
= Op
.getOperand(1);
6595 SDValue CC
= Op
.getOperand(2);
6596 EVT VT
= Op
.getValueType();
6597 ISD::CondCode SetCCOpcode
= cast
<CondCodeSDNode
>(CC
)->get();
6602 CmpVT
= Op0
.getValueType().changeVectorElementTypeToInteger();
6604 assert(ST
->hasMVEIntegerOps() &&
6605 "No hardware support for integer vector comparison!");
6607 if (Op
.getValueType().getVectorElementType() != MVT::i1
)
6610 // Make sure we expand floating point setcc to scalar if we do not have
6611 // mve.fp, so that we can handle them from there.
6612 if (Op0
.getValueType().isFloatingPoint() && !ST
->hasMVEFloatOps())
6618 if (Op0
.getValueType().getVectorElementType() == MVT::i64
&&
6619 (SetCCOpcode
== ISD::SETEQ
|| SetCCOpcode
== ISD::SETNE
)) {
6620 // Special-case integer 64-bit equality comparisons. They aren't legal,
6621 // but they can be lowered with a few vector instructions.
6622 unsigned CmpElements
= CmpVT
.getVectorNumElements() * 2;
6623 EVT SplitVT
= EVT::getVectorVT(*DAG
.getContext(), MVT::i32
, CmpElements
);
6624 SDValue CastOp0
= DAG
.getNode(ISD::BITCAST
, dl
, SplitVT
, Op0
);
6625 SDValue CastOp1
= DAG
.getNode(ISD::BITCAST
, dl
, SplitVT
, Op1
);
6626 SDValue Cmp
= DAG
.getNode(ISD::SETCC
, dl
, SplitVT
, CastOp0
, CastOp1
,
6627 DAG
.getCondCode(ISD::SETEQ
));
6628 SDValue Reversed
= DAG
.getNode(ARMISD::VREV64
, dl
, SplitVT
, Cmp
);
6629 SDValue Merged
= DAG
.getNode(ISD::AND
, dl
, SplitVT
, Cmp
, Reversed
);
6630 Merged
= DAG
.getNode(ISD::BITCAST
, dl
, CmpVT
, Merged
);
6631 if (SetCCOpcode
== ISD::SETNE
)
6632 Merged
= DAG
.getNOT(dl
, Merged
, CmpVT
);
6633 Merged
= DAG
.getSExtOrTrunc(Merged
, dl
, VT
);
6637 if (CmpVT
.getVectorElementType() == MVT::i64
)
6638 // 64-bit comparisons are not legal in general.
6641 if (Op1
.getValueType().isFloatingPoint()) {
6642 switch (SetCCOpcode
) {
6643 default: llvm_unreachable("Illegal FP comparison");
6646 if (ST
->hasMVEFloatOps()) {
6647 Opc
= ARMCC::NE
; break;
6649 Invert
= true; LLVM_FALLTHROUGH
;
6652 case ISD::SETEQ
: Opc
= ARMCC::EQ
; break;
6654 case ISD::SETLT
: Swap
= true; LLVM_FALLTHROUGH
;
6656 case ISD::SETGT
: Opc
= ARMCC::GT
; break;
6658 case ISD::SETLE
: Swap
= true; LLVM_FALLTHROUGH
;
6660 case ISD::SETGE
: Opc
= ARMCC::GE
; break;
6661 case ISD::SETUGE
: Swap
= true; LLVM_FALLTHROUGH
;
6662 case ISD::SETULE
: Invert
= true; Opc
= ARMCC::GT
; break;
6663 case ISD::SETUGT
: Swap
= true; LLVM_FALLTHROUGH
;
6664 case ISD::SETULT
: Invert
= true; Opc
= ARMCC::GE
; break;
6665 case ISD::SETUEQ
: Invert
= true; LLVM_FALLTHROUGH
;
6667 // Expand this to (OLT | OGT).
6668 SDValue TmpOp0
= DAG
.getNode(ARMISD::VCMP
, dl
, CmpVT
, Op1
, Op0
,
6669 DAG
.getConstant(ARMCC::GT
, dl
, MVT::i32
));
6670 SDValue TmpOp1
= DAG
.getNode(ARMISD::VCMP
, dl
, CmpVT
, Op0
, Op1
,
6671 DAG
.getConstant(ARMCC::GT
, dl
, MVT::i32
));
6672 SDValue Result
= DAG
.getNode(ISD::OR
, dl
, CmpVT
, TmpOp0
, TmpOp1
);
6674 Result
= DAG
.getNOT(dl
, Result
, VT
);
6677 case ISD::SETUO
: Invert
= true; LLVM_FALLTHROUGH
;
6679 // Expand this to (OLT | OGE).
6680 SDValue TmpOp0
= DAG
.getNode(ARMISD::VCMP
, dl
, CmpVT
, Op1
, Op0
,
6681 DAG
.getConstant(ARMCC::GT
, dl
, MVT::i32
));
6682 SDValue TmpOp1
= DAG
.getNode(ARMISD::VCMP
, dl
, CmpVT
, Op0
, Op1
,
6683 DAG
.getConstant(ARMCC::GE
, dl
, MVT::i32
));
6684 SDValue Result
= DAG
.getNode(ISD::OR
, dl
, CmpVT
, TmpOp0
, TmpOp1
);
6686 Result
= DAG
.getNOT(dl
, Result
, VT
);
6691 // Integer comparisons.
6692 switch (SetCCOpcode
) {
6693 default: llvm_unreachable("Illegal integer comparison");
6695 if (ST
->hasMVEIntegerOps()) {
6696 Opc
= ARMCC::NE
; break;
6698 Invert
= true; LLVM_FALLTHROUGH
;
6700 case ISD::SETEQ
: Opc
= ARMCC::EQ
; break;
6701 case ISD::SETLT
: Swap
= true; LLVM_FALLTHROUGH
;
6702 case ISD::SETGT
: Opc
= ARMCC::GT
; break;
6703 case ISD::SETLE
: Swap
= true; LLVM_FALLTHROUGH
;
6704 case ISD::SETGE
: Opc
= ARMCC::GE
; break;
6705 case ISD::SETULT
: Swap
= true; LLVM_FALLTHROUGH
;
6706 case ISD::SETUGT
: Opc
= ARMCC::HI
; break;
6707 case ISD::SETULE
: Swap
= true; LLVM_FALLTHROUGH
;
6708 case ISD::SETUGE
: Opc
= ARMCC::HS
; break;
6711 // Detect VTST (Vector Test Bits) = icmp ne (and (op0, op1), zero).
6712 if (ST
->hasNEON() && Opc
== ARMCC::EQ
) {
6714 if (ISD::isBuildVectorAllZeros(Op1
.getNode()))
6716 else if (ISD::isBuildVectorAllZeros(Op0
.getNode()))
6719 // Ignore bitconvert.
6720 if (AndOp
.getNode() && AndOp
.getOpcode() == ISD::BITCAST
)
6721 AndOp
= AndOp
.getOperand(0);
6723 if (AndOp
.getNode() && AndOp
.getOpcode() == ISD::AND
) {
6724 Op0
= DAG
.getNode(ISD::BITCAST
, dl
, CmpVT
, AndOp
.getOperand(0));
6725 Op1
= DAG
.getNode(ISD::BITCAST
, dl
, CmpVT
, AndOp
.getOperand(1));
6726 SDValue Result
= DAG
.getNode(ARMISD::VTST
, dl
, CmpVT
, Op0
, Op1
);
6728 Result
= DAG
.getNOT(dl
, Result
, VT
);
6735 std::swap(Op0
, Op1
);
6737 // If one of the operands is a constant vector zero, attempt to fold the
6738 // comparison to a specialized compare-against-zero form.
6740 if (ISD::isBuildVectorAllZeros(Op1
.getNode()))
6742 else if (ISD::isBuildVectorAllZeros(Op0
.getNode())) {
6743 if (Opc
== ARMCC::GE
)
6745 else if (Opc
== ARMCC::GT
)
6751 if (SingleOp
.getNode()) {
6752 Result
= DAG
.getNode(ARMISD::VCMPZ
, dl
, CmpVT
, SingleOp
,
6753 DAG
.getConstant(Opc
, dl
, MVT::i32
));
6755 Result
= DAG
.getNode(ARMISD::VCMP
, dl
, CmpVT
, Op0
, Op1
,
6756 DAG
.getConstant(Opc
, dl
, MVT::i32
));
6759 Result
= DAG
.getSExtOrTrunc(Result
, dl
, VT
);
6762 Result
= DAG
.getNOT(dl
, Result
, VT
);
6767 static SDValue
LowerSETCCCARRY(SDValue Op
, SelectionDAG
&DAG
) {
6768 SDValue LHS
= Op
.getOperand(0);
6769 SDValue RHS
= Op
.getOperand(1);
6770 SDValue Carry
= Op
.getOperand(2);
6771 SDValue Cond
= Op
.getOperand(3);
6774 assert(LHS
.getSimpleValueType().isInteger() && "SETCCCARRY is integer only.");
6776 // ARMISD::SUBE expects a carry not a borrow like ISD::SUBCARRY so we
6777 // have to invert the carry first.
6778 Carry
= DAG
.getNode(ISD::SUB
, DL
, MVT::i32
,
6779 DAG
.getConstant(1, DL
, MVT::i32
), Carry
);
6780 // This converts the boolean value carry into the carry flag.
6781 Carry
= ConvertBooleanCarryToCarryFlag(Carry
, DAG
);
6783 SDVTList VTs
= DAG
.getVTList(LHS
.getValueType(), MVT::i32
);
6784 SDValue Cmp
= DAG
.getNode(ARMISD::SUBE
, DL
, VTs
, LHS
, RHS
, Carry
);
6786 SDValue FVal
= DAG
.getConstant(0, DL
, MVT::i32
);
6787 SDValue TVal
= DAG
.getConstant(1, DL
, MVT::i32
);
6788 SDValue ARMcc
= DAG
.getConstant(
6789 IntCCToARMCC(cast
<CondCodeSDNode
>(Cond
)->get()), DL
, MVT::i32
);
6790 SDValue CCR
= DAG
.getRegister(ARM::CPSR
, MVT::i32
);
6791 SDValue Chain
= DAG
.getCopyToReg(DAG
.getEntryNode(), DL
, ARM::CPSR
,
6792 Cmp
.getValue(1), SDValue());
6793 return DAG
.getNode(ARMISD::CMOV
, DL
, Op
.getValueType(), FVal
, TVal
, ARMcc
,
6794 CCR
, Chain
.getValue(1));
6797 /// isVMOVModifiedImm - Check if the specified splat value corresponds to a
6798 /// valid vector constant for a NEON or MVE instruction with a "modified
6799 /// immediate" operand (e.g., VMOV). If so, return the encoded value.
6800 static SDValue
isVMOVModifiedImm(uint64_t SplatBits
, uint64_t SplatUndef
,
6801 unsigned SplatBitSize
, SelectionDAG
&DAG
,
6802 const SDLoc
&dl
, EVT
&VT
, EVT VectorVT
,
6803 VMOVModImmType type
) {
6804 unsigned OpCmode
, Imm
;
6805 bool is128Bits
= VectorVT
.is128BitVector();
6807 // SplatBitSize is set to the smallest size that splats the vector, so a
6808 // zero vector will always have SplatBitSize == 8. However, NEON modified
6809 // immediate instructions others than VMOV do not support the 8-bit encoding
6810 // of a zero vector, and the default encoding of zero is supposed to be the
6815 switch (SplatBitSize
) {
6817 if (type
!= VMOVModImm
)
6819 // Any 1-byte value is OK. Op=0, Cmode=1110.
6820 assert((SplatBits
& ~0xff) == 0 && "one byte splat value is too big");
6823 VT
= is128Bits
? MVT::v16i8
: MVT::v8i8
;
6827 // NEON's 16-bit VMOV supports splat values where only one byte is nonzero.
6828 VT
= is128Bits
? MVT::v8i16
: MVT::v4i16
;
6829 if ((SplatBits
& ~0xff) == 0) {
6830 // Value = 0x00nn: Op=x, Cmode=100x.
6835 if ((SplatBits
& ~0xff00) == 0) {
6836 // Value = 0xnn00: Op=x, Cmode=101x.
6838 Imm
= SplatBits
>> 8;
6844 // NEON's 32-bit VMOV supports splat values where:
6845 // * only one byte is nonzero, or
6846 // * the least significant byte is 0xff and the second byte is nonzero, or
6847 // * the least significant 2 bytes are 0xff and the third is nonzero.
6848 VT
= is128Bits
? MVT::v4i32
: MVT::v2i32
;
6849 if ((SplatBits
& ~0xff) == 0) {
6850 // Value = 0x000000nn: Op=x, Cmode=000x.
6855 if ((SplatBits
& ~0xff00) == 0) {
6856 // Value = 0x0000nn00: Op=x, Cmode=001x.
6858 Imm
= SplatBits
>> 8;
6861 if ((SplatBits
& ~0xff0000) == 0) {
6862 // Value = 0x00nn0000: Op=x, Cmode=010x.
6864 Imm
= SplatBits
>> 16;
6867 if ((SplatBits
& ~0xff000000) == 0) {
6868 // Value = 0xnn000000: Op=x, Cmode=011x.
6870 Imm
= SplatBits
>> 24;
6874 // cmode == 0b1100 and cmode == 0b1101 are not supported for VORR or VBIC
6875 if (type
== OtherModImm
) return SDValue();
6877 if ((SplatBits
& ~0xffff) == 0 &&
6878 ((SplatBits
| SplatUndef
) & 0xff) == 0xff) {
6879 // Value = 0x0000nnff: Op=x, Cmode=1100.
6881 Imm
= SplatBits
>> 8;
6885 // cmode == 0b1101 is not supported for MVE VMVN
6886 if (type
== MVEVMVNModImm
)
6889 if ((SplatBits
& ~0xffffff) == 0 &&
6890 ((SplatBits
| SplatUndef
) & 0xffff) == 0xffff) {
6891 // Value = 0x00nnffff: Op=x, Cmode=1101.
6893 Imm
= SplatBits
>> 16;
6897 // Note: there are a few 32-bit splat values (specifically: 00ffff00,
6898 // ff000000, ff0000ff, and ffff00ff) that are valid for VMOV.I64 but not
6899 // VMOV.I32. A (very) minor optimization would be to replicate the value
6900 // and fall through here to test for a valid 64-bit splat. But, then the
6901 // caller would also need to check and handle the change in size.
6905 if (type
!= VMOVModImm
)
6907 // NEON has a 64-bit VMOV splat where each byte is either 0 or 0xff.
6908 uint64_t BitMask
= 0xff;
6909 unsigned ImmMask
= 1;
6911 for (int ByteNum
= 0; ByteNum
< 8; ++ByteNum
) {
6912 if (((SplatBits
| SplatUndef
) & BitMask
) == BitMask
) {
6914 } else if ((SplatBits
& BitMask
) != 0) {
6921 if (DAG
.getDataLayout().isBigEndian()) {
6922 // Reverse the order of elements within the vector.
6923 unsigned BytesPerElem
= VectorVT
.getScalarSizeInBits() / 8;
6924 unsigned Mask
= (1 << BytesPerElem
) - 1;
6925 unsigned NumElems
= 8 / BytesPerElem
;
6926 unsigned NewImm
= 0;
6927 for (unsigned ElemNum
= 0; ElemNum
< NumElems
; ++ElemNum
) {
6928 unsigned Elem
= ((Imm
>> ElemNum
* BytesPerElem
) & Mask
);
6929 NewImm
|= Elem
<< (NumElems
- ElemNum
- 1) * BytesPerElem
;
6934 // Op=1, Cmode=1110.
6936 VT
= is128Bits
? MVT::v2i64
: MVT::v1i64
;
6941 llvm_unreachable("unexpected size for isVMOVModifiedImm");
6944 unsigned EncodedVal
= ARM_AM::createVMOVModImm(OpCmode
, Imm
);
6945 return DAG
.getTargetConstant(EncodedVal
, dl
, MVT::i32
);
6948 SDValue
ARMTargetLowering::LowerConstantFP(SDValue Op
, SelectionDAG
&DAG
,
6949 const ARMSubtarget
*ST
) const {
6950 EVT VT
= Op
.getValueType();
6951 bool IsDouble
= (VT
== MVT::f64
);
6952 ConstantFPSDNode
*CFP
= cast
<ConstantFPSDNode
>(Op
);
6953 const APFloat
&FPVal
= CFP
->getValueAPF();
6955 // Prevent floating-point constants from using literal loads
6956 // when execute-only is enabled.
6957 if (ST
->genExecuteOnly()) {
6958 // If we can represent the constant as an immediate, don't lower it
6959 if (isFPImmLegal(FPVal
, VT
))
6961 // Otherwise, construct as integer, and move to float register
6962 APInt INTVal
= FPVal
.bitcastToAPInt();
6964 switch (VT
.getSimpleVT().SimpleTy
) {
6966 llvm_unreachable("Unknown floating point type!");
6969 SDValue Lo
= DAG
.getConstant(INTVal
.trunc(32), DL
, MVT::i32
);
6970 SDValue Hi
= DAG
.getConstant(INTVal
.lshr(32).trunc(32), DL
, MVT::i32
);
6971 return DAG
.getNode(ARMISD::VMOVDRR
, DL
, MVT::f64
, Lo
, Hi
);
6974 return DAG
.getNode(ARMISD::VMOVSR
, DL
, VT
,
6975 DAG
.getConstant(INTVal
, DL
, MVT::i32
));
6979 if (!ST
->hasVFP3Base())
6982 // Use the default (constant pool) lowering for double constants when we have
6984 if (IsDouble
&& !Subtarget
->hasFP64())
6987 // Try splatting with a VMOV.f32...
6988 int ImmVal
= IsDouble
? ARM_AM::getFP64Imm(FPVal
) : ARM_AM::getFP32Imm(FPVal
);
6991 if (IsDouble
|| !ST
->useNEONForSinglePrecisionFP()) {
6992 // We have code in place to select a valid ConstantFP already, no need to
6997 // It's a float and we are trying to use NEON operations where
6998 // possible. Lower it to a splat followed by an extract.
7000 SDValue NewVal
= DAG
.getTargetConstant(ImmVal
, DL
, MVT::i32
);
7001 SDValue VecConstant
= DAG
.getNode(ARMISD::VMOVFPIMM
, DL
, MVT::v2f32
,
7003 return DAG
.getNode(ISD::EXTRACT_VECTOR_ELT
, DL
, MVT::f32
, VecConstant
,
7004 DAG
.getConstant(0, DL
, MVT::i32
));
7007 // The rest of our options are NEON only, make sure that's allowed before
7009 if (!ST
->hasNEON() || (!IsDouble
&& !ST
->useNEONForSinglePrecisionFP()))
7013 uint64_t iVal
= FPVal
.bitcastToAPInt().getZExtValue();
7015 // It wouldn't really be worth bothering for doubles except for one very
7016 // important value, which does happen to match: 0.0. So make sure we don't do
7018 if (IsDouble
&& (iVal
& 0xffffffff) != (iVal
>> 32))
7021 // Try a VMOV.i32 (FIXME: i8, i16, or i64 could work too).
7022 SDValue NewVal
= isVMOVModifiedImm(iVal
& 0xffffffffU
, 0, 32, DAG
, SDLoc(Op
),
7023 VMovVT
, VT
, VMOVModImm
);
7024 if (NewVal
!= SDValue()) {
7026 SDValue VecConstant
= DAG
.getNode(ARMISD::VMOVIMM
, DL
, VMovVT
,
7029 return DAG
.getNode(ISD::BITCAST
, DL
, MVT::f64
, VecConstant
);
7031 // It's a float: cast and extract a vector element.
7032 SDValue VecFConstant
= DAG
.getNode(ISD::BITCAST
, DL
, MVT::v2f32
,
7034 return DAG
.getNode(ISD::EXTRACT_VECTOR_ELT
, DL
, MVT::f32
, VecFConstant
,
7035 DAG
.getConstant(0, DL
, MVT::i32
));
7038 // Finally, try a VMVN.i32
7039 NewVal
= isVMOVModifiedImm(~iVal
& 0xffffffffU
, 0, 32, DAG
, SDLoc(Op
), VMovVT
,
7041 if (NewVal
!= SDValue()) {
7043 SDValue VecConstant
= DAG
.getNode(ARMISD::VMVNIMM
, DL
, VMovVT
, NewVal
);
7046 return DAG
.getNode(ISD::BITCAST
, DL
, MVT::f64
, VecConstant
);
7048 // It's a float: cast and extract a vector element.
7049 SDValue VecFConstant
= DAG
.getNode(ISD::BITCAST
, DL
, MVT::v2f32
,
7051 return DAG
.getNode(ISD::EXTRACT_VECTOR_ELT
, DL
, MVT::f32
, VecFConstant
,
7052 DAG
.getConstant(0, DL
, MVT::i32
));
7058 // check if an VEXT instruction can handle the shuffle mask when the
7059 // vector sources of the shuffle are the same.
7060 static bool isSingletonVEXTMask(ArrayRef
<int> M
, EVT VT
, unsigned &Imm
) {
7061 unsigned NumElts
= VT
.getVectorNumElements();
7063 // Assume that the first shuffle index is not UNDEF. Fail if it is.
7069 // If this is a VEXT shuffle, the immediate value is the index of the first
7070 // element. The other shuffle indices must be the successive elements after
7072 unsigned ExpectedElt
= Imm
;
7073 for (unsigned i
= 1; i
< NumElts
; ++i
) {
7074 // Increment the expected index. If it wraps around, just follow it
7075 // back to index zero and keep going.
7077 if (ExpectedElt
== NumElts
)
7080 if (M
[i
] < 0) continue; // ignore UNDEF indices
7081 if (ExpectedElt
!= static_cast<unsigned>(M
[i
]))
7088 static bool isVEXTMask(ArrayRef
<int> M
, EVT VT
,
7089 bool &ReverseVEXT
, unsigned &Imm
) {
7090 unsigned NumElts
= VT
.getVectorNumElements();
7091 ReverseVEXT
= false;
7093 // Assume that the first shuffle index is not UNDEF. Fail if it is.
7099 // If this is a VEXT shuffle, the immediate value is the index of the first
7100 // element. The other shuffle indices must be the successive elements after
7102 unsigned ExpectedElt
= Imm
;
7103 for (unsigned i
= 1; i
< NumElts
; ++i
) {
7104 // Increment the expected index. If it wraps around, it may still be
7105 // a VEXT but the source vectors must be swapped.
7107 if (ExpectedElt
== NumElts
* 2) {
7112 if (M
[i
] < 0) continue; // ignore UNDEF indices
7113 if (ExpectedElt
!= static_cast<unsigned>(M
[i
]))
7117 // Adjust the index value if the source operands will be swapped.
7124 static bool isVTBLMask(ArrayRef
<int> M
, EVT VT
) {
7125 // We can handle <8 x i8> vector shuffles. If the index in the mask is out of
7126 // range, then 0 is placed into the resulting vector. So pretty much any mask
7127 // of 8 elements can work here.
7128 return VT
== MVT::v8i8
&& M
.size() == 8;
7131 static unsigned SelectPairHalf(unsigned Elements
, ArrayRef
<int> Mask
,
7133 if (Mask
.size() == Elements
* 2)
7134 return Index
/ Elements
;
7135 return Mask
[Index
] == 0 ? 0 : 1;
7138 // Checks whether the shuffle mask represents a vector transpose (VTRN) by
7139 // checking that pairs of elements in the shuffle mask represent the same index
7140 // in each vector, incrementing the expected index by 2 at each step.
7141 // e.g. For v1,v2 of type v4i32 a valid shuffle mask is: [0, 4, 2, 6]
7142 // v1={a,b,c,d} => x=shufflevector v1, v2 shufflemask => x={a,e,c,g}
7144 // WhichResult gives the offset for each element in the mask based on which
7145 // of the two results it belongs to.
7147 // The transpose can be represented either as:
7148 // result1 = shufflevector v1, v2, result1_shuffle_mask
7149 // result2 = shufflevector v1, v2, result2_shuffle_mask
7150 // where v1/v2 and the shuffle masks have the same number of elements
7151 // (here WhichResult (see below) indicates which result is being checked)
7154 // results = shufflevector v1, v2, shuffle_mask
7155 // where both results are returned in one vector and the shuffle mask has twice
7156 // as many elements as v1/v2 (here WhichResult will always be 0 if true) here we
7157 // want to check the low half and high half of the shuffle mask as if it were
7159 static bool isVTRNMask(ArrayRef
<int> M
, EVT VT
, unsigned &WhichResult
) {
7160 unsigned EltSz
= VT
.getScalarSizeInBits();
7164 unsigned NumElts
= VT
.getVectorNumElements();
7165 if (M
.size() != NumElts
&& M
.size() != NumElts
*2)
7168 // If the mask is twice as long as the input vector then we need to check the
7169 // upper and lower parts of the mask with a matching value for WhichResult
7170 // FIXME: A mask with only even values will be rejected in case the first
7171 // element is undefined, e.g. [-1, 4, 2, 6] will be rejected, because only
7172 // M[0] is used to determine WhichResult
7173 for (unsigned i
= 0; i
< M
.size(); i
+= NumElts
) {
7174 WhichResult
= SelectPairHalf(NumElts
, M
, i
);
7175 for (unsigned j
= 0; j
< NumElts
; j
+= 2) {
7176 if ((M
[i
+j
] >= 0 && (unsigned) M
[i
+j
] != j
+ WhichResult
) ||
7177 (M
[i
+j
+1] >= 0 && (unsigned) M
[i
+j
+1] != j
+ NumElts
+ WhichResult
))
7182 if (M
.size() == NumElts
*2)
7188 /// isVTRN_v_undef_Mask - Special case of isVTRNMask for canonical form of
7189 /// "vector_shuffle v, v", i.e., "vector_shuffle v, undef".
7190 /// Mask is e.g., <0, 0, 2, 2> instead of <0, 4, 2, 6>.
7191 static bool isVTRN_v_undef_Mask(ArrayRef
<int> M
, EVT VT
, unsigned &WhichResult
){
7192 unsigned EltSz
= VT
.getScalarSizeInBits();
7196 unsigned NumElts
= VT
.getVectorNumElements();
7197 if (M
.size() != NumElts
&& M
.size() != NumElts
*2)
7200 for (unsigned i
= 0; i
< M
.size(); i
+= NumElts
) {
7201 WhichResult
= SelectPairHalf(NumElts
, M
, i
);
7202 for (unsigned j
= 0; j
< NumElts
; j
+= 2) {
7203 if ((M
[i
+j
] >= 0 && (unsigned) M
[i
+j
] != j
+ WhichResult
) ||
7204 (M
[i
+j
+1] >= 0 && (unsigned) M
[i
+j
+1] != j
+ WhichResult
))
7209 if (M
.size() == NumElts
*2)
7215 // Checks whether the shuffle mask represents a vector unzip (VUZP) by checking
7216 // that the mask elements are either all even and in steps of size 2 or all odd
7217 // and in steps of size 2.
7218 // e.g. For v1,v2 of type v4i32 a valid shuffle mask is: [0, 2, 4, 6]
7219 // v1={a,b,c,d} => x=shufflevector v1, v2 shufflemask => x={a,c,e,g}
7221 // Requires similar checks to that of isVTRNMask with
7222 // respect the how results are returned.
7223 static bool isVUZPMask(ArrayRef
<int> M
, EVT VT
, unsigned &WhichResult
) {
7224 unsigned EltSz
= VT
.getScalarSizeInBits();
7228 unsigned NumElts
= VT
.getVectorNumElements();
7229 if (M
.size() != NumElts
&& M
.size() != NumElts
*2)
7232 for (unsigned i
= 0; i
< M
.size(); i
+= NumElts
) {
7233 WhichResult
= SelectPairHalf(NumElts
, M
, i
);
7234 for (unsigned j
= 0; j
< NumElts
; ++j
) {
7235 if (M
[i
+j
] >= 0 && (unsigned) M
[i
+j
] != 2 * j
+ WhichResult
)
7240 if (M
.size() == NumElts
*2)
7243 // VUZP.32 for 64-bit vectors is a pseudo-instruction alias for VTRN.32.
7244 if (VT
.is64BitVector() && EltSz
== 32)
7250 /// isVUZP_v_undef_Mask - Special case of isVUZPMask for canonical form of
7251 /// "vector_shuffle v, v", i.e., "vector_shuffle v, undef".
7252 /// Mask is e.g., <0, 2, 0, 2> instead of <0, 2, 4, 6>,
7253 static bool isVUZP_v_undef_Mask(ArrayRef
<int> M
, EVT VT
, unsigned &WhichResult
){
7254 unsigned EltSz
= VT
.getScalarSizeInBits();
7258 unsigned NumElts
= VT
.getVectorNumElements();
7259 if (M
.size() != NumElts
&& M
.size() != NumElts
*2)
7262 unsigned Half
= NumElts
/ 2;
7263 for (unsigned i
= 0; i
< M
.size(); i
+= NumElts
) {
7264 WhichResult
= SelectPairHalf(NumElts
, M
, i
);
7265 for (unsigned j
= 0; j
< NumElts
; j
+= Half
) {
7266 unsigned Idx
= WhichResult
;
7267 for (unsigned k
= 0; k
< Half
; ++k
) {
7268 int MIdx
= M
[i
+ j
+ k
];
7269 if (MIdx
>= 0 && (unsigned) MIdx
!= Idx
)
7276 if (M
.size() == NumElts
*2)
7279 // VUZP.32 for 64-bit vectors is a pseudo-instruction alias for VTRN.32.
7280 if (VT
.is64BitVector() && EltSz
== 32)
7286 // Checks whether the shuffle mask represents a vector zip (VZIP) by checking
7287 // that pairs of elements of the shufflemask represent the same index in each
7288 // vector incrementing sequentially through the vectors.
7289 // e.g. For v1,v2 of type v4i32 a valid shuffle mask is: [0, 4, 1, 5]
7290 // v1={a,b,c,d} => x=shufflevector v1, v2 shufflemask => x={a,e,b,f}
7292 // Requires similar checks to that of isVTRNMask with respect the how results
7294 static bool isVZIPMask(ArrayRef
<int> M
, EVT VT
, unsigned &WhichResult
) {
7295 unsigned EltSz
= VT
.getScalarSizeInBits();
7299 unsigned NumElts
= VT
.getVectorNumElements();
7300 if (M
.size() != NumElts
&& M
.size() != NumElts
*2)
7303 for (unsigned i
= 0; i
< M
.size(); i
+= NumElts
) {
7304 WhichResult
= SelectPairHalf(NumElts
, M
, i
);
7305 unsigned Idx
= WhichResult
* NumElts
/ 2;
7306 for (unsigned j
= 0; j
< NumElts
; j
+= 2) {
7307 if ((M
[i
+j
] >= 0 && (unsigned) M
[i
+j
] != Idx
) ||
7308 (M
[i
+j
+1] >= 0 && (unsigned) M
[i
+j
+1] != Idx
+ NumElts
))
7314 if (M
.size() == NumElts
*2)
7317 // VZIP.32 for 64-bit vectors is a pseudo-instruction alias for VTRN.32.
7318 if (VT
.is64BitVector() && EltSz
== 32)
7324 /// isVZIP_v_undef_Mask - Special case of isVZIPMask for canonical form of
7325 /// "vector_shuffle v, v", i.e., "vector_shuffle v, undef".
7326 /// Mask is e.g., <0, 0, 1, 1> instead of <0, 4, 1, 5>.
7327 static bool isVZIP_v_undef_Mask(ArrayRef
<int> M
, EVT VT
, unsigned &WhichResult
){
7328 unsigned EltSz
= VT
.getScalarSizeInBits();
7332 unsigned NumElts
= VT
.getVectorNumElements();
7333 if (M
.size() != NumElts
&& M
.size() != NumElts
*2)
7336 for (unsigned i
= 0; i
< M
.size(); i
+= NumElts
) {
7337 WhichResult
= SelectPairHalf(NumElts
, M
, i
);
7338 unsigned Idx
= WhichResult
* NumElts
/ 2;
7339 for (unsigned j
= 0; j
< NumElts
; j
+= 2) {
7340 if ((M
[i
+j
] >= 0 && (unsigned) M
[i
+j
] != Idx
) ||
7341 (M
[i
+j
+1] >= 0 && (unsigned) M
[i
+j
+1] != Idx
))
7347 if (M
.size() == NumElts
*2)
7350 // VZIP.32 for 64-bit vectors is a pseudo-instruction alias for VTRN.32.
7351 if (VT
.is64BitVector() && EltSz
== 32)
7357 /// Check if \p ShuffleMask is a NEON two-result shuffle (VZIP, VUZP, VTRN),
7358 /// and return the corresponding ARMISD opcode if it is, or 0 if it isn't.
7359 static unsigned isNEONTwoResultShuffleMask(ArrayRef
<int> ShuffleMask
, EVT VT
,
7360 unsigned &WhichResult
,
7363 if (isVTRNMask(ShuffleMask
, VT
, WhichResult
))
7364 return ARMISD::VTRN
;
7365 if (isVUZPMask(ShuffleMask
, VT
, WhichResult
))
7366 return ARMISD::VUZP
;
7367 if (isVZIPMask(ShuffleMask
, VT
, WhichResult
))
7368 return ARMISD::VZIP
;
7371 if (isVTRN_v_undef_Mask(ShuffleMask
, VT
, WhichResult
))
7372 return ARMISD::VTRN
;
7373 if (isVUZP_v_undef_Mask(ShuffleMask
, VT
, WhichResult
))
7374 return ARMISD::VUZP
;
7375 if (isVZIP_v_undef_Mask(ShuffleMask
, VT
, WhichResult
))
7376 return ARMISD::VZIP
;
7381 /// \return true if this is a reverse operation on an vector.
7382 static bool isReverseMask(ArrayRef
<int> M
, EVT VT
) {
7383 unsigned NumElts
= VT
.getVectorNumElements();
7384 // Make sure the mask has the right size.
7385 if (NumElts
!= M
.size())
7388 // Look for <15, ..., 3, -1, 1, 0>.
7389 for (unsigned i
= 0; i
!= NumElts
; ++i
)
7390 if (M
[i
] >= 0 && M
[i
] != (int) (NumElts
- 1 - i
))
7396 static bool isVMOVNMask(ArrayRef
<int> M
, EVT VT
, bool Top
, bool SingleSource
) {
7397 unsigned NumElts
= VT
.getVectorNumElements();
7398 // Make sure the mask has the right size.
7399 if (NumElts
!= M
.size() || (VT
!= MVT::v8i16
&& VT
!= MVT::v16i8
))
7403 // Look for <0, N, 2, N+2, 4, N+4, ..>.
7404 // This inserts Input2 into Input1
7406 // Look for <0, N+1, 2, N+3, 4, N+5, ..>
7407 // This inserts Input1 into Input2
7408 unsigned Offset
= Top
? 0 : 1;
7409 unsigned N
= SingleSource
? 0 : NumElts
;
7410 for (unsigned i
= 0; i
< NumElts
; i
+= 2) {
7411 if (M
[i
] >= 0 && M
[i
] != (int)i
)
7413 if (M
[i
+ 1] >= 0 && M
[i
+ 1] != (int)(N
+ i
+ Offset
))
7420 static bool isVMOVNTruncMask(ArrayRef
<int> M
, EVT ToVT
, bool rev
) {
7421 unsigned NumElts
= ToVT
.getVectorNumElements();
7422 if (NumElts
!= M
.size())
7425 // Test if the Trunc can be convertable to a VMOVN with this shuffle. We are
7426 // looking for patterns of:
7427 // !rev: 0 N/2 1 N/2+1 2 N/2+2 ...
7428 // rev: N/2 0 N/2+1 1 N/2+2 2 ...
7430 unsigned Off0
= rev
? NumElts
/ 2 : 0;
7431 unsigned Off1
= rev
? 0 : NumElts
/ 2;
7432 for (unsigned i
= 0; i
< NumElts
; i
+= 2) {
7433 if (M
[i
] >= 0 && M
[i
] != (int)(Off0
+ i
/ 2))
7435 if (M
[i
+ 1] >= 0 && M
[i
+ 1] != (int)(Off1
+ i
/ 2))
7442 // Reconstruct an MVE VCVT from a BuildVector of scalar fptrunc, all extracted
7443 // from a pair of inputs. For example:
7444 // BUILDVECTOR(FP_ROUND(EXTRACT_ELT(X, 0),
7445 // FP_ROUND(EXTRACT_ELT(Y, 0),
7446 // FP_ROUND(EXTRACT_ELT(X, 1),
7447 // FP_ROUND(EXTRACT_ELT(Y, 1), ...)
7448 static SDValue
LowerBuildVectorOfFPTrunc(SDValue BV
, SelectionDAG
&DAG
,
7449 const ARMSubtarget
*ST
) {
7450 assert(BV
.getOpcode() == ISD::BUILD_VECTOR
&& "Unknown opcode!");
7451 if (!ST
->hasMVEFloatOps())
7455 EVT VT
= BV
.getValueType();
7456 if (VT
!= MVT::v8f16
)
7459 // We are looking for a buildvector of fptrunc elements, where all the
7460 // elements are interleavingly extracted from two sources. Check the first two
7461 // items are valid enough and extract some info from them (they are checked
7462 // properly in the loop below).
7463 if (BV
.getOperand(0).getOpcode() != ISD::FP_ROUND
||
7464 BV
.getOperand(0).getOperand(0).getOpcode() != ISD::EXTRACT_VECTOR_ELT
||
7465 BV
.getOperand(0).getOperand(0).getConstantOperandVal(1) != 0)
7467 if (BV
.getOperand(1).getOpcode() != ISD::FP_ROUND
||
7468 BV
.getOperand(1).getOperand(0).getOpcode() != ISD::EXTRACT_VECTOR_ELT
||
7469 BV
.getOperand(1).getOperand(0).getConstantOperandVal(1) != 0)
7471 SDValue Op0
= BV
.getOperand(0).getOperand(0).getOperand(0);
7472 SDValue Op1
= BV
.getOperand(1).getOperand(0).getOperand(0);
7473 if (Op0
.getValueType() != MVT::v4f32
|| Op1
.getValueType() != MVT::v4f32
)
7476 // Check all the values in the BuildVector line up with our expectations.
7477 for (unsigned i
= 1; i
< 4; i
++) {
7478 auto Check
= [](SDValue Trunc
, SDValue Op
, unsigned Idx
) {
7479 return Trunc
.getOpcode() == ISD::FP_ROUND
&&
7480 Trunc
.getOperand(0).getOpcode() == ISD::EXTRACT_VECTOR_ELT
&&
7481 Trunc
.getOperand(0).getOperand(0) == Op
&&
7482 Trunc
.getOperand(0).getConstantOperandVal(1) == Idx
;
7484 if (!Check(BV
.getOperand(i
* 2 + 0), Op0
, i
))
7486 if (!Check(BV
.getOperand(i
* 2 + 1), Op1
, i
))
7490 SDValue N1
= DAG
.getNode(ARMISD::VCVTN
, dl
, VT
, DAG
.getUNDEF(VT
), Op0
,
7491 DAG
.getConstant(0, dl
, MVT::i32
));
7492 return DAG
.getNode(ARMISD::VCVTN
, dl
, VT
, N1
, Op1
,
7493 DAG
.getConstant(1, dl
, MVT::i32
));
7496 // Reconstruct an MVE VCVT from a BuildVector of scalar fpext, all extracted
7497 // from a single input on alternating lanes. For example:
7498 // BUILDVECTOR(FP_ROUND(EXTRACT_ELT(X, 0),
7499 // FP_ROUND(EXTRACT_ELT(X, 2),
7500 // FP_ROUND(EXTRACT_ELT(X, 4), ...)
7501 static SDValue
LowerBuildVectorOfFPExt(SDValue BV
, SelectionDAG
&DAG
,
7502 const ARMSubtarget
*ST
) {
7503 assert(BV
.getOpcode() == ISD::BUILD_VECTOR
&& "Unknown opcode!");
7504 if (!ST
->hasMVEFloatOps())
7508 EVT VT
= BV
.getValueType();
7509 if (VT
!= MVT::v4f32
)
7512 // We are looking for a buildvector of fptext elements, where all the
7513 // elements are alternating lanes from a single source. For example <0,2,4,6>
7514 // or <1,3,5,7>. Check the first two items are valid enough and extract some
7515 // info from them (they are checked properly in the loop below).
7516 if (BV
.getOperand(0).getOpcode() != ISD::FP_EXTEND
||
7517 BV
.getOperand(0).getOperand(0).getOpcode() != ISD::EXTRACT_VECTOR_ELT
)
7519 SDValue Op0
= BV
.getOperand(0).getOperand(0).getOperand(0);
7520 int Offset
= BV
.getOperand(0).getOperand(0).getConstantOperandVal(1);
7521 if (Op0
.getValueType() != MVT::v8f16
|| (Offset
!= 0 && Offset
!= 1))
7524 // Check all the values in the BuildVector line up with our expectations.
7525 for (unsigned i
= 1; i
< 4; i
++) {
7526 auto Check
= [](SDValue Trunc
, SDValue Op
, unsigned Idx
) {
7527 return Trunc
.getOpcode() == ISD::FP_EXTEND
&&
7528 Trunc
.getOperand(0).getOpcode() == ISD::EXTRACT_VECTOR_ELT
&&
7529 Trunc
.getOperand(0).getOperand(0) == Op
&&
7530 Trunc
.getOperand(0).getConstantOperandVal(1) == Idx
;
7532 if (!Check(BV
.getOperand(i
), Op0
, 2 * i
+ Offset
))
7536 return DAG
.getNode(ARMISD::VCVTL
, dl
, VT
, Op0
,
7537 DAG
.getConstant(Offset
, dl
, MVT::i32
));
7540 // If N is an integer constant that can be moved into a register in one
7541 // instruction, return an SDValue of such a constant (will become a MOV
7542 // instruction). Otherwise return null.
7543 static SDValue
IsSingleInstrConstant(SDValue N
, SelectionDAG
&DAG
,
7544 const ARMSubtarget
*ST
, const SDLoc
&dl
) {
7546 if (!isa
<ConstantSDNode
>(N
))
7548 Val
= cast
<ConstantSDNode
>(N
)->getZExtValue();
7550 if (ST
->isThumb1Only()) {
7551 if (Val
<= 255 || ~Val
<= 255)
7552 return DAG
.getConstant(Val
, dl
, MVT::i32
);
7554 if (ARM_AM::getSOImmVal(Val
) != -1 || ARM_AM::getSOImmVal(~Val
) != -1)
7555 return DAG
.getConstant(Val
, dl
, MVT::i32
);
7560 static SDValue
LowerBUILD_VECTOR_i1(SDValue Op
, SelectionDAG
&DAG
,
7561 const ARMSubtarget
*ST
) {
7563 EVT VT
= Op
.getValueType();
7565 assert(ST
->hasMVEIntegerOps() && "LowerBUILD_VECTOR_i1 called without MVE!");
7567 unsigned NumElts
= VT
.getVectorNumElements();
7569 unsigned BitsPerBool
;
7573 } else if (NumElts
== 8) {
7576 } else if (NumElts
== 16) {
7582 // If this is a single value copied into all lanes (a splat), we can just sign
7583 // extend that single value
7584 SDValue FirstOp
= Op
.getOperand(0);
7585 if (!isa
<ConstantSDNode
>(FirstOp
) &&
7586 std::all_of(std::next(Op
->op_begin()), Op
->op_end(),
7587 [&FirstOp
](SDUse
&U
) {
7588 return U
.get().isUndef() || U
.get() == FirstOp
;
7590 SDValue Ext
= DAG
.getNode(ISD::SIGN_EXTEND_INREG
, dl
, MVT::i32
, FirstOp
,
7591 DAG
.getValueType(MVT::i1
));
7592 return DAG
.getNode(ARMISD::PREDICATE_CAST
, dl
, Op
.getValueType(), Ext
);
7595 // First create base with bits set where known
7596 unsigned Bits32
= 0;
7597 for (unsigned i
= 0; i
< NumElts
; ++i
) {
7598 SDValue V
= Op
.getOperand(i
);
7599 if (!isa
<ConstantSDNode
>(V
) && !V
.isUndef())
7601 bool BitSet
= V
.isUndef() ? false : cast
<ConstantSDNode
>(V
)->getZExtValue();
7603 Bits32
|= BoolMask
<< (i
* BitsPerBool
);
7606 // Add in unknown nodes
7607 SDValue Base
= DAG
.getNode(ARMISD::PREDICATE_CAST
, dl
, VT
,
7608 DAG
.getConstant(Bits32
, dl
, MVT::i32
));
7609 for (unsigned i
= 0; i
< NumElts
; ++i
) {
7610 SDValue V
= Op
.getOperand(i
);
7611 if (isa
<ConstantSDNode
>(V
) || V
.isUndef())
7613 Base
= DAG
.getNode(ISD::INSERT_VECTOR_ELT
, dl
, VT
, Base
, V
,
7614 DAG
.getConstant(i
, dl
, MVT::i32
));
7620 static SDValue
LowerBUILD_VECTORToVIDUP(SDValue Op
, SelectionDAG
&DAG
,
7621 const ARMSubtarget
*ST
) {
7622 if (!ST
->hasMVEIntegerOps())
7625 // We are looking for a buildvector where each element is Op[0] + i*N
7626 EVT VT
= Op
.getValueType();
7627 SDValue Op0
= Op
.getOperand(0);
7628 unsigned NumElts
= VT
.getVectorNumElements();
7630 // Get the increment value from operand 1
7631 SDValue Op1
= Op
.getOperand(1);
7632 if (Op1
.getOpcode() != ISD::ADD
|| Op1
.getOperand(0) != Op0
||
7633 !isa
<ConstantSDNode
>(Op1
.getOperand(1)))
7635 unsigned N
= Op1
.getConstantOperandVal(1);
7636 if (N
!= 1 && N
!= 2 && N
!= 4 && N
!= 8)
7639 // Check that each other operand matches
7640 for (unsigned I
= 2; I
< NumElts
; I
++) {
7641 SDValue OpI
= Op
.getOperand(I
);
7642 if (OpI
.getOpcode() != ISD::ADD
|| OpI
.getOperand(0) != Op0
||
7643 !isa
<ConstantSDNode
>(OpI
.getOperand(1)) ||
7644 OpI
.getConstantOperandVal(1) != I
* N
)
7649 return DAG
.getNode(ARMISD::VIDUP
, DL
, DAG
.getVTList(VT
, MVT::i32
), Op0
,
7650 DAG
.getConstant(N
, DL
, MVT::i32
));
7653 // If this is a case we can't handle, return null and let the default
7654 // expansion code take care of it.
7655 SDValue
ARMTargetLowering::LowerBUILD_VECTOR(SDValue Op
, SelectionDAG
&DAG
,
7656 const ARMSubtarget
*ST
) const {
7657 BuildVectorSDNode
*BVN
= cast
<BuildVectorSDNode
>(Op
.getNode());
7659 EVT VT
= Op
.getValueType();
7661 if (ST
->hasMVEIntegerOps() && VT
.getScalarSizeInBits() == 1)
7662 return LowerBUILD_VECTOR_i1(Op
, DAG
, ST
);
7664 if (SDValue R
= LowerBUILD_VECTORToVIDUP(Op
, DAG
, ST
))
7667 APInt SplatBits
, SplatUndef
;
7668 unsigned SplatBitSize
;
7670 if (BVN
->isConstantSplat(SplatBits
, SplatUndef
, SplatBitSize
, HasAnyUndefs
)) {
7671 if (SplatUndef
.isAllOnesValue())
7672 return DAG
.getUNDEF(VT
);
7674 if ((ST
->hasNEON() && SplatBitSize
<= 64) ||
7675 (ST
->hasMVEIntegerOps() && SplatBitSize
<= 64)) {
7676 // Check if an immediate VMOV works.
7679 isVMOVModifiedImm(SplatBits
.getZExtValue(), SplatUndef
.getZExtValue(),
7680 SplatBitSize
, DAG
, dl
, VmovVT
, VT
, VMOVModImm
);
7682 if (Val
.getNode()) {
7683 SDValue Vmov
= DAG
.getNode(ARMISD::VMOVIMM
, dl
, VmovVT
, Val
);
7684 return DAG
.getNode(ISD::BITCAST
, dl
, VT
, Vmov
);
7687 // Try an immediate VMVN.
7688 uint64_t NegatedImm
= (~SplatBits
).getZExtValue();
7689 Val
= isVMOVModifiedImm(
7690 NegatedImm
, SplatUndef
.getZExtValue(), SplatBitSize
, DAG
, dl
, VmovVT
,
7691 VT
, ST
->hasMVEIntegerOps() ? MVEVMVNModImm
: VMVNModImm
);
7692 if (Val
.getNode()) {
7693 SDValue Vmov
= DAG
.getNode(ARMISD::VMVNIMM
, dl
, VmovVT
, Val
);
7694 return DAG
.getNode(ISD::BITCAST
, dl
, VT
, Vmov
);
7697 // Use vmov.f32 to materialize other v2f32 and v4f32 splats.
7698 if ((VT
== MVT::v2f32
|| VT
== MVT::v4f32
) && SplatBitSize
== 32) {
7699 int ImmVal
= ARM_AM::getFP32Imm(SplatBits
);
7701 SDValue Val
= DAG
.getTargetConstant(ImmVal
, dl
, MVT::i32
);
7702 return DAG
.getNode(ARMISD::VMOVFPIMM
, dl
, VT
, Val
);
7706 // If we are under MVE, generate a VDUP(constant), bitcast to the original
7708 if (ST
->hasMVEIntegerOps() &&
7709 (SplatBitSize
== 8 || SplatBitSize
== 16 || SplatBitSize
== 32)) {
7710 EVT DupVT
= SplatBitSize
== 32 ? MVT::v4i32
7711 : SplatBitSize
== 16 ? MVT::v8i16
7713 SDValue Const
= DAG
.getConstant(SplatBits
.getZExtValue(), dl
, MVT::i32
);
7714 SDValue VDup
= DAG
.getNode(ARMISD::VDUP
, dl
, DupVT
, Const
);
7715 return DAG
.getNode(ARMISD::VECTOR_REG_CAST
, dl
, VT
, VDup
);
7720 // Scan through the operands to see if only one value is used.
7722 // As an optimisation, even if more than one value is used it may be more
7723 // profitable to splat with one value then change some lanes.
7725 // Heuristically we decide to do this if the vector has a "dominant" value,
7726 // defined as splatted to more than half of the lanes.
7727 unsigned NumElts
= VT
.getVectorNumElements();
7728 bool isOnlyLowElement
= true;
7729 bool usesOnlyOneValue
= true;
7730 bool hasDominantValue
= false;
7731 bool isConstant
= true;
7733 // Map of the number of times a particular SDValue appears in the
7735 DenseMap
<SDValue
, unsigned> ValueCounts
;
7737 for (unsigned i
= 0; i
< NumElts
; ++i
) {
7738 SDValue V
= Op
.getOperand(i
);
7742 isOnlyLowElement
= false;
7743 if (!isa
<ConstantFPSDNode
>(V
) && !isa
<ConstantSDNode
>(V
))
7746 ValueCounts
.insert(std::make_pair(V
, 0));
7747 unsigned &Count
= ValueCounts
[V
];
7749 // Is this value dominant? (takes up more than half of the lanes)
7750 if (++Count
> (NumElts
/ 2)) {
7751 hasDominantValue
= true;
7755 if (ValueCounts
.size() != 1)
7756 usesOnlyOneValue
= false;
7757 if (!Value
.getNode() && !ValueCounts
.empty())
7758 Value
= ValueCounts
.begin()->first
;
7760 if (ValueCounts
.empty())
7761 return DAG
.getUNDEF(VT
);
7763 // Loads are better lowered with insert_vector_elt/ARMISD::BUILD_VECTOR.
7764 // Keep going if we are hitting this case.
7765 if (isOnlyLowElement
&& !ISD::isNormalLoad(Value
.getNode()))
7766 return DAG
.getNode(ISD::SCALAR_TO_VECTOR
, dl
, VT
, Value
);
7768 unsigned EltSize
= VT
.getScalarSizeInBits();
7770 // Use VDUP for non-constant splats. For f32 constant splats, reduce to
7771 // i32 and try again.
7772 if (hasDominantValue
&& EltSize
<= 32) {
7776 // If we are VDUPing a value that comes directly from a vector, that will
7777 // cause an unnecessary move to and from a GPR, where instead we could
7778 // just use VDUPLANE. We can only do this if the lane being extracted
7779 // is at a constant index, as the VDUP from lane instructions only have
7780 // constant-index forms.
7781 ConstantSDNode
*constIndex
;
7782 if (Value
->getOpcode() == ISD::EXTRACT_VECTOR_ELT
&&
7783 (constIndex
= dyn_cast
<ConstantSDNode
>(Value
->getOperand(1)))) {
7784 // We need to create a new undef vector to use for the VDUPLANE if the
7785 // size of the vector from which we get the value is different than the
7786 // size of the vector that we need to create. We will insert the element
7787 // such that the register coalescer will remove unnecessary copies.
7788 if (VT
!= Value
->getOperand(0).getValueType()) {
7789 unsigned index
= constIndex
->getAPIntValue().getLimitedValue() %
7790 VT
.getVectorNumElements();
7791 N
= DAG
.getNode(ARMISD::VDUPLANE
, dl
, VT
,
7792 DAG
.getNode(ISD::INSERT_VECTOR_ELT
, dl
, VT
, DAG
.getUNDEF(VT
),
7793 Value
, DAG
.getConstant(index
, dl
, MVT::i32
)),
7794 DAG
.getConstant(index
, dl
, MVT::i32
));
7796 N
= DAG
.getNode(ARMISD::VDUPLANE
, dl
, VT
,
7797 Value
->getOperand(0), Value
->getOperand(1));
7799 N
= DAG
.getNode(ARMISD::VDUP
, dl
, VT
, Value
);
7801 if (!usesOnlyOneValue
) {
7802 // The dominant value was splatted as 'N', but we now have to insert
7803 // all differing elements.
7804 for (unsigned I
= 0; I
< NumElts
; ++I
) {
7805 if (Op
.getOperand(I
) == Value
)
7807 SmallVector
<SDValue
, 3> Ops
;
7809 Ops
.push_back(Op
.getOperand(I
));
7810 Ops
.push_back(DAG
.getConstant(I
, dl
, MVT::i32
));
7811 N
= DAG
.getNode(ISD::INSERT_VECTOR_ELT
, dl
, VT
, Ops
);
7816 if (VT
.getVectorElementType().isFloatingPoint()) {
7817 SmallVector
<SDValue
, 8> Ops
;
7818 MVT FVT
= VT
.getVectorElementType().getSimpleVT();
7819 assert(FVT
== MVT::f32
|| FVT
== MVT::f16
);
7820 MVT IVT
= (FVT
== MVT::f32
) ? MVT::i32
: MVT::i16
;
7821 for (unsigned i
= 0; i
< NumElts
; ++i
)
7822 Ops
.push_back(DAG
.getNode(ISD::BITCAST
, dl
, IVT
,
7824 EVT VecVT
= EVT::getVectorVT(*DAG
.getContext(), IVT
, NumElts
);
7825 SDValue Val
= DAG
.getBuildVector(VecVT
, dl
, Ops
);
7826 Val
= LowerBUILD_VECTOR(Val
, DAG
, ST
);
7828 return DAG
.getNode(ISD::BITCAST
, dl
, VT
, Val
);
7830 if (usesOnlyOneValue
) {
7831 SDValue Val
= IsSingleInstrConstant(Value
, DAG
, ST
, dl
);
7832 if (isConstant
&& Val
.getNode())
7833 return DAG
.getNode(ARMISD::VDUP
, dl
, VT
, Val
);
7837 // If all elements are constants and the case above didn't get hit, fall back
7838 // to the default expansion, which will generate a load from the constant
7843 // Reconstruct the BUILDVECTOR to one of the legal shuffles (such as vext and
7844 // vmovn). Empirical tests suggest this is rarely worth it for vectors of
7847 if (SDValue shuffle
= ReconstructShuffle(Op
, DAG
))
7850 // Attempt to turn a buildvector of scalar fptrunc's or fpext's back into
7852 if (SDValue VCVT
= LowerBuildVectorOfFPTrunc(Op
, DAG
, Subtarget
))
7854 if (SDValue VCVT
= LowerBuildVectorOfFPExt(Op
, DAG
, Subtarget
))
7857 if (ST
->hasNEON() && VT
.is128BitVector() && VT
!= MVT::v2f64
&& VT
!= MVT::v4f32
) {
7858 // If we haven't found an efficient lowering, try splitting a 128-bit vector
7859 // into two 64-bit vectors; we might discover a better way to lower it.
7860 SmallVector
<SDValue
, 64> Ops(Op
->op_begin(), Op
->op_begin() + NumElts
);
7861 EVT ExtVT
= VT
.getVectorElementType();
7862 EVT HVT
= EVT::getVectorVT(*DAG
.getContext(), ExtVT
, NumElts
/ 2);
7864 DAG
.getBuildVector(HVT
, dl
, makeArrayRef(&Ops
[0], NumElts
/ 2));
7865 if (Lower
.getOpcode() == ISD::BUILD_VECTOR
)
7866 Lower
= LowerBUILD_VECTOR(Lower
, DAG
, ST
);
7867 SDValue Upper
= DAG
.getBuildVector(
7868 HVT
, dl
, makeArrayRef(&Ops
[NumElts
/ 2], NumElts
/ 2));
7869 if (Upper
.getOpcode() == ISD::BUILD_VECTOR
)
7870 Upper
= LowerBUILD_VECTOR(Upper
, DAG
, ST
);
7872 return DAG
.getNode(ISD::CONCAT_VECTORS
, dl
, VT
, Lower
, Upper
);
7875 // Vectors with 32- or 64-bit elements can be built by directly assigning
7876 // the subregisters. Lower it to an ARMISD::BUILD_VECTOR so the operands
7877 // will be legalized.
7878 if (EltSize
>= 32) {
7879 // Do the expansion with floating-point types, since that is what the VFP
7880 // registers are defined to use, and since i64 is not legal.
7881 EVT EltVT
= EVT::getFloatingPointVT(EltSize
);
7882 EVT VecVT
= EVT::getVectorVT(*DAG
.getContext(), EltVT
, NumElts
);
7883 SmallVector
<SDValue
, 8> Ops
;
7884 for (unsigned i
= 0; i
< NumElts
; ++i
)
7885 Ops
.push_back(DAG
.getNode(ISD::BITCAST
, dl
, EltVT
, Op
.getOperand(i
)));
7886 SDValue Val
= DAG
.getNode(ARMISD::BUILD_VECTOR
, dl
, VecVT
, Ops
);
7887 return DAG
.getNode(ISD::BITCAST
, dl
, VT
, Val
);
7890 // If all else fails, just use a sequence of INSERT_VECTOR_ELT when we
7891 // know the default expansion would otherwise fall back on something even
7892 // worse. For a vector with one or two non-undef values, that's
7893 // scalar_to_vector for the elements followed by a shuffle (provided the
7894 // shuffle is valid for the target) and materialization element by element
7895 // on the stack followed by a load for everything else.
7896 if (!isConstant
&& !usesOnlyOneValue
) {
7897 SDValue Vec
= DAG
.getUNDEF(VT
);
7898 for (unsigned i
= 0 ; i
< NumElts
; ++i
) {
7899 SDValue V
= Op
.getOperand(i
);
7902 SDValue LaneIdx
= DAG
.getConstant(i
, dl
, MVT::i32
);
7903 Vec
= DAG
.getNode(ISD::INSERT_VECTOR_ELT
, dl
, VT
, Vec
, V
, LaneIdx
);
7911 // Gather data to see if the operation can be modelled as a
7912 // shuffle in combination with VEXTs.
7913 SDValue
ARMTargetLowering::ReconstructShuffle(SDValue Op
,
7914 SelectionDAG
&DAG
) const {
7915 assert(Op
.getOpcode() == ISD::BUILD_VECTOR
&& "Unknown opcode!");
7917 EVT VT
= Op
.getValueType();
7918 unsigned NumElts
= VT
.getVectorNumElements();
7920 struct ShuffleSourceInfo
{
7922 unsigned MinElt
= std::numeric_limits
<unsigned>::max();
7923 unsigned MaxElt
= 0;
7925 // We may insert some combination of BITCASTs and VEXT nodes to force Vec to
7926 // be compatible with the shuffle we intend to construct. As a result
7927 // ShuffleVec will be some sliding window into the original Vec.
7930 // Code should guarantee that element i in Vec starts at element "WindowBase
7931 // + i * WindowScale in ShuffleVec".
7933 int WindowScale
= 1;
7935 ShuffleSourceInfo(SDValue Vec
) : Vec(Vec
), ShuffleVec(Vec
) {}
7937 bool operator ==(SDValue OtherVec
) { return Vec
== OtherVec
; }
7940 // First gather all vectors used as an immediate source for this BUILD_VECTOR
7942 SmallVector
<ShuffleSourceInfo
, 2> Sources
;
7943 for (unsigned i
= 0; i
< NumElts
; ++i
) {
7944 SDValue V
= Op
.getOperand(i
);
7947 else if (V
.getOpcode() != ISD::EXTRACT_VECTOR_ELT
) {
7948 // A shuffle can only come from building a vector from various
7949 // elements of other vectors.
7951 } else if (!isa
<ConstantSDNode
>(V
.getOperand(1))) {
7952 // Furthermore, shuffles require a constant mask, whereas extractelts
7953 // accept variable indices.
7957 // Add this element source to the list if it's not already there.
7958 SDValue SourceVec
= V
.getOperand(0);
7959 auto Source
= llvm::find(Sources
, SourceVec
);
7960 if (Source
== Sources
.end())
7961 Source
= Sources
.insert(Sources
.end(), ShuffleSourceInfo(SourceVec
));
7963 // Update the minimum and maximum lane number seen.
7964 unsigned EltNo
= cast
<ConstantSDNode
>(V
.getOperand(1))->getZExtValue();
7965 Source
->MinElt
= std::min(Source
->MinElt
, EltNo
);
7966 Source
->MaxElt
= std::max(Source
->MaxElt
, EltNo
);
7969 // Currently only do something sane when at most two source vectors
7971 if (Sources
.size() > 2)
7974 // Find out the smallest element size among result and two sources, and use
7975 // it as element size to build the shuffle_vector.
7976 EVT SmallestEltTy
= VT
.getVectorElementType();
7977 for (auto &Source
: Sources
) {
7978 EVT SrcEltTy
= Source
.Vec
.getValueType().getVectorElementType();
7979 if (SrcEltTy
.bitsLT(SmallestEltTy
))
7980 SmallestEltTy
= SrcEltTy
;
7982 unsigned ResMultiplier
=
7983 VT
.getScalarSizeInBits() / SmallestEltTy
.getSizeInBits();
7984 NumElts
= VT
.getSizeInBits() / SmallestEltTy
.getSizeInBits();
7985 EVT ShuffleVT
= EVT::getVectorVT(*DAG
.getContext(), SmallestEltTy
, NumElts
);
7987 // If the source vector is too wide or too narrow, we may nevertheless be able
7988 // to construct a compatible shuffle either by concatenating it with UNDEF or
7989 // extracting a suitable range of elements.
7990 for (auto &Src
: Sources
) {
7991 EVT SrcVT
= Src
.ShuffleVec
.getValueType();
7993 uint64_t SrcVTSize
= SrcVT
.getFixedSizeInBits();
7994 uint64_t VTSize
= VT
.getFixedSizeInBits();
7995 if (SrcVTSize
== VTSize
)
7998 // This stage of the search produces a source with the same element type as
7999 // the original, but with a total width matching the BUILD_VECTOR output.
8000 EVT EltVT
= SrcVT
.getVectorElementType();
8001 unsigned NumSrcElts
= VTSize
/ EltVT
.getFixedSizeInBits();
8002 EVT DestVT
= EVT::getVectorVT(*DAG
.getContext(), EltVT
, NumSrcElts
);
8004 if (SrcVTSize
< VTSize
) {
8005 if (2 * SrcVTSize
!= VTSize
)
8007 // We can pad out the smaller vector for free, so if it's part of a
8010 DAG
.getNode(ISD::CONCAT_VECTORS
, dl
, DestVT
, Src
.ShuffleVec
,
8011 DAG
.getUNDEF(Src
.ShuffleVec
.getValueType()));
8015 if (SrcVTSize
!= 2 * VTSize
)
8018 if (Src
.MaxElt
- Src
.MinElt
>= NumSrcElts
) {
8019 // Span too large for a VEXT to cope
8023 if (Src
.MinElt
>= NumSrcElts
) {
8024 // The extraction can just take the second half
8026 DAG
.getNode(ISD::EXTRACT_SUBVECTOR
, dl
, DestVT
, Src
.ShuffleVec
,
8027 DAG
.getConstant(NumSrcElts
, dl
, MVT::i32
));
8028 Src
.WindowBase
= -NumSrcElts
;
8029 } else if (Src
.MaxElt
< NumSrcElts
) {
8030 // The extraction can just take the first half
8032 DAG
.getNode(ISD::EXTRACT_SUBVECTOR
, dl
, DestVT
, Src
.ShuffleVec
,
8033 DAG
.getConstant(0, dl
, MVT::i32
));
8035 // An actual VEXT is needed
8037 DAG
.getNode(ISD::EXTRACT_SUBVECTOR
, dl
, DestVT
, Src
.ShuffleVec
,
8038 DAG
.getConstant(0, dl
, MVT::i32
));
8040 DAG
.getNode(ISD::EXTRACT_SUBVECTOR
, dl
, DestVT
, Src
.ShuffleVec
,
8041 DAG
.getConstant(NumSrcElts
, dl
, MVT::i32
));
8043 Src
.ShuffleVec
= DAG
.getNode(ARMISD::VEXT
, dl
, DestVT
, VEXTSrc1
,
8045 DAG
.getConstant(Src
.MinElt
, dl
, MVT::i32
));
8046 Src
.WindowBase
= -Src
.MinElt
;
8050 // Another possible incompatibility occurs from the vector element types. We
8051 // can fix this by bitcasting the source vectors to the same type we intend
8053 for (auto &Src
: Sources
) {
8054 EVT SrcEltTy
= Src
.ShuffleVec
.getValueType().getVectorElementType();
8055 if (SrcEltTy
== SmallestEltTy
)
8057 assert(ShuffleVT
.getVectorElementType() == SmallestEltTy
);
8058 Src
.ShuffleVec
= DAG
.getNode(ARMISD::VECTOR_REG_CAST
, dl
, ShuffleVT
, Src
.ShuffleVec
);
8059 Src
.WindowScale
= SrcEltTy
.getSizeInBits() / SmallestEltTy
.getSizeInBits();
8060 Src
.WindowBase
*= Src
.WindowScale
;
8063 // Final sanity check before we try to actually produce a shuffle.
8064 LLVM_DEBUG(for (auto Src
8066 assert(Src
.ShuffleVec
.getValueType() == ShuffleVT
););
8068 // The stars all align, our next step is to produce the mask for the shuffle.
8069 SmallVector
<int, 8> Mask(ShuffleVT
.getVectorNumElements(), -1);
8070 int BitsPerShuffleLane
= ShuffleVT
.getScalarSizeInBits();
8071 for (unsigned i
= 0; i
< VT
.getVectorNumElements(); ++i
) {
8072 SDValue Entry
= Op
.getOperand(i
);
8073 if (Entry
.isUndef())
8076 auto Src
= llvm::find(Sources
, Entry
.getOperand(0));
8077 int EltNo
= cast
<ConstantSDNode
>(Entry
.getOperand(1))->getSExtValue();
8079 // EXTRACT_VECTOR_ELT performs an implicit any_ext; BUILD_VECTOR an implicit
8080 // trunc. So only std::min(SrcBits, DestBits) actually get defined in this
8082 EVT OrigEltTy
= Entry
.getOperand(0).getValueType().getVectorElementType();
8083 int BitsDefined
= std::min(OrigEltTy
.getScalarSizeInBits(),
8084 VT
.getScalarSizeInBits());
8085 int LanesDefined
= BitsDefined
/ BitsPerShuffleLane
;
8087 // This source is expected to fill ResMultiplier lanes of the final shuffle,
8088 // starting at the appropriate offset.
8089 int *LaneMask
= &Mask
[i
* ResMultiplier
];
8091 int ExtractBase
= EltNo
* Src
->WindowScale
+ Src
->WindowBase
;
8092 ExtractBase
+= NumElts
* (Src
- Sources
.begin());
8093 for (int j
= 0; j
< LanesDefined
; ++j
)
8094 LaneMask
[j
] = ExtractBase
+ j
;
8098 // We can't handle more than two sources. This should have already
8099 // been checked before this point.
8100 assert(Sources
.size() <= 2 && "Too many sources!");
8102 SDValue ShuffleOps
[] = { DAG
.getUNDEF(ShuffleVT
), DAG
.getUNDEF(ShuffleVT
) };
8103 for (unsigned i
= 0; i
< Sources
.size(); ++i
)
8104 ShuffleOps
[i
] = Sources
[i
].ShuffleVec
;
8106 SDValue Shuffle
= buildLegalVectorShuffle(ShuffleVT
, dl
, ShuffleOps
[0],
8107 ShuffleOps
[1], Mask
, DAG
);
8110 return DAG
.getNode(ARMISD::VECTOR_REG_CAST
, dl
, VT
, Shuffle
);
8113 enum ShuffleOpCodes
{
8114 OP_COPY
= 0, // Copy, used for things like <u,u,u,3> to say it is <0,1,2,3>
8123 OP_VUZPL
, // VUZP, left result
8124 OP_VUZPR
, // VUZP, right result
8125 OP_VZIPL
, // VZIP, left result
8126 OP_VZIPR
, // VZIP, right result
8127 OP_VTRNL
, // VTRN, left result
8128 OP_VTRNR
// VTRN, right result
8131 static bool isLegalMVEShuffleOp(unsigned PFEntry
) {
8132 unsigned OpNum
= (PFEntry
>> 26) & 0x0F;
8145 /// isShuffleMaskLegal - Targets can use this to indicate that they only
8146 /// support *some* VECTOR_SHUFFLE operations, those with specific masks.
8147 /// By default, if a target supports the VECTOR_SHUFFLE node, all mask values
8148 /// are assumed to be legal.
8149 bool ARMTargetLowering::isShuffleMaskLegal(ArrayRef
<int> M
, EVT VT
) const {
8150 if (VT
.getVectorNumElements() == 4 &&
8151 (VT
.is128BitVector() || VT
.is64BitVector())) {
8152 unsigned PFIndexes
[4];
8153 for (unsigned i
= 0; i
!= 4; ++i
) {
8157 PFIndexes
[i
] = M
[i
];
8160 // Compute the index in the perfect shuffle table.
8161 unsigned PFTableIndex
=
8162 PFIndexes
[0]*9*9*9+PFIndexes
[1]*9*9+PFIndexes
[2]*9+PFIndexes
[3];
8163 unsigned PFEntry
= PerfectShuffleTable
[PFTableIndex
];
8164 unsigned Cost
= (PFEntry
>> 30);
8166 if (Cost
<= 4 && (Subtarget
->hasNEON() || isLegalMVEShuffleOp(PFEntry
)))
8170 bool ReverseVEXT
, isV_UNDEF
;
8171 unsigned Imm
, WhichResult
;
8173 unsigned EltSize
= VT
.getScalarSizeInBits();
8174 if (EltSize
>= 32 ||
8175 ShuffleVectorSDNode::isSplatMask(&M
[0], VT
) ||
8176 ShuffleVectorInst::isIdentityMask(M
) ||
8177 isVREVMask(M
, VT
, 64) ||
8178 isVREVMask(M
, VT
, 32) ||
8179 isVREVMask(M
, VT
, 16))
8181 else if (Subtarget
->hasNEON() &&
8182 (isVEXTMask(M
, VT
, ReverseVEXT
, Imm
) ||
8183 isVTBLMask(M
, VT
) ||
8184 isNEONTwoResultShuffleMask(M
, VT
, WhichResult
, isV_UNDEF
)))
8186 else if (Subtarget
->hasNEON() && (VT
== MVT::v8i16
|| VT
== MVT::v16i8
) &&
8187 isReverseMask(M
, VT
))
8189 else if (Subtarget
->hasMVEIntegerOps() &&
8190 (isVMOVNMask(M
, VT
, true, false) ||
8191 isVMOVNMask(M
, VT
, false, false) || isVMOVNMask(M
, VT
, true, true)))
8197 /// GeneratePerfectShuffle - Given an entry in the perfect-shuffle table, emit
8198 /// the specified operations to build the shuffle.
8199 static SDValue
GeneratePerfectShuffle(unsigned PFEntry
, SDValue LHS
,
8200 SDValue RHS
, SelectionDAG
&DAG
,
8202 unsigned OpNum
= (PFEntry
>> 26) & 0x0F;
8203 unsigned LHSID
= (PFEntry
>> 13) & ((1 << 13)-1);
8204 unsigned RHSID
= (PFEntry
>> 0) & ((1 << 13)-1);
8206 if (OpNum
== OP_COPY
) {
8207 if (LHSID
== (1*9+2)*9+3) return LHS
;
8208 assert(LHSID
== ((4*9+5)*9+6)*9+7 && "Illegal OP_COPY!");
8212 SDValue OpLHS
, OpRHS
;
8213 OpLHS
= GeneratePerfectShuffle(PerfectShuffleTable
[LHSID
], LHS
, RHS
, DAG
, dl
);
8214 OpRHS
= GeneratePerfectShuffle(PerfectShuffleTable
[RHSID
], LHS
, RHS
, DAG
, dl
);
8215 EVT VT
= OpLHS
.getValueType();
8218 default: llvm_unreachable("Unknown shuffle opcode!");
8220 // VREV divides the vector in half and swaps within the half.
8221 if (VT
.getVectorElementType() == MVT::i32
||
8222 VT
.getVectorElementType() == MVT::f32
)
8223 return DAG
.getNode(ARMISD::VREV64
, dl
, VT
, OpLHS
);
8224 // vrev <4 x i16> -> VREV32
8225 if (VT
.getVectorElementType() == MVT::i16
||
8226 VT
.getVectorElementType() == MVT::f16
)
8227 return DAG
.getNode(ARMISD::VREV32
, dl
, VT
, OpLHS
);
8228 // vrev <4 x i8> -> VREV16
8229 assert(VT
.getVectorElementType() == MVT::i8
);
8230 return DAG
.getNode(ARMISD::VREV16
, dl
, VT
, OpLHS
);
8235 return DAG
.getNode(ARMISD::VDUPLANE
, dl
, VT
,
8236 OpLHS
, DAG
.getConstant(OpNum
-OP_VDUP0
, dl
, MVT::i32
));
8240 return DAG
.getNode(ARMISD::VEXT
, dl
, VT
,
8242 DAG
.getConstant(OpNum
- OP_VEXT1
+ 1, dl
, MVT::i32
));
8245 return DAG
.getNode(ARMISD::VUZP
, dl
, DAG
.getVTList(VT
, VT
),
8246 OpLHS
, OpRHS
).getValue(OpNum
-OP_VUZPL
);
8249 return DAG
.getNode(ARMISD::VZIP
, dl
, DAG
.getVTList(VT
, VT
),
8250 OpLHS
, OpRHS
).getValue(OpNum
-OP_VZIPL
);
8253 return DAG
.getNode(ARMISD::VTRN
, dl
, DAG
.getVTList(VT
, VT
),
8254 OpLHS
, OpRHS
).getValue(OpNum
-OP_VTRNL
);
8258 static SDValue
LowerVECTOR_SHUFFLEv8i8(SDValue Op
,
8259 ArrayRef
<int> ShuffleMask
,
8260 SelectionDAG
&DAG
) {
8261 // Check to see if we can use the VTBL instruction.
8262 SDValue V1
= Op
.getOperand(0);
8263 SDValue V2
= Op
.getOperand(1);
8266 SmallVector
<SDValue
, 8> VTBLMask
;
8267 for (ArrayRef
<int>::iterator
8268 I
= ShuffleMask
.begin(), E
= ShuffleMask
.end(); I
!= E
; ++I
)
8269 VTBLMask
.push_back(DAG
.getConstant(*I
, DL
, MVT::i32
));
8271 if (V2
.getNode()->isUndef())
8272 return DAG
.getNode(ARMISD::VTBL1
, DL
, MVT::v8i8
, V1
,
8273 DAG
.getBuildVector(MVT::v8i8
, DL
, VTBLMask
));
8275 return DAG
.getNode(ARMISD::VTBL2
, DL
, MVT::v8i8
, V1
, V2
,
8276 DAG
.getBuildVector(MVT::v8i8
, DL
, VTBLMask
));
8279 static SDValue
LowerReverse_VECTOR_SHUFFLEv16i8_v8i16(SDValue Op
,
8280 SelectionDAG
&DAG
) {
8282 SDValue OpLHS
= Op
.getOperand(0);
8283 EVT VT
= OpLHS
.getValueType();
8285 assert((VT
== MVT::v8i16
|| VT
== MVT::v16i8
) &&
8286 "Expect an v8i16/v16i8 type");
8287 OpLHS
= DAG
.getNode(ARMISD::VREV64
, DL
, VT
, OpLHS
);
8288 // For a v16i8 type: After the VREV, we have got <8, ...15, 8, ..., 0>. Now,
8289 // extract the first 8 bytes into the top double word and the last 8 bytes
8290 // into the bottom double word. The v8i16 case is similar.
8291 unsigned ExtractNum
= (VT
== MVT::v16i8
) ? 8 : 4;
8292 return DAG
.getNode(ARMISD::VEXT
, DL
, VT
, OpLHS
, OpLHS
,
8293 DAG
.getConstant(ExtractNum
, DL
, MVT::i32
));
8296 static EVT
getVectorTyFromPredicateVector(EVT VT
) {
8297 switch (VT
.getSimpleVT().SimpleTy
) {
8305 llvm_unreachable("Unexpected vector predicate type");
8309 static SDValue
PromoteMVEPredVector(SDLoc dl
, SDValue Pred
, EVT VT
,
8310 SelectionDAG
&DAG
) {
8311 // Converting from boolean predicates to integers involves creating a vector
8312 // of all ones or all zeroes and selecting the lanes based upon the real
8315 DAG
.getTargetConstant(ARM_AM::createVMOVModImm(0xe, 0xff), dl
, MVT::i32
);
8316 AllOnes
= DAG
.getNode(ARMISD::VMOVIMM
, dl
, MVT::v16i8
, AllOnes
);
8319 DAG
.getTargetConstant(ARM_AM::createVMOVModImm(0xe, 0x0), dl
, MVT::i32
);
8320 AllZeroes
= DAG
.getNode(ARMISD::VMOVIMM
, dl
, MVT::v16i8
, AllZeroes
);
8322 // Get full vector type from predicate type
8323 EVT NewVT
= getVectorTyFromPredicateVector(VT
);
8326 // If the real predicate is an v8i1 or v4i1 (not v16i1) then we need to recast
8327 // this to a v16i1. This cannot be done with an ordinary bitcast because the
8328 // sizes are not the same. We have to use a MVE specific PREDICATE_CAST node,
8329 // since we know in hardware the sizes are really the same.
8330 if (VT
!= MVT::v16i1
)
8331 RecastV1
= DAG
.getNode(ARMISD::PREDICATE_CAST
, dl
, MVT::v16i1
, Pred
);
8335 // Select either all ones or zeroes depending upon the real predicate bits.
8336 SDValue PredAsVector
=
8337 DAG
.getNode(ISD::VSELECT
, dl
, MVT::v16i8
, RecastV1
, AllOnes
, AllZeroes
);
8339 // Recast our new predicate-as-integer v16i8 vector into something
8340 // appropriate for the shuffle, i.e. v4i32 for a real v4i1 predicate.
8341 return DAG
.getNode(ISD::BITCAST
, dl
, NewVT
, PredAsVector
);
8344 static SDValue
LowerVECTOR_SHUFFLE_i1(SDValue Op
, SelectionDAG
&DAG
,
8345 const ARMSubtarget
*ST
) {
8346 EVT VT
= Op
.getValueType();
8347 ShuffleVectorSDNode
*SVN
= cast
<ShuffleVectorSDNode
>(Op
.getNode());
8348 ArrayRef
<int> ShuffleMask
= SVN
->getMask();
8350 assert(ST
->hasMVEIntegerOps() &&
8351 "No support for vector shuffle of boolean predicates");
8353 SDValue V1
= Op
.getOperand(0);
8355 if (isReverseMask(ShuffleMask
, VT
)) {
8356 SDValue cast
= DAG
.getNode(ARMISD::PREDICATE_CAST
, dl
, MVT::i32
, V1
);
8357 SDValue rbit
= DAG
.getNode(ISD::BITREVERSE
, dl
, MVT::i32
, cast
);
8358 SDValue srl
= DAG
.getNode(ISD::SRL
, dl
, MVT::i32
, rbit
,
8359 DAG
.getConstant(16, dl
, MVT::i32
));
8360 return DAG
.getNode(ARMISD::PREDICATE_CAST
, dl
, VT
, srl
);
8363 // Until we can come up with optimised cases for every single vector
8364 // shuffle in existence we have chosen the least painful strategy. This is
8365 // to essentially promote the boolean predicate to a 8-bit integer, where
8366 // each predicate represents a byte. Then we fall back on a normal integer
8367 // vector shuffle and convert the result back into a predicate vector. In
8368 // many cases the generated code might be even better than scalar code
8369 // operating on bits. Just imagine trying to shuffle 8 arbitrary 2-bit
8370 // fields in a register into 8 other arbitrary 2-bit fields!
8371 SDValue PredAsVector
= PromoteMVEPredVector(dl
, V1
, VT
, DAG
);
8372 EVT NewVT
= PredAsVector
.getValueType();
8375 SDValue Shuffled
= DAG
.getVectorShuffle(NewVT
, dl
, PredAsVector
,
8376 DAG
.getUNDEF(NewVT
), ShuffleMask
);
8378 // Now return the result of comparing the shuffled vector with zero,
8379 // which will generate a real predicate, i.e. v4i1, v8i1 or v16i1.
8380 return DAG
.getNode(ARMISD::VCMPZ
, dl
, VT
, Shuffled
,
8381 DAG
.getConstant(ARMCC::NE
, dl
, MVT::i32
));
8384 static SDValue
LowerVECTOR_SHUFFLEUsingMovs(SDValue Op
,
8385 ArrayRef
<int> ShuffleMask
,
8386 SelectionDAG
&DAG
) {
8387 // Attempt to lower the vector shuffle using as many whole register movs as
8388 // possible. This is useful for types smaller than 32bits, which would
8389 // often otherwise become a series for grp movs.
8391 EVT VT
= Op
.getValueType();
8392 if (VT
.getScalarSizeInBits() >= 32)
8395 assert((VT
== MVT::v8i16
|| VT
== MVT::v8f16
|| VT
== MVT::v16i8
) &&
8396 "Unexpected vector type");
8397 int NumElts
= VT
.getVectorNumElements();
8398 int QuarterSize
= NumElts
/ 4;
8399 // The four final parts of the vector, as i32's
8402 // Look for full lane vmovs like <0,1,2,3> or <u,5,6,7> etc, (but not
8403 // <u,u,u,u>), returning the vmov lane index
8404 auto getMovIdx
= [](ArrayRef
<int> ShuffleMask
, int Start
, int Length
) {
8405 // Detect which mov lane this would be from the first non-undef element.
8407 for (int i
= 0; i
< Length
; i
++) {
8408 if (ShuffleMask
[Start
+ i
] >= 0) {
8409 if (ShuffleMask
[Start
+ i
] % Length
!= i
)
8411 MovIdx
= ShuffleMask
[Start
+ i
] / Length
;
8415 // If all items are undef, leave this for other combines
8418 // Check the remaining values are the correct part of the same mov
8419 for (int i
= 1; i
< Length
; i
++) {
8420 if (ShuffleMask
[Start
+ i
] >= 0 &&
8421 (ShuffleMask
[Start
+ i
] / Length
!= MovIdx
||
8422 ShuffleMask
[Start
+ i
] % Length
!= i
))
8428 for (int Part
= 0; Part
< 4; ++Part
) {
8429 // Does this part look like a mov
8430 int Elt
= getMovIdx(ShuffleMask
, Part
* QuarterSize
, QuarterSize
);
8432 SDValue Input
= Op
->getOperand(0);
8434 Input
= Op
->getOperand(1);
8437 SDValue BitCast
= DAG
.getBitcast(MVT::v4f32
, Input
);
8438 Parts
[Part
] = DAG
.getNode(ISD::EXTRACT_VECTOR_ELT
, dl
, MVT::f32
, BitCast
,
8439 DAG
.getConstant(Elt
, dl
, MVT::i32
));
8443 // Nothing interesting found, just return
8444 if (!Parts
[0] && !Parts
[1] && !Parts
[2] && !Parts
[3])
8447 // The other parts need to be built with the old shuffle vector, cast to a
8448 // v4i32 and extract_vector_elts
8449 if (!Parts
[0] || !Parts
[1] || !Parts
[2] || !Parts
[3]) {
8450 SmallVector
<int, 16> NewShuffleMask
;
8451 for (int Part
= 0; Part
< 4; ++Part
)
8452 for (int i
= 0; i
< QuarterSize
; i
++)
8453 NewShuffleMask
.push_back(
8454 Parts
[Part
] ? -1 : ShuffleMask
[Part
* QuarterSize
+ i
]);
8455 SDValue NewShuffle
= DAG
.getVectorShuffle(
8456 VT
, dl
, Op
->getOperand(0), Op
->getOperand(1), NewShuffleMask
);
8457 SDValue BitCast
= DAG
.getBitcast(MVT::v4f32
, NewShuffle
);
8459 for (int Part
= 0; Part
< 4; ++Part
)
8461 Parts
[Part
] = DAG
.getNode(ISD::EXTRACT_VECTOR_ELT
, dl
, MVT::f32
,
8462 BitCast
, DAG
.getConstant(Part
, dl
, MVT::i32
));
8464 // Build a vector out of the various parts and bitcast it back to the original
8466 SDValue NewVec
= DAG
.getNode(ARMISD::BUILD_VECTOR
, dl
, MVT::v4f32
, Parts
);
8467 return DAG
.getBitcast(VT
, NewVec
);
8470 static SDValue
LowerVECTOR_SHUFFLEUsingOneOff(SDValue Op
,
8471 ArrayRef
<int> ShuffleMask
,
8472 SelectionDAG
&DAG
) {
8473 SDValue V1
= Op
.getOperand(0);
8474 SDValue V2
= Op
.getOperand(1);
8475 EVT VT
= Op
.getValueType();
8476 unsigned NumElts
= VT
.getVectorNumElements();
8478 // An One-Off Identity mask is one that is mostly an identity mask from as
8479 // single source but contains a single element out-of-place, either from a
8480 // different vector or from another position in the same vector. As opposed to
8481 // lowering this via a ARMISD::BUILD_VECTOR we can generate an extract/insert
8483 auto isOneOffIdentityMask
= [](ArrayRef
<int> Mask
, EVT VT
, int BaseOffset
,
8487 for (int i
= 0, NumMaskElts
= Mask
.size(); i
< NumMaskElts
; ++i
) {
8491 if (Mask
[i
] != i
+ BaseOffset
) {
8492 if (OffElement
== -1)
8498 return NonUndef
> 2 && OffElement
!= -1;
8502 if (isOneOffIdentityMask(ShuffleMask
, VT
, 0, OffElement
))
8504 else if (isOneOffIdentityMask(ShuffleMask
, VT
, NumElts
, OffElement
))
8510 EVT SVT
= VT
.getScalarType() == MVT::i8
|| VT
.getScalarType() == MVT::i16
8512 : VT
.getScalarType();
8513 SDValue Elt
= DAG
.getNode(
8514 ISD::EXTRACT_VECTOR_ELT
, dl
, SVT
,
8515 ShuffleMask
[OffElement
] < (int)NumElts
? V1
: V2
,
8516 DAG
.getVectorIdxConstant(ShuffleMask
[OffElement
] % NumElts
, dl
));
8517 return DAG
.getNode(ISD::INSERT_VECTOR_ELT
, dl
, VT
, VInput
, Elt
,
8518 DAG
.getVectorIdxConstant(OffElement
% NumElts
, dl
));
8521 static SDValue
LowerVECTOR_SHUFFLE(SDValue Op
, SelectionDAG
&DAG
,
8522 const ARMSubtarget
*ST
) {
8523 SDValue V1
= Op
.getOperand(0);
8524 SDValue V2
= Op
.getOperand(1);
8526 EVT VT
= Op
.getValueType();
8527 ShuffleVectorSDNode
*SVN
= cast
<ShuffleVectorSDNode
>(Op
.getNode());
8528 unsigned EltSize
= VT
.getScalarSizeInBits();
8530 if (ST
->hasMVEIntegerOps() && EltSize
== 1)
8531 return LowerVECTOR_SHUFFLE_i1(Op
, DAG
, ST
);
8533 // Convert shuffles that are directly supported on NEON to target-specific
8534 // DAG nodes, instead of keeping them as shuffles and matching them again
8535 // during code selection. This is more efficient and avoids the possibility
8536 // of inconsistencies between legalization and selection.
8537 // FIXME: floating-point vectors should be canonicalized to integer vectors
8538 // of the same time so that they get CSEd properly.
8539 ArrayRef
<int> ShuffleMask
= SVN
->getMask();
8541 if (EltSize
<= 32) {
8542 if (SVN
->isSplat()) {
8543 int Lane
= SVN
->getSplatIndex();
8544 // If this is undef splat, generate it via "just" vdup, if possible.
8545 if (Lane
== -1) Lane
= 0;
8547 // Test if V1 is a SCALAR_TO_VECTOR.
8548 if (Lane
== 0 && V1
.getOpcode() == ISD::SCALAR_TO_VECTOR
) {
8549 return DAG
.getNode(ARMISD::VDUP
, dl
, VT
, V1
.getOperand(0));
8551 // Test if V1 is a BUILD_VECTOR which is equivalent to a SCALAR_TO_VECTOR
8552 // (and probably will turn into a SCALAR_TO_VECTOR once legalization
8554 if (Lane
== 0 && V1
.getOpcode() == ISD::BUILD_VECTOR
&&
8555 !isa
<ConstantSDNode
>(V1
.getOperand(0))) {
8556 bool IsScalarToVector
= true;
8557 for (unsigned i
= 1, e
= V1
.getNumOperands(); i
!= e
; ++i
)
8558 if (!V1
.getOperand(i
).isUndef()) {
8559 IsScalarToVector
= false;
8562 if (IsScalarToVector
)
8563 return DAG
.getNode(ARMISD::VDUP
, dl
, VT
, V1
.getOperand(0));
8565 return DAG
.getNode(ARMISD::VDUPLANE
, dl
, VT
, V1
,
8566 DAG
.getConstant(Lane
, dl
, MVT::i32
));
8569 bool ReverseVEXT
= false;
8571 if (ST
->hasNEON() && isVEXTMask(ShuffleMask
, VT
, ReverseVEXT
, Imm
)) {
8574 return DAG
.getNode(ARMISD::VEXT
, dl
, VT
, V1
, V2
,
8575 DAG
.getConstant(Imm
, dl
, MVT::i32
));
8578 if (isVREVMask(ShuffleMask
, VT
, 64))
8579 return DAG
.getNode(ARMISD::VREV64
, dl
, VT
, V1
);
8580 if (isVREVMask(ShuffleMask
, VT
, 32))
8581 return DAG
.getNode(ARMISD::VREV32
, dl
, VT
, V1
);
8582 if (isVREVMask(ShuffleMask
, VT
, 16))
8583 return DAG
.getNode(ARMISD::VREV16
, dl
, VT
, V1
);
8585 if (ST
->hasNEON() && V2
->isUndef() && isSingletonVEXTMask(ShuffleMask
, VT
, Imm
)) {
8586 return DAG
.getNode(ARMISD::VEXT
, dl
, VT
, V1
, V1
,
8587 DAG
.getConstant(Imm
, dl
, MVT::i32
));
8590 // Check for Neon shuffles that modify both input vectors in place.
8591 // If both results are used, i.e., if there are two shuffles with the same
8592 // source operands and with masks corresponding to both results of one of
8593 // these operations, DAG memoization will ensure that a single node is
8594 // used for both shuffles.
8595 unsigned WhichResult
= 0;
8596 bool isV_UNDEF
= false;
8597 if (ST
->hasNEON()) {
8598 if (unsigned ShuffleOpc
= isNEONTwoResultShuffleMask(
8599 ShuffleMask
, VT
, WhichResult
, isV_UNDEF
)) {
8602 return DAG
.getNode(ShuffleOpc
, dl
, DAG
.getVTList(VT
, VT
), V1
, V2
)
8603 .getValue(WhichResult
);
8606 if (ST
->hasMVEIntegerOps()) {
8607 if (isVMOVNMask(ShuffleMask
, VT
, false, false))
8608 return DAG
.getNode(ARMISD::VMOVN
, dl
, VT
, V2
, V1
,
8609 DAG
.getConstant(0, dl
, MVT::i32
));
8610 if (isVMOVNMask(ShuffleMask
, VT
, true, false))
8611 return DAG
.getNode(ARMISD::VMOVN
, dl
, VT
, V1
, V2
,
8612 DAG
.getConstant(1, dl
, MVT::i32
));
8613 if (isVMOVNMask(ShuffleMask
, VT
, true, true))
8614 return DAG
.getNode(ARMISD::VMOVN
, dl
, VT
, V1
, V1
,
8615 DAG
.getConstant(1, dl
, MVT::i32
));
8618 // Also check for these shuffles through CONCAT_VECTORS: we canonicalize
8619 // shuffles that produce a result larger than their operands with:
8620 // shuffle(concat(v1, undef), concat(v2, undef))
8622 // shuffle(concat(v1, v2), undef)
8623 // because we can access quad vectors (see PerformVECTOR_SHUFFLECombine).
8625 // This is useful in the general case, but there are special cases where
8626 // native shuffles produce larger results: the two-result ops.
8628 // Look through the concat when lowering them:
8629 // shuffle(concat(v1, v2), undef)
8631 // concat(VZIP(v1, v2):0, :1)
8633 if (ST
->hasNEON() && V1
->getOpcode() == ISD::CONCAT_VECTORS
&& V2
->isUndef()) {
8634 SDValue SubV1
= V1
->getOperand(0);
8635 SDValue SubV2
= V1
->getOperand(1);
8636 EVT SubVT
= SubV1
.getValueType();
8638 // We expect these to have been canonicalized to -1.
8639 assert(llvm::all_of(ShuffleMask
, [&](int i
) {
8640 return i
< (int)VT
.getVectorNumElements();
8641 }) && "Unexpected shuffle index into UNDEF operand!");
8643 if (unsigned ShuffleOpc
= isNEONTwoResultShuffleMask(
8644 ShuffleMask
, SubVT
, WhichResult
, isV_UNDEF
)) {
8647 assert((WhichResult
== 0) &&
8648 "In-place shuffle of concat can only have one result!");
8649 SDValue Res
= DAG
.getNode(ShuffleOpc
, dl
, DAG
.getVTList(SubVT
, SubVT
),
8651 return DAG
.getNode(ISD::CONCAT_VECTORS
, dl
, VT
, Res
.getValue(0),
8657 if (ST
->hasMVEIntegerOps() && EltSize
<= 32)
8658 if (SDValue V
= LowerVECTOR_SHUFFLEUsingOneOff(Op
, ShuffleMask
, DAG
))
8661 // If the shuffle is not directly supported and it has 4 elements, use
8662 // the PerfectShuffle-generated table to synthesize it from other shuffles.
8663 unsigned NumElts
= VT
.getVectorNumElements();
8665 unsigned PFIndexes
[4];
8666 for (unsigned i
= 0; i
!= 4; ++i
) {
8667 if (ShuffleMask
[i
] < 0)
8670 PFIndexes
[i
] = ShuffleMask
[i
];
8673 // Compute the index in the perfect shuffle table.
8674 unsigned PFTableIndex
=
8675 PFIndexes
[0]*9*9*9+PFIndexes
[1]*9*9+PFIndexes
[2]*9+PFIndexes
[3];
8676 unsigned PFEntry
= PerfectShuffleTable
[PFTableIndex
];
8677 unsigned Cost
= (PFEntry
>> 30);
8681 return GeneratePerfectShuffle(PFEntry
, V1
, V2
, DAG
, dl
);
8682 else if (isLegalMVEShuffleOp(PFEntry
)) {
8683 unsigned LHSID
= (PFEntry
>> 13) & ((1 << 13)-1);
8684 unsigned RHSID
= (PFEntry
>> 0) & ((1 << 13)-1);
8685 unsigned PFEntryLHS
= PerfectShuffleTable
[LHSID
];
8686 unsigned PFEntryRHS
= PerfectShuffleTable
[RHSID
];
8687 if (isLegalMVEShuffleOp(PFEntryLHS
) && isLegalMVEShuffleOp(PFEntryRHS
))
8688 return GeneratePerfectShuffle(PFEntry
, V1
, V2
, DAG
, dl
);
8693 // Implement shuffles with 32- or 64-bit elements as ARMISD::BUILD_VECTORs.
8694 if (EltSize
>= 32) {
8695 // Do the expansion with floating-point types, since that is what the VFP
8696 // registers are defined to use, and since i64 is not legal.
8697 EVT EltVT
= EVT::getFloatingPointVT(EltSize
);
8698 EVT VecVT
= EVT::getVectorVT(*DAG
.getContext(), EltVT
, NumElts
);
8699 V1
= DAG
.getNode(ISD::BITCAST
, dl
, VecVT
, V1
);
8700 V2
= DAG
.getNode(ISD::BITCAST
, dl
, VecVT
, V2
);
8701 SmallVector
<SDValue
, 8> Ops
;
8702 for (unsigned i
= 0; i
< NumElts
; ++i
) {
8703 if (ShuffleMask
[i
] < 0)
8704 Ops
.push_back(DAG
.getUNDEF(EltVT
));
8706 Ops
.push_back(DAG
.getNode(ISD::EXTRACT_VECTOR_ELT
, dl
, EltVT
,
8707 ShuffleMask
[i
] < (int)NumElts
? V1
: V2
,
8708 DAG
.getConstant(ShuffleMask
[i
] & (NumElts
-1),
8711 SDValue Val
= DAG
.getNode(ARMISD::BUILD_VECTOR
, dl
, VecVT
, Ops
);
8712 return DAG
.getNode(ISD::BITCAST
, dl
, VT
, Val
);
8715 if (ST
->hasNEON() && (VT
== MVT::v8i16
|| VT
== MVT::v16i8
) && isReverseMask(ShuffleMask
, VT
))
8716 return LowerReverse_VECTOR_SHUFFLEv16i8_v8i16(Op
, DAG
);
8718 if (ST
->hasNEON() && VT
== MVT::v8i8
)
8719 if (SDValue NewOp
= LowerVECTOR_SHUFFLEv8i8(Op
, ShuffleMask
, DAG
))
8722 if (ST
->hasMVEIntegerOps())
8723 if (SDValue NewOp
= LowerVECTOR_SHUFFLEUsingMovs(Op
, ShuffleMask
, DAG
))
8729 static SDValue
LowerINSERT_VECTOR_ELT_i1(SDValue Op
, SelectionDAG
&DAG
,
8730 const ARMSubtarget
*ST
) {
8731 EVT VecVT
= Op
.getOperand(0).getValueType();
8734 assert(ST
->hasMVEIntegerOps() &&
8735 "LowerINSERT_VECTOR_ELT_i1 called without MVE!");
8738 DAG
.getNode(ARMISD::PREDICATE_CAST
, dl
, MVT::i32
, Op
->getOperand(0));
8739 unsigned Lane
= cast
<ConstantSDNode
>(Op
.getOperand(2))->getZExtValue();
8740 unsigned LaneWidth
=
8741 getVectorTyFromPredicateVector(VecVT
).getScalarSizeInBits() / 8;
8742 unsigned Mask
= ((1 << LaneWidth
) - 1) << Lane
* LaneWidth
;
8743 SDValue Ext
= DAG
.getNode(ISD::SIGN_EXTEND_INREG
, dl
, MVT::i32
,
8744 Op
.getOperand(1), DAG
.getValueType(MVT::i1
));
8745 SDValue BFI
= DAG
.getNode(ARMISD::BFI
, dl
, MVT::i32
, Conv
, Ext
,
8746 DAG
.getConstant(~Mask
, dl
, MVT::i32
));
8747 return DAG
.getNode(ARMISD::PREDICATE_CAST
, dl
, Op
.getValueType(), BFI
);
8750 SDValue
ARMTargetLowering::LowerINSERT_VECTOR_ELT(SDValue Op
,
8751 SelectionDAG
&DAG
) const {
8752 // INSERT_VECTOR_ELT is legal only for immediate indexes.
8753 SDValue Lane
= Op
.getOperand(2);
8754 if (!isa
<ConstantSDNode
>(Lane
))
8757 SDValue Elt
= Op
.getOperand(1);
8758 EVT EltVT
= Elt
.getValueType();
8760 if (Subtarget
->hasMVEIntegerOps() &&
8761 Op
.getValueType().getScalarSizeInBits() == 1)
8762 return LowerINSERT_VECTOR_ELT_i1(Op
, DAG
, Subtarget
);
8764 if (getTypeAction(*DAG
.getContext(), EltVT
) ==
8765 TargetLowering::TypePromoteFloat
) {
8766 // INSERT_VECTOR_ELT doesn't want f16 operands promoting to f32,
8767 // but the type system will try to do that if we don't intervene.
8768 // Reinterpret any such vector-element insertion as one with the
8769 // corresponding integer types.
8773 EVT IEltVT
= MVT::getIntegerVT(EltVT
.getScalarSizeInBits());
8774 assert(getTypeAction(*DAG
.getContext(), IEltVT
) !=
8775 TargetLowering::TypePromoteFloat
);
8777 SDValue VecIn
= Op
.getOperand(0);
8778 EVT VecVT
= VecIn
.getValueType();
8779 EVT IVecVT
= EVT::getVectorVT(*DAG
.getContext(), IEltVT
,
8780 VecVT
.getVectorNumElements());
8782 SDValue IElt
= DAG
.getNode(ISD::BITCAST
, dl
, IEltVT
, Elt
);
8783 SDValue IVecIn
= DAG
.getNode(ISD::BITCAST
, dl
, IVecVT
, VecIn
);
8784 SDValue IVecOut
= DAG
.getNode(ISD::INSERT_VECTOR_ELT
, dl
, IVecVT
,
8785 IVecIn
, IElt
, Lane
);
8786 return DAG
.getNode(ISD::BITCAST
, dl
, VecVT
, IVecOut
);
8792 static SDValue
LowerEXTRACT_VECTOR_ELT_i1(SDValue Op
, SelectionDAG
&DAG
,
8793 const ARMSubtarget
*ST
) {
8794 EVT VecVT
= Op
.getOperand(0).getValueType();
8797 assert(ST
->hasMVEIntegerOps() &&
8798 "LowerINSERT_VECTOR_ELT_i1 called without MVE!");
8801 DAG
.getNode(ARMISD::PREDICATE_CAST
, dl
, MVT::i32
, Op
->getOperand(0));
8802 unsigned Lane
= cast
<ConstantSDNode
>(Op
.getOperand(1))->getZExtValue();
8803 unsigned LaneWidth
=
8804 getVectorTyFromPredicateVector(VecVT
).getScalarSizeInBits() / 8;
8805 SDValue Shift
= DAG
.getNode(ISD::SRL
, dl
, MVT::i32
, Conv
,
8806 DAG
.getConstant(Lane
* LaneWidth
, dl
, MVT::i32
));
8810 static SDValue
LowerEXTRACT_VECTOR_ELT(SDValue Op
, SelectionDAG
&DAG
,
8811 const ARMSubtarget
*ST
) {
8812 // EXTRACT_VECTOR_ELT is legal only for immediate indexes.
8813 SDValue Lane
= Op
.getOperand(1);
8814 if (!isa
<ConstantSDNode
>(Lane
))
8817 SDValue Vec
= Op
.getOperand(0);
8818 EVT VT
= Vec
.getValueType();
8820 if (ST
->hasMVEIntegerOps() && VT
.getScalarSizeInBits() == 1)
8821 return LowerEXTRACT_VECTOR_ELT_i1(Op
, DAG
, ST
);
8823 if (Op
.getValueType() == MVT::i32
&& Vec
.getScalarValueSizeInBits() < 32) {
8825 return DAG
.getNode(ARMISD::VGETLANEu
, dl
, MVT::i32
, Vec
, Lane
);
8831 static SDValue
LowerCONCAT_VECTORS_i1(SDValue Op
, SelectionDAG
&DAG
,
8832 const ARMSubtarget
*ST
) {
8834 assert(Op
.getValueType().getScalarSizeInBits() == 1 &&
8835 "Unexpected custom CONCAT_VECTORS lowering");
8836 assert(isPowerOf2_32(Op
.getNumOperands()) &&
8837 "Unexpected custom CONCAT_VECTORS lowering");
8838 assert(ST
->hasMVEIntegerOps() &&
8839 "CONCAT_VECTORS lowering only supported for MVE");
8841 auto ConcatPair
= [&](SDValue V1
, SDValue V2
) {
8842 EVT Op1VT
= V1
.getValueType();
8843 EVT Op2VT
= V2
.getValueType();
8844 assert(Op1VT
== Op2VT
&& "Operand types don't match!");
8845 EVT VT
= Op1VT
.getDoubleNumVectorElementsVT(*DAG
.getContext());
8847 SDValue NewV1
= PromoteMVEPredVector(dl
, V1
, Op1VT
, DAG
);
8848 SDValue NewV2
= PromoteMVEPredVector(dl
, V2
, Op2VT
, DAG
);
8850 // We now have Op1 + Op2 promoted to vectors of integers, where v8i1 gets
8851 // promoted to v8i16, etc.
8853 getVectorTyFromPredicateVector(VT
).getScalarType().getSimpleVT();
8854 unsigned NumElts
= 2 * Op1VT
.getVectorNumElements();
8856 // Extract the vector elements from Op1 and Op2 one by one and truncate them
8857 // to be the right size for the destination. For example, if Op1 is v4i1
8858 // then the promoted vector is v4i32. The result of concatentation gives a
8859 // v8i1, which when promoted is v8i16. That means each i32 element from Op1
8860 // needs truncating to i16 and inserting in the result.
8861 EVT ConcatVT
= MVT::getVectorVT(ElType
, NumElts
);
8862 SDValue ConVec
= DAG
.getNode(ISD::UNDEF
, dl
, ConcatVT
);
8863 auto ExtractInto
= [&DAG
, &dl
](SDValue NewV
, SDValue ConVec
, unsigned &j
) {
8864 EVT NewVT
= NewV
.getValueType();
8865 EVT ConcatVT
= ConVec
.getValueType();
8866 for (unsigned i
= 0, e
= NewVT
.getVectorNumElements(); i
< e
; i
++, j
++) {
8867 SDValue Elt
= DAG
.getNode(ISD::EXTRACT_VECTOR_ELT
, dl
, MVT::i32
, NewV
,
8868 DAG
.getIntPtrConstant(i
, dl
));
8869 ConVec
= DAG
.getNode(ISD::INSERT_VECTOR_ELT
, dl
, ConcatVT
, ConVec
, Elt
,
8870 DAG
.getConstant(j
, dl
, MVT::i32
));
8875 ConVec
= ExtractInto(NewV1
, ConVec
, j
);
8876 ConVec
= ExtractInto(NewV2
, ConVec
, j
);
8878 // Now return the result of comparing the subvector with zero,
8879 // which will generate a real predicate, i.e. v4i1, v8i1 or v16i1.
8880 return DAG
.getNode(ARMISD::VCMPZ
, dl
, VT
, ConVec
,
8881 DAG
.getConstant(ARMCC::NE
, dl
, MVT::i32
));
8884 // Concat each pair of subvectors and pack into the lower half of the array.
8885 SmallVector
<SDValue
> ConcatOps(Op
->op_begin(), Op
->op_end());
8886 while (ConcatOps
.size() > 1) {
8887 for (unsigned I
= 0, E
= ConcatOps
.size(); I
!= E
; I
+= 2) {
8888 SDValue V1
= ConcatOps
[I
];
8889 SDValue V2
= ConcatOps
[I
+ 1];
8890 ConcatOps
[I
/ 2] = ConcatPair(V1
, V2
);
8892 ConcatOps
.resize(ConcatOps
.size() / 2);
8894 return ConcatOps
[0];
8897 static SDValue
LowerCONCAT_VECTORS(SDValue Op
, SelectionDAG
&DAG
,
8898 const ARMSubtarget
*ST
) {
8899 EVT VT
= Op
->getValueType(0);
8900 if (ST
->hasMVEIntegerOps() && VT
.getScalarSizeInBits() == 1)
8901 return LowerCONCAT_VECTORS_i1(Op
, DAG
, ST
);
8903 // The only time a CONCAT_VECTORS operation can have legal types is when
8904 // two 64-bit vectors are concatenated to a 128-bit vector.
8905 assert(Op
.getValueType().is128BitVector() && Op
.getNumOperands() == 2 &&
8906 "unexpected CONCAT_VECTORS");
8908 SDValue Val
= DAG
.getUNDEF(MVT::v2f64
);
8909 SDValue Op0
= Op
.getOperand(0);
8910 SDValue Op1
= Op
.getOperand(1);
8912 Val
= DAG
.getNode(ISD::INSERT_VECTOR_ELT
, dl
, MVT::v2f64
, Val
,
8913 DAG
.getNode(ISD::BITCAST
, dl
, MVT::f64
, Op0
),
8914 DAG
.getIntPtrConstant(0, dl
));
8916 Val
= DAG
.getNode(ISD::INSERT_VECTOR_ELT
, dl
, MVT::v2f64
, Val
,
8917 DAG
.getNode(ISD::BITCAST
, dl
, MVT::f64
, Op1
),
8918 DAG
.getIntPtrConstant(1, dl
));
8919 return DAG
.getNode(ISD::BITCAST
, dl
, Op
.getValueType(), Val
);
8922 static SDValue
LowerEXTRACT_SUBVECTOR(SDValue Op
, SelectionDAG
&DAG
,
8923 const ARMSubtarget
*ST
) {
8924 SDValue V1
= Op
.getOperand(0);
8925 SDValue V2
= Op
.getOperand(1);
8927 EVT VT
= Op
.getValueType();
8928 EVT Op1VT
= V1
.getValueType();
8929 unsigned NumElts
= VT
.getVectorNumElements();
8930 unsigned Index
= cast
<ConstantSDNode
>(V2
)->getZExtValue();
8932 assert(VT
.getScalarSizeInBits() == 1 &&
8933 "Unexpected custom EXTRACT_SUBVECTOR lowering");
8934 assert(ST
->hasMVEIntegerOps() &&
8935 "EXTRACT_SUBVECTOR lowering only supported for MVE");
8937 SDValue NewV1
= PromoteMVEPredVector(dl
, V1
, Op1VT
, DAG
);
8939 // We now have Op1 promoted to a vector of integers, where v8i1 gets
8940 // promoted to v8i16, etc.
8942 MVT ElType
= getVectorTyFromPredicateVector(VT
).getScalarType().getSimpleVT();
8944 EVT SubVT
= MVT::getVectorVT(ElType
, NumElts
);
8945 SDValue SubVec
= DAG
.getNode(ISD::UNDEF
, dl
, SubVT
);
8946 for (unsigned i
= Index
, j
= 0; i
< (Index
+ NumElts
); i
++, j
++) {
8947 SDValue Elt
= DAG
.getNode(ISD::EXTRACT_VECTOR_ELT
, dl
, MVT::i32
, NewV1
,
8948 DAG
.getIntPtrConstant(i
, dl
));
8949 SubVec
= DAG
.getNode(ISD::INSERT_VECTOR_ELT
, dl
, SubVT
, SubVec
, Elt
,
8950 DAG
.getConstant(j
, dl
, MVT::i32
));
8953 // Now return the result of comparing the subvector with zero,
8954 // which will generate a real predicate, i.e. v4i1, v8i1 or v16i1.
8955 return DAG
.getNode(ARMISD::VCMPZ
, dl
, VT
, SubVec
,
8956 DAG
.getConstant(ARMCC::NE
, dl
, MVT::i32
));
8959 // Turn a truncate into a predicate (an i1 vector) into icmp(and(x, 1), 0).
8960 static SDValue
LowerTruncatei1(SDNode
*N
, SelectionDAG
&DAG
,
8961 const ARMSubtarget
*ST
) {
8962 assert(ST
->hasMVEIntegerOps() && "Expected MVE!");
8963 EVT VT
= N
->getValueType(0);
8964 assert((VT
== MVT::v16i1
|| VT
== MVT::v8i1
|| VT
== MVT::v4i1
) &&
8965 "Expected a vector i1 type!");
8966 SDValue Op
= N
->getOperand(0);
8967 EVT FromVT
= Op
.getValueType();
8971 DAG
.getNode(ISD::AND
, DL
, FromVT
, Op
, DAG
.getConstant(1, DL
, FromVT
));
8972 return DAG
.getNode(ISD::SETCC
, DL
, VT
, And
, DAG
.getConstant(0, DL
, FromVT
),
8973 DAG
.getCondCode(ISD::SETNE
));
8976 static SDValue
LowerTruncate(SDNode
*N
, SelectionDAG
&DAG
,
8977 const ARMSubtarget
*Subtarget
) {
8978 if (!Subtarget
->hasMVEIntegerOps())
8981 EVT ToVT
= N
->getValueType(0);
8982 if (ToVT
.getScalarType() == MVT::i1
)
8983 return LowerTruncatei1(N
, DAG
, Subtarget
);
8985 // MVE does not have a single instruction to perform the truncation of a v4i32
8986 // into the lower half of a v8i16, in the same way that a NEON vmovn would.
8987 // Most of the instructions in MVE follow the 'Beats' system, where moving
8988 // values from different lanes is usually something that the instructions
8991 // Instead it has top/bottom instructions such as VMOVLT/B and VMOVNT/B,
8992 // which take a the top/bottom half of a larger lane and extend it (or do the
8993 // opposite, truncating into the top/bottom lane from a larger lane). Note
8994 // that because of the way we widen lanes, a v4i16 is really a v4i32 using the
8995 // bottom 16bits from each vector lane. This works really well with T/B
8996 // instructions, but that doesn't extend to v8i32->v8i16 where the lanes need
8999 // But truncates and sext/zext are always going to be fairly common from llvm.
9000 // We have several options for how to deal with them:
9001 // - Wherever possible combine them into an instruction that makes them
9002 // "free". This includes loads/stores, which can perform the trunc as part
9003 // of the memory operation. Or certain shuffles that can be turned into
9005 // - Lane Interleaving to transform blocks surrounded by ext/trunc. So
9006 // trunc(mul(sext(a), sext(b))) may become
9007 // VMOVNT(VMUL(VMOVLB(a), VMOVLB(b)), VMUL(VMOVLT(a), VMOVLT(b))). (Which in
9008 // this case can use VMULL). This is performed in the
9009 // MVELaneInterleavingPass.
9010 // - Otherwise we have an option. By default we would expand the
9011 // zext/sext/trunc into a series of lane extract/inserts going via GPR
9012 // registers. One for each vector lane in the vector. This can obviously be
9014 // - The other option is to use the fact that loads/store can extend/truncate
9015 // to turn a trunc into two truncating stack stores and a stack reload. This
9016 // becomes 3 back-to-back memory operations, but at least that is less than
9017 // all the insert/extracts.
9019 // In order to do the last, we convert certain trunc's into MVETRUNC, which
9020 // are either optimized where they can be, or eventually lowered into stack
9021 // stores/loads. This prevents us from splitting a v8i16 trunc into two stores
9022 // two early, where other instructions would be better, and stops us from
9023 // having to reconstruct multiple buildvector shuffles into loads/stores.
9024 if (ToVT
!= MVT::v8i16
&& ToVT
!= MVT::v16i8
)
9026 EVT FromVT
= N
->getOperand(0).getValueType();
9027 if (FromVT
!= MVT::v8i32
&& FromVT
!= MVT::v16i16
)
9031 std::tie(Lo
, Hi
) = DAG
.SplitVectorOperand(N
, 0);
9033 return DAG
.getNode(ARMISD::MVETRUNC
, DL
, ToVT
, Lo
, Hi
);
9036 static SDValue
LowerVectorExtend(SDNode
*N
, SelectionDAG
&DAG
,
9037 const ARMSubtarget
*Subtarget
) {
9038 if (!Subtarget
->hasMVEIntegerOps())
9041 // See LowerTruncate above for an explanation of MVEEXT/MVETRUNC.
9043 EVT ToVT
= N
->getValueType(0);
9044 if (ToVT
!= MVT::v16i32
&& ToVT
!= MVT::v8i32
&& ToVT
!= MVT::v16i16
)
9046 SDValue Op
= N
->getOperand(0);
9047 EVT FromVT
= Op
.getValueType();
9048 if (FromVT
!= MVT::v8i16
&& FromVT
!= MVT::v16i8
)
9052 EVT ExtVT
= ToVT
.getHalfNumVectorElementsVT(*DAG
.getContext());
9053 if (ToVT
.getScalarType() == MVT::i32
&& FromVT
.getScalarType() == MVT::i8
)
9057 N
->getOpcode() == ISD::SIGN_EXTEND
? ARMISD::MVESEXT
: ARMISD::MVEZEXT
;
9058 SDValue Ext
= DAG
.getNode(Opcode
, DL
, DAG
.getVTList(ExtVT
, ExtVT
), Op
);
9059 SDValue Ext1
= Ext
.getValue(1);
9061 if (ToVT
.getScalarType() == MVT::i32
&& FromVT
.getScalarType() == MVT::i8
) {
9062 Ext
= DAG
.getNode(N
->getOpcode(), DL
, MVT::v8i32
, Ext
);
9063 Ext1
= DAG
.getNode(N
->getOpcode(), DL
, MVT::v8i32
, Ext1
);
9066 return DAG
.getNode(ISD::CONCAT_VECTORS
, DL
, ToVT
, Ext
, Ext1
);
9069 /// isExtendedBUILD_VECTOR - Check if N is a constant BUILD_VECTOR where each
9070 /// element has been zero/sign-extended, depending on the isSigned parameter,
9071 /// from an integer type half its size.
9072 static bool isExtendedBUILD_VECTOR(SDNode
*N
, SelectionDAG
&DAG
,
9074 // A v2i64 BUILD_VECTOR will have been legalized to a BITCAST from v4i32.
9075 EVT VT
= N
->getValueType(0);
9076 if (VT
== MVT::v2i64
&& N
->getOpcode() == ISD::BITCAST
) {
9077 SDNode
*BVN
= N
->getOperand(0).getNode();
9078 if (BVN
->getValueType(0) != MVT::v4i32
||
9079 BVN
->getOpcode() != ISD::BUILD_VECTOR
)
9081 unsigned LoElt
= DAG
.getDataLayout().isBigEndian() ? 1 : 0;
9082 unsigned HiElt
= 1 - LoElt
;
9083 ConstantSDNode
*Lo0
= dyn_cast
<ConstantSDNode
>(BVN
->getOperand(LoElt
));
9084 ConstantSDNode
*Hi0
= dyn_cast
<ConstantSDNode
>(BVN
->getOperand(HiElt
));
9085 ConstantSDNode
*Lo1
= dyn_cast
<ConstantSDNode
>(BVN
->getOperand(LoElt
+2));
9086 ConstantSDNode
*Hi1
= dyn_cast
<ConstantSDNode
>(BVN
->getOperand(HiElt
+2));
9087 if (!Lo0
|| !Hi0
|| !Lo1
|| !Hi1
)
9090 if (Hi0
->getSExtValue() == Lo0
->getSExtValue() >> 32 &&
9091 Hi1
->getSExtValue() == Lo1
->getSExtValue() >> 32)
9094 if (Hi0
->isNullValue() && Hi1
->isNullValue())
9100 if (N
->getOpcode() != ISD::BUILD_VECTOR
)
9103 for (unsigned i
= 0, e
= N
->getNumOperands(); i
!= e
; ++i
) {
9104 SDNode
*Elt
= N
->getOperand(i
).getNode();
9105 if (ConstantSDNode
*C
= dyn_cast
<ConstantSDNode
>(Elt
)) {
9106 unsigned EltSize
= VT
.getScalarSizeInBits();
9107 unsigned HalfSize
= EltSize
/ 2;
9109 if (!isIntN(HalfSize
, C
->getSExtValue()))
9112 if (!isUIntN(HalfSize
, C
->getZExtValue()))
9123 /// isSignExtended - Check if a node is a vector value that is sign-extended
9124 /// or a constant BUILD_VECTOR with sign-extended elements.
9125 static bool isSignExtended(SDNode
*N
, SelectionDAG
&DAG
) {
9126 if (N
->getOpcode() == ISD::SIGN_EXTEND
|| ISD::isSEXTLoad(N
))
9128 if (isExtendedBUILD_VECTOR(N
, DAG
, true))
9133 /// isZeroExtended - Check if a node is a vector value that is zero-extended (or
9134 /// any-extended) or a constant BUILD_VECTOR with zero-extended elements.
9135 static bool isZeroExtended(SDNode
*N
, SelectionDAG
&DAG
) {
9136 if (N
->getOpcode() == ISD::ZERO_EXTEND
|| N
->getOpcode() == ISD::ANY_EXTEND
||
9139 if (isExtendedBUILD_VECTOR(N
, DAG
, false))
9144 static EVT
getExtensionTo64Bits(const EVT
&OrigVT
) {
9145 if (OrigVT
.getSizeInBits() >= 64)
9148 assert(OrigVT
.isSimple() && "Expecting a simple value type");
9150 MVT::SimpleValueType OrigSimpleTy
= OrigVT
.getSimpleVT().SimpleTy
;
9151 switch (OrigSimpleTy
) {
9152 default: llvm_unreachable("Unexpected Vector Type");
9161 /// AddRequiredExtensionForVMULL - Add a sign/zero extension to extend the total
9162 /// value size to 64 bits. We need a 64-bit D register as an operand to VMULL.
9163 /// We insert the required extension here to get the vector to fill a D register.
9164 static SDValue
AddRequiredExtensionForVMULL(SDValue N
, SelectionDAG
&DAG
,
9167 unsigned ExtOpcode
) {
9168 // The vector originally had a size of OrigTy. It was then extended to ExtTy.
9169 // We expect the ExtTy to be 128-bits total. If the OrigTy is less than
9170 // 64-bits we need to insert a new extension so that it will be 64-bits.
9171 assert(ExtTy
.is128BitVector() && "Unexpected extension size");
9172 if (OrigTy
.getSizeInBits() >= 64)
9175 // Must extend size to at least 64 bits to be used as an operand for VMULL.
9176 EVT NewVT
= getExtensionTo64Bits(OrigTy
);
9178 return DAG
.getNode(ExtOpcode
, SDLoc(N
), NewVT
, N
);
9181 /// SkipLoadExtensionForVMULL - return a load of the original vector size that
9182 /// does not do any sign/zero extension. If the original vector is less
9183 /// than 64 bits, an appropriate extension will be added after the load to
9184 /// reach a total size of 64 bits. We have to add the extension separately
9185 /// because ARM does not have a sign/zero extending load for vectors.
9186 static SDValue
SkipLoadExtensionForVMULL(LoadSDNode
*LD
, SelectionDAG
& DAG
) {
9187 EVT ExtendedTy
= getExtensionTo64Bits(LD
->getMemoryVT());
9189 // The load already has the right type.
9190 if (ExtendedTy
== LD
->getMemoryVT())
9191 return DAG
.getLoad(LD
->getMemoryVT(), SDLoc(LD
), LD
->getChain(),
9192 LD
->getBasePtr(), LD
->getPointerInfo(),
9193 LD
->getAlignment(), LD
->getMemOperand()->getFlags());
9195 // We need to create a zextload/sextload. We cannot just create a load
9196 // followed by a zext/zext node because LowerMUL is also run during normal
9197 // operation legalization where we can't create illegal types.
9198 return DAG
.getExtLoad(LD
->getExtensionType(), SDLoc(LD
), ExtendedTy
,
9199 LD
->getChain(), LD
->getBasePtr(), LD
->getPointerInfo(),
9200 LD
->getMemoryVT(), LD
->getAlignment(),
9201 LD
->getMemOperand()->getFlags());
9204 /// SkipExtensionForVMULL - For a node that is a SIGN_EXTEND, ZERO_EXTEND,
9205 /// ANY_EXTEND, extending load, or BUILD_VECTOR with extended elements, return
9206 /// the unextended value. The unextended vector should be 64 bits so that it can
9207 /// be used as an operand to a VMULL instruction. If the original vector size
9208 /// before extension is less than 64 bits we add a an extension to resize
9209 /// the vector to 64 bits.
9210 static SDValue
SkipExtensionForVMULL(SDNode
*N
, SelectionDAG
&DAG
) {
9211 if (N
->getOpcode() == ISD::SIGN_EXTEND
||
9212 N
->getOpcode() == ISD::ZERO_EXTEND
|| N
->getOpcode() == ISD::ANY_EXTEND
)
9213 return AddRequiredExtensionForVMULL(N
->getOperand(0), DAG
,
9214 N
->getOperand(0)->getValueType(0),
9218 if (LoadSDNode
*LD
= dyn_cast
<LoadSDNode
>(N
)) {
9219 assert((ISD::isSEXTLoad(LD
) || ISD::isZEXTLoad(LD
)) &&
9220 "Expected extending load");
9222 SDValue newLoad
= SkipLoadExtensionForVMULL(LD
, DAG
);
9223 DAG
.ReplaceAllUsesOfValueWith(SDValue(LD
, 1), newLoad
.getValue(1));
9224 unsigned Opcode
= ISD::isSEXTLoad(LD
) ? ISD::SIGN_EXTEND
: ISD::ZERO_EXTEND
;
9226 DAG
.getNode(Opcode
, SDLoc(newLoad
), LD
->getValueType(0), newLoad
);
9227 DAG
.ReplaceAllUsesOfValueWith(SDValue(LD
, 0), extLoad
);
9232 // Otherwise, the value must be a BUILD_VECTOR. For v2i64, it will
9233 // have been legalized as a BITCAST from v4i32.
9234 if (N
->getOpcode() == ISD::BITCAST
) {
9235 SDNode
*BVN
= N
->getOperand(0).getNode();
9236 assert(BVN
->getOpcode() == ISD::BUILD_VECTOR
&&
9237 BVN
->getValueType(0) == MVT::v4i32
&& "expected v4i32 BUILD_VECTOR");
9238 unsigned LowElt
= DAG
.getDataLayout().isBigEndian() ? 1 : 0;
9239 return DAG
.getBuildVector(
9240 MVT::v2i32
, SDLoc(N
),
9241 {BVN
->getOperand(LowElt
), BVN
->getOperand(LowElt
+ 2)});
9243 // Construct a new BUILD_VECTOR with elements truncated to half the size.
9244 assert(N
->getOpcode() == ISD::BUILD_VECTOR
&& "expected BUILD_VECTOR");
9245 EVT VT
= N
->getValueType(0);
9246 unsigned EltSize
= VT
.getScalarSizeInBits() / 2;
9247 unsigned NumElts
= VT
.getVectorNumElements();
9248 MVT TruncVT
= MVT::getIntegerVT(EltSize
);
9249 SmallVector
<SDValue
, 8> Ops
;
9251 for (unsigned i
= 0; i
!= NumElts
; ++i
) {
9252 ConstantSDNode
*C
= cast
<ConstantSDNode
>(N
->getOperand(i
));
9253 const APInt
&CInt
= C
->getAPIntValue();
9254 // Element types smaller than 32 bits are not legal, so use i32 elements.
9255 // The values are implicitly truncated so sext vs. zext doesn't matter.
9256 Ops
.push_back(DAG
.getConstant(CInt
.zextOrTrunc(32), dl
, MVT::i32
));
9258 return DAG
.getBuildVector(MVT::getVectorVT(TruncVT
, NumElts
), dl
, Ops
);
9261 static bool isAddSubSExt(SDNode
*N
, SelectionDAG
&DAG
) {
9262 unsigned Opcode
= N
->getOpcode();
9263 if (Opcode
== ISD::ADD
|| Opcode
== ISD::SUB
) {
9264 SDNode
*N0
= N
->getOperand(0).getNode();
9265 SDNode
*N1
= N
->getOperand(1).getNode();
9266 return N0
->hasOneUse() && N1
->hasOneUse() &&
9267 isSignExtended(N0
, DAG
) && isSignExtended(N1
, DAG
);
9272 static bool isAddSubZExt(SDNode
*N
, SelectionDAG
&DAG
) {
9273 unsigned Opcode
= N
->getOpcode();
9274 if (Opcode
== ISD::ADD
|| Opcode
== ISD::SUB
) {
9275 SDNode
*N0
= N
->getOperand(0).getNode();
9276 SDNode
*N1
= N
->getOperand(1).getNode();
9277 return N0
->hasOneUse() && N1
->hasOneUse() &&
9278 isZeroExtended(N0
, DAG
) && isZeroExtended(N1
, DAG
);
9283 static SDValue
LowerMUL(SDValue Op
, SelectionDAG
&DAG
) {
9284 // Multiplications are only custom-lowered for 128-bit vectors so that
9285 // VMULL can be detected. Otherwise v2i64 multiplications are not legal.
9286 EVT VT
= Op
.getValueType();
9287 assert(VT
.is128BitVector() && VT
.isInteger() &&
9288 "unexpected type for custom-lowering ISD::MUL");
9289 SDNode
*N0
= Op
.getOperand(0).getNode();
9290 SDNode
*N1
= Op
.getOperand(1).getNode();
9291 unsigned NewOpc
= 0;
9293 bool isN0SExt
= isSignExtended(N0
, DAG
);
9294 bool isN1SExt
= isSignExtended(N1
, DAG
);
9295 if (isN0SExt
&& isN1SExt
)
9296 NewOpc
= ARMISD::VMULLs
;
9298 bool isN0ZExt
= isZeroExtended(N0
, DAG
);
9299 bool isN1ZExt
= isZeroExtended(N1
, DAG
);
9300 if (isN0ZExt
&& isN1ZExt
)
9301 NewOpc
= ARMISD::VMULLu
;
9302 else if (isN1SExt
|| isN1ZExt
) {
9303 // Look for (s/zext A + s/zext B) * (s/zext C). We want to turn these
9304 // into (s/zext A * s/zext C) + (s/zext B * s/zext C)
9305 if (isN1SExt
&& isAddSubSExt(N0
, DAG
)) {
9306 NewOpc
= ARMISD::VMULLs
;
9308 } else if (isN1ZExt
&& isAddSubZExt(N0
, DAG
)) {
9309 NewOpc
= ARMISD::VMULLu
;
9311 } else if (isN0ZExt
&& isAddSubZExt(N1
, DAG
)) {
9313 NewOpc
= ARMISD::VMULLu
;
9319 if (VT
== MVT::v2i64
)
9320 // Fall through to expand this. It is not legal.
9323 // Other vector multiplications are legal.
9328 // Legalize to a VMULL instruction.
9331 SDValue Op1
= SkipExtensionForVMULL(N1
, DAG
);
9333 Op0
= SkipExtensionForVMULL(N0
, DAG
);
9334 assert(Op0
.getValueType().is64BitVector() &&
9335 Op1
.getValueType().is64BitVector() &&
9336 "unexpected types for extended operands to VMULL");
9337 return DAG
.getNode(NewOpc
, DL
, VT
, Op0
, Op1
);
9340 // Optimizing (zext A + zext B) * C, to (VMULL A, C) + (VMULL B, C) during
9341 // isel lowering to take advantage of no-stall back to back vmul + vmla.
9348 SDValue N00
= SkipExtensionForVMULL(N0
->getOperand(0).getNode(), DAG
);
9349 SDValue N01
= SkipExtensionForVMULL(N0
->getOperand(1).getNode(), DAG
);
9350 EVT Op1VT
= Op1
.getValueType();
9351 return DAG
.getNode(N0
->getOpcode(), DL
, VT
,
9352 DAG
.getNode(NewOpc
, DL
, VT
,
9353 DAG
.getNode(ISD::BITCAST
, DL
, Op1VT
, N00
), Op1
),
9354 DAG
.getNode(NewOpc
, DL
, VT
,
9355 DAG
.getNode(ISD::BITCAST
, DL
, Op1VT
, N01
), Op1
));
9358 static SDValue
LowerSDIV_v4i8(SDValue X
, SDValue Y
, const SDLoc
&dl
,
9359 SelectionDAG
&DAG
) {
9360 // TODO: Should this propagate fast-math-flags?
9363 // float4 xf = vcvt_f32_s32(vmovl_s16(a.lo));
9364 // float4 yf = vcvt_f32_s32(vmovl_s16(b.lo));
9365 X
= DAG
.getNode(ISD::SIGN_EXTEND
, dl
, MVT::v4i32
, X
);
9366 Y
= DAG
.getNode(ISD::SIGN_EXTEND
, dl
, MVT::v4i32
, Y
);
9367 X
= DAG
.getNode(ISD::SINT_TO_FP
, dl
, MVT::v4f32
, X
);
9368 Y
= DAG
.getNode(ISD::SINT_TO_FP
, dl
, MVT::v4f32
, Y
);
9369 // Get reciprocal estimate.
9370 // float4 recip = vrecpeq_f32(yf);
9371 Y
= DAG
.getNode(ISD::INTRINSIC_WO_CHAIN
, dl
, MVT::v4f32
,
9372 DAG
.getConstant(Intrinsic::arm_neon_vrecpe
, dl
, MVT::i32
),
9374 // Because char has a smaller range than uchar, we can actually get away
9375 // without any newton steps. This requires that we use a weird bias
9376 // of 0xb000, however (again, this has been exhaustively tested).
9377 // float4 result = as_float4(as_int4(xf*recip) + 0xb000);
9378 X
= DAG
.getNode(ISD::FMUL
, dl
, MVT::v4f32
, X
, Y
);
9379 X
= DAG
.getNode(ISD::BITCAST
, dl
, MVT::v4i32
, X
);
9380 Y
= DAG
.getConstant(0xb000, dl
, MVT::v4i32
);
9381 X
= DAG
.getNode(ISD::ADD
, dl
, MVT::v4i32
, X
, Y
);
9382 X
= DAG
.getNode(ISD::BITCAST
, dl
, MVT::v4f32
, X
);
9383 // Convert back to short.
9384 X
= DAG
.getNode(ISD::FP_TO_SINT
, dl
, MVT::v4i32
, X
);
9385 X
= DAG
.getNode(ISD::TRUNCATE
, dl
, MVT::v4i16
, X
);
9389 static SDValue
LowerSDIV_v4i16(SDValue N0
, SDValue N1
, const SDLoc
&dl
,
9390 SelectionDAG
&DAG
) {
9391 // TODO: Should this propagate fast-math-flags?
9394 // Convert to float.
9395 // float4 yf = vcvt_f32_s32(vmovl_s16(y));
9396 // float4 xf = vcvt_f32_s32(vmovl_s16(x));
9397 N0
= DAG
.getNode(ISD::SIGN_EXTEND
, dl
, MVT::v4i32
, N0
);
9398 N1
= DAG
.getNode(ISD::SIGN_EXTEND
, dl
, MVT::v4i32
, N1
);
9399 N0
= DAG
.getNode(ISD::SINT_TO_FP
, dl
, MVT::v4f32
, N0
);
9400 N1
= DAG
.getNode(ISD::SINT_TO_FP
, dl
, MVT::v4f32
, N1
);
9402 // Use reciprocal estimate and one refinement step.
9403 // float4 recip = vrecpeq_f32(yf);
9404 // recip *= vrecpsq_f32(yf, recip);
9405 N2
= DAG
.getNode(ISD::INTRINSIC_WO_CHAIN
, dl
, MVT::v4f32
,
9406 DAG
.getConstant(Intrinsic::arm_neon_vrecpe
, dl
, MVT::i32
),
9408 N1
= DAG
.getNode(ISD::INTRINSIC_WO_CHAIN
, dl
, MVT::v4f32
,
9409 DAG
.getConstant(Intrinsic::arm_neon_vrecps
, dl
, MVT::i32
),
9411 N2
= DAG
.getNode(ISD::FMUL
, dl
, MVT::v4f32
, N1
, N2
);
9412 // Because short has a smaller range than ushort, we can actually get away
9413 // with only a single newton step. This requires that we use a weird bias
9414 // of 89, however (again, this has been exhaustively tested).
9415 // float4 result = as_float4(as_int4(xf*recip) + 0x89);
9416 N0
= DAG
.getNode(ISD::FMUL
, dl
, MVT::v4f32
, N0
, N2
);
9417 N0
= DAG
.getNode(ISD::BITCAST
, dl
, MVT::v4i32
, N0
);
9418 N1
= DAG
.getConstant(0x89, dl
, MVT::v4i32
);
9419 N0
= DAG
.getNode(ISD::ADD
, dl
, MVT::v4i32
, N0
, N1
);
9420 N0
= DAG
.getNode(ISD::BITCAST
, dl
, MVT::v4f32
, N0
);
9421 // Convert back to integer and return.
9422 // return vmovn_s32(vcvt_s32_f32(result));
9423 N0
= DAG
.getNode(ISD::FP_TO_SINT
, dl
, MVT::v4i32
, N0
);
9424 N0
= DAG
.getNode(ISD::TRUNCATE
, dl
, MVT::v4i16
, N0
);
9428 static SDValue
LowerSDIV(SDValue Op
, SelectionDAG
&DAG
,
9429 const ARMSubtarget
*ST
) {
9430 EVT VT
= Op
.getValueType();
9431 assert((VT
== MVT::v4i16
|| VT
== MVT::v8i8
) &&
9432 "unexpected type for custom-lowering ISD::SDIV");
9435 SDValue N0
= Op
.getOperand(0);
9436 SDValue N1
= Op
.getOperand(1);
9439 if (VT
== MVT::v8i8
) {
9440 N0
= DAG
.getNode(ISD::SIGN_EXTEND
, dl
, MVT::v8i16
, N0
);
9441 N1
= DAG
.getNode(ISD::SIGN_EXTEND
, dl
, MVT::v8i16
, N1
);
9443 N2
= DAG
.getNode(ISD::EXTRACT_SUBVECTOR
, dl
, MVT::v4i16
, N0
,
9444 DAG
.getIntPtrConstant(4, dl
));
9445 N3
= DAG
.getNode(ISD::EXTRACT_SUBVECTOR
, dl
, MVT::v4i16
, N1
,
9446 DAG
.getIntPtrConstant(4, dl
));
9447 N0
= DAG
.getNode(ISD::EXTRACT_SUBVECTOR
, dl
, MVT::v4i16
, N0
,
9448 DAG
.getIntPtrConstant(0, dl
));
9449 N1
= DAG
.getNode(ISD::EXTRACT_SUBVECTOR
, dl
, MVT::v4i16
, N1
,
9450 DAG
.getIntPtrConstant(0, dl
));
9452 N0
= LowerSDIV_v4i8(N0
, N1
, dl
, DAG
); // v4i16
9453 N2
= LowerSDIV_v4i8(N2
, N3
, dl
, DAG
); // v4i16
9455 N0
= DAG
.getNode(ISD::CONCAT_VECTORS
, dl
, MVT::v8i16
, N0
, N2
);
9456 N0
= LowerCONCAT_VECTORS(N0
, DAG
, ST
);
9458 N0
= DAG
.getNode(ISD::TRUNCATE
, dl
, MVT::v8i8
, N0
);
9461 return LowerSDIV_v4i16(N0
, N1
, dl
, DAG
);
9464 static SDValue
LowerUDIV(SDValue Op
, SelectionDAG
&DAG
,
9465 const ARMSubtarget
*ST
) {
9466 // TODO: Should this propagate fast-math-flags?
9467 EVT VT
= Op
.getValueType();
9468 assert((VT
== MVT::v4i16
|| VT
== MVT::v8i8
) &&
9469 "unexpected type for custom-lowering ISD::UDIV");
9472 SDValue N0
= Op
.getOperand(0);
9473 SDValue N1
= Op
.getOperand(1);
9476 if (VT
== MVT::v8i8
) {
9477 N0
= DAG
.getNode(ISD::ZERO_EXTEND
, dl
, MVT::v8i16
, N0
);
9478 N1
= DAG
.getNode(ISD::ZERO_EXTEND
, dl
, MVT::v8i16
, N1
);
9480 N2
= DAG
.getNode(ISD::EXTRACT_SUBVECTOR
, dl
, MVT::v4i16
, N0
,
9481 DAG
.getIntPtrConstant(4, dl
));
9482 N3
= DAG
.getNode(ISD::EXTRACT_SUBVECTOR
, dl
, MVT::v4i16
, N1
,
9483 DAG
.getIntPtrConstant(4, dl
));
9484 N0
= DAG
.getNode(ISD::EXTRACT_SUBVECTOR
, dl
, MVT::v4i16
, N0
,
9485 DAG
.getIntPtrConstant(0, dl
));
9486 N1
= DAG
.getNode(ISD::EXTRACT_SUBVECTOR
, dl
, MVT::v4i16
, N1
,
9487 DAG
.getIntPtrConstant(0, dl
));
9489 N0
= LowerSDIV_v4i16(N0
, N1
, dl
, DAG
); // v4i16
9490 N2
= LowerSDIV_v4i16(N2
, N3
, dl
, DAG
); // v4i16
9492 N0
= DAG
.getNode(ISD::CONCAT_VECTORS
, dl
, MVT::v8i16
, N0
, N2
);
9493 N0
= LowerCONCAT_VECTORS(N0
, DAG
, ST
);
9495 N0
= DAG
.getNode(ISD::INTRINSIC_WO_CHAIN
, dl
, MVT::v8i8
,
9496 DAG
.getConstant(Intrinsic::arm_neon_vqmovnsu
, dl
,
9502 // v4i16 sdiv ... Convert to float.
9503 // float4 yf = vcvt_f32_s32(vmovl_u16(y));
9504 // float4 xf = vcvt_f32_s32(vmovl_u16(x));
9505 N0
= DAG
.getNode(ISD::ZERO_EXTEND
, dl
, MVT::v4i32
, N0
);
9506 N1
= DAG
.getNode(ISD::ZERO_EXTEND
, dl
, MVT::v4i32
, N1
);
9507 N0
= DAG
.getNode(ISD::SINT_TO_FP
, dl
, MVT::v4f32
, N0
);
9508 SDValue BN1
= DAG
.getNode(ISD::SINT_TO_FP
, dl
, MVT::v4f32
, N1
);
9510 // Use reciprocal estimate and two refinement steps.
9511 // float4 recip = vrecpeq_f32(yf);
9512 // recip *= vrecpsq_f32(yf, recip);
9513 // recip *= vrecpsq_f32(yf, recip);
9514 N2
= DAG
.getNode(ISD::INTRINSIC_WO_CHAIN
, dl
, MVT::v4f32
,
9515 DAG
.getConstant(Intrinsic::arm_neon_vrecpe
, dl
, MVT::i32
),
9517 N1
= DAG
.getNode(ISD::INTRINSIC_WO_CHAIN
, dl
, MVT::v4f32
,
9518 DAG
.getConstant(Intrinsic::arm_neon_vrecps
, dl
, MVT::i32
),
9520 N2
= DAG
.getNode(ISD::FMUL
, dl
, MVT::v4f32
, N1
, N2
);
9521 N1
= DAG
.getNode(ISD::INTRINSIC_WO_CHAIN
, dl
, MVT::v4f32
,
9522 DAG
.getConstant(Intrinsic::arm_neon_vrecps
, dl
, MVT::i32
),
9524 N2
= DAG
.getNode(ISD::FMUL
, dl
, MVT::v4f32
, N1
, N2
);
9525 // Simply multiplying by the reciprocal estimate can leave us a few ulps
9526 // too low, so we add 2 ulps (exhaustive testing shows that this is enough,
9527 // and that it will never cause us to return an answer too large).
9528 // float4 result = as_float4(as_int4(xf*recip) + 2);
9529 N0
= DAG
.getNode(ISD::FMUL
, dl
, MVT::v4f32
, N0
, N2
);
9530 N0
= DAG
.getNode(ISD::BITCAST
, dl
, MVT::v4i32
, N0
);
9531 N1
= DAG
.getConstant(2, dl
, MVT::v4i32
);
9532 N0
= DAG
.getNode(ISD::ADD
, dl
, MVT::v4i32
, N0
, N1
);
9533 N0
= DAG
.getNode(ISD::BITCAST
, dl
, MVT::v4f32
, N0
);
9534 // Convert back to integer and return.
9535 // return vmovn_u32(vcvt_s32_f32(result));
9536 N0
= DAG
.getNode(ISD::FP_TO_SINT
, dl
, MVT::v4i32
, N0
);
9537 N0
= DAG
.getNode(ISD::TRUNCATE
, dl
, MVT::v4i16
, N0
);
9541 static SDValue
LowerADDSUBCARRY(SDValue Op
, SelectionDAG
&DAG
) {
9542 SDNode
*N
= Op
.getNode();
9543 EVT VT
= N
->getValueType(0);
9544 SDVTList VTs
= DAG
.getVTList(VT
, MVT::i32
);
9546 SDValue Carry
= Op
.getOperand(2);
9551 if (Op
.getOpcode() == ISD::ADDCARRY
) {
9552 // This converts the boolean value carry into the carry flag.
9553 Carry
= ConvertBooleanCarryToCarryFlag(Carry
, DAG
);
9555 // Do the addition proper using the carry flag we wanted.
9556 Result
= DAG
.getNode(ARMISD::ADDE
, DL
, VTs
, Op
.getOperand(0),
9557 Op
.getOperand(1), Carry
);
9559 // Now convert the carry flag into a boolean value.
9560 Carry
= ConvertCarryFlagToBooleanCarry(Result
.getValue(1), VT
, DAG
);
9562 // ARMISD::SUBE expects a carry not a borrow like ISD::SUBCARRY so we
9563 // have to invert the carry first.
9564 Carry
= DAG
.getNode(ISD::SUB
, DL
, MVT::i32
,
9565 DAG
.getConstant(1, DL
, MVT::i32
), Carry
);
9566 // This converts the boolean value carry into the carry flag.
9567 Carry
= ConvertBooleanCarryToCarryFlag(Carry
, DAG
);
9569 // Do the subtraction proper using the carry flag we wanted.
9570 Result
= DAG
.getNode(ARMISD::SUBE
, DL
, VTs
, Op
.getOperand(0),
9571 Op
.getOperand(1), Carry
);
9573 // Now convert the carry flag into a boolean value.
9574 Carry
= ConvertCarryFlagToBooleanCarry(Result
.getValue(1), VT
, DAG
);
9575 // But the carry returned by ARMISD::SUBE is not a borrow as expected
9576 // by ISD::SUBCARRY, so compute 1 - C.
9577 Carry
= DAG
.getNode(ISD::SUB
, DL
, MVT::i32
,
9578 DAG
.getConstant(1, DL
, MVT::i32
), Carry
);
9581 // Return both values.
9582 return DAG
.getNode(ISD::MERGE_VALUES
, DL
, N
->getVTList(), Result
, Carry
);
9585 SDValue
ARMTargetLowering::LowerFSINCOS(SDValue Op
, SelectionDAG
&DAG
) const {
9586 assert(Subtarget
->isTargetDarwin());
9588 // For iOS, we want to call an alternative entry point: __sincos_stret,
9589 // return values are passed via sret.
9591 SDValue Arg
= Op
.getOperand(0);
9592 EVT ArgVT
= Arg
.getValueType();
9593 Type
*ArgTy
= ArgVT
.getTypeForEVT(*DAG
.getContext());
9594 auto PtrVT
= getPointerTy(DAG
.getDataLayout());
9596 MachineFrameInfo
&MFI
= DAG
.getMachineFunction().getFrameInfo();
9597 const TargetLowering
&TLI
= DAG
.getTargetLoweringInfo();
9599 // Pair of floats / doubles used to pass the result.
9600 Type
*RetTy
= StructType::get(ArgTy
, ArgTy
);
9601 auto &DL
= DAG
.getDataLayout();
9604 bool ShouldUseSRet
= Subtarget
->isAPCS_ABI();
9606 if (ShouldUseSRet
) {
9607 // Create stack object for sret.
9608 const uint64_t ByteSize
= DL
.getTypeAllocSize(RetTy
);
9609 const Align StackAlign
= DL
.getPrefTypeAlign(RetTy
);
9610 int FrameIdx
= MFI
.CreateStackObject(ByteSize
, StackAlign
, false);
9611 SRet
= DAG
.getFrameIndex(FrameIdx
, TLI
.getPointerTy(DL
));
9615 Entry
.Ty
= RetTy
->getPointerTo();
9616 Entry
.IsSExt
= false;
9617 Entry
.IsZExt
= false;
9618 Entry
.IsSRet
= true;
9619 Args
.push_back(Entry
);
9620 RetTy
= Type::getVoidTy(*DAG
.getContext());
9626 Entry
.IsSExt
= false;
9627 Entry
.IsZExt
= false;
9628 Args
.push_back(Entry
);
9631 (ArgVT
== MVT::f64
) ? RTLIB::SINCOS_STRET_F64
: RTLIB::SINCOS_STRET_F32
;
9632 const char *LibcallName
= getLibcallName(LC
);
9633 CallingConv::ID CC
= getLibcallCallingConv(LC
);
9634 SDValue Callee
= DAG
.getExternalSymbol(LibcallName
, getPointerTy(DL
));
9636 TargetLowering::CallLoweringInfo
CLI(DAG
);
9638 .setChain(DAG
.getEntryNode())
9639 .setCallee(CC
, RetTy
, Callee
, std::move(Args
))
9640 .setDiscardResult(ShouldUseSRet
);
9641 std::pair
<SDValue
, SDValue
> CallResult
= LowerCallTo(CLI
);
9644 return CallResult
.first
;
9647 DAG
.getLoad(ArgVT
, dl
, CallResult
.second
, SRet
, MachinePointerInfo());
9649 // Address of cos field.
9650 SDValue Add
= DAG
.getNode(ISD::ADD
, dl
, PtrVT
, SRet
,
9651 DAG
.getIntPtrConstant(ArgVT
.getStoreSize(), dl
));
9653 DAG
.getLoad(ArgVT
, dl
, LoadSin
.getValue(1), Add
, MachinePointerInfo());
9655 SDVTList Tys
= DAG
.getVTList(ArgVT
, ArgVT
);
9656 return DAG
.getNode(ISD::MERGE_VALUES
, dl
, Tys
,
9657 LoadSin
.getValue(0), LoadCos
.getValue(0));
9660 SDValue
ARMTargetLowering::LowerWindowsDIVLibCall(SDValue Op
, SelectionDAG
&DAG
,
9662 SDValue
&Chain
) const {
9663 EVT VT
= Op
.getValueType();
9664 assert((VT
== MVT::i32
|| VT
== MVT::i64
) &&
9665 "unexpected type for custom lowering DIV");
9668 const auto &DL
= DAG
.getDataLayout();
9669 const auto &TLI
= DAG
.getTargetLoweringInfo();
9671 const char *Name
= nullptr;
9673 Name
= (VT
== MVT::i32
) ? "__rt_sdiv" : "__rt_sdiv64";
9675 Name
= (VT
== MVT::i32
) ? "__rt_udiv" : "__rt_udiv64";
9677 SDValue ES
= DAG
.getExternalSymbol(Name
, TLI
.getPointerTy(DL
));
9679 ARMTargetLowering::ArgListTy Args
;
9681 for (auto AI
: {1, 0}) {
9683 Arg
.Node
= Op
.getOperand(AI
);
9684 Arg
.Ty
= Arg
.Node
.getValueType().getTypeForEVT(*DAG
.getContext());
9685 Args
.push_back(Arg
);
9688 CallLoweringInfo
CLI(DAG
);
9691 .setCallee(CallingConv::ARM_AAPCS_VFP
, VT
.getTypeForEVT(*DAG
.getContext()),
9692 ES
, std::move(Args
));
9694 return LowerCallTo(CLI
).first
;
9697 // This is a code size optimisation: return the original SDIV node to
9698 // DAGCombiner when we don't want to expand SDIV into a sequence of
9699 // instructions, and an empty node otherwise which will cause the
9700 // SDIV to be expanded in DAGCombine.
9702 ARMTargetLowering::BuildSDIVPow2(SDNode
*N
, const APInt
&Divisor
,
9704 SmallVectorImpl
<SDNode
*> &Created
) const {
9705 // TODO: Support SREM
9706 if (N
->getOpcode() != ISD::SDIV
)
9709 const auto &ST
= static_cast<const ARMSubtarget
&>(DAG
.getSubtarget());
9710 const bool MinSize
= ST
.hasMinSize();
9711 const bool HasDivide
= ST
.isThumb() ? ST
.hasDivideInThumbMode()
9712 : ST
.hasDivideInARMMode();
9714 // Don't touch vector types; rewriting this may lead to scalarizing
9716 if (N
->getOperand(0).getValueType().isVector())
9719 // Bail if MinSize is not set, and also for both ARM and Thumb mode we need
9720 // hwdiv support for this to be really profitable.
9721 if (!(MinSize
&& HasDivide
))
9724 // ARM mode is a bit simpler than Thumb: we can handle large power
9725 // of 2 immediates with 1 mov instruction; no further checks required,
9726 // just return the sdiv node.
9728 return SDValue(N
, 0);
9730 // In Thumb mode, immediates larger than 128 need a wide 4-byte MOV,
9731 // and thus lose the code size benefits of a MOVS that requires only 2.
9732 // TargetTransformInfo and 'getIntImmCodeSizeCost' could be helpful here,
9733 // but as it's doing exactly this, it's not worth the trouble to get TTI.
9734 if (Divisor
.sgt(128))
9737 return SDValue(N
, 0);
9740 SDValue
ARMTargetLowering::LowerDIV_Windows(SDValue Op
, SelectionDAG
&DAG
,
9741 bool Signed
) const {
9742 assert(Op
.getValueType() == MVT::i32
&&
9743 "unexpected type for custom lowering DIV");
9746 SDValue DBZCHK
= DAG
.getNode(ARMISD::WIN__DBZCHK
, dl
, MVT::Other
,
9747 DAG
.getEntryNode(), Op
.getOperand(1));
9749 return LowerWindowsDIVLibCall(Op
, DAG
, Signed
, DBZCHK
);
9752 static SDValue
WinDBZCheckDenominator(SelectionDAG
&DAG
, SDNode
*N
, SDValue InChain
) {
9754 SDValue Op
= N
->getOperand(1);
9755 if (N
->getValueType(0) == MVT::i32
)
9756 return DAG
.getNode(ARMISD::WIN__DBZCHK
, DL
, MVT::Other
, InChain
, Op
);
9757 SDValue Lo
= DAG
.getNode(ISD::EXTRACT_ELEMENT
, DL
, MVT::i32
, Op
,
9758 DAG
.getConstant(0, DL
, MVT::i32
));
9759 SDValue Hi
= DAG
.getNode(ISD::EXTRACT_ELEMENT
, DL
, MVT::i32
, Op
,
9760 DAG
.getConstant(1, DL
, MVT::i32
));
9761 return DAG
.getNode(ARMISD::WIN__DBZCHK
, DL
, MVT::Other
, InChain
,
9762 DAG
.getNode(ISD::OR
, DL
, MVT::i32
, Lo
, Hi
));
9765 void ARMTargetLowering::ExpandDIV_Windows(
9766 SDValue Op
, SelectionDAG
&DAG
, bool Signed
,
9767 SmallVectorImpl
<SDValue
> &Results
) const {
9768 const auto &DL
= DAG
.getDataLayout();
9769 const auto &TLI
= DAG
.getTargetLoweringInfo();
9771 assert(Op
.getValueType() == MVT::i64
&&
9772 "unexpected type for custom lowering DIV");
9775 SDValue DBZCHK
= WinDBZCheckDenominator(DAG
, Op
.getNode(), DAG
.getEntryNode());
9777 SDValue Result
= LowerWindowsDIVLibCall(Op
, DAG
, Signed
, DBZCHK
);
9779 SDValue Lower
= DAG
.getNode(ISD::TRUNCATE
, dl
, MVT::i32
, Result
);
9780 SDValue Upper
= DAG
.getNode(ISD::SRL
, dl
, MVT::i64
, Result
,
9781 DAG
.getConstant(32, dl
, TLI
.getPointerTy(DL
)));
9782 Upper
= DAG
.getNode(ISD::TRUNCATE
, dl
, MVT::i32
, Upper
);
9784 Results
.push_back(DAG
.getNode(ISD::BUILD_PAIR
, dl
, MVT::i64
, Lower
, Upper
));
9787 static SDValue
LowerPredicateLoad(SDValue Op
, SelectionDAG
&DAG
) {
9788 LoadSDNode
*LD
= cast
<LoadSDNode
>(Op
.getNode());
9789 EVT MemVT
= LD
->getMemoryVT();
9790 assert((MemVT
== MVT::v4i1
|| MemVT
== MVT::v8i1
|| MemVT
== MVT::v16i1
) &&
9791 "Expected a predicate type!");
9792 assert(MemVT
== Op
.getValueType());
9793 assert(LD
->getExtensionType() == ISD::NON_EXTLOAD
&&
9794 "Expected a non-extending load");
9795 assert(LD
->isUnindexed() && "Expected a unindexed load");
9797 // The basic MVE VLDR on a v4i1/v8i1 actually loads the entire 16bit
9798 // predicate, with the "v4i1" bits spread out over the 16 bits loaded. We
9799 // need to make sure that 8/4 bits are actually loaded into the correct
9800 // place, which means loading the value and then shuffling the values into
9801 // the bottom bits of the predicate.
9802 // Equally, VLDR for an v16i1 will actually load 32bits (so will be incorrect
9804 // Speaking of BE, apparently the rest of llvm will assume a reverse order to
9805 // a natural VMSR(load), so needs to be reversed.
9808 SDValue Load
= DAG
.getExtLoad(
9809 ISD::EXTLOAD
, dl
, MVT::i32
, LD
->getChain(), LD
->getBasePtr(),
9810 EVT::getIntegerVT(*DAG
.getContext(), MemVT
.getSizeInBits()),
9811 LD
->getMemOperand());
9813 if (DAG
.getDataLayout().isBigEndian())
9814 Val
= DAG
.getNode(ISD::SRL
, dl
, MVT::i32
,
9815 DAG
.getNode(ISD::BITREVERSE
, dl
, MVT::i32
, Load
),
9816 DAG
.getConstant(32 - MemVT
.getSizeInBits(), dl
, MVT::i32
));
9817 SDValue Pred
= DAG
.getNode(ARMISD::PREDICATE_CAST
, dl
, MVT::v16i1
, Val
);
9818 if (MemVT
!= MVT::v16i1
)
9819 Pred
= DAG
.getNode(ISD::EXTRACT_SUBVECTOR
, dl
, MemVT
, Pred
,
9820 DAG
.getConstant(0, dl
, MVT::i32
));
9821 return DAG
.getMergeValues({Pred
, Load
.getValue(1)}, dl
);
9824 void ARMTargetLowering::LowerLOAD(SDNode
*N
, SmallVectorImpl
<SDValue
> &Results
,
9825 SelectionDAG
&DAG
) const {
9826 LoadSDNode
*LD
= cast
<LoadSDNode
>(N
);
9827 EVT MemVT
= LD
->getMemoryVT();
9828 assert(LD
->isUnindexed() && "Loads should be unindexed at this point.");
9830 if (MemVT
== MVT::i64
&& Subtarget
->hasV5TEOps() &&
9831 !Subtarget
->isThumb1Only() && LD
->isVolatile()) {
9833 SDValue Result
= DAG
.getMemIntrinsicNode(
9834 ARMISD::LDRD
, dl
, DAG
.getVTList({MVT::i32
, MVT::i32
, MVT::Other
}),
9835 {LD
->getChain(), LD
->getBasePtr()}, MemVT
, LD
->getMemOperand());
9836 SDValue Lo
= Result
.getValue(DAG
.getDataLayout().isLittleEndian() ? 0 : 1);
9837 SDValue Hi
= Result
.getValue(DAG
.getDataLayout().isLittleEndian() ? 1 : 0);
9838 SDValue Pair
= DAG
.getNode(ISD::BUILD_PAIR
, dl
, MVT::i64
, Lo
, Hi
);
9839 Results
.append({Pair
, Result
.getValue(2)});
9843 static SDValue
LowerPredicateStore(SDValue Op
, SelectionDAG
&DAG
) {
9844 StoreSDNode
*ST
= cast
<StoreSDNode
>(Op
.getNode());
9845 EVT MemVT
= ST
->getMemoryVT();
9846 assert((MemVT
== MVT::v4i1
|| MemVT
== MVT::v8i1
|| MemVT
== MVT::v16i1
) &&
9847 "Expected a predicate type!");
9848 assert(MemVT
== ST
->getValue().getValueType());
9849 assert(!ST
->isTruncatingStore() && "Expected a non-extending store");
9850 assert(ST
->isUnindexed() && "Expected a unindexed store");
9852 // Only store the v4i1 or v8i1 worth of bits, via a buildvector with top bits
9853 // unset and a scalar store.
9855 SDValue Build
= ST
->getValue();
9856 if (MemVT
!= MVT::v16i1
) {
9857 SmallVector
<SDValue
, 16> Ops
;
9858 for (unsigned I
= 0; I
< MemVT
.getVectorNumElements(); I
++) {
9859 unsigned Elt
= DAG
.getDataLayout().isBigEndian()
9860 ? MemVT
.getVectorNumElements() - I
- 1
9862 Ops
.push_back(DAG
.getNode(ISD::EXTRACT_VECTOR_ELT
, dl
, MVT::i32
, Build
,
9863 DAG
.getConstant(Elt
, dl
, MVT::i32
)));
9865 for (unsigned I
= MemVT
.getVectorNumElements(); I
< 16; I
++)
9866 Ops
.push_back(DAG
.getUNDEF(MVT::i32
));
9867 Build
= DAG
.getNode(ISD::BUILD_VECTOR
, dl
, MVT::v16i1
, Ops
);
9869 SDValue GRP
= DAG
.getNode(ARMISD::PREDICATE_CAST
, dl
, MVT::i32
, Build
);
9870 if (MemVT
== MVT::v16i1
&& DAG
.getDataLayout().isBigEndian())
9871 GRP
= DAG
.getNode(ISD::SRL
, dl
, MVT::i32
,
9872 DAG
.getNode(ISD::BITREVERSE
, dl
, MVT::i32
, GRP
),
9873 DAG
.getConstant(16, dl
, MVT::i32
));
9874 return DAG
.getTruncStore(
9875 ST
->getChain(), dl
, GRP
, ST
->getBasePtr(),
9876 EVT::getIntegerVT(*DAG
.getContext(), MemVT
.getSizeInBits()),
9877 ST
->getMemOperand());
9880 static SDValue
LowerSTORE(SDValue Op
, SelectionDAG
&DAG
,
9881 const ARMSubtarget
*Subtarget
) {
9882 StoreSDNode
*ST
= cast
<StoreSDNode
>(Op
.getNode());
9883 EVT MemVT
= ST
->getMemoryVT();
9884 assert(ST
->isUnindexed() && "Stores should be unindexed at this point.");
9886 if (MemVT
== MVT::i64
&& Subtarget
->hasV5TEOps() &&
9887 !Subtarget
->isThumb1Only() && ST
->isVolatile()) {
9888 SDNode
*N
= Op
.getNode();
9891 SDValue Lo
= DAG
.getNode(
9892 ISD::EXTRACT_ELEMENT
, dl
, MVT::i32
, ST
->getValue(),
9893 DAG
.getTargetConstant(DAG
.getDataLayout().isLittleEndian() ? 0 : 1, dl
,
9895 SDValue Hi
= DAG
.getNode(
9896 ISD::EXTRACT_ELEMENT
, dl
, MVT::i32
, ST
->getValue(),
9897 DAG
.getTargetConstant(DAG
.getDataLayout().isLittleEndian() ? 1 : 0, dl
,
9900 return DAG
.getMemIntrinsicNode(ARMISD::STRD
, dl
, DAG
.getVTList(MVT::Other
),
9901 {ST
->getChain(), Lo
, Hi
, ST
->getBasePtr()},
9902 MemVT
, ST
->getMemOperand());
9903 } else if (Subtarget
->hasMVEIntegerOps() &&
9904 ((MemVT
== MVT::v4i1
|| MemVT
== MVT::v8i1
||
9905 MemVT
== MVT::v16i1
))) {
9906 return LowerPredicateStore(Op
, DAG
);
9912 static bool isZeroVector(SDValue N
) {
9913 return (ISD::isBuildVectorAllZeros(N
.getNode()) ||
9914 (N
->getOpcode() == ARMISD::VMOVIMM
&&
9915 isNullConstant(N
->getOperand(0))));
9918 static SDValue
LowerMLOAD(SDValue Op
, SelectionDAG
&DAG
) {
9919 MaskedLoadSDNode
*N
= cast
<MaskedLoadSDNode
>(Op
.getNode());
9920 MVT VT
= Op
.getSimpleValueType();
9921 SDValue Mask
= N
->getMask();
9922 SDValue PassThru
= N
->getPassThru();
9925 if (isZeroVector(PassThru
))
9928 // MVE Masked loads use zero as the passthru value. Here we convert undef to
9929 // zero too, and other values are lowered to a select.
9930 SDValue ZeroVec
= DAG
.getNode(ARMISD::VMOVIMM
, dl
, VT
,
9931 DAG
.getTargetConstant(0, dl
, MVT::i32
));
9932 SDValue NewLoad
= DAG
.getMaskedLoad(
9933 VT
, dl
, N
->getChain(), N
->getBasePtr(), N
->getOffset(), Mask
, ZeroVec
,
9934 N
->getMemoryVT(), N
->getMemOperand(), N
->getAddressingMode(),
9935 N
->getExtensionType(), N
->isExpandingLoad());
9936 SDValue Combo
= NewLoad
;
9937 bool PassThruIsCastZero
= (PassThru
.getOpcode() == ISD::BITCAST
||
9938 PassThru
.getOpcode() == ARMISD::VECTOR_REG_CAST
) &&
9939 isZeroVector(PassThru
->getOperand(0));
9940 if (!PassThru
.isUndef() && !PassThruIsCastZero
)
9941 Combo
= DAG
.getNode(ISD::VSELECT
, dl
, VT
, Mask
, NewLoad
, PassThru
);
9942 return DAG
.getMergeValues({Combo
, NewLoad
.getValue(1)}, dl
);
9945 static SDValue
LowerVecReduce(SDValue Op
, SelectionDAG
&DAG
,
9946 const ARMSubtarget
*ST
) {
9947 if (!ST
->hasMVEIntegerOps())
9951 unsigned BaseOpcode
= 0;
9952 switch (Op
->getOpcode()) {
9953 default: llvm_unreachable("Expected VECREDUCE opcode");
9954 case ISD::VECREDUCE_FADD
: BaseOpcode
= ISD::FADD
; break;
9955 case ISD::VECREDUCE_FMUL
: BaseOpcode
= ISD::FMUL
; break;
9956 case ISD::VECREDUCE_MUL
: BaseOpcode
= ISD::MUL
; break;
9957 case ISD::VECREDUCE_AND
: BaseOpcode
= ISD::AND
; break;
9958 case ISD::VECREDUCE_OR
: BaseOpcode
= ISD::OR
; break;
9959 case ISD::VECREDUCE_XOR
: BaseOpcode
= ISD::XOR
; break;
9960 case ISD::VECREDUCE_FMAX
: BaseOpcode
= ISD::FMAXNUM
; break;
9961 case ISD::VECREDUCE_FMIN
: BaseOpcode
= ISD::FMINNUM
; break;
9964 SDValue Op0
= Op
->getOperand(0);
9965 EVT VT
= Op0
.getValueType();
9966 EVT EltVT
= VT
.getVectorElementType();
9967 unsigned NumElts
= VT
.getVectorNumElements();
9968 unsigned NumActiveLanes
= NumElts
;
9970 assert((NumActiveLanes
== 16 || NumActiveLanes
== 8 || NumActiveLanes
== 4 ||
9971 NumActiveLanes
== 2) &&
9972 "Only expected a power 2 vector size");
9974 // Use Mul(X, Rev(X)) until 4 items remain. Going down to 4 vector elements
9975 // allows us to easily extract vector elements from the lanes.
9976 while (NumActiveLanes
> 4) {
9977 unsigned RevOpcode
= NumActiveLanes
== 16 ? ARMISD::VREV16
: ARMISD::VREV32
;
9978 SDValue Rev
= DAG
.getNode(RevOpcode
, dl
, VT
, Op0
);
9979 Op0
= DAG
.getNode(BaseOpcode
, dl
, VT
, Op0
, Rev
);
9980 NumActiveLanes
/= 2;
9984 if (NumActiveLanes
== 4) {
9985 // The remaining 4 elements are summed sequentially
9986 SDValue Ext0
= DAG
.getNode(ISD::EXTRACT_VECTOR_ELT
, dl
, EltVT
, Op0
,
9987 DAG
.getConstant(0 * NumElts
/ 4, dl
, MVT::i32
));
9988 SDValue Ext1
= DAG
.getNode(ISD::EXTRACT_VECTOR_ELT
, dl
, EltVT
, Op0
,
9989 DAG
.getConstant(1 * NumElts
/ 4, dl
, MVT::i32
));
9990 SDValue Ext2
= DAG
.getNode(ISD::EXTRACT_VECTOR_ELT
, dl
, EltVT
, Op0
,
9991 DAG
.getConstant(2 * NumElts
/ 4, dl
, MVT::i32
));
9992 SDValue Ext3
= DAG
.getNode(ISD::EXTRACT_VECTOR_ELT
, dl
, EltVT
, Op0
,
9993 DAG
.getConstant(3 * NumElts
/ 4, dl
, MVT::i32
));
9994 SDValue Res0
= DAG
.getNode(BaseOpcode
, dl
, EltVT
, Ext0
, Ext1
, Op
->getFlags());
9995 SDValue Res1
= DAG
.getNode(BaseOpcode
, dl
, EltVT
, Ext2
, Ext3
, Op
->getFlags());
9996 Res
= DAG
.getNode(BaseOpcode
, dl
, EltVT
, Res0
, Res1
, Op
->getFlags());
9998 SDValue Ext0
= DAG
.getNode(ISD::EXTRACT_VECTOR_ELT
, dl
, EltVT
, Op0
,
9999 DAG
.getConstant(0, dl
, MVT::i32
));
10000 SDValue Ext1
= DAG
.getNode(ISD::EXTRACT_VECTOR_ELT
, dl
, EltVT
, Op0
,
10001 DAG
.getConstant(1, dl
, MVT::i32
));
10002 Res
= DAG
.getNode(BaseOpcode
, dl
, EltVT
, Ext0
, Ext1
, Op
->getFlags());
10005 // Result type may be wider than element type.
10006 if (EltVT
!= Op
->getValueType(0))
10007 Res
= DAG
.getNode(ISD::ANY_EXTEND
, dl
, Op
->getValueType(0), Res
);
10011 static SDValue
LowerVecReduceF(SDValue Op
, SelectionDAG
&DAG
,
10012 const ARMSubtarget
*ST
) {
10013 if (!ST
->hasMVEFloatOps())
10015 return LowerVecReduce(Op
, DAG
, ST
);
10018 static SDValue
LowerAtomicLoadStore(SDValue Op
, SelectionDAG
&DAG
) {
10019 if (isStrongerThanMonotonic(cast
<AtomicSDNode
>(Op
)->getSuccessOrdering()))
10020 // Acquire/Release load/store is not legal for targets without a dmb or
10021 // equivalent available.
10024 // Monotonic load/store is legal for all targets.
10028 static void ReplaceREADCYCLECOUNTER(SDNode
*N
,
10029 SmallVectorImpl
<SDValue
> &Results
,
10031 const ARMSubtarget
*Subtarget
) {
10033 // Under Power Management extensions, the cycle-count is:
10034 // mrc p15, #0, <Rt>, c9, c13, #0
10035 SDValue Ops
[] = { N
->getOperand(0), // Chain
10036 DAG
.getTargetConstant(Intrinsic::arm_mrc
, DL
, MVT::i32
),
10037 DAG
.getTargetConstant(15, DL
, MVT::i32
),
10038 DAG
.getTargetConstant(0, DL
, MVT::i32
),
10039 DAG
.getTargetConstant(9, DL
, MVT::i32
),
10040 DAG
.getTargetConstant(13, DL
, MVT::i32
),
10041 DAG
.getTargetConstant(0, DL
, MVT::i32
)
10044 SDValue Cycles32
= DAG
.getNode(ISD::INTRINSIC_W_CHAIN
, DL
,
10045 DAG
.getVTList(MVT::i32
, MVT::Other
), Ops
);
10046 Results
.push_back(DAG
.getNode(ISD::BUILD_PAIR
, DL
, MVT::i64
, Cycles32
,
10047 DAG
.getConstant(0, DL
, MVT::i32
)));
10048 Results
.push_back(Cycles32
.getValue(1));
10051 static SDValue
createGPRPairNode(SelectionDAG
&DAG
, SDValue V
) {
10052 SDLoc
dl(V
.getNode());
10053 SDValue VLo
= DAG
.getAnyExtOrTrunc(V
, dl
, MVT::i32
);
10054 SDValue VHi
= DAG
.getAnyExtOrTrunc(
10055 DAG
.getNode(ISD::SRL
, dl
, MVT::i64
, V
, DAG
.getConstant(32, dl
, MVT::i32
)),
10057 bool isBigEndian
= DAG
.getDataLayout().isBigEndian();
10059 std::swap (VLo
, VHi
);
10061 DAG
.getTargetConstant(ARM::GPRPairRegClassID
, dl
, MVT::i32
);
10062 SDValue SubReg0
= DAG
.getTargetConstant(ARM::gsub_0
, dl
, MVT::i32
);
10063 SDValue SubReg1
= DAG
.getTargetConstant(ARM::gsub_1
, dl
, MVT::i32
);
10064 const SDValue Ops
[] = { RegClass
, VLo
, SubReg0
, VHi
, SubReg1
};
10066 DAG
.getMachineNode(TargetOpcode::REG_SEQUENCE
, dl
, MVT::Untyped
, Ops
), 0);
10069 static void ReplaceCMP_SWAP_64Results(SDNode
*N
,
10070 SmallVectorImpl
<SDValue
> & Results
,
10071 SelectionDAG
&DAG
) {
10072 assert(N
->getValueType(0) == MVT::i64
&&
10073 "AtomicCmpSwap on types less than 64 should be legal");
10074 SDValue Ops
[] = {N
->getOperand(1),
10075 createGPRPairNode(DAG
, N
->getOperand(2)),
10076 createGPRPairNode(DAG
, N
->getOperand(3)),
10078 SDNode
*CmpSwap
= DAG
.getMachineNode(
10079 ARM::CMP_SWAP_64
, SDLoc(N
),
10080 DAG
.getVTList(MVT::Untyped
, MVT::i32
, MVT::Other
), Ops
);
10082 MachineMemOperand
*MemOp
= cast
<MemSDNode
>(N
)->getMemOperand();
10083 DAG
.setNodeMemRefs(cast
<MachineSDNode
>(CmpSwap
), {MemOp
});
10085 bool isBigEndian
= DAG
.getDataLayout().isBigEndian();
10088 DAG
.getTargetExtractSubreg(isBigEndian
? ARM::gsub_1
: ARM::gsub_0
,
10089 SDLoc(N
), MVT::i32
, SDValue(CmpSwap
, 0));
10091 DAG
.getTargetExtractSubreg(isBigEndian
? ARM::gsub_0
: ARM::gsub_1
,
10092 SDLoc(N
), MVT::i32
, SDValue(CmpSwap
, 0));
10093 Results
.push_back(DAG
.getNode(ISD::BUILD_PAIR
, SDLoc(N
), MVT::i64
, Lo
, Hi
));
10094 Results
.push_back(SDValue(CmpSwap
, 2));
10097 SDValue
ARMTargetLowering::LowerFSETCC(SDValue Op
, SelectionDAG
&DAG
) const {
10099 EVT VT
= Op
.getValueType();
10100 SDValue Chain
= Op
.getOperand(0);
10101 SDValue LHS
= Op
.getOperand(1);
10102 SDValue RHS
= Op
.getOperand(2);
10103 ISD::CondCode CC
= cast
<CondCodeSDNode
>(Op
.getOperand(3))->get();
10104 bool IsSignaling
= Op
.getOpcode() == ISD::STRICT_FSETCCS
;
10106 // If we don't have instructions of this float type then soften to a libcall
10107 // and use SETCC instead.
10108 if (isUnsupportedFloatingType(LHS
.getValueType())) {
10109 DAG
.getTargetLoweringInfo().softenSetCCOperands(
10110 DAG
, LHS
.getValueType(), LHS
, RHS
, CC
, dl
, LHS
, RHS
, Chain
, IsSignaling
);
10111 if (!RHS
.getNode()) {
10112 RHS
= DAG
.getConstant(0, dl
, LHS
.getValueType());
10115 SDValue Result
= DAG
.getNode(ISD::SETCC
, dl
, VT
, LHS
, RHS
,
10116 DAG
.getCondCode(CC
));
10117 return DAG
.getMergeValues({Result
, Chain
}, dl
);
10120 ARMCC::CondCodes CondCode
, CondCode2
;
10121 FPCCToARMCC(CC
, CondCode
, CondCode2
);
10123 // FIXME: Chain is not handled correctly here. Currently the FPSCR is implicit
10124 // in CMPFP and CMPFPE, but instead it should be made explicit by these
10125 // instructions using a chain instead of glue. This would also fix the problem
10126 // here (and also in LowerSELECT_CC) where we generate two comparisons when
10127 // CondCode2 != AL.
10128 SDValue True
= DAG
.getConstant(1, dl
, VT
);
10129 SDValue False
= DAG
.getConstant(0, dl
, VT
);
10130 SDValue ARMcc
= DAG
.getConstant(CondCode
, dl
, MVT::i32
);
10131 SDValue CCR
= DAG
.getRegister(ARM::CPSR
, MVT::i32
);
10132 SDValue Cmp
= getVFPCmp(LHS
, RHS
, DAG
, dl
, IsSignaling
);
10133 SDValue Result
= getCMOV(dl
, VT
, False
, True
, ARMcc
, CCR
, Cmp
, DAG
);
10134 if (CondCode2
!= ARMCC::AL
) {
10135 ARMcc
= DAG
.getConstant(CondCode2
, dl
, MVT::i32
);
10136 Cmp
= getVFPCmp(LHS
, RHS
, DAG
, dl
, IsSignaling
);
10137 Result
= getCMOV(dl
, VT
, Result
, True
, ARMcc
, CCR
, Cmp
, DAG
);
10139 return DAG
.getMergeValues({Result
, Chain
}, dl
);
10142 SDValue
ARMTargetLowering::LowerOperation(SDValue Op
, SelectionDAG
&DAG
) const {
10143 LLVM_DEBUG(dbgs() << "Lowering node: "; Op
.dump());
10144 switch (Op
.getOpcode()) {
10145 default: llvm_unreachable("Don't know how to custom lower this!");
10146 case ISD::WRITE_REGISTER
: return LowerWRITE_REGISTER(Op
, DAG
);
10147 case ISD::ConstantPool
: return LowerConstantPool(Op
, DAG
);
10148 case ISD::BlockAddress
: return LowerBlockAddress(Op
, DAG
);
10149 case ISD::GlobalAddress
: return LowerGlobalAddress(Op
, DAG
);
10150 case ISD::GlobalTLSAddress
: return LowerGlobalTLSAddress(Op
, DAG
);
10151 case ISD::SELECT
: return LowerSELECT(Op
, DAG
);
10152 case ISD::SELECT_CC
: return LowerSELECT_CC(Op
, DAG
);
10153 case ISD::BRCOND
: return LowerBRCOND(Op
, DAG
);
10154 case ISD::BR_CC
: return LowerBR_CC(Op
, DAG
);
10155 case ISD::BR_JT
: return LowerBR_JT(Op
, DAG
);
10156 case ISD::VASTART
: return LowerVASTART(Op
, DAG
);
10157 case ISD::ATOMIC_FENCE
: return LowerATOMIC_FENCE(Op
, DAG
, Subtarget
);
10158 case ISD::PREFETCH
: return LowerPREFETCH(Op
, DAG
, Subtarget
);
10159 case ISD::SINT_TO_FP
:
10160 case ISD::UINT_TO_FP
: return LowerINT_TO_FP(Op
, DAG
);
10161 case ISD::STRICT_FP_TO_SINT
:
10162 case ISD::STRICT_FP_TO_UINT
:
10163 case ISD::FP_TO_SINT
:
10164 case ISD::FP_TO_UINT
: return LowerFP_TO_INT(Op
, DAG
);
10165 case ISD::FCOPYSIGN
: return LowerFCOPYSIGN(Op
, DAG
);
10166 case ISD::RETURNADDR
: return LowerRETURNADDR(Op
, DAG
);
10167 case ISD::FRAMEADDR
: return LowerFRAMEADDR(Op
, DAG
);
10168 case ISD::EH_SJLJ_SETJMP
: return LowerEH_SJLJ_SETJMP(Op
, DAG
);
10169 case ISD::EH_SJLJ_LONGJMP
: return LowerEH_SJLJ_LONGJMP(Op
, DAG
);
10170 case ISD::EH_SJLJ_SETUP_DISPATCH
: return LowerEH_SJLJ_SETUP_DISPATCH(Op
, DAG
);
10171 case ISD::INTRINSIC_VOID
: return LowerINTRINSIC_VOID(Op
, DAG
, Subtarget
);
10172 case ISD::INTRINSIC_WO_CHAIN
: return LowerINTRINSIC_WO_CHAIN(Op
, DAG
,
10174 case ISD::BITCAST
: return ExpandBITCAST(Op
.getNode(), DAG
, Subtarget
);
10177 case ISD::SRA
: return LowerShift(Op
.getNode(), DAG
, Subtarget
);
10178 case ISD::SREM
: return LowerREM(Op
.getNode(), DAG
);
10179 case ISD::UREM
: return LowerREM(Op
.getNode(), DAG
);
10180 case ISD::SHL_PARTS
: return LowerShiftLeftParts(Op
, DAG
);
10181 case ISD::SRL_PARTS
:
10182 case ISD::SRA_PARTS
: return LowerShiftRightParts(Op
, DAG
);
10184 case ISD::CTTZ_ZERO_UNDEF
: return LowerCTTZ(Op
.getNode(), DAG
, Subtarget
);
10185 case ISD::CTPOP
: return LowerCTPOP(Op
.getNode(), DAG
, Subtarget
);
10186 case ISD::SETCC
: return LowerVSETCC(Op
, DAG
, Subtarget
);
10187 case ISD::SETCCCARRY
: return LowerSETCCCARRY(Op
, DAG
);
10188 case ISD::ConstantFP
: return LowerConstantFP(Op
, DAG
, Subtarget
);
10189 case ISD::BUILD_VECTOR
: return LowerBUILD_VECTOR(Op
, DAG
, Subtarget
);
10190 case ISD::VECTOR_SHUFFLE
: return LowerVECTOR_SHUFFLE(Op
, DAG
, Subtarget
);
10191 case ISD::EXTRACT_SUBVECTOR
: return LowerEXTRACT_SUBVECTOR(Op
, DAG
, Subtarget
);
10192 case ISD::INSERT_VECTOR_ELT
: return LowerINSERT_VECTOR_ELT(Op
, DAG
);
10193 case ISD::EXTRACT_VECTOR_ELT
: return LowerEXTRACT_VECTOR_ELT(Op
, DAG
, Subtarget
);
10194 case ISD::CONCAT_VECTORS
: return LowerCONCAT_VECTORS(Op
, DAG
, Subtarget
);
10195 case ISD::TRUNCATE
: return LowerTruncate(Op
.getNode(), DAG
, Subtarget
);
10196 case ISD::SIGN_EXTEND
:
10197 case ISD::ZERO_EXTEND
: return LowerVectorExtend(Op
.getNode(), DAG
, Subtarget
);
10198 case ISD::FLT_ROUNDS_
: return LowerFLT_ROUNDS_(Op
, DAG
);
10199 case ISD::SET_ROUNDING
: return LowerSET_ROUNDING(Op
, DAG
);
10200 case ISD::MUL
: return LowerMUL(Op
, DAG
);
10202 if (Subtarget
->isTargetWindows() && !Op
.getValueType().isVector())
10203 return LowerDIV_Windows(Op
, DAG
, /* Signed */ true);
10204 return LowerSDIV(Op
, DAG
, Subtarget
);
10206 if (Subtarget
->isTargetWindows() && !Op
.getValueType().isVector())
10207 return LowerDIV_Windows(Op
, DAG
, /* Signed */ false);
10208 return LowerUDIV(Op
, DAG
, Subtarget
);
10209 case ISD::ADDCARRY
:
10210 case ISD::SUBCARRY
: return LowerADDSUBCARRY(Op
, DAG
);
10213 return LowerSignedALUO(Op
, DAG
);
10216 return LowerUnsignedALUO(Op
, DAG
);
10221 return LowerADDSUBSAT(Op
, DAG
, Subtarget
);
10223 return LowerPredicateLoad(Op
, DAG
);
10225 return LowerSTORE(Op
, DAG
, Subtarget
);
10227 return LowerMLOAD(Op
, DAG
);
10228 case ISD::VECREDUCE_MUL
:
10229 case ISD::VECREDUCE_AND
:
10230 case ISD::VECREDUCE_OR
:
10231 case ISD::VECREDUCE_XOR
:
10232 return LowerVecReduce(Op
, DAG
, Subtarget
);
10233 case ISD::VECREDUCE_FADD
:
10234 case ISD::VECREDUCE_FMUL
:
10235 case ISD::VECREDUCE_FMIN
:
10236 case ISD::VECREDUCE_FMAX
:
10237 return LowerVecReduceF(Op
, DAG
, Subtarget
);
10238 case ISD::ATOMIC_LOAD
:
10239 case ISD::ATOMIC_STORE
: return LowerAtomicLoadStore(Op
, DAG
);
10240 case ISD::FSINCOS
: return LowerFSINCOS(Op
, DAG
);
10242 case ISD::UDIVREM
: return LowerDivRem(Op
, DAG
);
10243 case ISD::DYNAMIC_STACKALLOC
:
10244 if (Subtarget
->isTargetWindows())
10245 return LowerDYNAMIC_STACKALLOC(Op
, DAG
);
10246 llvm_unreachable("Don't know how to custom lower this!");
10247 case ISD::STRICT_FP_ROUND
:
10248 case ISD::FP_ROUND
: return LowerFP_ROUND(Op
, DAG
);
10249 case ISD::STRICT_FP_EXTEND
:
10250 case ISD::FP_EXTEND
: return LowerFP_EXTEND(Op
, DAG
);
10251 case ISD::STRICT_FSETCC
:
10252 case ISD::STRICT_FSETCCS
: return LowerFSETCC(Op
, DAG
);
10253 case ARMISD::WIN__DBZCHK
: return SDValue();
10257 static void ReplaceLongIntrinsic(SDNode
*N
, SmallVectorImpl
<SDValue
> &Results
,
10258 SelectionDAG
&DAG
) {
10259 unsigned IntNo
= cast
<ConstantSDNode
>(N
->getOperand(0))->getZExtValue();
10261 if (IntNo
== Intrinsic::arm_smlald
)
10262 Opc
= ARMISD::SMLALD
;
10263 else if (IntNo
== Intrinsic::arm_smlaldx
)
10264 Opc
= ARMISD::SMLALDX
;
10265 else if (IntNo
== Intrinsic::arm_smlsld
)
10266 Opc
= ARMISD::SMLSLD
;
10267 else if (IntNo
== Intrinsic::arm_smlsldx
)
10268 Opc
= ARMISD::SMLSLDX
;
10273 SDValue Lo
= DAG
.getNode(ISD::EXTRACT_ELEMENT
, dl
, MVT::i32
,
10275 DAG
.getConstant(0, dl
, MVT::i32
));
10276 SDValue Hi
= DAG
.getNode(ISD::EXTRACT_ELEMENT
, dl
, MVT::i32
,
10278 DAG
.getConstant(1, dl
, MVT::i32
));
10280 SDValue LongMul
= DAG
.getNode(Opc
, dl
,
10281 DAG
.getVTList(MVT::i32
, MVT::i32
),
10282 N
->getOperand(1), N
->getOperand(2),
10284 Results
.push_back(DAG
.getNode(ISD::BUILD_PAIR
, dl
, MVT::i64
,
10285 LongMul
.getValue(0), LongMul
.getValue(1)));
10288 /// ReplaceNodeResults - Replace the results of node with an illegal result
10289 /// type with new values built out of custom code.
10290 void ARMTargetLowering::ReplaceNodeResults(SDNode
*N
,
10291 SmallVectorImpl
<SDValue
> &Results
,
10292 SelectionDAG
&DAG
) const {
10294 switch (N
->getOpcode()) {
10296 llvm_unreachable("Don't know how to custom expand this!");
10297 case ISD::READ_REGISTER
:
10298 ExpandREAD_REGISTER(N
, Results
, DAG
);
10301 Res
= ExpandBITCAST(N
, DAG
, Subtarget
);
10306 Res
= Expand64BitShift(N
, DAG
, Subtarget
);
10310 Res
= LowerREM(N
, DAG
);
10314 Res
= LowerDivRem(SDValue(N
, 0), DAG
);
10315 assert(Res
.getNumOperands() == 2 && "DivRem needs two values");
10316 Results
.push_back(Res
.getValue(0));
10317 Results
.push_back(Res
.getValue(1));
10323 Res
= LowerADDSUBSAT(SDValue(N
, 0), DAG
, Subtarget
);
10325 case ISD::READCYCLECOUNTER
:
10326 ReplaceREADCYCLECOUNTER(N
, Results
, DAG
, Subtarget
);
10330 assert(Subtarget
->isTargetWindows() && "can only expand DIV on Windows");
10331 return ExpandDIV_Windows(SDValue(N
, 0), DAG
, N
->getOpcode() == ISD::SDIV
,
10333 case ISD::ATOMIC_CMP_SWAP
:
10334 ReplaceCMP_SWAP_64Results(N
, Results
, DAG
);
10336 case ISD::INTRINSIC_WO_CHAIN
:
10337 return ReplaceLongIntrinsic(N
, Results
, DAG
);
10339 lowerABS(N
, Results
, DAG
);
10342 LowerLOAD(N
, Results
, DAG
);
10344 case ISD::TRUNCATE
:
10345 Res
= LowerTruncate(N
, DAG
, Subtarget
);
10347 case ISD::SIGN_EXTEND
:
10348 case ISD::ZERO_EXTEND
:
10349 Res
= LowerVectorExtend(N
, DAG
, Subtarget
);
10353 Results
.push_back(Res
);
10356 //===----------------------------------------------------------------------===//
10357 // ARM Scheduler Hooks
10358 //===----------------------------------------------------------------------===//
10360 /// SetupEntryBlockForSjLj - Insert code into the entry block that creates and
10361 /// registers the function context.
10362 void ARMTargetLowering::SetupEntryBlockForSjLj(MachineInstr
&MI
,
10363 MachineBasicBlock
*MBB
,
10364 MachineBasicBlock
*DispatchBB
,
10366 assert(!Subtarget
->isROPI() && !Subtarget
->isRWPI() &&
10367 "ROPI/RWPI not currently supported with SjLj");
10368 const TargetInstrInfo
*TII
= Subtarget
->getInstrInfo();
10369 DebugLoc dl
= MI
.getDebugLoc();
10370 MachineFunction
*MF
= MBB
->getParent();
10371 MachineRegisterInfo
*MRI
= &MF
->getRegInfo();
10372 MachineConstantPool
*MCP
= MF
->getConstantPool();
10373 ARMFunctionInfo
*AFI
= MF
->getInfo
<ARMFunctionInfo
>();
10374 const Function
&F
= MF
->getFunction();
10376 bool isThumb
= Subtarget
->isThumb();
10377 bool isThumb2
= Subtarget
->isThumb2();
10379 unsigned PCLabelId
= AFI
->createPICLabelUId();
10380 unsigned PCAdj
= (isThumb
|| isThumb2
) ? 4 : 8;
10381 ARMConstantPoolValue
*CPV
=
10382 ARMConstantPoolMBB::Create(F
.getContext(), DispatchBB
, PCLabelId
, PCAdj
);
10383 unsigned CPI
= MCP
->getConstantPoolIndex(CPV
, Align(4));
10385 const TargetRegisterClass
*TRC
= isThumb
? &ARM::tGPRRegClass
10386 : &ARM::GPRRegClass
;
10388 // Grab constant pool and fixed stack memory operands.
10389 MachineMemOperand
*CPMMO
=
10390 MF
->getMachineMemOperand(MachinePointerInfo::getConstantPool(*MF
),
10391 MachineMemOperand::MOLoad
, 4, Align(4));
10393 MachineMemOperand
*FIMMOSt
=
10394 MF
->getMachineMemOperand(MachinePointerInfo::getFixedStack(*MF
, FI
),
10395 MachineMemOperand::MOStore
, 4, Align(4));
10397 // Load the address of the dispatch MBB into the jump buffer.
10399 // Incoming value: jbuf
10400 // ldr.n r5, LCPI1_1
10403 // str r5, [$jbuf, #+4] ; &jbuf[1]
10404 Register NewVReg1
= MRI
->createVirtualRegister(TRC
);
10405 BuildMI(*MBB
, MI
, dl
, TII
->get(ARM::t2LDRpci
), NewVReg1
)
10406 .addConstantPoolIndex(CPI
)
10407 .addMemOperand(CPMMO
)
10408 .add(predOps(ARMCC::AL
));
10409 // Set the low bit because of thumb mode.
10410 Register NewVReg2
= MRI
->createVirtualRegister(TRC
);
10411 BuildMI(*MBB
, MI
, dl
, TII
->get(ARM::t2ORRri
), NewVReg2
)
10412 .addReg(NewVReg1
, RegState::Kill
)
10414 .add(predOps(ARMCC::AL
))
10415 .add(condCodeOp());
10416 Register NewVReg3
= MRI
->createVirtualRegister(TRC
);
10417 BuildMI(*MBB
, MI
, dl
, TII
->get(ARM::tPICADD
), NewVReg3
)
10418 .addReg(NewVReg2
, RegState::Kill
)
10419 .addImm(PCLabelId
);
10420 BuildMI(*MBB
, MI
, dl
, TII
->get(ARM::t2STRi12
))
10421 .addReg(NewVReg3
, RegState::Kill
)
10423 .addImm(36) // &jbuf[1] :: pc
10424 .addMemOperand(FIMMOSt
)
10425 .add(predOps(ARMCC::AL
));
10426 } else if (isThumb
) {
10427 // Incoming value: jbuf
10428 // ldr.n r1, LCPI1_4
10432 // add r2, $jbuf, #+4 ; &jbuf[1]
10434 Register NewVReg1
= MRI
->createVirtualRegister(TRC
);
10435 BuildMI(*MBB
, MI
, dl
, TII
->get(ARM::tLDRpci
), NewVReg1
)
10436 .addConstantPoolIndex(CPI
)
10437 .addMemOperand(CPMMO
)
10438 .add(predOps(ARMCC::AL
));
10439 Register NewVReg2
= MRI
->createVirtualRegister(TRC
);
10440 BuildMI(*MBB
, MI
, dl
, TII
->get(ARM::tPICADD
), NewVReg2
)
10441 .addReg(NewVReg1
, RegState::Kill
)
10442 .addImm(PCLabelId
);
10443 // Set the low bit because of thumb mode.
10444 Register NewVReg3
= MRI
->createVirtualRegister(TRC
);
10445 BuildMI(*MBB
, MI
, dl
, TII
->get(ARM::tMOVi8
), NewVReg3
)
10446 .addReg(ARM::CPSR
, RegState::Define
)
10448 .add(predOps(ARMCC::AL
));
10449 Register NewVReg4
= MRI
->createVirtualRegister(TRC
);
10450 BuildMI(*MBB
, MI
, dl
, TII
->get(ARM::tORR
), NewVReg4
)
10451 .addReg(ARM::CPSR
, RegState::Define
)
10452 .addReg(NewVReg2
, RegState::Kill
)
10453 .addReg(NewVReg3
, RegState::Kill
)
10454 .add(predOps(ARMCC::AL
));
10455 Register NewVReg5
= MRI
->createVirtualRegister(TRC
);
10456 BuildMI(*MBB
, MI
, dl
, TII
->get(ARM::tADDframe
), NewVReg5
)
10458 .addImm(36); // &jbuf[1] :: pc
10459 BuildMI(*MBB
, MI
, dl
, TII
->get(ARM::tSTRi
))
10460 .addReg(NewVReg4
, RegState::Kill
)
10461 .addReg(NewVReg5
, RegState::Kill
)
10463 .addMemOperand(FIMMOSt
)
10464 .add(predOps(ARMCC::AL
));
10466 // Incoming value: jbuf
10469 // str r1, [$jbuf, #+4] ; &jbuf[1]
10470 Register NewVReg1
= MRI
->createVirtualRegister(TRC
);
10471 BuildMI(*MBB
, MI
, dl
, TII
->get(ARM::LDRi12
), NewVReg1
)
10472 .addConstantPoolIndex(CPI
)
10474 .addMemOperand(CPMMO
)
10475 .add(predOps(ARMCC::AL
));
10476 Register NewVReg2
= MRI
->createVirtualRegister(TRC
);
10477 BuildMI(*MBB
, MI
, dl
, TII
->get(ARM::PICADD
), NewVReg2
)
10478 .addReg(NewVReg1
, RegState::Kill
)
10480 .add(predOps(ARMCC::AL
));
10481 BuildMI(*MBB
, MI
, dl
, TII
->get(ARM::STRi12
))
10482 .addReg(NewVReg2
, RegState::Kill
)
10484 .addImm(36) // &jbuf[1] :: pc
10485 .addMemOperand(FIMMOSt
)
10486 .add(predOps(ARMCC::AL
));
10490 void ARMTargetLowering::EmitSjLjDispatchBlock(MachineInstr
&MI
,
10491 MachineBasicBlock
*MBB
) const {
10492 const TargetInstrInfo
*TII
= Subtarget
->getInstrInfo();
10493 DebugLoc dl
= MI
.getDebugLoc();
10494 MachineFunction
*MF
= MBB
->getParent();
10495 MachineRegisterInfo
*MRI
= &MF
->getRegInfo();
10496 MachineFrameInfo
&MFI
= MF
->getFrameInfo();
10497 int FI
= MFI
.getFunctionContextIndex();
10499 const TargetRegisterClass
*TRC
= Subtarget
->isThumb() ? &ARM::tGPRRegClass
10500 : &ARM::GPRnopcRegClass
;
10502 // Get a mapping of the call site numbers to all of the landing pads they're
10503 // associated with.
10504 DenseMap
<unsigned, SmallVector
<MachineBasicBlock
*, 2>> CallSiteNumToLPad
;
10505 unsigned MaxCSNum
= 0;
10506 for (MachineFunction::iterator BB
= MF
->begin(), E
= MF
->end(); BB
!= E
;
10508 if (!BB
->isEHPad()) continue;
10510 // FIXME: We should assert that the EH_LABEL is the first MI in the landing
10512 for (MachineBasicBlock::iterator
10513 II
= BB
->begin(), IE
= BB
->end(); II
!= IE
; ++II
) {
10514 if (!II
->isEHLabel()) continue;
10516 MCSymbol
*Sym
= II
->getOperand(0).getMCSymbol();
10517 if (!MF
->hasCallSiteLandingPad(Sym
)) continue;
10519 SmallVectorImpl
<unsigned> &CallSiteIdxs
= MF
->getCallSiteLandingPad(Sym
);
10520 for (SmallVectorImpl
<unsigned>::iterator
10521 CSI
= CallSiteIdxs
.begin(), CSE
= CallSiteIdxs
.end();
10522 CSI
!= CSE
; ++CSI
) {
10523 CallSiteNumToLPad
[*CSI
].push_back(&*BB
);
10524 MaxCSNum
= std::max(MaxCSNum
, *CSI
);
10530 // Get an ordered list of the machine basic blocks for the jump table.
10531 std::vector
<MachineBasicBlock
*> LPadList
;
10532 SmallPtrSet
<MachineBasicBlock
*, 32> InvokeBBs
;
10533 LPadList
.reserve(CallSiteNumToLPad
.size());
10534 for (unsigned I
= 1; I
<= MaxCSNum
; ++I
) {
10535 SmallVectorImpl
<MachineBasicBlock
*> &MBBList
= CallSiteNumToLPad
[I
];
10536 for (SmallVectorImpl
<MachineBasicBlock
*>::iterator
10537 II
= MBBList
.begin(), IE
= MBBList
.end(); II
!= IE
; ++II
) {
10538 LPadList
.push_back(*II
);
10539 InvokeBBs
.insert((*II
)->pred_begin(), (*II
)->pred_end());
10543 assert(!LPadList
.empty() &&
10544 "No landing pad destinations for the dispatch jump table!");
10546 // Create the jump table and associated information.
10547 MachineJumpTableInfo
*JTI
=
10548 MF
->getOrCreateJumpTableInfo(MachineJumpTableInfo::EK_Inline
);
10549 unsigned MJTI
= JTI
->createJumpTableIndex(LPadList
);
10551 // Create the MBBs for the dispatch code.
10553 // Shove the dispatch's address into the return slot in the function context.
10554 MachineBasicBlock
*DispatchBB
= MF
->CreateMachineBasicBlock();
10555 DispatchBB
->setIsEHPad();
10557 MachineBasicBlock
*TrapBB
= MF
->CreateMachineBasicBlock();
10558 unsigned trap_opcode
;
10559 if (Subtarget
->isThumb())
10560 trap_opcode
= ARM::tTRAP
;
10562 trap_opcode
= Subtarget
->useNaClTrap() ? ARM::TRAPNaCl
: ARM::TRAP
;
10564 BuildMI(TrapBB
, dl
, TII
->get(trap_opcode
));
10565 DispatchBB
->addSuccessor(TrapBB
);
10567 MachineBasicBlock
*DispContBB
= MF
->CreateMachineBasicBlock();
10568 DispatchBB
->addSuccessor(DispContBB
);
10570 // Insert and MBBs.
10571 MF
->insert(MF
->end(), DispatchBB
);
10572 MF
->insert(MF
->end(), DispContBB
);
10573 MF
->insert(MF
->end(), TrapBB
);
10575 // Insert code into the entry block that creates and registers the function
10577 SetupEntryBlockForSjLj(MI
, MBB
, DispatchBB
, FI
);
10579 MachineMemOperand
*FIMMOLd
= MF
->getMachineMemOperand(
10580 MachinePointerInfo::getFixedStack(*MF
, FI
),
10581 MachineMemOperand::MOLoad
| MachineMemOperand::MOVolatile
, 4, Align(4));
10583 MachineInstrBuilder MIB
;
10584 MIB
= BuildMI(DispatchBB
, dl
, TII
->get(ARM::Int_eh_sjlj_dispatchsetup
));
10586 const ARMBaseInstrInfo
*AII
= static_cast<const ARMBaseInstrInfo
*>(TII
);
10587 const ARMBaseRegisterInfo
&RI
= AII
->getRegisterInfo();
10589 // Add a register mask with no preserved registers. This results in all
10590 // registers being marked as clobbered. This can't work if the dispatch block
10591 // is in a Thumb1 function and is linked with ARM code which uses the FP
10592 // registers, as there is no way to preserve the FP registers in Thumb1 mode.
10593 MIB
.addRegMask(RI
.getSjLjDispatchPreservedMask(*MF
));
10595 bool IsPositionIndependent
= isPositionIndependent();
10596 unsigned NumLPads
= LPadList
.size();
10597 if (Subtarget
->isThumb2()) {
10598 Register NewVReg1
= MRI
->createVirtualRegister(TRC
);
10599 BuildMI(DispatchBB
, dl
, TII
->get(ARM::t2LDRi12
), NewVReg1
)
10602 .addMemOperand(FIMMOLd
)
10603 .add(predOps(ARMCC::AL
));
10605 if (NumLPads
< 256) {
10606 BuildMI(DispatchBB
, dl
, TII
->get(ARM::t2CMPri
))
10608 .addImm(LPadList
.size())
10609 .add(predOps(ARMCC::AL
));
10611 Register VReg1
= MRI
->createVirtualRegister(TRC
);
10612 BuildMI(DispatchBB
, dl
, TII
->get(ARM::t2MOVi16
), VReg1
)
10613 .addImm(NumLPads
& 0xFFFF)
10614 .add(predOps(ARMCC::AL
));
10616 unsigned VReg2
= VReg1
;
10617 if ((NumLPads
& 0xFFFF0000) != 0) {
10618 VReg2
= MRI
->createVirtualRegister(TRC
);
10619 BuildMI(DispatchBB
, dl
, TII
->get(ARM::t2MOVTi16
), VReg2
)
10621 .addImm(NumLPads
>> 16)
10622 .add(predOps(ARMCC::AL
));
10625 BuildMI(DispatchBB
, dl
, TII
->get(ARM::t2CMPrr
))
10628 .add(predOps(ARMCC::AL
));
10631 BuildMI(DispatchBB
, dl
, TII
->get(ARM::t2Bcc
))
10634 .addReg(ARM::CPSR
);
10636 Register NewVReg3
= MRI
->createVirtualRegister(TRC
);
10637 BuildMI(DispContBB
, dl
, TII
->get(ARM::t2LEApcrelJT
), NewVReg3
)
10638 .addJumpTableIndex(MJTI
)
10639 .add(predOps(ARMCC::AL
));
10641 Register NewVReg4
= MRI
->createVirtualRegister(TRC
);
10642 BuildMI(DispContBB
, dl
, TII
->get(ARM::t2ADDrs
), NewVReg4
)
10643 .addReg(NewVReg3
, RegState::Kill
)
10645 .addImm(ARM_AM::getSORegOpc(ARM_AM::lsl
, 2))
10646 .add(predOps(ARMCC::AL
))
10647 .add(condCodeOp());
10649 BuildMI(DispContBB
, dl
, TII
->get(ARM::t2BR_JT
))
10650 .addReg(NewVReg4
, RegState::Kill
)
10652 .addJumpTableIndex(MJTI
);
10653 } else if (Subtarget
->isThumb()) {
10654 Register NewVReg1
= MRI
->createVirtualRegister(TRC
);
10655 BuildMI(DispatchBB
, dl
, TII
->get(ARM::tLDRspi
), NewVReg1
)
10658 .addMemOperand(FIMMOLd
)
10659 .add(predOps(ARMCC::AL
));
10661 if (NumLPads
< 256) {
10662 BuildMI(DispatchBB
, dl
, TII
->get(ARM::tCMPi8
))
10665 .add(predOps(ARMCC::AL
));
10667 MachineConstantPool
*ConstantPool
= MF
->getConstantPool();
10668 Type
*Int32Ty
= Type::getInt32Ty(MF
->getFunction().getContext());
10669 const Constant
*C
= ConstantInt::get(Int32Ty
, NumLPads
);
10671 // MachineConstantPool wants an explicit alignment.
10672 Align Alignment
= MF
->getDataLayout().getPrefTypeAlign(Int32Ty
);
10673 unsigned Idx
= ConstantPool
->getConstantPoolIndex(C
, Alignment
);
10675 Register VReg1
= MRI
->createVirtualRegister(TRC
);
10676 BuildMI(DispatchBB
, dl
, TII
->get(ARM::tLDRpci
))
10677 .addReg(VReg1
, RegState::Define
)
10678 .addConstantPoolIndex(Idx
)
10679 .add(predOps(ARMCC::AL
));
10680 BuildMI(DispatchBB
, dl
, TII
->get(ARM::tCMPr
))
10683 .add(predOps(ARMCC::AL
));
10686 BuildMI(DispatchBB
, dl
, TII
->get(ARM::tBcc
))
10689 .addReg(ARM::CPSR
);
10691 Register NewVReg2
= MRI
->createVirtualRegister(TRC
);
10692 BuildMI(DispContBB
, dl
, TII
->get(ARM::tLSLri
), NewVReg2
)
10693 .addReg(ARM::CPSR
, RegState::Define
)
10696 .add(predOps(ARMCC::AL
));
10698 Register NewVReg3
= MRI
->createVirtualRegister(TRC
);
10699 BuildMI(DispContBB
, dl
, TII
->get(ARM::tLEApcrelJT
), NewVReg3
)
10700 .addJumpTableIndex(MJTI
)
10701 .add(predOps(ARMCC::AL
));
10703 Register NewVReg4
= MRI
->createVirtualRegister(TRC
);
10704 BuildMI(DispContBB
, dl
, TII
->get(ARM::tADDrr
), NewVReg4
)
10705 .addReg(ARM::CPSR
, RegState::Define
)
10706 .addReg(NewVReg2
, RegState::Kill
)
10708 .add(predOps(ARMCC::AL
));
10710 MachineMemOperand
*JTMMOLd
=
10711 MF
->getMachineMemOperand(MachinePointerInfo::getJumpTable(*MF
),
10712 MachineMemOperand::MOLoad
, 4, Align(4));
10714 Register NewVReg5
= MRI
->createVirtualRegister(TRC
);
10715 BuildMI(DispContBB
, dl
, TII
->get(ARM::tLDRi
), NewVReg5
)
10716 .addReg(NewVReg4
, RegState::Kill
)
10718 .addMemOperand(JTMMOLd
)
10719 .add(predOps(ARMCC::AL
));
10721 unsigned NewVReg6
= NewVReg5
;
10722 if (IsPositionIndependent
) {
10723 NewVReg6
= MRI
->createVirtualRegister(TRC
);
10724 BuildMI(DispContBB
, dl
, TII
->get(ARM::tADDrr
), NewVReg6
)
10725 .addReg(ARM::CPSR
, RegState::Define
)
10726 .addReg(NewVReg5
, RegState::Kill
)
10728 .add(predOps(ARMCC::AL
));
10731 BuildMI(DispContBB
, dl
, TII
->get(ARM::tBR_JTr
))
10732 .addReg(NewVReg6
, RegState::Kill
)
10733 .addJumpTableIndex(MJTI
);
10735 Register NewVReg1
= MRI
->createVirtualRegister(TRC
);
10736 BuildMI(DispatchBB
, dl
, TII
->get(ARM::LDRi12
), NewVReg1
)
10739 .addMemOperand(FIMMOLd
)
10740 .add(predOps(ARMCC::AL
));
10742 if (NumLPads
< 256) {
10743 BuildMI(DispatchBB
, dl
, TII
->get(ARM::CMPri
))
10746 .add(predOps(ARMCC::AL
));
10747 } else if (Subtarget
->hasV6T2Ops() && isUInt
<16>(NumLPads
)) {
10748 Register VReg1
= MRI
->createVirtualRegister(TRC
);
10749 BuildMI(DispatchBB
, dl
, TII
->get(ARM::MOVi16
), VReg1
)
10750 .addImm(NumLPads
& 0xFFFF)
10751 .add(predOps(ARMCC::AL
));
10753 unsigned VReg2
= VReg1
;
10754 if ((NumLPads
& 0xFFFF0000) != 0) {
10755 VReg2
= MRI
->createVirtualRegister(TRC
);
10756 BuildMI(DispatchBB
, dl
, TII
->get(ARM::MOVTi16
), VReg2
)
10758 .addImm(NumLPads
>> 16)
10759 .add(predOps(ARMCC::AL
));
10762 BuildMI(DispatchBB
, dl
, TII
->get(ARM::CMPrr
))
10765 .add(predOps(ARMCC::AL
));
10767 MachineConstantPool
*ConstantPool
= MF
->getConstantPool();
10768 Type
*Int32Ty
= Type::getInt32Ty(MF
->getFunction().getContext());
10769 const Constant
*C
= ConstantInt::get(Int32Ty
, NumLPads
);
10771 // MachineConstantPool wants an explicit alignment.
10772 Align Alignment
= MF
->getDataLayout().getPrefTypeAlign(Int32Ty
);
10773 unsigned Idx
= ConstantPool
->getConstantPoolIndex(C
, Alignment
);
10775 Register VReg1
= MRI
->createVirtualRegister(TRC
);
10776 BuildMI(DispatchBB
, dl
, TII
->get(ARM::LDRcp
))
10777 .addReg(VReg1
, RegState::Define
)
10778 .addConstantPoolIndex(Idx
)
10780 .add(predOps(ARMCC::AL
));
10781 BuildMI(DispatchBB
, dl
, TII
->get(ARM::CMPrr
))
10783 .addReg(VReg1
, RegState::Kill
)
10784 .add(predOps(ARMCC::AL
));
10787 BuildMI(DispatchBB
, dl
, TII
->get(ARM::Bcc
))
10790 .addReg(ARM::CPSR
);
10792 Register NewVReg3
= MRI
->createVirtualRegister(TRC
);
10793 BuildMI(DispContBB
, dl
, TII
->get(ARM::MOVsi
), NewVReg3
)
10795 .addImm(ARM_AM::getSORegOpc(ARM_AM::lsl
, 2))
10796 .add(predOps(ARMCC::AL
))
10797 .add(condCodeOp());
10798 Register NewVReg4
= MRI
->createVirtualRegister(TRC
);
10799 BuildMI(DispContBB
, dl
, TII
->get(ARM::LEApcrelJT
), NewVReg4
)
10800 .addJumpTableIndex(MJTI
)
10801 .add(predOps(ARMCC::AL
));
10803 MachineMemOperand
*JTMMOLd
=
10804 MF
->getMachineMemOperand(MachinePointerInfo::getJumpTable(*MF
),
10805 MachineMemOperand::MOLoad
, 4, Align(4));
10806 Register NewVReg5
= MRI
->createVirtualRegister(TRC
);
10807 BuildMI(DispContBB
, dl
, TII
->get(ARM::LDRrs
), NewVReg5
)
10808 .addReg(NewVReg3
, RegState::Kill
)
10811 .addMemOperand(JTMMOLd
)
10812 .add(predOps(ARMCC::AL
));
10814 if (IsPositionIndependent
) {
10815 BuildMI(DispContBB
, dl
, TII
->get(ARM::BR_JTadd
))
10816 .addReg(NewVReg5
, RegState::Kill
)
10818 .addJumpTableIndex(MJTI
);
10820 BuildMI(DispContBB
, dl
, TII
->get(ARM::BR_JTr
))
10821 .addReg(NewVReg5
, RegState::Kill
)
10822 .addJumpTableIndex(MJTI
);
10826 // Add the jump table entries as successors to the MBB.
10827 SmallPtrSet
<MachineBasicBlock
*, 8> SeenMBBs
;
10828 for (std::vector
<MachineBasicBlock
*>::iterator
10829 I
= LPadList
.begin(), E
= LPadList
.end(); I
!= E
; ++I
) {
10830 MachineBasicBlock
*CurMBB
= *I
;
10831 if (SeenMBBs
.insert(CurMBB
).second
)
10832 DispContBB
->addSuccessor(CurMBB
);
10835 // N.B. the order the invoke BBs are processed in doesn't matter here.
10836 const MCPhysReg
*SavedRegs
= RI
.getCalleeSavedRegs(MF
);
10837 SmallVector
<MachineBasicBlock
*, 64> MBBLPads
;
10838 for (MachineBasicBlock
*BB
: InvokeBBs
) {
10840 // Remove the landing pad successor from the invoke block and replace it
10841 // with the new dispatch block.
10842 SmallVector
<MachineBasicBlock
*, 4> Successors(BB
->successors());
10843 while (!Successors
.empty()) {
10844 MachineBasicBlock
*SMBB
= Successors
.pop_back_val();
10845 if (SMBB
->isEHPad()) {
10846 BB
->removeSuccessor(SMBB
);
10847 MBBLPads
.push_back(SMBB
);
10851 BB
->addSuccessor(DispatchBB
, BranchProbability::getZero());
10852 BB
->normalizeSuccProbs();
10854 // Find the invoke call and mark all of the callee-saved registers as
10855 // 'implicit defined' so that they're spilled. This prevents code from
10856 // moving instructions to before the EH block, where they will never be
10858 for (MachineBasicBlock::reverse_iterator
10859 II
= BB
->rbegin(), IE
= BB
->rend(); II
!= IE
; ++II
) {
10860 if (!II
->isCall()) continue;
10862 DenseMap
<unsigned, bool> DefRegs
;
10863 for (MachineInstr::mop_iterator
10864 OI
= II
->operands_begin(), OE
= II
->operands_end();
10866 if (!OI
->isReg()) continue;
10867 DefRegs
[OI
->getReg()] = true;
10870 MachineInstrBuilder
MIB(*MF
, &*II
);
10872 for (unsigned i
= 0; SavedRegs
[i
] != 0; ++i
) {
10873 unsigned Reg
= SavedRegs
[i
];
10874 if (Subtarget
->isThumb2() &&
10875 !ARM::tGPRRegClass
.contains(Reg
) &&
10876 !ARM::hGPRRegClass
.contains(Reg
))
10878 if (Subtarget
->isThumb1Only() && !ARM::tGPRRegClass
.contains(Reg
))
10880 if (!Subtarget
->isThumb() && !ARM::GPRRegClass
.contains(Reg
))
10883 MIB
.addReg(Reg
, RegState::ImplicitDefine
| RegState::Dead
);
10890 // Mark all former landing pads as non-landing pads. The dispatch is the only
10891 // landing pad now.
10892 for (SmallVectorImpl
<MachineBasicBlock
*>::iterator
10893 I
= MBBLPads
.begin(), E
= MBBLPads
.end(); I
!= E
; ++I
)
10894 (*I
)->setIsEHPad(false);
10896 // The instruction is gone now.
10897 MI
.eraseFromParent();
10901 MachineBasicBlock
*OtherSucc(MachineBasicBlock
*MBB
, MachineBasicBlock
*Succ
) {
10902 for (MachineBasicBlock::succ_iterator I
= MBB
->succ_begin(),
10903 E
= MBB
->succ_end(); I
!= E
; ++I
)
10906 llvm_unreachable("Expecting a BB with two successors!");
10909 /// Return the load opcode for a given load size. If load size >= 8,
10910 /// neon opcode will be returned.
10911 static unsigned getLdOpcode(unsigned LdSize
, bool IsThumb1
, bool IsThumb2
) {
10913 return LdSize
== 16 ? ARM::VLD1q32wb_fixed
10914 : LdSize
== 8 ? ARM::VLD1d32wb_fixed
: 0;
10916 return LdSize
== 4 ? ARM::tLDRi
10917 : LdSize
== 2 ? ARM::tLDRHi
10918 : LdSize
== 1 ? ARM::tLDRBi
: 0;
10920 return LdSize
== 4 ? ARM::t2LDR_POST
10921 : LdSize
== 2 ? ARM::t2LDRH_POST
10922 : LdSize
== 1 ? ARM::t2LDRB_POST
: 0;
10923 return LdSize
== 4 ? ARM::LDR_POST_IMM
10924 : LdSize
== 2 ? ARM::LDRH_POST
10925 : LdSize
== 1 ? ARM::LDRB_POST_IMM
: 0;
10928 /// Return the store opcode for a given store size. If store size >= 8,
10929 /// neon opcode will be returned.
10930 static unsigned getStOpcode(unsigned StSize
, bool IsThumb1
, bool IsThumb2
) {
10932 return StSize
== 16 ? ARM::VST1q32wb_fixed
10933 : StSize
== 8 ? ARM::VST1d32wb_fixed
: 0;
10935 return StSize
== 4 ? ARM::tSTRi
10936 : StSize
== 2 ? ARM::tSTRHi
10937 : StSize
== 1 ? ARM::tSTRBi
: 0;
10939 return StSize
== 4 ? ARM::t2STR_POST
10940 : StSize
== 2 ? ARM::t2STRH_POST
10941 : StSize
== 1 ? ARM::t2STRB_POST
: 0;
10942 return StSize
== 4 ? ARM::STR_POST_IMM
10943 : StSize
== 2 ? ARM::STRH_POST
10944 : StSize
== 1 ? ARM::STRB_POST_IMM
: 0;
10947 /// Emit a post-increment load operation with given size. The instructions
10948 /// will be added to BB at Pos.
10949 static void emitPostLd(MachineBasicBlock
*BB
, MachineBasicBlock::iterator Pos
,
10950 const TargetInstrInfo
*TII
, const DebugLoc
&dl
,
10951 unsigned LdSize
, unsigned Data
, unsigned AddrIn
,
10952 unsigned AddrOut
, bool IsThumb1
, bool IsThumb2
) {
10953 unsigned LdOpc
= getLdOpcode(LdSize
, IsThumb1
, IsThumb2
);
10954 assert(LdOpc
!= 0 && "Should have a load opcode");
10956 BuildMI(*BB
, Pos
, dl
, TII
->get(LdOpc
), Data
)
10957 .addReg(AddrOut
, RegState::Define
)
10960 .add(predOps(ARMCC::AL
));
10961 } else if (IsThumb1
) {
10962 // load + update AddrIn
10963 BuildMI(*BB
, Pos
, dl
, TII
->get(LdOpc
), Data
)
10966 .add(predOps(ARMCC::AL
));
10967 BuildMI(*BB
, Pos
, dl
, TII
->get(ARM::tADDi8
), AddrOut
)
10968 .add(t1CondCodeOp())
10971 .add(predOps(ARMCC::AL
));
10972 } else if (IsThumb2
) {
10973 BuildMI(*BB
, Pos
, dl
, TII
->get(LdOpc
), Data
)
10974 .addReg(AddrOut
, RegState::Define
)
10977 .add(predOps(ARMCC::AL
));
10979 BuildMI(*BB
, Pos
, dl
, TII
->get(LdOpc
), Data
)
10980 .addReg(AddrOut
, RegState::Define
)
10984 .add(predOps(ARMCC::AL
));
10988 /// Emit a post-increment store operation with given size. The instructions
10989 /// will be added to BB at Pos.
10990 static void emitPostSt(MachineBasicBlock
*BB
, MachineBasicBlock::iterator Pos
,
10991 const TargetInstrInfo
*TII
, const DebugLoc
&dl
,
10992 unsigned StSize
, unsigned Data
, unsigned AddrIn
,
10993 unsigned AddrOut
, bool IsThumb1
, bool IsThumb2
) {
10994 unsigned StOpc
= getStOpcode(StSize
, IsThumb1
, IsThumb2
);
10995 assert(StOpc
!= 0 && "Should have a store opcode");
10997 BuildMI(*BB
, Pos
, dl
, TII
->get(StOpc
), AddrOut
)
11001 .add(predOps(ARMCC::AL
));
11002 } else if (IsThumb1
) {
11003 // store + update AddrIn
11004 BuildMI(*BB
, Pos
, dl
, TII
->get(StOpc
))
11008 .add(predOps(ARMCC::AL
));
11009 BuildMI(*BB
, Pos
, dl
, TII
->get(ARM::tADDi8
), AddrOut
)
11010 .add(t1CondCodeOp())
11013 .add(predOps(ARMCC::AL
));
11014 } else if (IsThumb2
) {
11015 BuildMI(*BB
, Pos
, dl
, TII
->get(StOpc
), AddrOut
)
11019 .add(predOps(ARMCC::AL
));
11021 BuildMI(*BB
, Pos
, dl
, TII
->get(StOpc
), AddrOut
)
11026 .add(predOps(ARMCC::AL
));
11030 MachineBasicBlock
*
11031 ARMTargetLowering::EmitStructByval(MachineInstr
&MI
,
11032 MachineBasicBlock
*BB
) const {
11033 // This pseudo instruction has 3 operands: dst, src, size
11034 // We expand it to a loop if size > Subtarget->getMaxInlineSizeThreshold().
11035 // Otherwise, we will generate unrolled scalar copies.
11036 const TargetInstrInfo
*TII
= Subtarget
->getInstrInfo();
11037 const BasicBlock
*LLVM_BB
= BB
->getBasicBlock();
11038 MachineFunction::iterator It
= ++BB
->getIterator();
11040 Register dest
= MI
.getOperand(0).getReg();
11041 Register src
= MI
.getOperand(1).getReg();
11042 unsigned SizeVal
= MI
.getOperand(2).getImm();
11043 unsigned Alignment
= MI
.getOperand(3).getImm();
11044 DebugLoc dl
= MI
.getDebugLoc();
11046 MachineFunction
*MF
= BB
->getParent();
11047 MachineRegisterInfo
&MRI
= MF
->getRegInfo();
11048 unsigned UnitSize
= 0;
11049 const TargetRegisterClass
*TRC
= nullptr;
11050 const TargetRegisterClass
*VecTRC
= nullptr;
11052 bool IsThumb1
= Subtarget
->isThumb1Only();
11053 bool IsThumb2
= Subtarget
->isThumb2();
11054 bool IsThumb
= Subtarget
->isThumb();
11056 if (Alignment
& 1) {
11058 } else if (Alignment
& 2) {
11061 // Check whether we can use NEON instructions.
11062 if (!MF
->getFunction().hasFnAttribute(Attribute::NoImplicitFloat
) &&
11063 Subtarget
->hasNEON()) {
11064 if ((Alignment
% 16 == 0) && SizeVal
>= 16)
11066 else if ((Alignment
% 8 == 0) && SizeVal
>= 8)
11069 // Can't use NEON instructions.
11074 // Select the correct opcode and register class for unit size load/store
11075 bool IsNeon
= UnitSize
>= 8;
11076 TRC
= IsThumb
? &ARM::tGPRRegClass
: &ARM::GPRRegClass
;
11078 VecTRC
= UnitSize
== 16 ? &ARM::DPairRegClass
11079 : UnitSize
== 8 ? &ARM::DPRRegClass
11082 unsigned BytesLeft
= SizeVal
% UnitSize
;
11083 unsigned LoopSize
= SizeVal
- BytesLeft
;
11085 if (SizeVal
<= Subtarget
->getMaxInlineSizeThreshold()) {
11086 // Use LDR and STR to copy.
11087 // [scratch, srcOut] = LDR_POST(srcIn, UnitSize)
11088 // [destOut] = STR_POST(scratch, destIn, UnitSize)
11089 unsigned srcIn
= src
;
11090 unsigned destIn
= dest
;
11091 for (unsigned i
= 0; i
< LoopSize
; i
+=UnitSize
) {
11092 Register srcOut
= MRI
.createVirtualRegister(TRC
);
11093 Register destOut
= MRI
.createVirtualRegister(TRC
);
11094 Register scratch
= MRI
.createVirtualRegister(IsNeon
? VecTRC
: TRC
);
11095 emitPostLd(BB
, MI
, TII
, dl
, UnitSize
, scratch
, srcIn
, srcOut
,
11096 IsThumb1
, IsThumb2
);
11097 emitPostSt(BB
, MI
, TII
, dl
, UnitSize
, scratch
, destIn
, destOut
,
11098 IsThumb1
, IsThumb2
);
11103 // Handle the leftover bytes with LDRB and STRB.
11104 // [scratch, srcOut] = LDRB_POST(srcIn, 1)
11105 // [destOut] = STRB_POST(scratch, destIn, 1)
11106 for (unsigned i
= 0; i
< BytesLeft
; i
++) {
11107 Register srcOut
= MRI
.createVirtualRegister(TRC
);
11108 Register destOut
= MRI
.createVirtualRegister(TRC
);
11109 Register scratch
= MRI
.createVirtualRegister(TRC
);
11110 emitPostLd(BB
, MI
, TII
, dl
, 1, scratch
, srcIn
, srcOut
,
11111 IsThumb1
, IsThumb2
);
11112 emitPostSt(BB
, MI
, TII
, dl
, 1, scratch
, destIn
, destOut
,
11113 IsThumb1
, IsThumb2
);
11117 MI
.eraseFromParent(); // The instruction is gone now.
11121 // Expand the pseudo op to a loop.
11124 // movw varEnd, # --> with thumb2
11126 // ldrcp varEnd, idx --> without thumb2
11127 // fallthrough --> loopMBB
11129 // PHI varPhi, varEnd, varLoop
11130 // PHI srcPhi, src, srcLoop
11131 // PHI destPhi, dst, destLoop
11132 // [scratch, srcLoop] = LDR_POST(srcPhi, UnitSize)
11133 // [destLoop] = STR_POST(scratch, destPhi, UnitSize)
11134 // subs varLoop, varPhi, #UnitSize
11136 // fallthrough --> exitMBB
11138 // epilogue to handle left-over bytes
11139 // [scratch, srcOut] = LDRB_POST(srcLoop, 1)
11140 // [destOut] = STRB_POST(scratch, destLoop, 1)
11141 MachineBasicBlock
*loopMBB
= MF
->CreateMachineBasicBlock(LLVM_BB
);
11142 MachineBasicBlock
*exitMBB
= MF
->CreateMachineBasicBlock(LLVM_BB
);
11143 MF
->insert(It
, loopMBB
);
11144 MF
->insert(It
, exitMBB
);
11146 // Transfer the remainder of BB and its successor edges to exitMBB.
11147 exitMBB
->splice(exitMBB
->begin(), BB
,
11148 std::next(MachineBasicBlock::iterator(MI
)), BB
->end());
11149 exitMBB
->transferSuccessorsAndUpdatePHIs(BB
);
11151 // Load an immediate to varEnd.
11152 Register varEnd
= MRI
.createVirtualRegister(TRC
);
11153 if (Subtarget
->useMovt()) {
11154 unsigned Vtmp
= varEnd
;
11155 if ((LoopSize
& 0xFFFF0000) != 0)
11156 Vtmp
= MRI
.createVirtualRegister(TRC
);
11157 BuildMI(BB
, dl
, TII
->get(IsThumb
? ARM::t2MOVi16
: ARM::MOVi16
), Vtmp
)
11158 .addImm(LoopSize
& 0xFFFF)
11159 .add(predOps(ARMCC::AL
));
11161 if ((LoopSize
& 0xFFFF0000) != 0)
11162 BuildMI(BB
, dl
, TII
->get(IsThumb
? ARM::t2MOVTi16
: ARM::MOVTi16
), varEnd
)
11164 .addImm(LoopSize
>> 16)
11165 .add(predOps(ARMCC::AL
));
11167 MachineConstantPool
*ConstantPool
= MF
->getConstantPool();
11168 Type
*Int32Ty
= Type::getInt32Ty(MF
->getFunction().getContext());
11169 const Constant
*C
= ConstantInt::get(Int32Ty
, LoopSize
);
11171 // MachineConstantPool wants an explicit alignment.
11172 Align Alignment
= MF
->getDataLayout().getPrefTypeAlign(Int32Ty
);
11173 unsigned Idx
= ConstantPool
->getConstantPoolIndex(C
, Alignment
);
11174 MachineMemOperand
*CPMMO
=
11175 MF
->getMachineMemOperand(MachinePointerInfo::getConstantPool(*MF
),
11176 MachineMemOperand::MOLoad
, 4, Align(4));
11179 BuildMI(*BB
, MI
, dl
, TII
->get(ARM::tLDRpci
))
11180 .addReg(varEnd
, RegState::Define
)
11181 .addConstantPoolIndex(Idx
)
11182 .add(predOps(ARMCC::AL
))
11183 .addMemOperand(CPMMO
);
11185 BuildMI(*BB
, MI
, dl
, TII
->get(ARM::LDRcp
))
11186 .addReg(varEnd
, RegState::Define
)
11187 .addConstantPoolIndex(Idx
)
11189 .add(predOps(ARMCC::AL
))
11190 .addMemOperand(CPMMO
);
11192 BB
->addSuccessor(loopMBB
);
11194 // Generate the loop body:
11195 // varPhi = PHI(varLoop, varEnd)
11196 // srcPhi = PHI(srcLoop, src)
11197 // destPhi = PHI(destLoop, dst)
11198 MachineBasicBlock
*entryBB
= BB
;
11200 Register varLoop
= MRI
.createVirtualRegister(TRC
);
11201 Register varPhi
= MRI
.createVirtualRegister(TRC
);
11202 Register srcLoop
= MRI
.createVirtualRegister(TRC
);
11203 Register srcPhi
= MRI
.createVirtualRegister(TRC
);
11204 Register destLoop
= MRI
.createVirtualRegister(TRC
);
11205 Register destPhi
= MRI
.createVirtualRegister(TRC
);
11207 BuildMI(*BB
, BB
->begin(), dl
, TII
->get(ARM::PHI
), varPhi
)
11208 .addReg(varLoop
).addMBB(loopMBB
)
11209 .addReg(varEnd
).addMBB(entryBB
);
11210 BuildMI(BB
, dl
, TII
->get(ARM::PHI
), srcPhi
)
11211 .addReg(srcLoop
).addMBB(loopMBB
)
11212 .addReg(src
).addMBB(entryBB
);
11213 BuildMI(BB
, dl
, TII
->get(ARM::PHI
), destPhi
)
11214 .addReg(destLoop
).addMBB(loopMBB
)
11215 .addReg(dest
).addMBB(entryBB
);
11217 // [scratch, srcLoop] = LDR_POST(srcPhi, UnitSize)
11218 // [destLoop] = STR_POST(scratch, destPhi, UnitSiz)
11219 Register scratch
= MRI
.createVirtualRegister(IsNeon
? VecTRC
: TRC
);
11220 emitPostLd(BB
, BB
->end(), TII
, dl
, UnitSize
, scratch
, srcPhi
, srcLoop
,
11221 IsThumb1
, IsThumb2
);
11222 emitPostSt(BB
, BB
->end(), TII
, dl
, UnitSize
, scratch
, destPhi
, destLoop
,
11223 IsThumb1
, IsThumb2
);
11225 // Decrement loop variable by UnitSize.
11227 BuildMI(*BB
, BB
->end(), dl
, TII
->get(ARM::tSUBi8
), varLoop
)
11228 .add(t1CondCodeOp())
11231 .add(predOps(ARMCC::AL
));
11233 MachineInstrBuilder MIB
=
11234 BuildMI(*BB
, BB
->end(), dl
,
11235 TII
->get(IsThumb2
? ARM::t2SUBri
: ARM::SUBri
), varLoop
);
11238 .add(predOps(ARMCC::AL
))
11239 .add(condCodeOp());
11240 MIB
->getOperand(5).setReg(ARM::CPSR
);
11241 MIB
->getOperand(5).setIsDef(true);
11243 BuildMI(*BB
, BB
->end(), dl
,
11244 TII
->get(IsThumb1
? ARM::tBcc
: IsThumb2
? ARM::t2Bcc
: ARM::Bcc
))
11245 .addMBB(loopMBB
).addImm(ARMCC::NE
).addReg(ARM::CPSR
);
11247 // loopMBB can loop back to loopMBB or fall through to exitMBB.
11248 BB
->addSuccessor(loopMBB
);
11249 BB
->addSuccessor(exitMBB
);
11251 // Add epilogue to handle BytesLeft.
11253 auto StartOfExit
= exitMBB
->begin();
11255 // [scratch, srcOut] = LDRB_POST(srcLoop, 1)
11256 // [destOut] = STRB_POST(scratch, destLoop, 1)
11257 unsigned srcIn
= srcLoop
;
11258 unsigned destIn
= destLoop
;
11259 for (unsigned i
= 0; i
< BytesLeft
; i
++) {
11260 Register srcOut
= MRI
.createVirtualRegister(TRC
);
11261 Register destOut
= MRI
.createVirtualRegister(TRC
);
11262 Register scratch
= MRI
.createVirtualRegister(TRC
);
11263 emitPostLd(BB
, StartOfExit
, TII
, dl
, 1, scratch
, srcIn
, srcOut
,
11264 IsThumb1
, IsThumb2
);
11265 emitPostSt(BB
, StartOfExit
, TII
, dl
, 1, scratch
, destIn
, destOut
,
11266 IsThumb1
, IsThumb2
);
11271 MI
.eraseFromParent(); // The instruction is gone now.
11275 MachineBasicBlock
*
11276 ARMTargetLowering::EmitLowered__chkstk(MachineInstr
&MI
,
11277 MachineBasicBlock
*MBB
) const {
11278 const TargetMachine
&TM
= getTargetMachine();
11279 const TargetInstrInfo
&TII
= *Subtarget
->getInstrInfo();
11280 DebugLoc DL
= MI
.getDebugLoc();
11282 assert(Subtarget
->isTargetWindows() &&
11283 "__chkstk is only supported on Windows");
11284 assert(Subtarget
->isThumb2() && "Windows on ARM requires Thumb-2 mode");
11286 // __chkstk takes the number of words to allocate on the stack in R4, and
11287 // returns the stack adjustment in number of bytes in R4. This will not
11288 // clober any other registers (other than the obvious lr).
11290 // Although, technically, IP should be considered a register which may be
11291 // clobbered, the call itself will not touch it. Windows on ARM is a pure
11292 // thumb-2 environment, so there is no interworking required. As a result, we
11293 // do not expect a veneer to be emitted by the linker, clobbering IP.
11295 // Each module receives its own copy of __chkstk, so no import thunk is
11296 // required, again, ensuring that IP is not clobbered.
11298 // Finally, although some linkers may theoretically provide a trampoline for
11299 // out of range calls (which is quite common due to a 32M range limitation of
11300 // branches for Thumb), we can generate the long-call version via
11301 // -mcmodel=large, alleviating the need for the trampoline which may clobber
11304 switch (TM
.getCodeModel()) {
11305 case CodeModel::Tiny
:
11306 llvm_unreachable("Tiny code model not available on ARM.");
11307 case CodeModel::Small
:
11308 case CodeModel::Medium
:
11309 case CodeModel::Kernel
:
11310 BuildMI(*MBB
, MI
, DL
, TII
.get(ARM::tBL
))
11311 .add(predOps(ARMCC::AL
))
11312 .addExternalSymbol("__chkstk")
11313 .addReg(ARM::R4
, RegState::Implicit
| RegState::Kill
)
11314 .addReg(ARM::R4
, RegState::Implicit
| RegState::Define
)
11316 RegState::Implicit
| RegState::Define
| RegState::Dead
)
11318 RegState::Implicit
| RegState::Define
| RegState::Dead
);
11320 case CodeModel::Large
: {
11321 MachineRegisterInfo
&MRI
= MBB
->getParent()->getRegInfo();
11322 Register Reg
= MRI
.createVirtualRegister(&ARM::rGPRRegClass
);
11324 BuildMI(*MBB
, MI
, DL
, TII
.get(ARM::t2MOVi32imm
), Reg
)
11325 .addExternalSymbol("__chkstk");
11326 BuildMI(*MBB
, MI
, DL
, TII
.get(gettBLXrOpcode(*MBB
->getParent())))
11327 .add(predOps(ARMCC::AL
))
11328 .addReg(Reg
, RegState::Kill
)
11329 .addReg(ARM::R4
, RegState::Implicit
| RegState::Kill
)
11330 .addReg(ARM::R4
, RegState::Implicit
| RegState::Define
)
11332 RegState::Implicit
| RegState::Define
| RegState::Dead
)
11334 RegState::Implicit
| RegState::Define
| RegState::Dead
);
11339 BuildMI(*MBB
, MI
, DL
, TII
.get(ARM::t2SUBrr
), ARM::SP
)
11340 .addReg(ARM::SP
, RegState::Kill
)
11341 .addReg(ARM::R4
, RegState::Kill
)
11342 .setMIFlags(MachineInstr::FrameSetup
)
11343 .add(predOps(ARMCC::AL
))
11344 .add(condCodeOp());
11346 MI
.eraseFromParent();
11350 MachineBasicBlock
*
11351 ARMTargetLowering::EmitLowered__dbzchk(MachineInstr
&MI
,
11352 MachineBasicBlock
*MBB
) const {
11353 DebugLoc DL
= MI
.getDebugLoc();
11354 MachineFunction
*MF
= MBB
->getParent();
11355 const TargetInstrInfo
*TII
= Subtarget
->getInstrInfo();
11357 MachineBasicBlock
*ContBB
= MF
->CreateMachineBasicBlock();
11358 MF
->insert(++MBB
->getIterator(), ContBB
);
11359 ContBB
->splice(ContBB
->begin(), MBB
,
11360 std::next(MachineBasicBlock::iterator(MI
)), MBB
->end());
11361 ContBB
->transferSuccessorsAndUpdatePHIs(MBB
);
11362 MBB
->addSuccessor(ContBB
);
11364 MachineBasicBlock
*TrapBB
= MF
->CreateMachineBasicBlock();
11365 BuildMI(TrapBB
, DL
, TII
->get(ARM::t__brkdiv0
));
11366 MF
->push_back(TrapBB
);
11367 MBB
->addSuccessor(TrapBB
);
11369 BuildMI(*MBB
, MI
, DL
, TII
->get(ARM::tCMPi8
))
11370 .addReg(MI
.getOperand(0).getReg())
11372 .add(predOps(ARMCC::AL
));
11373 BuildMI(*MBB
, MI
, DL
, TII
->get(ARM::t2Bcc
))
11376 .addReg(ARM::CPSR
);
11378 MI
.eraseFromParent();
11382 // The CPSR operand of SelectItr might be missing a kill marker
11383 // because there were multiple uses of CPSR, and ISel didn't know
11384 // which to mark. Figure out whether SelectItr should have had a
11385 // kill marker, and set it if it should. Returns the correct kill
11387 static bool checkAndUpdateCPSRKill(MachineBasicBlock::iterator SelectItr
,
11388 MachineBasicBlock
* BB
,
11389 const TargetRegisterInfo
* TRI
) {
11390 // Scan forward through BB for a use/def of CPSR.
11391 MachineBasicBlock::iterator
miI(std::next(SelectItr
));
11392 for (MachineBasicBlock::iterator miE
= BB
->end(); miI
!= miE
; ++miI
) {
11393 const MachineInstr
& mi
= *miI
;
11394 if (mi
.readsRegister(ARM::CPSR
))
11396 if (mi
.definesRegister(ARM::CPSR
))
11397 break; // Should have kill-flag - update below.
11400 // If we hit the end of the block, check whether CPSR is live into a
11402 if (miI
== BB
->end()) {
11403 for (MachineBasicBlock::succ_iterator sItr
= BB
->succ_begin(),
11404 sEnd
= BB
->succ_end();
11405 sItr
!= sEnd
; ++sItr
) {
11406 MachineBasicBlock
* succ
= *sItr
;
11407 if (succ
->isLiveIn(ARM::CPSR
))
11412 // We found a def, or hit the end of the basic block and CPSR wasn't live
11413 // out. SelectMI should have a kill flag on CPSR.
11414 SelectItr
->addRegisterKilled(ARM::CPSR
, TRI
);
11418 /// Adds logic in loop entry MBB to calculate loop iteration count and adds
11419 /// t2WhileLoopSetup and t2WhileLoopStart to generate WLS loop
11420 static Register
genTPEntry(MachineBasicBlock
*TpEntry
,
11421 MachineBasicBlock
*TpLoopBody
,
11422 MachineBasicBlock
*TpExit
, Register OpSizeReg
,
11423 const TargetInstrInfo
*TII
, DebugLoc Dl
,
11424 MachineRegisterInfo
&MRI
) {
11425 // Calculates loop iteration count = ceil(n/16) = (n + 15) >> 4.
11426 Register AddDestReg
= MRI
.createVirtualRegister(&ARM::rGPRRegClass
);
11427 BuildMI(TpEntry
, Dl
, TII
->get(ARM::t2ADDri
), AddDestReg
)
11430 .add(predOps(ARMCC::AL
))
11433 Register LsrDestReg
= MRI
.createVirtualRegister(&ARM::rGPRRegClass
);
11434 BuildMI(TpEntry
, Dl
, TII
->get(ARM::t2LSRri
), LsrDestReg
)
11435 .addUse(AddDestReg
, RegState::Kill
)
11437 .add(predOps(ARMCC::AL
))
11440 Register TotalIterationsReg
= MRI
.createVirtualRegister(&ARM::GPRlrRegClass
);
11441 BuildMI(TpEntry
, Dl
, TII
->get(ARM::t2WhileLoopSetup
), TotalIterationsReg
)
11442 .addUse(LsrDestReg
, RegState::Kill
);
11444 BuildMI(TpEntry
, Dl
, TII
->get(ARM::t2WhileLoopStart
))
11445 .addUse(TotalIterationsReg
)
11448 BuildMI(TpEntry
, Dl
, TII
->get(ARM::t2B
))
11449 .addMBB(TpLoopBody
)
11450 .add(predOps(ARMCC::AL
));
11452 return TotalIterationsReg
;
11455 /// Adds logic in the loopBody MBB to generate MVE_VCTP, t2DoLoopDec and
11456 /// t2DoLoopEnd. These are used by later passes to generate tail predicated
11458 static void genTPLoopBody(MachineBasicBlock
*TpLoopBody
,
11459 MachineBasicBlock
*TpEntry
, MachineBasicBlock
*TpExit
,
11460 const TargetInstrInfo
*TII
, DebugLoc Dl
,
11461 MachineRegisterInfo
&MRI
, Register OpSrcReg
,
11462 Register OpDestReg
, Register ElementCountReg
,
11463 Register TotalIterationsReg
, bool IsMemcpy
) {
11464 // First insert 4 PHI nodes for: Current pointer to Src (if memcpy), Dest
11465 // array, loop iteration counter, predication counter.
11467 Register SrcPhiReg
, CurrSrcReg
;
11469 // Current position in the src array
11470 SrcPhiReg
= MRI
.createVirtualRegister(&ARM::rGPRRegClass
);
11471 CurrSrcReg
= MRI
.createVirtualRegister(&ARM::rGPRRegClass
);
11472 BuildMI(TpLoopBody
, Dl
, TII
->get(ARM::PHI
), SrcPhiReg
)
11475 .addUse(CurrSrcReg
)
11476 .addMBB(TpLoopBody
);
11479 // Current position in the dest array
11480 Register DestPhiReg
= MRI
.createVirtualRegister(&ARM::rGPRRegClass
);
11481 Register CurrDestReg
= MRI
.createVirtualRegister(&ARM::rGPRRegClass
);
11482 BuildMI(TpLoopBody
, Dl
, TII
->get(ARM::PHI
), DestPhiReg
)
11485 .addUse(CurrDestReg
)
11486 .addMBB(TpLoopBody
);
11488 // Current loop counter
11489 Register LoopCounterPhiReg
= MRI
.createVirtualRegister(&ARM::GPRlrRegClass
);
11490 Register RemainingLoopIterationsReg
=
11491 MRI
.createVirtualRegister(&ARM::GPRlrRegClass
);
11492 BuildMI(TpLoopBody
, Dl
, TII
->get(ARM::PHI
), LoopCounterPhiReg
)
11493 .addUse(TotalIterationsReg
)
11495 .addUse(RemainingLoopIterationsReg
)
11496 .addMBB(TpLoopBody
);
11498 // Predication counter
11499 Register PredCounterPhiReg
= MRI
.createVirtualRegister(&ARM::rGPRRegClass
);
11500 Register RemainingElementsReg
= MRI
.createVirtualRegister(&ARM::rGPRRegClass
);
11501 BuildMI(TpLoopBody
, Dl
, TII
->get(ARM::PHI
), PredCounterPhiReg
)
11502 .addUse(ElementCountReg
)
11504 .addUse(RemainingElementsReg
)
11505 .addMBB(TpLoopBody
);
11507 // Pass predication counter to VCTP
11508 Register VccrReg
= MRI
.createVirtualRegister(&ARM::VCCRRegClass
);
11509 BuildMI(TpLoopBody
, Dl
, TII
->get(ARM::MVE_VCTP8
), VccrReg
)
11510 .addUse(PredCounterPhiReg
)
11511 .addImm(ARMVCC::None
)
11514 BuildMI(TpLoopBody
, Dl
, TII
->get(ARM::t2SUBri
), RemainingElementsReg
)
11515 .addUse(PredCounterPhiReg
)
11517 .add(predOps(ARMCC::AL
))
11520 // VLDRB (only if memcpy) and VSTRB instructions, predicated using VPR
11521 Register SrcValueReg
;
11523 SrcValueReg
= MRI
.createVirtualRegister(&ARM::MQPRRegClass
);
11524 BuildMI(TpLoopBody
, Dl
, TII
->get(ARM::MVE_VLDRBU8_post
))
11525 .addDef(CurrSrcReg
)
11526 .addDef(SrcValueReg
)
11529 .addImm(ARMVCC::Then
)
11532 SrcValueReg
= OpSrcReg
;
11534 BuildMI(TpLoopBody
, Dl
, TII
->get(ARM::MVE_VSTRBU8_post
))
11535 .addDef(CurrDestReg
)
11536 .addUse(SrcValueReg
)
11537 .addReg(DestPhiReg
)
11539 .addImm(ARMVCC::Then
)
11542 // Add the pseudoInstrs for decrementing the loop counter and marking the
11543 // end:t2DoLoopDec and t2DoLoopEnd
11544 BuildMI(TpLoopBody
, Dl
, TII
->get(ARM::t2LoopDec
), RemainingLoopIterationsReg
)
11545 .addUse(LoopCounterPhiReg
)
11548 BuildMI(TpLoopBody
, Dl
, TII
->get(ARM::t2LoopEnd
))
11549 .addUse(RemainingLoopIterationsReg
)
11550 .addMBB(TpLoopBody
);
11552 BuildMI(TpLoopBody
, Dl
, TII
->get(ARM::t2B
))
11554 .add(predOps(ARMCC::AL
));
11557 MachineBasicBlock
*
11558 ARMTargetLowering::EmitInstrWithCustomInserter(MachineInstr
&MI
,
11559 MachineBasicBlock
*BB
) const {
11560 const TargetInstrInfo
*TII
= Subtarget
->getInstrInfo();
11561 DebugLoc dl
= MI
.getDebugLoc();
11562 bool isThumb2
= Subtarget
->isThumb2();
11563 switch (MI
.getOpcode()) {
11566 llvm_unreachable("Unexpected instr type to insert");
11569 // Thumb1 post-indexed loads are really just single-register LDMs.
11570 case ARM::tLDR_postidx
: {
11571 MachineOperand
Def(MI
.getOperand(1));
11572 BuildMI(*BB
, MI
, dl
, TII
->get(ARM::tLDMIA_UPD
))
11574 .add(MI
.getOperand(2)) // Rn
11575 .add(MI
.getOperand(3)) // PredImm
11576 .add(MI
.getOperand(4)) // PredReg
11577 .add(MI
.getOperand(0)) // Rt
11579 MI
.eraseFromParent();
11583 case ARM::MVE_MEMCPYLOOPINST
:
11584 case ARM::MVE_MEMSETLOOPINST
: {
11586 // Transformation below expands MVE_MEMCPYLOOPINST/MVE_MEMSETLOOPINST Pseudo
11587 // into a Tail Predicated (TP) Loop. It adds the instructions to calculate
11588 // the iteration count =ceil(size_in_bytes/16)) in the TP entry block and
11589 // adds the relevant instructions in the TP loop Body for generation of a
11592 // Below is relevant portion of the CFG after the transformation.
11593 // The Machine Basic Blocks are shown along with branch conditions (in
11594 // brackets). Note that TP entry/exit MBBs depict the entry/exit of this
11595 // portion of the CFG and may not necessarily be the entry/exit of the
11598 // (Relevant) CFG after transformation:
11601 // |-----------------|
11602 // (n <= 0) (n > 0)
11604 // | TP loop Body MBB<--|
11610 MachineFunction
*MF
= BB
->getParent();
11611 MachineFunctionProperties
&Properties
= MF
->getProperties();
11612 MachineRegisterInfo
&MRI
= MF
->getRegInfo();
11614 Register OpDestReg
= MI
.getOperand(0).getReg();
11615 Register OpSrcReg
= MI
.getOperand(1).getReg();
11616 Register OpSizeReg
= MI
.getOperand(2).getReg();
11618 // Allocate the required MBBs and add to parent function.
11619 MachineBasicBlock
*TpEntry
= BB
;
11620 MachineBasicBlock
*TpLoopBody
= MF
->CreateMachineBasicBlock();
11621 MachineBasicBlock
*TpExit
;
11623 MF
->push_back(TpLoopBody
);
11625 // If any instructions are present in the current block after
11626 // MVE_MEMCPYLOOPINST or MVE_MEMSETLOOPINST, split the current block and
11627 // move the instructions into the newly created exit block. If there are no
11628 // instructions add an explicit branch to the FallThrough block and then
11631 // The split is required for two reasons:
11632 // 1) A terminator(t2WhileLoopStart) will be placed at that site.
11633 // 2) Since a TPLoopBody will be added later, any phis in successive blocks
11634 // need to be updated. splitAt() already handles this.
11635 TpExit
= BB
->splitAt(MI
, false);
11636 if (TpExit
== BB
) {
11637 assert(BB
->canFallThrough() && "Exit Block must be Fallthrough of the "
11638 "block containing memcpy/memset Pseudo");
11639 TpExit
= BB
->getFallThrough();
11640 BuildMI(BB
, dl
, TII
->get(ARM::t2B
))
11642 .add(predOps(ARMCC::AL
));
11643 TpExit
= BB
->splitAt(MI
, false);
11646 // Add logic for iteration count
11647 Register TotalIterationsReg
=
11648 genTPEntry(TpEntry
, TpLoopBody
, TpExit
, OpSizeReg
, TII
, dl
, MRI
);
11650 // Add the vectorized (and predicated) loads/store instructions
11651 bool IsMemcpy
= MI
.getOpcode() == ARM::MVE_MEMCPYLOOPINST
;
11652 genTPLoopBody(TpLoopBody
, TpEntry
, TpExit
, TII
, dl
, MRI
, OpSrcReg
,
11653 OpDestReg
, OpSizeReg
, TotalIterationsReg
, IsMemcpy
);
11655 // Required to avoid conflict with the MachineVerifier during testing.
11656 Properties
.reset(MachineFunctionProperties::Property::NoPHIs
);
11658 // Connect the blocks
11659 TpEntry
->addSuccessor(TpLoopBody
);
11660 TpLoopBody
->addSuccessor(TpLoopBody
);
11661 TpLoopBody
->addSuccessor(TpExit
);
11663 // Reorder for a more natural layout
11664 TpLoopBody
->moveAfter(TpEntry
);
11665 TpExit
->moveAfter(TpLoopBody
);
11667 // Finally, remove the memcpy Psuedo Instruction
11668 MI
.eraseFromParent();
11670 // Return the exit block as it may contain other instructions requiring a
11675 // The Thumb2 pre-indexed stores have the same MI operands, they just
11676 // define them differently in the .td files from the isel patterns, so
11677 // they need pseudos.
11678 case ARM::t2STR_preidx
:
11679 MI
.setDesc(TII
->get(ARM::t2STR_PRE
));
11681 case ARM::t2STRB_preidx
:
11682 MI
.setDesc(TII
->get(ARM::t2STRB_PRE
));
11684 case ARM::t2STRH_preidx
:
11685 MI
.setDesc(TII
->get(ARM::t2STRH_PRE
));
11688 case ARM::STRi_preidx
:
11689 case ARM::STRBi_preidx
: {
11690 unsigned NewOpc
= MI
.getOpcode() == ARM::STRi_preidx
? ARM::STR_PRE_IMM
11691 : ARM::STRB_PRE_IMM
;
11692 // Decode the offset.
11693 unsigned Offset
= MI
.getOperand(4).getImm();
11694 bool isSub
= ARM_AM::getAM2Op(Offset
) == ARM_AM::sub
;
11695 Offset
= ARM_AM::getAM2Offset(Offset
);
11699 MachineMemOperand
*MMO
= *MI
.memoperands_begin();
11700 BuildMI(*BB
, MI
, dl
, TII
->get(NewOpc
))
11701 .add(MI
.getOperand(0)) // Rn_wb
11702 .add(MI
.getOperand(1)) // Rt
11703 .add(MI
.getOperand(2)) // Rn
11704 .addImm(Offset
) // offset (skip GPR==zero_reg)
11705 .add(MI
.getOperand(5)) // pred
11706 .add(MI
.getOperand(6))
11707 .addMemOperand(MMO
);
11708 MI
.eraseFromParent();
11711 case ARM::STRr_preidx
:
11712 case ARM::STRBr_preidx
:
11713 case ARM::STRH_preidx
: {
11715 switch (MI
.getOpcode()) {
11716 default: llvm_unreachable("unexpected opcode!");
11717 case ARM::STRr_preidx
: NewOpc
= ARM::STR_PRE_REG
; break;
11718 case ARM::STRBr_preidx
: NewOpc
= ARM::STRB_PRE_REG
; break;
11719 case ARM::STRH_preidx
: NewOpc
= ARM::STRH_PRE
; break;
11721 MachineInstrBuilder MIB
= BuildMI(*BB
, MI
, dl
, TII
->get(NewOpc
));
11722 for (unsigned i
= 0; i
< MI
.getNumOperands(); ++i
)
11723 MIB
.add(MI
.getOperand(i
));
11724 MI
.eraseFromParent();
11728 case ARM::tMOVCCr_pseudo
: {
11729 // To "insert" a SELECT_CC instruction, we actually have to insert the
11730 // diamond control-flow pattern. The incoming instruction knows the
11731 // destination vreg to set, the condition code register to branch on, the
11732 // true/false values to select between, and a branch opcode to use.
11733 const BasicBlock
*LLVM_BB
= BB
->getBasicBlock();
11734 MachineFunction::iterator It
= ++BB
->getIterator();
11739 // cmpTY ccX, r1, r2
11741 // fallthrough --> copy0MBB
11742 MachineBasicBlock
*thisMBB
= BB
;
11743 MachineFunction
*F
= BB
->getParent();
11744 MachineBasicBlock
*copy0MBB
= F
->CreateMachineBasicBlock(LLVM_BB
);
11745 MachineBasicBlock
*sinkMBB
= F
->CreateMachineBasicBlock(LLVM_BB
);
11746 F
->insert(It
, copy0MBB
);
11747 F
->insert(It
, sinkMBB
);
11749 // Check whether CPSR is live past the tMOVCCr_pseudo.
11750 const TargetRegisterInfo
*TRI
= Subtarget
->getRegisterInfo();
11751 if (!MI
.killsRegister(ARM::CPSR
) &&
11752 !checkAndUpdateCPSRKill(MI
, thisMBB
, TRI
)) {
11753 copy0MBB
->addLiveIn(ARM::CPSR
);
11754 sinkMBB
->addLiveIn(ARM::CPSR
);
11757 // Transfer the remainder of BB and its successor edges to sinkMBB.
11758 sinkMBB
->splice(sinkMBB
->begin(), BB
,
11759 std::next(MachineBasicBlock::iterator(MI
)), BB
->end());
11760 sinkMBB
->transferSuccessorsAndUpdatePHIs(BB
);
11762 BB
->addSuccessor(copy0MBB
);
11763 BB
->addSuccessor(sinkMBB
);
11765 BuildMI(BB
, dl
, TII
->get(ARM::tBcc
))
11767 .addImm(MI
.getOperand(3).getImm())
11768 .addReg(MI
.getOperand(4).getReg());
11771 // %FalseValue = ...
11772 // # fallthrough to sinkMBB
11775 // Update machine-CFG edges
11776 BB
->addSuccessor(sinkMBB
);
11779 // %Result = phi [ %FalseValue, copy0MBB ], [ %TrueValue, thisMBB ]
11782 BuildMI(*BB
, BB
->begin(), dl
, TII
->get(ARM::PHI
), MI
.getOperand(0).getReg())
11783 .addReg(MI
.getOperand(1).getReg())
11785 .addReg(MI
.getOperand(2).getReg())
11788 MI
.eraseFromParent(); // The pseudo instruction is gone now.
11793 case ARM::BCCZi64
: {
11794 // If there is an unconditional branch to the other successor, remove it.
11795 BB
->erase(std::next(MachineBasicBlock::iterator(MI
)), BB
->end());
11797 // Compare both parts that make up the double comparison separately for
11799 bool RHSisZero
= MI
.getOpcode() == ARM::BCCZi64
;
11801 Register LHS1
= MI
.getOperand(1).getReg();
11802 Register LHS2
= MI
.getOperand(2).getReg();
11804 BuildMI(BB
, dl
, TII
->get(isThumb2
? ARM::t2CMPri
: ARM::CMPri
))
11807 .add(predOps(ARMCC::AL
));
11808 BuildMI(BB
, dl
, TII
->get(isThumb2
? ARM::t2CMPri
: ARM::CMPri
))
11809 .addReg(LHS2
).addImm(0)
11810 .addImm(ARMCC::EQ
).addReg(ARM::CPSR
);
11812 Register RHS1
= MI
.getOperand(3).getReg();
11813 Register RHS2
= MI
.getOperand(4).getReg();
11814 BuildMI(BB
, dl
, TII
->get(isThumb2
? ARM::t2CMPrr
: ARM::CMPrr
))
11817 .add(predOps(ARMCC::AL
));
11818 BuildMI(BB
, dl
, TII
->get(isThumb2
? ARM::t2CMPrr
: ARM::CMPrr
))
11819 .addReg(LHS2
).addReg(RHS2
)
11820 .addImm(ARMCC::EQ
).addReg(ARM::CPSR
);
11823 MachineBasicBlock
*destMBB
= MI
.getOperand(RHSisZero
? 3 : 5).getMBB();
11824 MachineBasicBlock
*exitMBB
= OtherSucc(BB
, destMBB
);
11825 if (MI
.getOperand(0).getImm() == ARMCC::NE
)
11826 std::swap(destMBB
, exitMBB
);
11828 BuildMI(BB
, dl
, TII
->get(isThumb2
? ARM::t2Bcc
: ARM::Bcc
))
11829 .addMBB(destMBB
).addImm(ARMCC::EQ
).addReg(ARM::CPSR
);
11831 BuildMI(BB
, dl
, TII
->get(ARM::t2B
))
11833 .add(predOps(ARMCC::AL
));
11835 BuildMI(BB
, dl
, TII
->get(ARM::B
)) .addMBB(exitMBB
);
11837 MI
.eraseFromParent(); // The pseudo instruction is gone now.
11841 case ARM::Int_eh_sjlj_setjmp
:
11842 case ARM::Int_eh_sjlj_setjmp_nofp
:
11843 case ARM::tInt_eh_sjlj_setjmp
:
11844 case ARM::t2Int_eh_sjlj_setjmp
:
11845 case ARM::t2Int_eh_sjlj_setjmp_nofp
:
11848 case ARM::Int_eh_sjlj_setup_dispatch
:
11849 EmitSjLjDispatchBlock(MI
, BB
);
11854 // To insert an ABS instruction, we have to insert the
11855 // diamond control-flow pattern. The incoming instruction knows the
11856 // source vreg to test against 0, the destination vreg to set,
11857 // the condition code register to branch on, the
11858 // true/false values to select between, and a branch opcode to use.
11863 // BCC (branch to SinkBB if V0 >= 0)
11864 // RSBBB: V3 = RSBri V2, 0 (compute ABS if V2 < 0)
11865 // SinkBB: V1 = PHI(V2, V3)
11866 const BasicBlock
*LLVM_BB
= BB
->getBasicBlock();
11867 MachineFunction::iterator BBI
= ++BB
->getIterator();
11868 MachineFunction
*Fn
= BB
->getParent();
11869 MachineBasicBlock
*RSBBB
= Fn
->CreateMachineBasicBlock(LLVM_BB
);
11870 MachineBasicBlock
*SinkBB
= Fn
->CreateMachineBasicBlock(LLVM_BB
);
11871 Fn
->insert(BBI
, RSBBB
);
11872 Fn
->insert(BBI
, SinkBB
);
11874 Register ABSSrcReg
= MI
.getOperand(1).getReg();
11875 Register ABSDstReg
= MI
.getOperand(0).getReg();
11876 bool ABSSrcKIll
= MI
.getOperand(1).isKill();
11877 bool isThumb2
= Subtarget
->isThumb2();
11878 MachineRegisterInfo
&MRI
= Fn
->getRegInfo();
11879 // In Thumb mode S must not be specified if source register is the SP or
11880 // PC and if destination register is the SP, so restrict register class
11881 Register NewRsbDstReg
= MRI
.createVirtualRegister(
11882 isThumb2
? &ARM::rGPRRegClass
: &ARM::GPRRegClass
);
11884 // Transfer the remainder of BB and its successor edges to sinkMBB.
11885 SinkBB
->splice(SinkBB
->begin(), BB
,
11886 std::next(MachineBasicBlock::iterator(MI
)), BB
->end());
11887 SinkBB
->transferSuccessorsAndUpdatePHIs(BB
);
11889 BB
->addSuccessor(RSBBB
);
11890 BB
->addSuccessor(SinkBB
);
11892 // fall through to SinkMBB
11893 RSBBB
->addSuccessor(SinkBB
);
11895 // insert a cmp at the end of BB
11896 BuildMI(BB
, dl
, TII
->get(isThumb2
? ARM::t2CMPri
: ARM::CMPri
))
11899 .add(predOps(ARMCC::AL
));
11901 // insert a bcc with opposite CC to ARMCC::MI at the end of BB
11903 TII
->get(isThumb2
? ARM::t2Bcc
: ARM::Bcc
)).addMBB(SinkBB
)
11904 .addImm(ARMCC::getOppositeCondition(ARMCC::MI
)).addReg(ARM::CPSR
);
11906 // insert rsbri in RSBBB
11907 // Note: BCC and rsbri will be converted into predicated rsbmi
11908 // by if-conversion pass
11909 BuildMI(*RSBBB
, RSBBB
->begin(), dl
,
11910 TII
->get(isThumb2
? ARM::t2RSBri
: ARM::RSBri
), NewRsbDstReg
)
11911 .addReg(ABSSrcReg
, ABSSrcKIll
? RegState::Kill
: 0)
11913 .add(predOps(ARMCC::AL
))
11914 .add(condCodeOp());
11916 // insert PHI in SinkBB,
11917 // reuse ABSDstReg to not change uses of ABS instruction
11918 BuildMI(*SinkBB
, SinkBB
->begin(), dl
,
11919 TII
->get(ARM::PHI
), ABSDstReg
)
11920 .addReg(NewRsbDstReg
).addMBB(RSBBB
)
11921 .addReg(ABSSrcReg
).addMBB(BB
);
11923 // remove ABS instruction
11924 MI
.eraseFromParent();
11926 // return last added BB
11929 case ARM::COPY_STRUCT_BYVAL_I32
:
11931 return EmitStructByval(MI
, BB
);
11932 case ARM::WIN__CHKSTK
:
11933 return EmitLowered__chkstk(MI
, BB
);
11934 case ARM::WIN__DBZCHK
:
11935 return EmitLowered__dbzchk(MI
, BB
);
11939 /// Attaches vregs to MEMCPY that it will use as scratch registers
11940 /// when it is expanded into LDM/STM. This is done as a post-isel lowering
11941 /// instead of as a custom inserter because we need the use list from the SDNode.
11942 static void attachMEMCPYScratchRegs(const ARMSubtarget
*Subtarget
,
11943 MachineInstr
&MI
, const SDNode
*Node
) {
11944 bool isThumb1
= Subtarget
->isThumb1Only();
11946 DebugLoc DL
= MI
.getDebugLoc();
11947 MachineFunction
*MF
= MI
.getParent()->getParent();
11948 MachineRegisterInfo
&MRI
= MF
->getRegInfo();
11949 MachineInstrBuilder
MIB(*MF
, MI
);
11951 // If the new dst/src is unused mark it as dead.
11952 if (!Node
->hasAnyUseOfValue(0)) {
11953 MI
.getOperand(0).setIsDead(true);
11955 if (!Node
->hasAnyUseOfValue(1)) {
11956 MI
.getOperand(1).setIsDead(true);
11959 // The MEMCPY both defines and kills the scratch registers.
11960 for (unsigned I
= 0; I
!= MI
.getOperand(4).getImm(); ++I
) {
11961 Register TmpReg
= MRI
.createVirtualRegister(isThumb1
? &ARM::tGPRRegClass
11962 : &ARM::GPRRegClass
);
11963 MIB
.addReg(TmpReg
, RegState::Define
|RegState::Dead
);
11967 void ARMTargetLowering::AdjustInstrPostInstrSelection(MachineInstr
&MI
,
11968 SDNode
*Node
) const {
11969 if (MI
.getOpcode() == ARM::MEMCPY
) {
11970 attachMEMCPYScratchRegs(Subtarget
, MI
, Node
);
11974 const MCInstrDesc
*MCID
= &MI
.getDesc();
11975 // Adjust potentially 's' setting instructions after isel, i.e. ADC, SBC, RSB,
11976 // RSC. Coming out of isel, they have an implicit CPSR def, but the optional
11977 // operand is still set to noreg. If needed, set the optional operand's
11978 // register to CPSR, and remove the redundant implicit def.
11980 // e.g. ADCS (..., implicit-def CPSR) -> ADC (... opt:def CPSR).
11982 // Rename pseudo opcodes.
11983 unsigned NewOpc
= convertAddSubFlagsOpcode(MI
.getOpcode());
11986 const ARMBaseInstrInfo
*TII
= Subtarget
->getInstrInfo();
11987 MCID
= &TII
->get(NewOpc
);
11989 assert(MCID
->getNumOperands() ==
11990 MI
.getDesc().getNumOperands() + 5 - MI
.getDesc().getSize()
11991 && "converted opcode should be the same except for cc_out"
11992 " (and, on Thumb1, pred)");
11996 // Add the optional cc_out operand
11997 MI
.addOperand(MachineOperand::CreateReg(0, /*isDef=*/true));
11999 // On Thumb1, move all input operands to the end, then add the predicate
12000 if (Subtarget
->isThumb1Only()) {
12001 for (unsigned c
= MCID
->getNumOperands() - 4; c
--;) {
12002 MI
.addOperand(MI
.getOperand(1));
12003 MI
.RemoveOperand(1);
12006 // Restore the ties
12007 for (unsigned i
= MI
.getNumOperands(); i
--;) {
12008 const MachineOperand
& op
= MI
.getOperand(i
);
12009 if (op
.isReg() && op
.isUse()) {
12010 int DefIdx
= MCID
->getOperandConstraint(i
, MCOI::TIED_TO
);
12012 MI
.tieOperands(DefIdx
, i
);
12016 MI
.addOperand(MachineOperand::CreateImm(ARMCC::AL
));
12017 MI
.addOperand(MachineOperand::CreateReg(0, /*isDef=*/false));
12020 ccOutIdx
= MCID
->getNumOperands() - 1;
12022 ccOutIdx
= MCID
->getNumOperands() - 1;
12024 // Any ARM instruction that sets the 's' bit should specify an optional
12025 // "cc_out" operand in the last operand position.
12026 if (!MI
.hasOptionalDef() || !MCID
->OpInfo
[ccOutIdx
].isOptionalDef()) {
12027 assert(!NewOpc
&& "Optional cc_out operand required");
12030 // Look for an implicit def of CPSR added by MachineInstr ctor. Remove it
12031 // since we already have an optional CPSR def.
12032 bool definesCPSR
= false;
12033 bool deadCPSR
= false;
12034 for (unsigned i
= MCID
->getNumOperands(), e
= MI
.getNumOperands(); i
!= e
;
12036 const MachineOperand
&MO
= MI
.getOperand(i
);
12037 if (MO
.isReg() && MO
.isDef() && MO
.getReg() == ARM::CPSR
) {
12038 definesCPSR
= true;
12041 MI
.RemoveOperand(i
);
12045 if (!definesCPSR
) {
12046 assert(!NewOpc
&& "Optional cc_out operand required");
12049 assert(deadCPSR
== !Node
->hasAnyUseOfValue(1) && "inconsistent dead flag");
12051 assert(!MI
.getOperand(ccOutIdx
).getReg() &&
12052 "expect uninitialized optional cc_out operand");
12053 // Thumb1 instructions must have the S bit even if the CPSR is dead.
12054 if (!Subtarget
->isThumb1Only())
12058 // If this instruction was defined with an optional CPSR def and its dag node
12059 // had a live implicit CPSR def, then activate the optional CPSR def.
12060 MachineOperand
&MO
= MI
.getOperand(ccOutIdx
);
12061 MO
.setReg(ARM::CPSR
);
12065 //===----------------------------------------------------------------------===//
12066 // ARM Optimization Hooks
12067 //===----------------------------------------------------------------------===//
12069 // Helper function that checks if N is a null or all ones constant.
12070 static inline bool isZeroOrAllOnes(SDValue N
, bool AllOnes
) {
12071 return AllOnes
? isAllOnesConstant(N
) : isNullConstant(N
);
12074 // Return true if N is conditionally 0 or all ones.
12075 // Detects these expressions where cc is an i1 value:
12077 // (select cc 0, y) [AllOnes=0]
12078 // (select cc y, 0) [AllOnes=0]
12079 // (zext cc) [AllOnes=0]
12080 // (sext cc) [AllOnes=0/1]
12081 // (select cc -1, y) [AllOnes=1]
12082 // (select cc y, -1) [AllOnes=1]
12084 // Invert is set when N is the null/all ones constant when CC is false.
12085 // OtherOp is set to the alternative value of N.
12086 static bool isConditionalZeroOrAllOnes(SDNode
*N
, bool AllOnes
,
12087 SDValue
&CC
, bool &Invert
,
12089 SelectionDAG
&DAG
) {
12090 switch (N
->getOpcode()) {
12091 default: return false;
12092 case ISD::SELECT
: {
12093 CC
= N
->getOperand(0);
12094 SDValue N1
= N
->getOperand(1);
12095 SDValue N2
= N
->getOperand(2);
12096 if (isZeroOrAllOnes(N1
, AllOnes
)) {
12101 if (isZeroOrAllOnes(N2
, AllOnes
)) {
12108 case ISD::ZERO_EXTEND
:
12109 // (zext cc) can never be the all ones value.
12113 case ISD::SIGN_EXTEND
: {
12115 EVT VT
= N
->getValueType(0);
12116 CC
= N
->getOperand(0);
12117 if (CC
.getValueType() != MVT::i1
|| CC
.getOpcode() != ISD::SETCC
)
12121 // When looking for an AllOnes constant, N is an sext, and the 'other'
12123 OtherOp
= DAG
.getConstant(0, dl
, VT
);
12124 else if (N
->getOpcode() == ISD::ZERO_EXTEND
)
12125 // When looking for a 0 constant, N can be zext or sext.
12126 OtherOp
= DAG
.getConstant(1, dl
, VT
);
12128 OtherOp
= DAG
.getConstant(APInt::getAllOnesValue(VT
.getSizeInBits()), dl
,
12135 // Combine a constant select operand into its use:
12137 // (add (select cc, 0, c), x) -> (select cc, x, (add, x, c))
12138 // (sub x, (select cc, 0, c)) -> (select cc, x, (sub, x, c))
12139 // (and (select cc, -1, c), x) -> (select cc, x, (and, x, c)) [AllOnes=1]
12140 // (or (select cc, 0, c), x) -> (select cc, x, (or, x, c))
12141 // (xor (select cc, 0, c), x) -> (select cc, x, (xor, x, c))
12143 // The transform is rejected if the select doesn't have a constant operand that
12144 // is null, or all ones when AllOnes is set.
12146 // Also recognize sext/zext from i1:
12148 // (add (zext cc), x) -> (select cc (add x, 1), x)
12149 // (add (sext cc), x) -> (select cc (add x, -1), x)
12151 // These transformations eventually create predicated instructions.
12153 // @param N The node to transform.
12154 // @param Slct The N operand that is a select.
12155 // @param OtherOp The other N operand (x above).
12156 // @param DCI Context.
12157 // @param AllOnes Require the select constant to be all ones instead of null.
12158 // @returns The new node, or SDValue() on failure.
12160 SDValue
combineSelectAndUse(SDNode
*N
, SDValue Slct
, SDValue OtherOp
,
12161 TargetLowering::DAGCombinerInfo
&DCI
,
12162 bool AllOnes
= false) {
12163 SelectionDAG
&DAG
= DCI
.DAG
;
12164 EVT VT
= N
->getValueType(0);
12165 SDValue NonConstantVal
;
12167 bool SwapSelectOps
;
12168 if (!isConditionalZeroOrAllOnes(Slct
.getNode(), AllOnes
, CCOp
, SwapSelectOps
,
12169 NonConstantVal
, DAG
))
12172 // Slct is now know to be the desired identity constant when CC is true.
12173 SDValue TrueVal
= OtherOp
;
12174 SDValue FalseVal
= DAG
.getNode(N
->getOpcode(), SDLoc(N
), VT
,
12175 OtherOp
, NonConstantVal
);
12176 // Unless SwapSelectOps says CC should be false.
12178 std::swap(TrueVal
, FalseVal
);
12180 return DAG
.getNode(ISD::SELECT
, SDLoc(N
), VT
,
12181 CCOp
, TrueVal
, FalseVal
);
12184 // Attempt combineSelectAndUse on each operand of a commutative operator N.
12186 SDValue
combineSelectAndUseCommutative(SDNode
*N
, bool AllOnes
,
12187 TargetLowering::DAGCombinerInfo
&DCI
) {
12188 SDValue N0
= N
->getOperand(0);
12189 SDValue N1
= N
->getOperand(1);
12190 if (N0
.getNode()->hasOneUse())
12191 if (SDValue Result
= combineSelectAndUse(N
, N0
, N1
, DCI
, AllOnes
))
12193 if (N1
.getNode()->hasOneUse())
12194 if (SDValue Result
= combineSelectAndUse(N
, N1
, N0
, DCI
, AllOnes
))
12199 static bool IsVUZPShuffleNode(SDNode
*N
) {
12200 // VUZP shuffle node.
12201 if (N
->getOpcode() == ARMISD::VUZP
)
12204 // "VUZP" on i32 is an alias for VTRN.
12205 if (N
->getOpcode() == ARMISD::VTRN
&& N
->getValueType(0) == MVT::v2i32
)
12211 static SDValue
AddCombineToVPADD(SDNode
*N
, SDValue N0
, SDValue N1
,
12212 TargetLowering::DAGCombinerInfo
&DCI
,
12213 const ARMSubtarget
*Subtarget
) {
12214 // Look for ADD(VUZP.0, VUZP.1).
12215 if (!IsVUZPShuffleNode(N0
.getNode()) || N0
.getNode() != N1
.getNode() ||
12219 // Make sure the ADD is a 64-bit add; there is no 128-bit VPADD.
12220 if (!N
->getValueType(0).is64BitVector())
12224 SelectionDAG
&DAG
= DCI
.DAG
;
12225 const TargetLowering
&TLI
= DAG
.getTargetLoweringInfo();
12227 SDNode
*Unzip
= N0
.getNode();
12228 EVT VT
= N
->getValueType(0);
12230 SmallVector
<SDValue
, 8> Ops
;
12231 Ops
.push_back(DAG
.getConstant(Intrinsic::arm_neon_vpadd
, dl
,
12232 TLI
.getPointerTy(DAG
.getDataLayout())));
12233 Ops
.push_back(Unzip
->getOperand(0));
12234 Ops
.push_back(Unzip
->getOperand(1));
12236 return DAG
.getNode(ISD::INTRINSIC_WO_CHAIN
, dl
, VT
, Ops
);
12239 static SDValue
AddCombineVUZPToVPADDL(SDNode
*N
, SDValue N0
, SDValue N1
,
12240 TargetLowering::DAGCombinerInfo
&DCI
,
12241 const ARMSubtarget
*Subtarget
) {
12242 // Check for two extended operands.
12243 if (!(N0
.getOpcode() == ISD::SIGN_EXTEND
&&
12244 N1
.getOpcode() == ISD::SIGN_EXTEND
) &&
12245 !(N0
.getOpcode() == ISD::ZERO_EXTEND
&&
12246 N1
.getOpcode() == ISD::ZERO_EXTEND
))
12249 SDValue N00
= N0
.getOperand(0);
12250 SDValue N10
= N1
.getOperand(0);
12252 // Look for ADD(SEXT(VUZP.0), SEXT(VUZP.1))
12253 if (!IsVUZPShuffleNode(N00
.getNode()) || N00
.getNode() != N10
.getNode() ||
12257 // We only recognize Q register paddl here; this can't be reached until
12258 // after type legalization.
12259 if (!N00
.getValueType().is64BitVector() ||
12260 !N0
.getValueType().is128BitVector())
12263 // Generate vpaddl.
12264 SelectionDAG
&DAG
= DCI
.DAG
;
12265 const TargetLowering
&TLI
= DAG
.getTargetLoweringInfo();
12267 EVT VT
= N
->getValueType(0);
12269 SmallVector
<SDValue
, 8> Ops
;
12270 // Form vpaddl.sN or vpaddl.uN depending on the kind of extension.
12272 if (N0
.getOpcode() == ISD::SIGN_EXTEND
)
12273 Opcode
= Intrinsic::arm_neon_vpaddls
;
12275 Opcode
= Intrinsic::arm_neon_vpaddlu
;
12276 Ops
.push_back(DAG
.getConstant(Opcode
, dl
,
12277 TLI
.getPointerTy(DAG
.getDataLayout())));
12278 EVT ElemTy
= N00
.getValueType().getVectorElementType();
12279 unsigned NumElts
= VT
.getVectorNumElements();
12280 EVT ConcatVT
= EVT::getVectorVT(*DAG
.getContext(), ElemTy
, NumElts
* 2);
12281 SDValue Concat
= DAG
.getNode(ISD::CONCAT_VECTORS
, SDLoc(N
), ConcatVT
,
12282 N00
.getOperand(0), N00
.getOperand(1));
12283 Ops
.push_back(Concat
);
12285 return DAG
.getNode(ISD::INTRINSIC_WO_CHAIN
, dl
, VT
, Ops
);
12288 // FIXME: This function shouldn't be necessary; if we lower BUILD_VECTOR in
12289 // an appropriate manner, we end up with ADD(VUZP(ZEXT(N))), which is
12290 // much easier to match.
12292 AddCombineBUILD_VECTORToVPADDL(SDNode
*N
, SDValue N0
, SDValue N1
,
12293 TargetLowering::DAGCombinerInfo
&DCI
,
12294 const ARMSubtarget
*Subtarget
) {
12295 // Only perform optimization if after legalize, and if NEON is available. We
12296 // also expected both operands to be BUILD_VECTORs.
12297 if (DCI
.isBeforeLegalize() || !Subtarget
->hasNEON()
12298 || N0
.getOpcode() != ISD::BUILD_VECTOR
12299 || N1
.getOpcode() != ISD::BUILD_VECTOR
)
12302 // Check output type since VPADDL operand elements can only be 8, 16, or 32.
12303 EVT VT
= N
->getValueType(0);
12304 if (!VT
.isInteger() || VT
.getVectorElementType() == MVT::i64
)
12307 // Check that the vector operands are of the right form.
12308 // N0 and N1 are BUILD_VECTOR nodes with N number of EXTRACT_VECTOR
12309 // operands, where N is the size of the formed vector.
12310 // Each EXTRACT_VECTOR should have the same input vector and odd or even
12311 // index such that we have a pair wise add pattern.
12313 // Grab the vector that all EXTRACT_VECTOR nodes should be referencing.
12314 if (N0
->getOperand(0)->getOpcode() != ISD::EXTRACT_VECTOR_ELT
)
12316 SDValue Vec
= N0
->getOperand(0)->getOperand(0);
12317 SDNode
*V
= Vec
.getNode();
12318 unsigned nextIndex
= 0;
12320 // For each operands to the ADD which are BUILD_VECTORs,
12321 // check to see if each of their operands are an EXTRACT_VECTOR with
12322 // the same vector and appropriate index.
12323 for (unsigned i
= 0, e
= N0
->getNumOperands(); i
!= e
; ++i
) {
12324 if (N0
->getOperand(i
)->getOpcode() == ISD::EXTRACT_VECTOR_ELT
12325 && N1
->getOperand(i
)->getOpcode() == ISD::EXTRACT_VECTOR_ELT
) {
12327 SDValue ExtVec0
= N0
->getOperand(i
);
12328 SDValue ExtVec1
= N1
->getOperand(i
);
12330 // First operand is the vector, verify its the same.
12331 if (V
!= ExtVec0
->getOperand(0).getNode() ||
12332 V
!= ExtVec1
->getOperand(0).getNode())
12335 // Second is the constant, verify its correct.
12336 ConstantSDNode
*C0
= dyn_cast
<ConstantSDNode
>(ExtVec0
->getOperand(1));
12337 ConstantSDNode
*C1
= dyn_cast
<ConstantSDNode
>(ExtVec1
->getOperand(1));
12339 // For the constant, we want to see all the even or all the odd.
12340 if (!C0
|| !C1
|| C0
->getZExtValue() != nextIndex
12341 || C1
->getZExtValue() != nextIndex
+1)
12344 // Increment index.
12350 // Don't generate vpaddl+vmovn; we'll match it to vpadd later. Also make sure
12351 // we're using the entire input vector, otherwise there's a size/legality
12352 // mismatch somewhere.
12353 if (nextIndex
!= Vec
.getValueType().getVectorNumElements() ||
12354 Vec
.getValueType().getVectorElementType() == VT
.getVectorElementType())
12357 // Create VPADDL node.
12358 SelectionDAG
&DAG
= DCI
.DAG
;
12359 const TargetLowering
&TLI
= DAG
.getTargetLoweringInfo();
12363 // Build operand list.
12364 SmallVector
<SDValue
, 8> Ops
;
12365 Ops
.push_back(DAG
.getConstant(Intrinsic::arm_neon_vpaddls
, dl
,
12366 TLI
.getPointerTy(DAG
.getDataLayout())));
12368 // Input is the vector.
12369 Ops
.push_back(Vec
);
12371 // Get widened type and narrowed type.
12373 unsigned numElem
= VT
.getVectorNumElements();
12375 EVT inputLaneType
= Vec
.getValueType().getVectorElementType();
12376 switch (inputLaneType
.getSimpleVT().SimpleTy
) {
12377 case MVT::i8
: widenType
= MVT::getVectorVT(MVT::i16
, numElem
); break;
12378 case MVT::i16
: widenType
= MVT::getVectorVT(MVT::i32
, numElem
); break;
12379 case MVT::i32
: widenType
= MVT::getVectorVT(MVT::i64
, numElem
); break;
12381 llvm_unreachable("Invalid vector element type for padd optimization.");
12384 SDValue tmp
= DAG
.getNode(ISD::INTRINSIC_WO_CHAIN
, dl
, widenType
, Ops
);
12385 unsigned ExtOp
= VT
.bitsGT(tmp
.getValueType()) ? ISD::ANY_EXTEND
: ISD::TRUNCATE
;
12386 return DAG
.getNode(ExtOp
, dl
, VT
, tmp
);
12389 static SDValue
findMUL_LOHI(SDValue V
) {
12390 if (V
->getOpcode() == ISD::UMUL_LOHI
||
12391 V
->getOpcode() == ISD::SMUL_LOHI
)
12396 static SDValue
AddCombineTo64BitSMLAL16(SDNode
*AddcNode
, SDNode
*AddeNode
,
12397 TargetLowering::DAGCombinerInfo
&DCI
,
12398 const ARMSubtarget
*Subtarget
) {
12399 if (!Subtarget
->hasBaseDSP())
12402 // SMLALBB, SMLALBT, SMLALTB, SMLALTT multiply two 16-bit values and
12403 // accumulates the product into a 64-bit value. The 16-bit values will
12404 // be sign extended somehow or SRA'd into 32-bit values
12405 // (addc (adde (mul 16bit, 16bit), lo), hi)
12406 SDValue Mul
= AddcNode
->getOperand(0);
12407 SDValue Lo
= AddcNode
->getOperand(1);
12408 if (Mul
.getOpcode() != ISD::MUL
) {
12409 Lo
= AddcNode
->getOperand(0);
12410 Mul
= AddcNode
->getOperand(1);
12411 if (Mul
.getOpcode() != ISD::MUL
)
12415 SDValue SRA
= AddeNode
->getOperand(0);
12416 SDValue Hi
= AddeNode
->getOperand(1);
12417 if (SRA
.getOpcode() != ISD::SRA
) {
12418 SRA
= AddeNode
->getOperand(1);
12419 Hi
= AddeNode
->getOperand(0);
12420 if (SRA
.getOpcode() != ISD::SRA
)
12423 if (auto Const
= dyn_cast
<ConstantSDNode
>(SRA
.getOperand(1))) {
12424 if (Const
->getZExtValue() != 31)
12429 if (SRA
.getOperand(0) != Mul
)
12432 SelectionDAG
&DAG
= DCI
.DAG
;
12433 SDLoc
dl(AddcNode
);
12434 unsigned Opcode
= 0;
12438 if (isS16(Mul
.getOperand(0), DAG
) && isS16(Mul
.getOperand(1), DAG
)) {
12439 Opcode
= ARMISD::SMLALBB
;
12440 Op0
= Mul
.getOperand(0);
12441 Op1
= Mul
.getOperand(1);
12442 } else if (isS16(Mul
.getOperand(0), DAG
) && isSRA16(Mul
.getOperand(1))) {
12443 Opcode
= ARMISD::SMLALBT
;
12444 Op0
= Mul
.getOperand(0);
12445 Op1
= Mul
.getOperand(1).getOperand(0);
12446 } else if (isSRA16(Mul
.getOperand(0)) && isS16(Mul
.getOperand(1), DAG
)) {
12447 Opcode
= ARMISD::SMLALTB
;
12448 Op0
= Mul
.getOperand(0).getOperand(0);
12449 Op1
= Mul
.getOperand(1);
12450 } else if (isSRA16(Mul
.getOperand(0)) && isSRA16(Mul
.getOperand(1))) {
12451 Opcode
= ARMISD::SMLALTT
;
12452 Op0
= Mul
->getOperand(0).getOperand(0);
12453 Op1
= Mul
->getOperand(1).getOperand(0);
12459 SDValue SMLAL
= DAG
.getNode(Opcode
, dl
, DAG
.getVTList(MVT::i32
, MVT::i32
),
12461 // Replace the ADDs' nodes uses by the MLA node's values.
12462 SDValue
HiMLALResult(SMLAL
.getNode(), 1);
12463 SDValue
LoMLALResult(SMLAL
.getNode(), 0);
12465 DAG
.ReplaceAllUsesOfValueWith(SDValue(AddcNode
, 0), LoMLALResult
);
12466 DAG
.ReplaceAllUsesOfValueWith(SDValue(AddeNode
, 0), HiMLALResult
);
12468 // Return original node to notify the driver to stop replacing.
12469 SDValue
resNode(AddcNode
, 0);
12473 static SDValue
AddCombineTo64bitMLAL(SDNode
*AddeSubeNode
,
12474 TargetLowering::DAGCombinerInfo
&DCI
,
12475 const ARMSubtarget
*Subtarget
) {
12476 // Look for multiply add opportunities.
12477 // The pattern is a ISD::UMUL_LOHI followed by two add nodes, where
12478 // each add nodes consumes a value from ISD::UMUL_LOHI and there is
12479 // a glue link from the first add to the second add.
12480 // If we find this pattern, we can replace the U/SMUL_LOHI, ADDC, and ADDE by
12481 // a S/UMLAL instruction.
12484 // V \ [no multiline comment]
12490 // In the special case where only the higher part of a signed result is used
12491 // and the add to the low part of the result of ISD::UMUL_LOHI adds or subtracts
12492 // a constant with the exact value of 0x80000000, we recognize we are dealing
12493 // with a "rounded multiply and add" (or subtract) and transform it into
12494 // either a ARMISD::SMMLAR or ARMISD::SMMLSR respectively.
12496 assert((AddeSubeNode
->getOpcode() == ARMISD::ADDE
||
12497 AddeSubeNode
->getOpcode() == ARMISD::SUBE
) &&
12498 "Expect an ADDE or SUBE");
12500 assert(AddeSubeNode
->getNumOperands() == 3 &&
12501 AddeSubeNode
->getOperand(2).getValueType() == MVT::i32
&&
12502 "ADDE node has the wrong inputs");
12504 // Check that we are chained to the right ADDC or SUBC node.
12505 SDNode
*AddcSubcNode
= AddeSubeNode
->getOperand(2).getNode();
12506 if ((AddeSubeNode
->getOpcode() == ARMISD::ADDE
&&
12507 AddcSubcNode
->getOpcode() != ARMISD::ADDC
) ||
12508 (AddeSubeNode
->getOpcode() == ARMISD::SUBE
&&
12509 AddcSubcNode
->getOpcode() != ARMISD::SUBC
))
12512 SDValue AddcSubcOp0
= AddcSubcNode
->getOperand(0);
12513 SDValue AddcSubcOp1
= AddcSubcNode
->getOperand(1);
12515 // Check if the two operands are from the same mul_lohi node.
12516 if (AddcSubcOp0
.getNode() == AddcSubcOp1
.getNode())
12519 assert(AddcSubcNode
->getNumValues() == 2 &&
12520 AddcSubcNode
->getValueType(0) == MVT::i32
&&
12521 "Expect ADDC with two result values. First: i32");
12523 // Check that the ADDC adds the low result of the S/UMUL_LOHI. If not, it
12524 // maybe a SMLAL which multiplies two 16-bit values.
12525 if (AddeSubeNode
->getOpcode() == ARMISD::ADDE
&&
12526 AddcSubcOp0
->getOpcode() != ISD::UMUL_LOHI
&&
12527 AddcSubcOp0
->getOpcode() != ISD::SMUL_LOHI
&&
12528 AddcSubcOp1
->getOpcode() != ISD::UMUL_LOHI
&&
12529 AddcSubcOp1
->getOpcode() != ISD::SMUL_LOHI
)
12530 return AddCombineTo64BitSMLAL16(AddcSubcNode
, AddeSubeNode
, DCI
, Subtarget
);
12532 // Check for the triangle shape.
12533 SDValue AddeSubeOp0
= AddeSubeNode
->getOperand(0);
12534 SDValue AddeSubeOp1
= AddeSubeNode
->getOperand(1);
12536 // Make sure that the ADDE/SUBE operands are not coming from the same node.
12537 if (AddeSubeOp0
.getNode() == AddeSubeOp1
.getNode())
12540 // Find the MUL_LOHI node walking up ADDE/SUBE's operands.
12541 bool IsLeftOperandMUL
= false;
12542 SDValue MULOp
= findMUL_LOHI(AddeSubeOp0
);
12543 if (MULOp
== SDValue())
12544 MULOp
= findMUL_LOHI(AddeSubeOp1
);
12546 IsLeftOperandMUL
= true;
12547 if (MULOp
== SDValue())
12550 // Figure out the right opcode.
12551 unsigned Opc
= MULOp
->getOpcode();
12552 unsigned FinalOpc
= (Opc
== ISD::SMUL_LOHI
) ? ARMISD::SMLAL
: ARMISD::UMLAL
;
12554 // Figure out the high and low input values to the MLAL node.
12555 SDValue
*HiAddSub
= nullptr;
12556 SDValue
*LoMul
= nullptr;
12557 SDValue
*LowAddSub
= nullptr;
12559 // Ensure that ADDE/SUBE is from high result of ISD::xMUL_LOHI.
12560 if ((AddeSubeOp0
!= MULOp
.getValue(1)) && (AddeSubeOp1
!= MULOp
.getValue(1)))
12563 if (IsLeftOperandMUL
)
12564 HiAddSub
= &AddeSubeOp1
;
12566 HiAddSub
= &AddeSubeOp0
;
12568 // Ensure that LoMul and LowAddSub are taken from correct ISD::SMUL_LOHI node
12569 // whose low result is fed to the ADDC/SUBC we are checking.
12571 if (AddcSubcOp0
== MULOp
.getValue(0)) {
12572 LoMul
= &AddcSubcOp0
;
12573 LowAddSub
= &AddcSubcOp1
;
12575 if (AddcSubcOp1
== MULOp
.getValue(0)) {
12576 LoMul
= &AddcSubcOp1
;
12577 LowAddSub
= &AddcSubcOp0
;
12583 // If HiAddSub is the same node as ADDC/SUBC or is a predecessor of ADDC/SUBC
12584 // the replacement below will create a cycle.
12585 if (AddcSubcNode
== HiAddSub
->getNode() ||
12586 AddcSubcNode
->isPredecessorOf(HiAddSub
->getNode()))
12589 // Create the merged node.
12590 SelectionDAG
&DAG
= DCI
.DAG
;
12592 // Start building operand list.
12593 SmallVector
<SDValue
, 8> Ops
;
12594 Ops
.push_back(LoMul
->getOperand(0));
12595 Ops
.push_back(LoMul
->getOperand(1));
12597 // Check whether we can use SMMLAR, SMMLSR or SMMULR instead. For this to be
12598 // the case, we must be doing signed multiplication and only use the higher
12599 // part of the result of the MLAL, furthermore the LowAddSub must be a constant
12600 // addition or subtraction with the value of 0x800000.
12601 if (Subtarget
->hasV6Ops() && Subtarget
->hasDSP() && Subtarget
->useMulOps() &&
12602 FinalOpc
== ARMISD::SMLAL
&& !AddeSubeNode
->hasAnyUseOfValue(1) &&
12603 LowAddSub
->getNode()->getOpcode() == ISD::Constant
&&
12604 static_cast<ConstantSDNode
*>(LowAddSub
->getNode())->getZExtValue() ==
12606 Ops
.push_back(*HiAddSub
);
12607 if (AddcSubcNode
->getOpcode() == ARMISD::SUBC
) {
12608 FinalOpc
= ARMISD::SMMLSR
;
12610 FinalOpc
= ARMISD::SMMLAR
;
12612 SDValue NewNode
= DAG
.getNode(FinalOpc
, SDLoc(AddcSubcNode
), MVT::i32
, Ops
);
12613 DAG
.ReplaceAllUsesOfValueWith(SDValue(AddeSubeNode
, 0), NewNode
);
12615 return SDValue(AddeSubeNode
, 0);
12616 } else if (AddcSubcNode
->getOpcode() == ARMISD::SUBC
)
12617 // SMMLS is generated during instruction selection and the rest of this
12618 // function can not handle the case where AddcSubcNode is a SUBC.
12621 // Finish building the operand list for {U/S}MLAL
12622 Ops
.push_back(*LowAddSub
);
12623 Ops
.push_back(*HiAddSub
);
12625 SDValue MLALNode
= DAG
.getNode(FinalOpc
, SDLoc(AddcSubcNode
),
12626 DAG
.getVTList(MVT::i32
, MVT::i32
), Ops
);
12628 // Replace the ADDs' nodes uses by the MLA node's values.
12629 SDValue
HiMLALResult(MLALNode
.getNode(), 1);
12630 DAG
.ReplaceAllUsesOfValueWith(SDValue(AddeSubeNode
, 0), HiMLALResult
);
12632 SDValue
LoMLALResult(MLALNode
.getNode(), 0);
12633 DAG
.ReplaceAllUsesOfValueWith(SDValue(AddcSubcNode
, 0), LoMLALResult
);
12635 // Return original node to notify the driver to stop replacing.
12636 return SDValue(AddeSubeNode
, 0);
12639 static SDValue
AddCombineTo64bitUMAAL(SDNode
*AddeNode
,
12640 TargetLowering::DAGCombinerInfo
&DCI
,
12641 const ARMSubtarget
*Subtarget
) {
12642 // UMAAL is similar to UMLAL except that it adds two unsigned values.
12643 // While trying to combine for the other MLAL nodes, first search for the
12644 // chance to use UMAAL. Check if Addc uses a node which has already
12645 // been combined into a UMLAL. The other pattern is UMLAL using Addc/Adde
12646 // as the addend, and it's handled in PerformUMLALCombine.
12648 if (!Subtarget
->hasV6Ops() || !Subtarget
->hasDSP())
12649 return AddCombineTo64bitMLAL(AddeNode
, DCI
, Subtarget
);
12651 // Check that we have a glued ADDC node.
12652 SDNode
* AddcNode
= AddeNode
->getOperand(2).getNode();
12653 if (AddcNode
->getOpcode() != ARMISD::ADDC
)
12656 // Find the converted UMAAL or quit if it doesn't exist.
12657 SDNode
*UmlalNode
= nullptr;
12659 if (AddcNode
->getOperand(0).getOpcode() == ARMISD::UMLAL
) {
12660 UmlalNode
= AddcNode
->getOperand(0).getNode();
12661 AddHi
= AddcNode
->getOperand(1);
12662 } else if (AddcNode
->getOperand(1).getOpcode() == ARMISD::UMLAL
) {
12663 UmlalNode
= AddcNode
->getOperand(1).getNode();
12664 AddHi
= AddcNode
->getOperand(0);
12666 return AddCombineTo64bitMLAL(AddeNode
, DCI
, Subtarget
);
12669 // The ADDC should be glued to an ADDE node, which uses the same UMLAL as
12670 // the ADDC as well as Zero.
12671 if (!isNullConstant(UmlalNode
->getOperand(3)))
12674 if ((isNullConstant(AddeNode
->getOperand(0)) &&
12675 AddeNode
->getOperand(1).getNode() == UmlalNode
) ||
12676 (AddeNode
->getOperand(0).getNode() == UmlalNode
&&
12677 isNullConstant(AddeNode
->getOperand(1)))) {
12678 SelectionDAG
&DAG
= DCI
.DAG
;
12679 SDValue Ops
[] = { UmlalNode
->getOperand(0), UmlalNode
->getOperand(1),
12680 UmlalNode
->getOperand(2), AddHi
};
12681 SDValue UMAAL
= DAG
.getNode(ARMISD::UMAAL
, SDLoc(AddcNode
),
12682 DAG
.getVTList(MVT::i32
, MVT::i32
), Ops
);
12684 // Replace the ADDs' nodes uses by the UMAAL node's values.
12685 DAG
.ReplaceAllUsesOfValueWith(SDValue(AddeNode
, 0), SDValue(UMAAL
.getNode(), 1));
12686 DAG
.ReplaceAllUsesOfValueWith(SDValue(AddcNode
, 0), SDValue(UMAAL
.getNode(), 0));
12688 // Return original node to notify the driver to stop replacing.
12689 return SDValue(AddeNode
, 0);
12694 static SDValue
PerformUMLALCombine(SDNode
*N
, SelectionDAG
&DAG
,
12695 const ARMSubtarget
*Subtarget
) {
12696 if (!Subtarget
->hasV6Ops() || !Subtarget
->hasDSP())
12699 // Check that we have a pair of ADDC and ADDE as operands.
12700 // Both addends of the ADDE must be zero.
12701 SDNode
* AddcNode
= N
->getOperand(2).getNode();
12702 SDNode
* AddeNode
= N
->getOperand(3).getNode();
12703 if ((AddcNode
->getOpcode() == ARMISD::ADDC
) &&
12704 (AddeNode
->getOpcode() == ARMISD::ADDE
) &&
12705 isNullConstant(AddeNode
->getOperand(0)) &&
12706 isNullConstant(AddeNode
->getOperand(1)) &&
12707 (AddeNode
->getOperand(2).getNode() == AddcNode
))
12708 return DAG
.getNode(ARMISD::UMAAL
, SDLoc(N
),
12709 DAG
.getVTList(MVT::i32
, MVT::i32
),
12710 {N
->getOperand(0), N
->getOperand(1),
12711 AddcNode
->getOperand(0), AddcNode
->getOperand(1)});
12716 static SDValue
PerformAddcSubcCombine(SDNode
*N
,
12717 TargetLowering::DAGCombinerInfo
&DCI
,
12718 const ARMSubtarget
*Subtarget
) {
12719 SelectionDAG
&DAG(DCI
.DAG
);
12721 if (N
->getOpcode() == ARMISD::SUBC
) {
12722 // (SUBC (ADDE 0, 0, C), 1) -> C
12723 SDValue LHS
= N
->getOperand(0);
12724 SDValue RHS
= N
->getOperand(1);
12725 if (LHS
->getOpcode() == ARMISD::ADDE
&&
12726 isNullConstant(LHS
->getOperand(0)) &&
12727 isNullConstant(LHS
->getOperand(1)) && isOneConstant(RHS
)) {
12728 return DCI
.CombineTo(N
, SDValue(N
, 0), LHS
->getOperand(2));
12732 if (Subtarget
->isThumb1Only()) {
12733 SDValue RHS
= N
->getOperand(1);
12734 if (ConstantSDNode
*C
= dyn_cast
<ConstantSDNode
>(RHS
)) {
12735 int32_t imm
= C
->getSExtValue();
12736 if (imm
< 0 && imm
> std::numeric_limits
<int>::min()) {
12738 RHS
= DAG
.getConstant(-imm
, DL
, MVT::i32
);
12739 unsigned Opcode
= (N
->getOpcode() == ARMISD::ADDC
) ? ARMISD::SUBC
12741 return DAG
.getNode(Opcode
, DL
, N
->getVTList(), N
->getOperand(0), RHS
);
12749 static SDValue
PerformAddeSubeCombine(SDNode
*N
,
12750 TargetLowering::DAGCombinerInfo
&DCI
,
12751 const ARMSubtarget
*Subtarget
) {
12752 if (Subtarget
->isThumb1Only()) {
12753 SelectionDAG
&DAG
= DCI
.DAG
;
12754 SDValue RHS
= N
->getOperand(1);
12755 if (ConstantSDNode
*C
= dyn_cast
<ConstantSDNode
>(RHS
)) {
12756 int64_t imm
= C
->getSExtValue();
12760 // The with-carry-in form matches bitwise not instead of the negation.
12761 // Effectively, the inverse interpretation of the carry flag already
12762 // accounts for part of the negation.
12763 RHS
= DAG
.getConstant(~imm
, DL
, MVT::i32
);
12765 unsigned Opcode
= (N
->getOpcode() == ARMISD::ADDE
) ? ARMISD::SUBE
12767 return DAG
.getNode(Opcode
, DL
, N
->getVTList(),
12768 N
->getOperand(0), RHS
, N
->getOperand(2));
12771 } else if (N
->getOperand(1)->getOpcode() == ISD::SMUL_LOHI
) {
12772 return AddCombineTo64bitMLAL(N
, DCI
, Subtarget
);
12777 static SDValue
PerformSELECTCombine(SDNode
*N
,
12778 TargetLowering::DAGCombinerInfo
&DCI
,
12779 const ARMSubtarget
*Subtarget
) {
12780 if (!Subtarget
->hasMVEIntegerOps())
12791 if (N
->getOpcode() == ISD::SELECT
&&
12792 N
->getOperand(0)->getOpcode() == ISD::SETCC
) {
12793 SetCC
= N
->getOperand(0);
12794 LHS
= SetCC
->getOperand(0);
12795 RHS
= SetCC
->getOperand(1);
12796 CC
= cast
<CondCodeSDNode
>(SetCC
->getOperand(2))->get();
12797 TrueVal
= N
->getOperand(1);
12798 FalseVal
= N
->getOperand(2);
12799 } else if (N
->getOpcode() == ISD::SELECT_CC
) {
12800 LHS
= N
->getOperand(0);
12801 RHS
= N
->getOperand(1);
12802 CC
= cast
<CondCodeSDNode
>(N
->getOperand(4))->get();
12803 TrueVal
= N
->getOperand(2);
12804 FalseVal
= N
->getOperand(3);
12809 unsigned int Opcode
= 0;
12810 if ((TrueVal
->getOpcode() == ISD::VECREDUCE_UMIN
||
12811 FalseVal
->getOpcode() == ISD::VECREDUCE_UMIN
) &&
12812 (CC
== ISD::SETULT
|| CC
== ISD::SETUGT
)) {
12813 Opcode
= ARMISD::VMINVu
;
12814 if (CC
== ISD::SETUGT
)
12815 std::swap(TrueVal
, FalseVal
);
12816 } else if ((TrueVal
->getOpcode() == ISD::VECREDUCE_SMIN
||
12817 FalseVal
->getOpcode() == ISD::VECREDUCE_SMIN
) &&
12818 (CC
== ISD::SETLT
|| CC
== ISD::SETGT
)) {
12819 Opcode
= ARMISD::VMINVs
;
12820 if (CC
== ISD::SETGT
)
12821 std::swap(TrueVal
, FalseVal
);
12822 } else if ((TrueVal
->getOpcode() == ISD::VECREDUCE_UMAX
||
12823 FalseVal
->getOpcode() == ISD::VECREDUCE_UMAX
) &&
12824 (CC
== ISD::SETUGT
|| CC
== ISD::SETULT
)) {
12825 Opcode
= ARMISD::VMAXVu
;
12826 if (CC
== ISD::SETULT
)
12827 std::swap(TrueVal
, FalseVal
);
12828 } else if ((TrueVal
->getOpcode() == ISD::VECREDUCE_SMAX
||
12829 FalseVal
->getOpcode() == ISD::VECREDUCE_SMAX
) &&
12830 (CC
== ISD::SETGT
|| CC
== ISD::SETLT
)) {
12831 Opcode
= ARMISD::VMAXVs
;
12832 if (CC
== ISD::SETLT
)
12833 std::swap(TrueVal
, FalseVal
);
12837 // Normalise to the right hand side being the vector reduction
12838 switch (TrueVal
->getOpcode()) {
12839 case ISD::VECREDUCE_UMIN
:
12840 case ISD::VECREDUCE_SMIN
:
12841 case ISD::VECREDUCE_UMAX
:
12842 case ISD::VECREDUCE_SMAX
:
12843 std::swap(LHS
, RHS
);
12844 std::swap(TrueVal
, FalseVal
);
12848 EVT VectorType
= FalseVal
->getOperand(0).getValueType();
12850 if (VectorType
!= MVT::v16i8
&& VectorType
!= MVT::v8i16
&&
12851 VectorType
!= MVT::v4i32
)
12854 EVT VectorScalarType
= VectorType
.getVectorElementType();
12856 // The values being selected must also be the ones being compared
12857 if (TrueVal
!= LHS
|| FalseVal
!= RHS
)
12860 EVT LeftType
= LHS
->getValueType(0);
12861 EVT RightType
= RHS
->getValueType(0);
12863 // The types must match the reduced type too
12864 if (LeftType
!= VectorScalarType
|| RightType
!= VectorScalarType
)
12867 // Legalise the scalar to an i32
12868 if (VectorScalarType
!= MVT::i32
)
12869 LHS
= DCI
.DAG
.getNode(ISD::ANY_EXTEND
, dl
, MVT::i32
, LHS
);
12871 // Generate the reduction as an i32 for legalisation purposes
12873 DCI
.DAG
.getNode(Opcode
, dl
, MVT::i32
, LHS
, RHS
->getOperand(0));
12875 // The result isn't actually an i32 so truncate it back to its original type
12876 if (VectorScalarType
!= MVT::i32
)
12877 Reduction
= DCI
.DAG
.getNode(ISD::TRUNCATE
, dl
, VectorScalarType
, Reduction
);
12882 // A special combine for the vqdmulh family of instructions. This is one of the
12883 // potential set of patterns that could patch this instruction. The base pattern
12884 // you would expect to be min(max(ashr(mul(mul(sext(x), 2), sext(y)), 16))).
12885 // This matches the different min(max(ashr(mul(mul(sext(x), sext(y)), 2), 16))),
12886 // which llvm will have optimized to min(ashr(mul(sext(x), sext(y)), 15))) as
12887 // the max is unnecessary.
12888 static SDValue
PerformVQDMULHCombine(SDNode
*N
, SelectionDAG
&DAG
) {
12889 EVT VT
= N
->getValueType(0);
12891 ConstantSDNode
*Clamp
;
12893 if (!VT
.isVector())
12896 if (N
->getOpcode() == ISD::SMIN
) {
12897 Shft
= N
->getOperand(0);
12898 Clamp
= isConstOrConstSplat(N
->getOperand(1));
12899 } else if (N
->getOpcode() == ISD::VSELECT
) {
12900 // Detect a SMIN, which for an i64 node will be a vselect/setcc, not a smin.
12901 SDValue Cmp
= N
->getOperand(0);
12902 if (Cmp
.getOpcode() != ISD::SETCC
||
12903 cast
<CondCodeSDNode
>(Cmp
.getOperand(2))->get() != ISD::SETLT
||
12904 Cmp
.getOperand(0) != N
->getOperand(1) ||
12905 Cmp
.getOperand(1) != N
->getOperand(2))
12907 Shft
= N
->getOperand(1);
12908 Clamp
= isConstOrConstSplat(N
->getOperand(2));
12917 switch (Clamp
->getSExtValue()) {
12919 ScalarType
= MVT::i8
;
12922 case (1 << 15) - 1:
12923 ScalarType
= MVT::i16
;
12926 case (1ULL << 31) - 1:
12927 ScalarType
= MVT::i32
;
12934 if (Shft
.getOpcode() != ISD::SRA
)
12936 ConstantSDNode
*N1
= isConstOrConstSplat(Shft
.getOperand(1));
12937 if (!N1
|| N1
->getSExtValue() != ShftAmt
)
12940 SDValue Mul
= Shft
.getOperand(0);
12941 if (Mul
.getOpcode() != ISD::MUL
)
12944 SDValue Ext0
= Mul
.getOperand(0);
12945 SDValue Ext1
= Mul
.getOperand(1);
12946 if (Ext0
.getOpcode() != ISD::SIGN_EXTEND
||
12947 Ext1
.getOpcode() != ISD::SIGN_EXTEND
)
12949 EVT VecVT
= Ext0
.getOperand(0).getValueType();
12950 if (!VecVT
.isPow2VectorType() || VecVT
.getVectorNumElements() == 1)
12952 if (Ext1
.getOperand(0).getValueType() != VecVT
||
12953 VecVT
.getScalarType() != ScalarType
||
12954 VT
.getScalarSizeInBits() < ScalarType
.getScalarSizeInBits() * 2)
12958 unsigned LegalLanes
= 128 / (ShftAmt
+ 1);
12959 EVT LegalVecVT
= MVT::getVectorVT(ScalarType
, LegalLanes
);
12960 // For types smaller than legal vectors extend to be legal and only use needed
12962 if (VecVT
.getSizeInBits() < 128) {
12964 MVT::getVectorVT(MVT::getIntegerVT(128 / VecVT
.getVectorNumElements()),
12965 VecVT
.getVectorNumElements());
12967 DAG
.getNode(ISD::ANY_EXTEND
, DL
, ExtVecVT
, Ext0
.getOperand(0));
12969 DAG
.getNode(ISD::ANY_EXTEND
, DL
, ExtVecVT
, Ext1
.getOperand(0));
12970 Inp0
= DAG
.getNode(ARMISD::VECTOR_REG_CAST
, DL
, LegalVecVT
, Inp0
);
12971 Inp1
= DAG
.getNode(ARMISD::VECTOR_REG_CAST
, DL
, LegalVecVT
, Inp1
);
12972 SDValue VQDMULH
= DAG
.getNode(ARMISD::VQDMULH
, DL
, LegalVecVT
, Inp0
, Inp1
);
12973 SDValue Trunc
= DAG
.getNode(ARMISD::VECTOR_REG_CAST
, DL
, ExtVecVT
, VQDMULH
);
12974 Trunc
= DAG
.getNode(ISD::TRUNCATE
, DL
, VecVT
, Trunc
);
12975 return DAG
.getNode(ISD::SIGN_EXTEND
, DL
, VT
, Trunc
);
12978 // For larger types, split into legal sized chunks.
12979 assert(VecVT
.getSizeInBits() % 128 == 0 && "Expected a power2 type");
12980 unsigned NumParts
= VecVT
.getSizeInBits() / 128;
12981 SmallVector
<SDValue
> Parts
;
12982 for (unsigned I
= 0; I
< NumParts
; ++I
) {
12984 DAG
.getNode(ISD::EXTRACT_SUBVECTOR
, DL
, LegalVecVT
, Ext0
.getOperand(0),
12985 DAG
.getVectorIdxConstant(I
* LegalLanes
, DL
));
12987 DAG
.getNode(ISD::EXTRACT_SUBVECTOR
, DL
, LegalVecVT
, Ext1
.getOperand(0),
12988 DAG
.getVectorIdxConstant(I
* LegalLanes
, DL
));
12989 SDValue VQDMULH
= DAG
.getNode(ARMISD::VQDMULH
, DL
, LegalVecVT
, Inp0
, Inp1
);
12990 Parts
.push_back(VQDMULH
);
12992 return DAG
.getNode(ISD::SIGN_EXTEND
, DL
, VT
,
12993 DAG
.getNode(ISD::CONCAT_VECTORS
, DL
, VecVT
, Parts
));
12996 static SDValue
PerformVSELECTCombine(SDNode
*N
,
12997 TargetLowering::DAGCombinerInfo
&DCI
,
12998 const ARMSubtarget
*Subtarget
) {
12999 if (!Subtarget
->hasMVEIntegerOps())
13002 if (SDValue V
= PerformVQDMULHCombine(N
, DCI
.DAG
))
13005 // Transforms vselect(not(cond), lhs, rhs) into vselect(cond, rhs, lhs).
13007 // We need to re-implement this optimization here as the implementation in the
13008 // Target-Independent DAGCombiner does not handle the kind of constant we make
13009 // (it calls isConstOrConstSplat with AllowTruncation set to false - and for
13010 // good reason, allowing truncation there would break other targets).
13012 // Currently, this is only done for MVE, as it's the only target that benefits
13013 // from this transformation (e.g. VPNOT+VPSEL becomes a single VPSEL).
13014 if (N
->getOperand(0).getOpcode() != ISD::XOR
)
13016 SDValue XOR
= N
->getOperand(0);
13018 // Check if the XOR's RHS is either a 1, or a BUILD_VECTOR of 1s.
13019 // It is important to check with truncation allowed as the BUILD_VECTORs we
13020 // generate in those situations will truncate their operands.
13021 ConstantSDNode
*Const
=
13022 isConstOrConstSplat(XOR
->getOperand(1), /*AllowUndefs*/ false,
13023 /*AllowTruncation*/ true);
13024 if (!Const
|| !Const
->isOne())
13027 // Rewrite into vselect(cond, rhs, lhs).
13028 SDValue Cond
= XOR
->getOperand(0);
13029 SDValue LHS
= N
->getOperand(1);
13030 SDValue RHS
= N
->getOperand(2);
13031 EVT Type
= N
->getValueType(0);
13032 return DCI
.DAG
.getNode(ISD::VSELECT
, SDLoc(N
), Type
, Cond
, RHS
, LHS
);
13035 static SDValue
PerformABSCombine(SDNode
*N
,
13036 TargetLowering::DAGCombinerInfo
&DCI
,
13037 const ARMSubtarget
*Subtarget
) {
13039 SelectionDAG
&DAG
= DCI
.DAG
;
13040 const TargetLowering
&TLI
= DAG
.getTargetLoweringInfo();
13042 if (TLI
.isOperationLegal(N
->getOpcode(), N
->getValueType(0)))
13045 if (!TLI
.expandABS(N
, res
, DAG
))
13051 /// PerformADDECombine - Target-specific dag combine transform from
13052 /// ARMISD::ADDC, ARMISD::ADDE, and ISD::MUL_LOHI to MLAL or
13053 /// ARMISD::ADDC, ARMISD::ADDE and ARMISD::UMLAL to ARMISD::UMAAL
13054 static SDValue
PerformADDECombine(SDNode
*N
,
13055 TargetLowering::DAGCombinerInfo
&DCI
,
13056 const ARMSubtarget
*Subtarget
) {
13057 // Only ARM and Thumb2 support UMLAL/SMLAL.
13058 if (Subtarget
->isThumb1Only())
13059 return PerformAddeSubeCombine(N
, DCI
, Subtarget
);
13061 // Only perform the checks after legalize when the pattern is available.
13062 if (DCI
.isBeforeLegalize()) return SDValue();
13064 return AddCombineTo64bitUMAAL(N
, DCI
, Subtarget
);
13067 /// PerformADDCombineWithOperands - Try DAG combinations for an ADD with
13068 /// operands N0 and N1. This is a helper for PerformADDCombine that is
13069 /// called with the default operands, and if that fails, with commuted
13071 static SDValue
PerformADDCombineWithOperands(SDNode
*N
, SDValue N0
, SDValue N1
,
13072 TargetLowering::DAGCombinerInfo
&DCI
,
13073 const ARMSubtarget
*Subtarget
){
13074 // Attempt to create vpadd for this add.
13075 if (SDValue Result
= AddCombineToVPADD(N
, N0
, N1
, DCI
, Subtarget
))
13078 // Attempt to create vpaddl for this add.
13079 if (SDValue Result
= AddCombineVUZPToVPADDL(N
, N0
, N1
, DCI
, Subtarget
))
13081 if (SDValue Result
= AddCombineBUILD_VECTORToVPADDL(N
, N0
, N1
, DCI
,
13085 // fold (add (select cc, 0, c), x) -> (select cc, x, (add, x, c))
13086 if (N0
.getNode()->hasOneUse())
13087 if (SDValue Result
= combineSelectAndUse(N
, N0
, N1
, DCI
))
13092 static SDValue
TryDistrubutionADDVecReduce(SDNode
*N
, SelectionDAG
&DAG
) {
13093 EVT VT
= N
->getValueType(0);
13094 SDValue N0
= N
->getOperand(0);
13095 SDValue N1
= N
->getOperand(1);
13098 auto IsVecReduce
= [](SDValue Op
) {
13099 switch (Op
.getOpcode()) {
13100 case ISD::VECREDUCE_ADD
:
13101 case ARMISD::VADDVs
:
13102 case ARMISD::VADDVu
:
13103 case ARMISD::VMLAVs
:
13104 case ARMISD::VMLAVu
:
13110 auto DistrubuteAddAddVecReduce
= [&](SDValue N0
, SDValue N1
) {
13111 // Distribute add(X, add(vecreduce(Y), vecreduce(Z))) ->
13112 // add(add(X, vecreduce(Y)), vecreduce(Z))
13113 // to make better use of vaddva style instructions.
13114 if (VT
== MVT::i32
&& N1
.getOpcode() == ISD::ADD
&& !IsVecReduce(N0
) &&
13115 IsVecReduce(N1
.getOperand(0)) && IsVecReduce(N1
.getOperand(1)) &&
13116 !isa
<ConstantSDNode
>(N0
)) {
13117 SDValue Add0
= DAG
.getNode(ISD::ADD
, dl
, VT
, N0
, N1
.getOperand(0));
13118 return DAG
.getNode(ISD::ADD
, dl
, VT
, Add0
, N1
.getOperand(1));
13120 // And turn add(add(A, reduce(B)), add(C, reduce(D))) ->
13121 // add(add(add(A, C), reduce(B)), reduce(D))
13122 if (VT
== MVT::i32
&& N0
.getOpcode() == ISD::ADD
&&
13123 N1
.getOpcode() == ISD::ADD
) {
13124 unsigned N0RedOp
= 0;
13125 if (!IsVecReduce(N0
.getOperand(N0RedOp
))) {
13127 if (!IsVecReduce(N0
.getOperand(N0RedOp
)))
13131 unsigned N1RedOp
= 0;
13132 if (!IsVecReduce(N1
.getOperand(N1RedOp
)))
13134 if (!IsVecReduce(N1
.getOperand(N1RedOp
)))
13137 SDValue Add0
= DAG
.getNode(ISD::ADD
, dl
, VT
, N0
.getOperand(1 - N0RedOp
),
13138 N1
.getOperand(1 - N1RedOp
));
13140 DAG
.getNode(ISD::ADD
, dl
, VT
, Add0
, N0
.getOperand(N0RedOp
));
13141 return DAG
.getNode(ISD::ADD
, dl
, VT
, Add1
, N1
.getOperand(N1RedOp
));
13145 if (SDValue R
= DistrubuteAddAddVecReduce(N0
, N1
))
13147 if (SDValue R
= DistrubuteAddAddVecReduce(N1
, N0
))
13150 // Distribute add(vecreduce(load(Y)), vecreduce(load(Z)))
13151 // Or add(add(X, vecreduce(load(Y))), vecreduce(load(Z)))
13152 // by ascending load offsets. This can help cores prefetch if the order of
13153 // loads is more predictable.
13154 auto DistrubuteVecReduceLoad
= [&](SDValue N0
, SDValue N1
, bool IsForward
) {
13155 // Check if two reductions are known to load data where one is before/after
13156 // another. Return negative if N0 loads data before N1, positive if N1 is
13157 // before N0 and 0 otherwise if nothing is known.
13158 auto IsKnownOrderedLoad
= [&](SDValue N0
, SDValue N1
) {
13159 // Look through to the first operand of a MUL, for the VMLA case.
13160 // Currently only looks at the first operand, in the hope they are equal.
13161 if (N0
.getOpcode() == ISD::MUL
)
13162 N0
= N0
.getOperand(0);
13163 if (N1
.getOpcode() == ISD::MUL
)
13164 N1
= N1
.getOperand(0);
13166 // Return true if the two operands are loads to the same object and the
13167 // offset of the first is known to be less than the offset of the second.
13168 LoadSDNode
*Load0
= dyn_cast
<LoadSDNode
>(N0
);
13169 LoadSDNode
*Load1
= dyn_cast
<LoadSDNode
>(N1
);
13170 if (!Load0
|| !Load1
|| Load0
->getChain() != Load1
->getChain() ||
13171 !Load0
->isSimple() || !Load1
->isSimple() || Load0
->isIndexed() ||
13172 Load1
->isIndexed())
13175 auto BaseLocDecomp0
= BaseIndexOffset::match(Load0
, DAG
);
13176 auto BaseLocDecomp1
= BaseIndexOffset::match(Load1
, DAG
);
13178 if (!BaseLocDecomp0
.getBase() ||
13179 BaseLocDecomp0
.getBase() != BaseLocDecomp1
.getBase() ||
13180 !BaseLocDecomp0
.hasValidOffset() || !BaseLocDecomp1
.hasValidOffset())
13182 if (BaseLocDecomp0
.getOffset() < BaseLocDecomp1
.getOffset())
13184 if (BaseLocDecomp0
.getOffset() > BaseLocDecomp1
.getOffset())
13190 if (N0
.getOpcode() == ISD::ADD
) {
13191 if (IsVecReduce(N0
.getOperand(0)) && IsVecReduce(N0
.getOperand(1))) {
13192 int IsBefore
= IsKnownOrderedLoad(N0
.getOperand(0).getOperand(0),
13193 N0
.getOperand(1).getOperand(0));
13194 if (IsBefore
< 0) {
13195 X
= N0
.getOperand(0);
13196 N0
= N0
.getOperand(1);
13197 } else if (IsBefore
> 0) {
13198 X
= N0
.getOperand(1);
13199 N0
= N0
.getOperand(0);
13202 } else if (IsVecReduce(N0
.getOperand(0))) {
13203 X
= N0
.getOperand(1);
13204 N0
= N0
.getOperand(0);
13205 } else if (IsVecReduce(N0
.getOperand(1))) {
13206 X
= N0
.getOperand(0);
13207 N0
= N0
.getOperand(1);
13210 } else if (IsForward
&& IsVecReduce(N0
) && IsVecReduce(N1
) &&
13211 IsKnownOrderedLoad(N0
.getOperand(0), N1
.getOperand(0)) < 0) {
13212 // Note this is backward to how you would expect. We create
13213 // add(reduce(load + 16), reduce(load + 0)) so that the
13214 // add(reduce(load+16), X) is combined into VADDVA(X, load+16)), leaving
13215 // the X as VADDV(load + 0)
13216 return DAG
.getNode(ISD::ADD
, dl
, VT
, N1
, N0
);
13220 if (!IsVecReduce(N0
) || !IsVecReduce(N1
))
13223 if (IsKnownOrderedLoad(N1
.getOperand(0), N0
.getOperand(0)) >= 0)
13226 // Switch from add(add(X, N0), N1) to add(add(X, N1), N0)
13227 SDValue Add0
= DAG
.getNode(ISD::ADD
, dl
, VT
, X
, N1
);
13228 return DAG
.getNode(ISD::ADD
, dl
, VT
, Add0
, N0
);
13230 if (SDValue R
= DistrubuteVecReduceLoad(N0
, N1
, true))
13232 if (SDValue R
= DistrubuteVecReduceLoad(N1
, N0
, false))
13237 static SDValue
PerformADDVecReduce(SDNode
*N
, SelectionDAG
&DAG
,
13238 const ARMSubtarget
*Subtarget
) {
13239 if (!Subtarget
->hasMVEIntegerOps())
13242 if (SDValue R
= TryDistrubutionADDVecReduce(N
, DAG
))
13245 EVT VT
= N
->getValueType(0);
13246 SDValue N0
= N
->getOperand(0);
13247 SDValue N1
= N
->getOperand(1);
13250 if (VT
!= MVT::i64
)
13253 // We are looking for a i64 add of a VADDLVx. Due to these being i64's, this
13255 // t1: i32,i32 = ARMISD::VADDLVs x
13256 // t2: i64 = build_pair t1, t1:1
13257 // t3: i64 = add t2, y
13258 // Otherwise we try to push the add up above VADDLVAx, to potentially allow
13259 // the add to be simplified seperately.
13260 // We also need to check for sext / zext and commutitive adds.
13261 auto MakeVecReduce
= [&](unsigned Opcode
, unsigned OpcodeA
, SDValue NA
,
13263 if (NB
->getOpcode() != ISD::BUILD_PAIR
)
13265 SDValue VecRed
= NB
->getOperand(0);
13266 if ((VecRed
->getOpcode() != Opcode
&& VecRed
->getOpcode() != OpcodeA
) ||
13267 VecRed
.getResNo() != 0 ||
13268 NB
->getOperand(1) != SDValue(VecRed
.getNode(), 1))
13271 if (VecRed
->getOpcode() == OpcodeA
) {
13272 // add(NA, VADDLVA(Inp), Y) -> VADDLVA(add(NA, Inp), Y)
13273 SDValue Inp
= DAG
.getNode(ISD::BUILD_PAIR
, dl
, MVT::i64
,
13274 VecRed
.getOperand(0), VecRed
.getOperand(1));
13275 NA
= DAG
.getNode(ISD::ADD
, dl
, MVT::i64
, Inp
, NA
);
13278 SmallVector
<SDValue
, 4> Ops
;
13279 Ops
.push_back(DAG
.getNode(ISD::EXTRACT_ELEMENT
, dl
, MVT::i32
, NA
,
13280 DAG
.getConstant(0, dl
, MVT::i32
)));
13281 Ops
.push_back(DAG
.getNode(ISD::EXTRACT_ELEMENT
, dl
, MVT::i32
, NA
,
13282 DAG
.getConstant(1, dl
, MVT::i32
)));
13283 unsigned S
= VecRed
->getOpcode() == OpcodeA
? 2 : 0;
13284 for (unsigned I
= S
, E
= VecRed
.getNumOperands(); I
< E
; I
++)
13285 Ops
.push_back(VecRed
->getOperand(I
));
13287 DAG
.getNode(OpcodeA
, dl
, DAG
.getVTList({MVT::i32
, MVT::i32
}), Ops
);
13288 return DAG
.getNode(ISD::BUILD_PAIR
, dl
, MVT::i64
, Red
,
13289 SDValue(Red
.getNode(), 1));
13292 if (SDValue M
= MakeVecReduce(ARMISD::VADDLVs
, ARMISD::VADDLVAs
, N0
, N1
))
13294 if (SDValue M
= MakeVecReduce(ARMISD::VADDLVu
, ARMISD::VADDLVAu
, N0
, N1
))
13296 if (SDValue M
= MakeVecReduce(ARMISD::VADDLVs
, ARMISD::VADDLVAs
, N1
, N0
))
13298 if (SDValue M
= MakeVecReduce(ARMISD::VADDLVu
, ARMISD::VADDLVAu
, N1
, N0
))
13300 if (SDValue M
= MakeVecReduce(ARMISD::VADDLVps
, ARMISD::VADDLVAps
, N0
, N1
))
13302 if (SDValue M
= MakeVecReduce(ARMISD::VADDLVpu
, ARMISD::VADDLVApu
, N0
, N1
))
13304 if (SDValue M
= MakeVecReduce(ARMISD::VADDLVps
, ARMISD::VADDLVAps
, N1
, N0
))
13306 if (SDValue M
= MakeVecReduce(ARMISD::VADDLVpu
, ARMISD::VADDLVApu
, N1
, N0
))
13308 if (SDValue M
= MakeVecReduce(ARMISD::VMLALVs
, ARMISD::VMLALVAs
, N0
, N1
))
13310 if (SDValue M
= MakeVecReduce(ARMISD::VMLALVu
, ARMISD::VMLALVAu
, N0
, N1
))
13312 if (SDValue M
= MakeVecReduce(ARMISD::VMLALVs
, ARMISD::VMLALVAs
, N1
, N0
))
13314 if (SDValue M
= MakeVecReduce(ARMISD::VMLALVu
, ARMISD::VMLALVAu
, N1
, N0
))
13316 if (SDValue M
= MakeVecReduce(ARMISD::VMLALVps
, ARMISD::VMLALVAps
, N0
, N1
))
13318 if (SDValue M
= MakeVecReduce(ARMISD::VMLALVpu
, ARMISD::VMLALVApu
, N0
, N1
))
13320 if (SDValue M
= MakeVecReduce(ARMISD::VMLALVps
, ARMISD::VMLALVAps
, N1
, N0
))
13322 if (SDValue M
= MakeVecReduce(ARMISD::VMLALVpu
, ARMISD::VMLALVApu
, N1
, N0
))
13328 ARMTargetLowering::isDesirableToCommuteWithShift(const SDNode
*N
,
13329 CombineLevel Level
) const {
13330 if (Level
== BeforeLegalizeTypes
)
13333 if (N
->getOpcode() != ISD::SHL
)
13336 if (Subtarget
->isThumb1Only()) {
13337 // Avoid making expensive immediates by commuting shifts. (This logic
13338 // only applies to Thumb1 because ARM and Thumb2 immediates can be shifted
13340 if (N
->getOpcode() != ISD::SHL
)
13342 SDValue N1
= N
->getOperand(0);
13343 if (N1
->getOpcode() != ISD::ADD
&& N1
->getOpcode() != ISD::AND
&&
13344 N1
->getOpcode() != ISD::OR
&& N1
->getOpcode() != ISD::XOR
)
13346 if (auto *Const
= dyn_cast
<ConstantSDNode
>(N1
->getOperand(1))) {
13347 if (Const
->getAPIntValue().ult(256))
13349 if (N1
->getOpcode() == ISD::ADD
&& Const
->getAPIntValue().slt(0) &&
13350 Const
->getAPIntValue().sgt(-256))
13356 // Turn off commute-with-shift transform after legalization, so it doesn't
13357 // conflict with PerformSHLSimplify. (We could try to detect when
13358 // PerformSHLSimplify would trigger more precisely, but it isn't
13359 // really necessary.)
13363 bool ARMTargetLowering::shouldFoldConstantShiftPairToMask(
13364 const SDNode
*N
, CombineLevel Level
) const {
13365 if (!Subtarget
->isThumb1Only())
13368 if (Level
== BeforeLegalizeTypes
)
13374 bool ARMTargetLowering::preferIncOfAddToSubOfNot(EVT VT
) const {
13375 if (!Subtarget
->hasNEON()) {
13376 if (Subtarget
->isThumb1Only())
13377 return VT
.getScalarSizeInBits() <= 32;
13380 return VT
.isScalarInteger();
13383 static SDValue
PerformSHLSimplify(SDNode
*N
,
13384 TargetLowering::DAGCombinerInfo
&DCI
,
13385 const ARMSubtarget
*ST
) {
13386 // Allow the generic combiner to identify potential bswaps.
13387 if (DCI
.isBeforeLegalize())
13390 // DAG combiner will fold:
13391 // (shl (add x, c1), c2) -> (add (shl x, c2), c1 << c2)
13392 // (shl (or x, c1), c2) -> (or (shl x, c2), c1 << c2
13393 // Other code patterns that can be also be modified have the following form:
13394 // b + ((a << 1) | 510)
13395 // b + ((a << 1) & 510)
13396 // b + ((a << 1) ^ 510)
13397 // b + ((a << 1) + 510)
13399 // Many instructions can perform the shift for free, but it requires both
13400 // the operands to be registers. If c1 << c2 is too large, a mov immediate
13401 // instruction will needed. So, unfold back to the original pattern if:
13402 // - if c1 and c2 are small enough that they don't require mov imms.
13403 // - the user(s) of the node can perform an shl
13405 // No shifted operands for 16-bit instructions.
13406 if (ST
->isThumb() && ST
->isThumb1Only())
13409 // Check that all the users could perform the shl themselves.
13410 for (auto U
: N
->uses()) {
13411 switch(U
->getOpcode()) {
13421 // Check that the user isn't already using a constant because there
13422 // aren't any instructions that support an immediate operand and a
13423 // shifted operand.
13424 if (isa
<ConstantSDNode
>(U
->getOperand(0)) ||
13425 isa
<ConstantSDNode
>(U
->getOperand(1)))
13428 // Check that it's not already using a shift.
13429 if (U
->getOperand(0).getOpcode() == ISD::SHL
||
13430 U
->getOperand(1).getOpcode() == ISD::SHL
)
13436 if (N
->getOpcode() != ISD::ADD
&& N
->getOpcode() != ISD::OR
&&
13437 N
->getOpcode() != ISD::XOR
&& N
->getOpcode() != ISD::AND
)
13440 if (N
->getOperand(0).getOpcode() != ISD::SHL
)
13443 SDValue SHL
= N
->getOperand(0);
13445 auto *C1ShlC2
= dyn_cast
<ConstantSDNode
>(N
->getOperand(1));
13446 auto *C2
= dyn_cast
<ConstantSDNode
>(SHL
.getOperand(1));
13447 if (!C1ShlC2
|| !C2
)
13450 APInt C2Int
= C2
->getAPIntValue();
13451 APInt C1Int
= C1ShlC2
->getAPIntValue();
13453 // Check that performing a lshr will not lose any information.
13454 APInt Mask
= APInt::getHighBitsSet(C2Int
.getBitWidth(),
13455 C2Int
.getBitWidth() - C2
->getZExtValue());
13456 if ((C1Int
& Mask
) != C1Int
)
13459 // Shift the first constant.
13460 C1Int
.lshrInPlace(C2Int
);
13462 // The immediates are encoded as an 8-bit value that can be rotated.
13463 auto LargeImm
= [](const APInt
&Imm
) {
13464 unsigned Zeros
= Imm
.countLeadingZeros() + Imm
.countTrailingZeros();
13465 return Imm
.getBitWidth() - Zeros
> 8;
13468 if (LargeImm(C1Int
) || LargeImm(C2Int
))
13471 SelectionDAG
&DAG
= DCI
.DAG
;
13473 SDValue X
= SHL
.getOperand(0);
13474 SDValue BinOp
= DAG
.getNode(N
->getOpcode(), dl
, MVT::i32
, X
,
13475 DAG
.getConstant(C1Int
, dl
, MVT::i32
));
13476 // Shift left to compensate for the lshr of C1Int.
13477 SDValue Res
= DAG
.getNode(ISD::SHL
, dl
, MVT::i32
, BinOp
, SHL
.getOperand(1));
13479 LLVM_DEBUG(dbgs() << "Simplify shl use:\n"; SHL
.getOperand(0).dump();
13480 SHL
.dump(); N
->dump());
13481 LLVM_DEBUG(dbgs() << "Into:\n"; X
.dump(); BinOp
.dump(); Res
.dump());
13486 /// PerformADDCombine - Target-specific dag combine xforms for ISD::ADD.
13488 static SDValue
PerformADDCombine(SDNode
*N
,
13489 TargetLowering::DAGCombinerInfo
&DCI
,
13490 const ARMSubtarget
*Subtarget
) {
13491 SDValue N0
= N
->getOperand(0);
13492 SDValue N1
= N
->getOperand(1);
13494 // Only works one way, because it needs an immediate operand.
13495 if (SDValue Result
= PerformSHLSimplify(N
, DCI
, Subtarget
))
13498 if (SDValue Result
= PerformADDVecReduce(N
, DCI
.DAG
, Subtarget
))
13501 // First try with the default operand order.
13502 if (SDValue Result
= PerformADDCombineWithOperands(N
, N0
, N1
, DCI
, Subtarget
))
13505 // If that didn't work, try again with the operands commuted.
13506 return PerformADDCombineWithOperands(N
, N1
, N0
, DCI
, Subtarget
);
13509 // Combine (sub 0, (csinc X, Y, CC)) -> (csinv -X, Y, CC)
13510 // providing -X is as cheap as X (currently, just a constant).
13511 static SDValue
PerformSubCSINCCombine(SDNode
*N
, SelectionDAG
&DAG
) {
13512 if (N
->getValueType(0) != MVT::i32
|| !isNullConstant(N
->getOperand(0)))
13514 SDValue CSINC
= N
->getOperand(1);
13515 if (CSINC
.getOpcode() != ARMISD::CSINC
|| !CSINC
.hasOneUse())
13518 ConstantSDNode
*X
= dyn_cast
<ConstantSDNode
>(CSINC
.getOperand(0));
13522 return DAG
.getNode(ARMISD::CSINV
, SDLoc(N
), MVT::i32
,
13523 DAG
.getNode(ISD::SUB
, SDLoc(N
), MVT::i32
, N
->getOperand(0),
13524 CSINC
.getOperand(0)),
13525 CSINC
.getOperand(1), CSINC
.getOperand(2),
13526 CSINC
.getOperand(3));
13529 /// PerformSUBCombine - Target-specific dag combine xforms for ISD::SUB.
13531 static SDValue
PerformSUBCombine(SDNode
*N
,
13532 TargetLowering::DAGCombinerInfo
&DCI
,
13533 const ARMSubtarget
*Subtarget
) {
13534 SDValue N0
= N
->getOperand(0);
13535 SDValue N1
= N
->getOperand(1);
13537 // fold (sub x, (select cc, 0, c)) -> (select cc, x, (sub, x, c))
13538 if (N1
.getNode()->hasOneUse())
13539 if (SDValue Result
= combineSelectAndUse(N
, N1
, N0
, DCI
))
13542 if (SDValue R
= PerformSubCSINCCombine(N
, DCI
.DAG
))
13545 if (!Subtarget
->hasMVEIntegerOps() || !N
->getValueType(0).isVector())
13548 // Fold (sub (ARMvmovImm 0), (ARMvdup x)) -> (ARMvdup (sub 0, x))
13549 // so that we can readily pattern match more mve instructions which can use
13550 // a scalar operand.
13551 SDValue VDup
= N
->getOperand(1);
13552 if (VDup
->getOpcode() != ARMISD::VDUP
)
13555 SDValue VMov
= N
->getOperand(0);
13556 if (VMov
->getOpcode() == ISD::BITCAST
)
13557 VMov
= VMov
->getOperand(0);
13559 if (VMov
->getOpcode() != ARMISD::VMOVIMM
|| !isZeroVector(VMov
))
13563 SDValue Negate
= DCI
.DAG
.getNode(ISD::SUB
, dl
, MVT::i32
,
13564 DCI
.DAG
.getConstant(0, dl
, MVT::i32
),
13565 VDup
->getOperand(0));
13566 return DCI
.DAG
.getNode(ARMISD::VDUP
, dl
, N
->getValueType(0), Negate
);
13569 /// PerformVMULCombine
13570 /// Distribute (A + B) * C to (A * C) + (B * C) to take advantage of the
13571 /// special multiplier accumulator forwarding.
13572 /// vmul d3, d0, d2
13573 /// vmla d3, d1, d2
13575 /// vadd d3, d0, d1
13576 /// vmul d3, d3, d2
13577 // However, for (A + B) * (A + B),
13584 static SDValue
PerformVMULCombine(SDNode
*N
,
13585 TargetLowering::DAGCombinerInfo
&DCI
,
13586 const ARMSubtarget
*Subtarget
) {
13587 if (!Subtarget
->hasVMLxForwarding())
13590 SelectionDAG
&DAG
= DCI
.DAG
;
13591 SDValue N0
= N
->getOperand(0);
13592 SDValue N1
= N
->getOperand(1);
13593 unsigned Opcode
= N0
.getOpcode();
13594 if (Opcode
!= ISD::ADD
&& Opcode
!= ISD::SUB
&&
13595 Opcode
!= ISD::FADD
&& Opcode
!= ISD::FSUB
) {
13596 Opcode
= N1
.getOpcode();
13597 if (Opcode
!= ISD::ADD
&& Opcode
!= ISD::SUB
&&
13598 Opcode
!= ISD::FADD
&& Opcode
!= ISD::FSUB
)
13606 EVT VT
= N
->getValueType(0);
13608 SDValue N00
= N0
->getOperand(0);
13609 SDValue N01
= N0
->getOperand(1);
13610 return DAG
.getNode(Opcode
, DL
, VT
,
13611 DAG
.getNode(ISD::MUL
, DL
, VT
, N00
, N1
),
13612 DAG
.getNode(ISD::MUL
, DL
, VT
, N01
, N1
));
13615 static SDValue
PerformMVEVMULLCombine(SDNode
*N
, SelectionDAG
&DAG
,
13616 const ARMSubtarget
*Subtarget
) {
13617 EVT VT
= N
->getValueType(0);
13618 if (VT
!= MVT::v2i64
)
13621 SDValue N0
= N
->getOperand(0);
13622 SDValue N1
= N
->getOperand(1);
13624 auto IsSignExt
= [&](SDValue Op
) {
13625 if (Op
->getOpcode() != ISD::SIGN_EXTEND_INREG
)
13627 EVT VT
= cast
<VTSDNode
>(Op
->getOperand(1))->getVT();
13628 if (VT
.getScalarSizeInBits() == 32)
13629 return Op
->getOperand(0);
13632 auto IsZeroExt
= [&](SDValue Op
) {
13633 // Zero extends are a little more awkward. At the point we are matching
13634 // this, we are looking for an AND with a (-1, 0, -1, 0) buildvector mask.
13635 // That might be before of after a bitcast depending on how the and is
13636 // placed. Because this has to look through bitcasts, it is currently only
13637 // supported on LE.
13638 if (!Subtarget
->isLittle())
13642 if (And
->getOpcode() == ISD::BITCAST
)
13643 And
= And
->getOperand(0);
13644 if (And
->getOpcode() != ISD::AND
)
13646 SDValue Mask
= And
->getOperand(1);
13647 if (Mask
->getOpcode() == ISD::BITCAST
)
13648 Mask
= Mask
->getOperand(0);
13650 if (Mask
->getOpcode() != ISD::BUILD_VECTOR
||
13651 Mask
.getValueType() != MVT::v4i32
)
13653 if (isAllOnesConstant(Mask
->getOperand(0)) &&
13654 isNullConstant(Mask
->getOperand(1)) &&
13655 isAllOnesConstant(Mask
->getOperand(2)) &&
13656 isNullConstant(Mask
->getOperand(3)))
13657 return And
->getOperand(0);
13662 if (SDValue Op0
= IsSignExt(N0
)) {
13663 if (SDValue Op1
= IsSignExt(N1
)) {
13664 SDValue New0a
= DAG
.getNode(ARMISD::VECTOR_REG_CAST
, dl
, MVT::v4i32
, Op0
);
13665 SDValue New1a
= DAG
.getNode(ARMISD::VECTOR_REG_CAST
, dl
, MVT::v4i32
, Op1
);
13666 return DAG
.getNode(ARMISD::VMULLs
, dl
, VT
, New0a
, New1a
);
13669 if (SDValue Op0
= IsZeroExt(N0
)) {
13670 if (SDValue Op1
= IsZeroExt(N1
)) {
13671 SDValue New0a
= DAG
.getNode(ARMISD::VECTOR_REG_CAST
, dl
, MVT::v4i32
, Op0
);
13672 SDValue New1a
= DAG
.getNode(ARMISD::VECTOR_REG_CAST
, dl
, MVT::v4i32
, Op1
);
13673 return DAG
.getNode(ARMISD::VMULLu
, dl
, VT
, New0a
, New1a
);
13680 static SDValue
PerformMULCombine(SDNode
*N
,
13681 TargetLowering::DAGCombinerInfo
&DCI
,
13682 const ARMSubtarget
*Subtarget
) {
13683 SelectionDAG
&DAG
= DCI
.DAG
;
13685 EVT VT
= N
->getValueType(0);
13686 if (Subtarget
->hasMVEIntegerOps() && VT
== MVT::v2i64
)
13687 return PerformMVEVMULLCombine(N
, DAG
, Subtarget
);
13689 if (Subtarget
->isThumb1Only())
13692 if (DCI
.isBeforeLegalize() || DCI
.isCalledByLegalizer())
13695 if (VT
.is64BitVector() || VT
.is128BitVector())
13696 return PerformVMULCombine(N
, DCI
, Subtarget
);
13697 if (VT
!= MVT::i32
)
13700 ConstantSDNode
*C
= dyn_cast
<ConstantSDNode
>(N
->getOperand(1));
13704 int64_t MulAmt
= C
->getSExtValue();
13705 unsigned ShiftAmt
= countTrailingZeros
<uint64_t>(MulAmt
);
13707 ShiftAmt
= ShiftAmt
& (32 - 1);
13708 SDValue V
= N
->getOperand(0);
13712 MulAmt
>>= ShiftAmt
;
13715 if (isPowerOf2_32(MulAmt
- 1)) {
13716 // (mul x, 2^N + 1) => (add (shl x, N), x)
13717 Res
= DAG
.getNode(ISD::ADD
, DL
, VT
,
13719 DAG
.getNode(ISD::SHL
, DL
, VT
,
13721 DAG
.getConstant(Log2_32(MulAmt
- 1), DL
,
13723 } else if (isPowerOf2_32(MulAmt
+ 1)) {
13724 // (mul x, 2^N - 1) => (sub (shl x, N), x)
13725 Res
= DAG
.getNode(ISD::SUB
, DL
, VT
,
13726 DAG
.getNode(ISD::SHL
, DL
, VT
,
13728 DAG
.getConstant(Log2_32(MulAmt
+ 1), DL
,
13734 uint64_t MulAmtAbs
= -MulAmt
;
13735 if (isPowerOf2_32(MulAmtAbs
+ 1)) {
13736 // (mul x, -(2^N - 1)) => (sub x, (shl x, N))
13737 Res
= DAG
.getNode(ISD::SUB
, DL
, VT
,
13739 DAG
.getNode(ISD::SHL
, DL
, VT
,
13741 DAG
.getConstant(Log2_32(MulAmtAbs
+ 1), DL
,
13743 } else if (isPowerOf2_32(MulAmtAbs
- 1)) {
13744 // (mul x, -(2^N + 1)) => - (add (shl x, N), x)
13745 Res
= DAG
.getNode(ISD::ADD
, DL
, VT
,
13747 DAG
.getNode(ISD::SHL
, DL
, VT
,
13749 DAG
.getConstant(Log2_32(MulAmtAbs
- 1), DL
,
13751 Res
= DAG
.getNode(ISD::SUB
, DL
, VT
,
13752 DAG
.getConstant(0, DL
, MVT::i32
), Res
);
13758 Res
= DAG
.getNode(ISD::SHL
, DL
, VT
,
13759 Res
, DAG
.getConstant(ShiftAmt
, DL
, MVT::i32
));
13761 // Do not add new nodes to DAG combiner worklist.
13762 DCI
.CombineTo(N
, Res
, false);
13766 static SDValue
CombineANDShift(SDNode
*N
,
13767 TargetLowering::DAGCombinerInfo
&DCI
,
13768 const ARMSubtarget
*Subtarget
) {
13769 // Allow DAGCombine to pattern-match before we touch the canonical form.
13770 if (DCI
.isBeforeLegalize() || DCI
.isCalledByLegalizer())
13773 if (N
->getValueType(0) != MVT::i32
)
13776 ConstantSDNode
*N1C
= dyn_cast
<ConstantSDNode
>(N
->getOperand(1));
13780 uint32_t C1
= (uint32_t)N1C
->getZExtValue();
13781 // Don't transform uxtb/uxth.
13782 if (C1
== 255 || C1
== 65535)
13785 SDNode
*N0
= N
->getOperand(0).getNode();
13786 if (!N0
->hasOneUse())
13789 if (N0
->getOpcode() != ISD::SHL
&& N0
->getOpcode() != ISD::SRL
)
13792 bool LeftShift
= N0
->getOpcode() == ISD::SHL
;
13794 ConstantSDNode
*N01C
= dyn_cast
<ConstantSDNode
>(N0
->getOperand(1));
13798 uint32_t C2
= (uint32_t)N01C
->getZExtValue();
13799 if (!C2
|| C2
>= 32)
13802 // Clear irrelevant bits in the mask.
13808 SelectionDAG
&DAG
= DCI
.DAG
;
13811 // We have a pattern of the form "(and (shl x, c2) c1)" or
13812 // "(and (srl x, c2) c1)", where c1 is a shifted mask. Try to
13813 // transform to a pair of shifts, to save materializing c1.
13815 // First pattern: right shift, then mask off leading bits.
13816 // FIXME: Use demanded bits?
13817 if (!LeftShift
&& isMask_32(C1
)) {
13818 uint32_t C3
= countLeadingZeros(C1
);
13820 SDValue SHL
= DAG
.getNode(ISD::SHL
, DL
, MVT::i32
, N0
->getOperand(0),
13821 DAG
.getConstant(C3
- C2
, DL
, MVT::i32
));
13822 return DAG
.getNode(ISD::SRL
, DL
, MVT::i32
, SHL
,
13823 DAG
.getConstant(C3
, DL
, MVT::i32
));
13827 // First pattern, reversed: left shift, then mask off trailing bits.
13828 if (LeftShift
&& isMask_32(~C1
)) {
13829 uint32_t C3
= countTrailingZeros(C1
);
13831 SDValue SHL
= DAG
.getNode(ISD::SRL
, DL
, MVT::i32
, N0
->getOperand(0),
13832 DAG
.getConstant(C3
- C2
, DL
, MVT::i32
));
13833 return DAG
.getNode(ISD::SHL
, DL
, MVT::i32
, SHL
,
13834 DAG
.getConstant(C3
, DL
, MVT::i32
));
13838 // Second pattern: left shift, then mask off leading bits.
13839 // FIXME: Use demanded bits?
13840 if (LeftShift
&& isShiftedMask_32(C1
)) {
13841 uint32_t Trailing
= countTrailingZeros(C1
);
13842 uint32_t C3
= countLeadingZeros(C1
);
13843 if (Trailing
== C2
&& C2
+ C3
< 32) {
13844 SDValue SHL
= DAG
.getNode(ISD::SHL
, DL
, MVT::i32
, N0
->getOperand(0),
13845 DAG
.getConstant(C2
+ C3
, DL
, MVT::i32
));
13846 return DAG
.getNode(ISD::SRL
, DL
, MVT::i32
, SHL
,
13847 DAG
.getConstant(C3
, DL
, MVT::i32
));
13851 // Second pattern, reversed: right shift, then mask off trailing bits.
13852 // FIXME: Handle other patterns of known/demanded bits.
13853 if (!LeftShift
&& isShiftedMask_32(C1
)) {
13854 uint32_t Leading
= countLeadingZeros(C1
);
13855 uint32_t C3
= countTrailingZeros(C1
);
13856 if (Leading
== C2
&& C2
+ C3
< 32) {
13857 SDValue SHL
= DAG
.getNode(ISD::SRL
, DL
, MVT::i32
, N0
->getOperand(0),
13858 DAG
.getConstant(C2
+ C3
, DL
, MVT::i32
));
13859 return DAG
.getNode(ISD::SHL
, DL
, MVT::i32
, SHL
,
13860 DAG
.getConstant(C3
, DL
, MVT::i32
));
13864 // FIXME: Transform "(and (shl x, c2) c1)" ->
13865 // "(shl (and x, c1>>c2), c2)" if "c1 >> c2" is a cheaper immediate than
13870 static SDValue
PerformANDCombine(SDNode
*N
,
13871 TargetLowering::DAGCombinerInfo
&DCI
,
13872 const ARMSubtarget
*Subtarget
) {
13873 // Attempt to use immediate-form VBIC
13874 BuildVectorSDNode
*BVN
= dyn_cast
<BuildVectorSDNode
>(N
->getOperand(1));
13876 EVT VT
= N
->getValueType(0);
13877 SelectionDAG
&DAG
= DCI
.DAG
;
13879 if (!DAG
.getTargetLoweringInfo().isTypeLegal(VT
) || VT
== MVT::v4i1
||
13880 VT
== MVT::v8i1
|| VT
== MVT::v16i1
)
13883 APInt SplatBits
, SplatUndef
;
13884 unsigned SplatBitSize
;
13886 if (BVN
&& (Subtarget
->hasNEON() || Subtarget
->hasMVEIntegerOps()) &&
13887 BVN
->isConstantSplat(SplatBits
, SplatUndef
, SplatBitSize
, HasAnyUndefs
)) {
13888 if (SplatBitSize
== 8 || SplatBitSize
== 16 || SplatBitSize
== 32 ||
13889 SplatBitSize
== 64) {
13891 SDValue Val
= isVMOVModifiedImm((~SplatBits
).getZExtValue(),
13892 SplatUndef
.getZExtValue(), SplatBitSize
,
13893 DAG
, dl
, VbicVT
, VT
, OtherModImm
);
13894 if (Val
.getNode()) {
13896 DAG
.getNode(ISD::BITCAST
, dl
, VbicVT
, N
->getOperand(0));
13897 SDValue Vbic
= DAG
.getNode(ARMISD::VBICIMM
, dl
, VbicVT
, Input
, Val
);
13898 return DAG
.getNode(ISD::BITCAST
, dl
, VT
, Vbic
);
13903 if (!Subtarget
->isThumb1Only()) {
13904 // fold (and (select cc, -1, c), x) -> (select cc, x, (and, x, c))
13905 if (SDValue Result
= combineSelectAndUseCommutative(N
, true, DCI
))
13908 if (SDValue Result
= PerformSHLSimplify(N
, DCI
, Subtarget
))
13912 if (Subtarget
->isThumb1Only())
13913 if (SDValue Result
= CombineANDShift(N
, DCI
, Subtarget
))
13919 // Try combining OR nodes to SMULWB, SMULWT.
13920 static SDValue
PerformORCombineToSMULWBT(SDNode
*OR
,
13921 TargetLowering::DAGCombinerInfo
&DCI
,
13922 const ARMSubtarget
*Subtarget
) {
13923 if (!Subtarget
->hasV6Ops() ||
13924 (Subtarget
->isThumb() &&
13925 (!Subtarget
->hasThumb2() || !Subtarget
->hasDSP())))
13928 SDValue SRL
= OR
->getOperand(0);
13929 SDValue SHL
= OR
->getOperand(1);
13931 if (SRL
.getOpcode() != ISD::SRL
|| SHL
.getOpcode() != ISD::SHL
) {
13932 SRL
= OR
->getOperand(1);
13933 SHL
= OR
->getOperand(0);
13935 if (!isSRL16(SRL
) || !isSHL16(SHL
))
13938 // The first operands to the shifts need to be the two results from the
13939 // same smul_lohi node.
13940 if ((SRL
.getOperand(0).getNode() != SHL
.getOperand(0).getNode()) ||
13941 SRL
.getOperand(0).getOpcode() != ISD::SMUL_LOHI
)
13944 SDNode
*SMULLOHI
= SRL
.getOperand(0).getNode();
13945 if (SRL
.getOperand(0) != SDValue(SMULLOHI
, 0) ||
13946 SHL
.getOperand(0) != SDValue(SMULLOHI
, 1))
13950 // (or (srl (smul_lohi ?, ?), 16), (shl (smul_lohi ?, ?), 16)))
13951 // For SMUL[B|T] smul_lohi will take a 32-bit and a 16-bit arguments.
13952 // For SMUWB the 16-bit value will signed extended somehow.
13953 // For SMULWT only the SRA is required.
13954 // Check both sides of SMUL_LOHI
13955 SDValue OpS16
= SMULLOHI
->getOperand(0);
13956 SDValue OpS32
= SMULLOHI
->getOperand(1);
13958 SelectionDAG
&DAG
= DCI
.DAG
;
13959 if (!isS16(OpS16
, DAG
) && !isSRA16(OpS16
)) {
13961 OpS32
= SMULLOHI
->getOperand(0);
13965 unsigned Opcode
= 0;
13966 if (isS16(OpS16
, DAG
))
13967 Opcode
= ARMISD::SMULWB
;
13968 else if (isSRA16(OpS16
)) {
13969 Opcode
= ARMISD::SMULWT
;
13970 OpS16
= OpS16
->getOperand(0);
13975 SDValue Res
= DAG
.getNode(Opcode
, dl
, MVT::i32
, OpS32
, OpS16
);
13976 DAG
.ReplaceAllUsesOfValueWith(SDValue(OR
, 0), Res
);
13977 return SDValue(OR
, 0);
13980 static SDValue
PerformORCombineToBFI(SDNode
*N
,
13981 TargetLowering::DAGCombinerInfo
&DCI
,
13982 const ARMSubtarget
*Subtarget
) {
13983 // BFI is only available on V6T2+
13984 if (Subtarget
->isThumb1Only() || !Subtarget
->hasV6T2Ops())
13987 EVT VT
= N
->getValueType(0);
13988 SDValue N0
= N
->getOperand(0);
13989 SDValue N1
= N
->getOperand(1);
13990 SelectionDAG
&DAG
= DCI
.DAG
;
13992 // 1) or (and A, mask), val => ARMbfi A, val, mask
13993 // iff (val & mask) == val
13995 // 2) or (and A, mask), (and B, mask2) => ARMbfi A, (lsr B, amt), mask
13996 // 2a) iff isBitFieldInvertedMask(mask) && isBitFieldInvertedMask(~mask2)
13997 // && mask == ~mask2
13998 // 2b) iff isBitFieldInvertedMask(~mask) && isBitFieldInvertedMask(mask2)
13999 // && ~mask == mask2
14000 // (i.e., copy a bitfield value into another bitfield of the same width)
14002 if (VT
!= MVT::i32
)
14005 SDValue N00
= N0
.getOperand(0);
14007 // The value and the mask need to be constants so we can verify this is
14008 // actually a bitfield set. If the mask is 0xffff, we can do better
14009 // via a movt instruction, so don't use BFI in that case.
14010 SDValue MaskOp
= N0
.getOperand(1);
14011 ConstantSDNode
*MaskC
= dyn_cast
<ConstantSDNode
>(MaskOp
);
14014 unsigned Mask
= MaskC
->getZExtValue();
14015 if (Mask
== 0xffff)
14018 // Case (1): or (and A, mask), val => ARMbfi A, val, mask
14019 ConstantSDNode
*N1C
= dyn_cast
<ConstantSDNode
>(N1
);
14021 unsigned Val
= N1C
->getZExtValue();
14022 if ((Val
& ~Mask
) != Val
)
14025 if (ARM::isBitFieldInvertedMask(Mask
)) {
14026 Val
>>= countTrailingZeros(~Mask
);
14028 Res
= DAG
.getNode(ARMISD::BFI
, DL
, VT
, N00
,
14029 DAG
.getConstant(Val
, DL
, MVT::i32
),
14030 DAG
.getConstant(Mask
, DL
, MVT::i32
));
14032 DCI
.CombineTo(N
, Res
, false);
14033 // Return value from the original node to inform the combiner than N is
14035 return SDValue(N
, 0);
14037 } else if (N1
.getOpcode() == ISD::AND
) {
14038 // case (2) or (and A, mask), (and B, mask2) => ARMbfi A, (lsr B, amt), mask
14039 ConstantSDNode
*N11C
= dyn_cast
<ConstantSDNode
>(N1
.getOperand(1));
14042 unsigned Mask2
= N11C
->getZExtValue();
14044 // Mask and ~Mask2 (or reverse) must be equivalent for the BFI pattern
14046 if (ARM::isBitFieldInvertedMask(Mask
) &&
14047 (Mask
== ~Mask2
)) {
14048 // The pack halfword instruction works better for masks that fit it,
14049 // so use that when it's available.
14050 if (Subtarget
->hasDSP() &&
14051 (Mask
== 0xffff || Mask
== 0xffff0000))
14054 unsigned amt
= countTrailingZeros(Mask2
);
14055 Res
= DAG
.getNode(ISD::SRL
, DL
, VT
, N1
.getOperand(0),
14056 DAG
.getConstant(amt
, DL
, MVT::i32
));
14057 Res
= DAG
.getNode(ARMISD::BFI
, DL
, VT
, N00
, Res
,
14058 DAG
.getConstant(Mask
, DL
, MVT::i32
));
14059 DCI
.CombineTo(N
, Res
, false);
14060 // Return value from the original node to inform the combiner than N is
14062 return SDValue(N
, 0);
14063 } else if (ARM::isBitFieldInvertedMask(~Mask
) &&
14064 (~Mask
== Mask2
)) {
14065 // The pack halfword instruction works better for masks that fit it,
14066 // so use that when it's available.
14067 if (Subtarget
->hasDSP() &&
14068 (Mask2
== 0xffff || Mask2
== 0xffff0000))
14071 unsigned lsb
= countTrailingZeros(Mask
);
14072 Res
= DAG
.getNode(ISD::SRL
, DL
, VT
, N00
,
14073 DAG
.getConstant(lsb
, DL
, MVT::i32
));
14074 Res
= DAG
.getNode(ARMISD::BFI
, DL
, VT
, N1
.getOperand(0), Res
,
14075 DAG
.getConstant(Mask2
, DL
, MVT::i32
));
14076 DCI
.CombineTo(N
, Res
, false);
14077 // Return value from the original node to inform the combiner than N is
14079 return SDValue(N
, 0);
14083 if (DAG
.MaskedValueIsZero(N1
, MaskC
->getAPIntValue()) &&
14084 N00
.getOpcode() == ISD::SHL
&& isa
<ConstantSDNode
>(N00
.getOperand(1)) &&
14085 ARM::isBitFieldInvertedMask(~Mask
)) {
14086 // Case (3): or (and (shl A, #shamt), mask), B => ARMbfi B, A, ~mask
14087 // where lsb(mask) == #shamt and masked bits of B are known zero.
14088 SDValue ShAmt
= N00
.getOperand(1);
14089 unsigned ShAmtC
= cast
<ConstantSDNode
>(ShAmt
)->getZExtValue();
14090 unsigned LSB
= countTrailingZeros(Mask
);
14094 Res
= DAG
.getNode(ARMISD::BFI
, DL
, VT
, N1
, N00
.getOperand(0),
14095 DAG
.getConstant(~Mask
, DL
, MVT::i32
));
14097 DCI
.CombineTo(N
, Res
, false);
14098 // Return value from the original node to inform the combiner than N is
14100 return SDValue(N
, 0);
14106 static bool isValidMVECond(unsigned CC
, bool IsFloat
) {
14123 static ARMCC::CondCodes
getVCMPCondCode(SDValue N
) {
14124 if (N
->getOpcode() == ARMISD::VCMP
)
14125 return (ARMCC::CondCodes
)N
->getConstantOperandVal(2);
14126 else if (N
->getOpcode() == ARMISD::VCMPZ
)
14127 return (ARMCC::CondCodes
)N
->getConstantOperandVal(1);
14129 llvm_unreachable("Not a VCMP/VCMPZ!");
14132 static bool CanInvertMVEVCMP(SDValue N
) {
14133 ARMCC::CondCodes CC
= ARMCC::getOppositeCondition(getVCMPCondCode(N
));
14134 return isValidMVECond(CC
, N
->getOperand(0).getValueType().isFloatingPoint());
14137 static SDValue
PerformORCombine_i1(SDNode
*N
, SelectionDAG
&DAG
,
14138 const ARMSubtarget
*Subtarget
) {
14139 // Try to invert "or A, B" -> "and ~A, ~B", as the "and" is easier to chain
14140 // together with predicates
14141 EVT VT
= N
->getValueType(0);
14143 SDValue N0
= N
->getOperand(0);
14144 SDValue N1
= N
->getOperand(1);
14146 auto IsFreelyInvertable
= [&](SDValue V
) {
14147 if (V
->getOpcode() == ARMISD::VCMP
|| V
->getOpcode() == ARMISD::VCMPZ
)
14148 return CanInvertMVEVCMP(V
);
14152 // At least one operand must be freely invertable.
14153 if (!(IsFreelyInvertable(N0
) || IsFreelyInvertable(N1
)))
14156 SDValue NewN0
= DAG
.getLogicalNOT(DL
, N0
, VT
);
14157 SDValue NewN1
= DAG
.getLogicalNOT(DL
, N1
, VT
);
14158 SDValue And
= DAG
.getNode(ISD::AND
, DL
, VT
, NewN0
, NewN1
);
14159 return DAG
.getLogicalNOT(DL
, And
, VT
);
14162 /// PerformORCombine - Target-specific dag combine xforms for ISD::OR
14163 static SDValue
PerformORCombine(SDNode
*N
,
14164 TargetLowering::DAGCombinerInfo
&DCI
,
14165 const ARMSubtarget
*Subtarget
) {
14166 // Attempt to use immediate-form VORR
14167 BuildVectorSDNode
*BVN
= dyn_cast
<BuildVectorSDNode
>(N
->getOperand(1));
14169 EVT VT
= N
->getValueType(0);
14170 SelectionDAG
&DAG
= DCI
.DAG
;
14172 if(!DAG
.getTargetLoweringInfo().isTypeLegal(VT
))
14175 if (Subtarget
->hasMVEIntegerOps() &&
14176 (VT
== MVT::v4i1
|| VT
== MVT::v8i1
|| VT
== MVT::v16i1
))
14177 return PerformORCombine_i1(N
, DAG
, Subtarget
);
14179 APInt SplatBits
, SplatUndef
;
14180 unsigned SplatBitSize
;
14182 if (BVN
&& (Subtarget
->hasNEON() || Subtarget
->hasMVEIntegerOps()) &&
14183 BVN
->isConstantSplat(SplatBits
, SplatUndef
, SplatBitSize
, HasAnyUndefs
)) {
14184 if (SplatBitSize
== 8 || SplatBitSize
== 16 || SplatBitSize
== 32 ||
14185 SplatBitSize
== 64) {
14188 isVMOVModifiedImm(SplatBits
.getZExtValue(), SplatUndef
.getZExtValue(),
14189 SplatBitSize
, DAG
, dl
, VorrVT
, VT
, OtherModImm
);
14190 if (Val
.getNode()) {
14192 DAG
.getNode(ISD::BITCAST
, dl
, VorrVT
, N
->getOperand(0));
14193 SDValue Vorr
= DAG
.getNode(ARMISD::VORRIMM
, dl
, VorrVT
, Input
, Val
);
14194 return DAG
.getNode(ISD::BITCAST
, dl
, VT
, Vorr
);
14199 if (!Subtarget
->isThumb1Only()) {
14200 // fold (or (select cc, 0, c), x) -> (select cc, x, (or, x, c))
14201 if (SDValue Result
= combineSelectAndUseCommutative(N
, false, DCI
))
14203 if (SDValue Result
= PerformORCombineToSMULWBT(N
, DCI
, Subtarget
))
14207 SDValue N0
= N
->getOperand(0);
14208 SDValue N1
= N
->getOperand(1);
14210 // (or (and B, A), (and C, ~A)) => (VBSL A, B, C) when A is a constant.
14211 if (Subtarget
->hasNEON() && N1
.getOpcode() == ISD::AND
&& VT
.isVector() &&
14212 DAG
.getTargetLoweringInfo().isTypeLegal(VT
)) {
14214 // The code below optimizes (or (and X, Y), Z).
14215 // The AND operand needs to have a single user to make these optimizations
14217 if (N0
.getOpcode() != ISD::AND
|| !N0
.hasOneUse())
14221 unsigned SplatBitSize
;
14224 APInt SplatBits0
, SplatBits1
;
14225 BuildVectorSDNode
*BVN0
= dyn_cast
<BuildVectorSDNode
>(N0
->getOperand(1));
14226 BuildVectorSDNode
*BVN1
= dyn_cast
<BuildVectorSDNode
>(N1
->getOperand(1));
14227 // Ensure that the second operand of both ands are constants
14228 if (BVN0
&& BVN0
->isConstantSplat(SplatBits0
, SplatUndef
, SplatBitSize
,
14229 HasAnyUndefs
) && !HasAnyUndefs
) {
14230 if (BVN1
&& BVN1
->isConstantSplat(SplatBits1
, SplatUndef
, SplatBitSize
,
14231 HasAnyUndefs
) && !HasAnyUndefs
) {
14232 // Ensure that the bit width of the constants are the same and that
14233 // the splat arguments are logical inverses as per the pattern we
14234 // are trying to simplify.
14235 if (SplatBits0
.getBitWidth() == SplatBits1
.getBitWidth() &&
14236 SplatBits0
== ~SplatBits1
) {
14237 // Canonicalize the vector type to make instruction selection
14239 EVT CanonicalVT
= VT
.is128BitVector() ? MVT::v4i32
: MVT::v2i32
;
14240 SDValue Result
= DAG
.getNode(ARMISD::VBSP
, dl
, CanonicalVT
,
14243 N1
->getOperand(0));
14244 return DAG
.getNode(ISD::BITCAST
, dl
, VT
, Result
);
14250 // Try to use the ARM/Thumb2 BFI (bitfield insert) instruction when
14252 if (N0
.getOpcode() == ISD::AND
&& N0
.hasOneUse()) {
14253 if (SDValue Res
= PerformORCombineToBFI(N
, DCI
, Subtarget
))
14257 if (SDValue Result
= PerformSHLSimplify(N
, DCI
, Subtarget
))
14263 static SDValue
PerformXORCombine(SDNode
*N
,
14264 TargetLowering::DAGCombinerInfo
&DCI
,
14265 const ARMSubtarget
*Subtarget
) {
14266 EVT VT
= N
->getValueType(0);
14267 SelectionDAG
&DAG
= DCI
.DAG
;
14269 if(!DAG
.getTargetLoweringInfo().isTypeLegal(VT
))
14272 if (!Subtarget
->isThumb1Only()) {
14273 // fold (xor (select cc, 0, c), x) -> (select cc, x, (xor, x, c))
14274 if (SDValue Result
= combineSelectAndUseCommutative(N
, false, DCI
))
14277 if (SDValue Result
= PerformSHLSimplify(N
, DCI
, Subtarget
))
14281 if (Subtarget
->hasMVEIntegerOps()) {
14282 // fold (xor(vcmp/z, 1)) into a vcmp with the opposite condition.
14283 SDValue N0
= N
->getOperand(0);
14284 SDValue N1
= N
->getOperand(1);
14285 const TargetLowering
*TLI
= Subtarget
->getTargetLowering();
14286 if (TLI
->isConstTrueVal(N1
.getNode()) &&
14287 (N0
->getOpcode() == ARMISD::VCMP
|| N0
->getOpcode() == ARMISD::VCMPZ
)) {
14288 if (CanInvertMVEVCMP(N0
)) {
14290 ARMCC::CondCodes CC
= ARMCC::getOppositeCondition(getVCMPCondCode(N0
));
14292 SmallVector
<SDValue
, 4> Ops
;
14293 Ops
.push_back(N0
->getOperand(0));
14294 if (N0
->getOpcode() == ARMISD::VCMP
)
14295 Ops
.push_back(N0
->getOperand(1));
14296 Ops
.push_back(DAG
.getConstant(CC
, DL
, MVT::i32
));
14297 return DAG
.getNode(N0
->getOpcode(), DL
, N0
->getValueType(0), Ops
);
14305 // ParseBFI - given a BFI instruction in N, extract the "from" value (Rn) and return it,
14306 // and fill in FromMask and ToMask with (consecutive) bits in "from" to be extracted and
14307 // their position in "to" (Rd).
14308 static SDValue
ParseBFI(SDNode
*N
, APInt
&ToMask
, APInt
&FromMask
) {
14309 assert(N
->getOpcode() == ARMISD::BFI
);
14311 SDValue From
= N
->getOperand(1);
14312 ToMask
= ~cast
<ConstantSDNode
>(N
->getOperand(2))->getAPIntValue();
14313 FromMask
= APInt::getLowBitsSet(ToMask
.getBitWidth(), ToMask
.countPopulation());
14315 // If the Base came from a SHR #C, we can deduce that it is really testing bit
14316 // #C in the base of the SHR.
14317 if (From
->getOpcode() == ISD::SRL
&&
14318 isa
<ConstantSDNode
>(From
->getOperand(1))) {
14319 APInt Shift
= cast
<ConstantSDNode
>(From
->getOperand(1))->getAPIntValue();
14320 assert(Shift
.getLimitedValue() < 32 && "Shift too large!");
14321 FromMask
<<= Shift
.getLimitedValue(31);
14322 From
= From
->getOperand(0);
14328 // If A and B contain one contiguous set of bits, does A | B == A . B?
14330 // Neither A nor B must be zero.
14331 static bool BitsProperlyConcatenate(const APInt
&A
, const APInt
&B
) {
14332 unsigned LastActiveBitInA
= A
.countTrailingZeros();
14333 unsigned FirstActiveBitInB
= B
.getBitWidth() - B
.countLeadingZeros() - 1;
14334 return LastActiveBitInA
- 1 == FirstActiveBitInB
;
14337 static SDValue
FindBFIToCombineWith(SDNode
*N
) {
14338 // We have a BFI in N. Find a BFI it can combine with, if one exists.
14339 APInt ToMask
, FromMask
;
14340 SDValue From
= ParseBFI(N
, ToMask
, FromMask
);
14341 SDValue To
= N
->getOperand(0);
14344 if (V
.getOpcode() != ARMISD::BFI
)
14347 APInt NewToMask
, NewFromMask
;
14348 SDValue NewFrom
= ParseBFI(V
.getNode(), NewToMask
, NewFromMask
);
14349 if (NewFrom
!= From
)
14352 // Do the written bits conflict with any we've seen so far?
14353 if ((NewToMask
& ToMask
).getBoolValue())
14354 // Conflicting bits.
14357 // Are the new bits contiguous when combined with the old bits?
14358 if (BitsProperlyConcatenate(ToMask
, NewToMask
) &&
14359 BitsProperlyConcatenate(FromMask
, NewFromMask
))
14361 if (BitsProperlyConcatenate(NewToMask
, ToMask
) &&
14362 BitsProperlyConcatenate(NewFromMask
, FromMask
))
14368 static SDValue
PerformBFICombine(SDNode
*N
, SelectionDAG
&DAG
) {
14369 SDValue N0
= N
->getOperand(0);
14370 SDValue N1
= N
->getOperand(1);
14372 if (N1
.getOpcode() == ISD::AND
) {
14373 // (bfi A, (and B, Mask1), Mask2) -> (bfi A, B, Mask2) iff
14374 // the bits being cleared by the AND are not demanded by the BFI.
14375 ConstantSDNode
*N11C
= dyn_cast
<ConstantSDNode
>(N1
.getOperand(1));
14378 unsigned InvMask
= cast
<ConstantSDNode
>(N
->getOperand(2))->getZExtValue();
14379 unsigned LSB
= countTrailingZeros(~InvMask
);
14380 unsigned Width
= (32 - countLeadingZeros(~InvMask
)) - LSB
;
14382 static_cast<unsigned>(std::numeric_limits
<unsigned>::digits
) &&
14383 "undefined behavior");
14384 unsigned Mask
= (1u << Width
) - 1;
14385 unsigned Mask2
= N11C
->getZExtValue();
14386 if ((Mask
& (~Mask2
)) == 0)
14387 return DAG
.getNode(ARMISD::BFI
, SDLoc(N
), N
->getValueType(0),
14388 N
->getOperand(0), N1
.getOperand(0), N
->getOperand(2));
14392 // Look for another BFI to combine with.
14393 if (SDValue CombineBFI
= FindBFIToCombineWith(N
)) {
14394 // We've found a BFI.
14395 APInt ToMask1
, FromMask1
;
14396 SDValue From1
= ParseBFI(N
, ToMask1
, FromMask1
);
14398 APInt ToMask2
, FromMask2
;
14399 SDValue From2
= ParseBFI(CombineBFI
.getNode(), ToMask2
, FromMask2
);
14400 assert(From1
== From2
);
14403 // Create a new BFI, combining the two together.
14404 APInt NewFromMask
= FromMask1
| FromMask2
;
14405 APInt NewToMask
= ToMask1
| ToMask2
;
14407 EVT VT
= N
->getValueType(0);
14410 if (NewFromMask
[0] == 0)
14411 From1
= DAG
.getNode(
14412 ISD::SRL
, dl
, VT
, From1
,
14413 DAG
.getConstant(NewFromMask
.countTrailingZeros(), dl
, VT
));
14414 return DAG
.getNode(ARMISD::BFI
, dl
, VT
, CombineBFI
.getOperand(0), From1
,
14415 DAG
.getConstant(~NewToMask
, dl
, VT
));
14418 // Reassociate BFI(BFI (A, B, M1), C, M2) to BFI(BFI (A, C, M2), B, M1) so
14419 // that lower bit insertions are performed first, providing that M1 and M2
14420 // do no overlap. This can allow multiple BFI instructions to be combined
14421 // together by the other folds above.
14422 if (N
->getOperand(0).getOpcode() == ARMISD::BFI
) {
14423 APInt ToMask1
= ~N
->getConstantOperandAPInt(2);
14424 APInt ToMask2
= ~N0
.getConstantOperandAPInt(2);
14426 if (!N0
.hasOneUse() || (ToMask1
& ToMask2
) != 0 ||
14427 ToMask1
.countLeadingZeros() < ToMask2
.countLeadingZeros())
14430 EVT VT
= N
->getValueType(0);
14432 SDValue BFI1
= DAG
.getNode(ARMISD::BFI
, dl
, VT
, N0
.getOperand(0),
14433 N
->getOperand(1), N
->getOperand(2));
14434 return DAG
.getNode(ARMISD::BFI
, dl
, VT
, BFI1
, N0
.getOperand(1),
14441 /// PerformVMOVRRDCombine - Target-specific dag combine xforms for
14442 /// ARMISD::VMOVRRD.
14443 static SDValue
PerformVMOVRRDCombine(SDNode
*N
,
14444 TargetLowering::DAGCombinerInfo
&DCI
,
14445 const ARMSubtarget
*Subtarget
) {
14446 // vmovrrd(vmovdrr x, y) -> x,y
14447 SDValue InDouble
= N
->getOperand(0);
14448 if (InDouble
.getOpcode() == ARMISD::VMOVDRR
&& Subtarget
->hasFP64())
14449 return DCI
.CombineTo(N
, InDouble
.getOperand(0), InDouble
.getOperand(1));
14451 // vmovrrd(load f64) -> (load i32), (load i32)
14452 SDNode
*InNode
= InDouble
.getNode();
14453 if (ISD::isNormalLoad(InNode
) && InNode
->hasOneUse() &&
14454 InNode
->getValueType(0) == MVT::f64
&&
14455 InNode
->getOperand(1).getOpcode() == ISD::FrameIndex
&&
14456 !cast
<LoadSDNode
>(InNode
)->isVolatile()) {
14457 // TODO: Should this be done for non-FrameIndex operands?
14458 LoadSDNode
*LD
= cast
<LoadSDNode
>(InNode
);
14460 SelectionDAG
&DAG
= DCI
.DAG
;
14462 SDValue BasePtr
= LD
->getBasePtr();
14464 DAG
.getLoad(MVT::i32
, DL
, LD
->getChain(), BasePtr
, LD
->getPointerInfo(),
14465 LD
->getAlignment(), LD
->getMemOperand()->getFlags());
14467 SDValue OffsetPtr
= DAG
.getNode(ISD::ADD
, DL
, MVT::i32
, BasePtr
,
14468 DAG
.getConstant(4, DL
, MVT::i32
));
14470 SDValue NewLD2
= DAG
.getLoad(MVT::i32
, DL
, LD
->getChain(), OffsetPtr
,
14471 LD
->getPointerInfo().getWithOffset(4),
14472 std::min(4U, LD
->getAlignment()),
14473 LD
->getMemOperand()->getFlags());
14475 DAG
.ReplaceAllUsesOfValueWith(SDValue(LD
, 1), NewLD2
.getValue(1));
14476 if (DCI
.DAG
.getDataLayout().isBigEndian())
14477 std::swap (NewLD1
, NewLD2
);
14478 SDValue Result
= DCI
.CombineTo(N
, NewLD1
, NewLD2
);
14482 // VMOVRRD(extract(..(build_vector(a, b, c, d)))) -> a,b or c,d
14483 // VMOVRRD(extract(insert_vector(insert_vector(.., a, l1), b, l2))) -> a,b
14484 if (InDouble
.getOpcode() == ISD::EXTRACT_VECTOR_ELT
&&
14485 isa
<ConstantSDNode
>(InDouble
.getOperand(1))) {
14486 SDValue BV
= InDouble
.getOperand(0);
14487 // Look up through any nop bitcasts and vector_reg_casts. bitcasts may
14488 // change lane order under big endian.
14489 bool BVSwap
= BV
.getOpcode() == ISD::BITCAST
;
14491 (BV
.getOpcode() == ISD::BITCAST
||
14492 BV
.getOpcode() == ARMISD::VECTOR_REG_CAST
) &&
14493 (BV
.getValueType() == MVT::v2f64
|| BV
.getValueType() == MVT::v2i64
)) {
14494 BVSwap
= BV
.getOpcode() == ISD::BITCAST
;
14495 BV
= BV
.getOperand(0);
14497 if (BV
.getValueType() != MVT::v4i32
)
14500 // Handle buildvectors, pulling out the correct lane depending on
14502 unsigned Offset
= InDouble
.getConstantOperandVal(1) == 1 ? 2 : 0;
14503 if (BV
.getOpcode() == ISD::BUILD_VECTOR
) {
14504 SDValue Op0
= BV
.getOperand(Offset
);
14505 SDValue Op1
= BV
.getOperand(Offset
+ 1);
14506 if (!Subtarget
->isLittle() && BVSwap
)
14507 std::swap(Op0
, Op1
);
14509 return DCI
.DAG
.getMergeValues({Op0
, Op1
}, SDLoc(N
));
14512 // A chain of insert_vectors, grabbing the correct value of the chain of
14515 while (BV
.getOpcode() == ISD::INSERT_VECTOR_ELT
) {
14516 if (isa
<ConstantSDNode
>(BV
.getOperand(2))) {
14517 if (BV
.getConstantOperandVal(2) == Offset
)
14518 Op0
= BV
.getOperand(1);
14519 if (BV
.getConstantOperandVal(2) == Offset
+ 1)
14520 Op1
= BV
.getOperand(1);
14522 BV
= BV
.getOperand(0);
14524 if (!Subtarget
->isLittle() && BVSwap
)
14525 std::swap(Op0
, Op1
);
14527 return DCI
.DAG
.getMergeValues({Op0
, Op1
}, SDLoc(N
));
14533 /// PerformVMOVDRRCombine - Target-specific dag combine xforms for
14534 /// ARMISD::VMOVDRR. This is also used for BUILD_VECTORs with 2 operands.
14535 static SDValue
PerformVMOVDRRCombine(SDNode
*N
, SelectionDAG
&DAG
) {
14536 // N=vmovrrd(X); vmovdrr(N:0, N:1) -> bit_convert(X)
14537 SDValue Op0
= N
->getOperand(0);
14538 SDValue Op1
= N
->getOperand(1);
14539 if (Op0
.getOpcode() == ISD::BITCAST
)
14540 Op0
= Op0
.getOperand(0);
14541 if (Op1
.getOpcode() == ISD::BITCAST
)
14542 Op1
= Op1
.getOperand(0);
14543 if (Op0
.getOpcode() == ARMISD::VMOVRRD
&&
14544 Op0
.getNode() == Op1
.getNode() &&
14545 Op0
.getResNo() == 0 && Op1
.getResNo() == 1)
14546 return DAG
.getNode(ISD::BITCAST
, SDLoc(N
),
14547 N
->getValueType(0), Op0
.getOperand(0));
14551 static SDValue
PerformVMOVhrCombine(SDNode
*N
,
14552 TargetLowering::DAGCombinerInfo
&DCI
) {
14553 SDValue Op0
= N
->getOperand(0);
14555 // VMOVhr (VMOVrh (X)) -> X
14556 if (Op0
->getOpcode() == ARMISD::VMOVrh
)
14557 return Op0
->getOperand(0);
14559 // FullFP16: half values are passed in S-registers, and we don't
14560 // need any of the bitcast and moves:
14562 // t2: f32,ch = CopyFromReg t0, Register:f32 %0
14563 // t5: i32 = bitcast t2
14564 // t18: f16 = ARMISD::VMOVhr t5
14565 if (Op0
->getOpcode() == ISD::BITCAST
) {
14566 SDValue Copy
= Op0
->getOperand(0);
14567 if (Copy
.getValueType() == MVT::f32
&&
14568 Copy
->getOpcode() == ISD::CopyFromReg
) {
14569 SDValue Ops
[] = {Copy
->getOperand(0), Copy
->getOperand(1)};
14571 DCI
.DAG
.getNode(ISD::CopyFromReg
, SDLoc(N
), N
->getValueType(0), Ops
);
14576 // fold (VMOVhr (load x)) -> (load (f16*)x)
14577 if (LoadSDNode
*LN0
= dyn_cast
<LoadSDNode
>(Op0
)) {
14578 if (LN0
->hasOneUse() && LN0
->isUnindexed() &&
14579 LN0
->getMemoryVT() == MVT::i16
) {
14581 DCI
.DAG
.getLoad(N
->getValueType(0), SDLoc(N
), LN0
->getChain(),
14582 LN0
->getBasePtr(), LN0
->getMemOperand());
14583 DCI
.DAG
.ReplaceAllUsesOfValueWith(SDValue(N
, 0), Load
.getValue(0));
14584 DCI
.DAG
.ReplaceAllUsesOfValueWith(Op0
.getValue(1), Load
.getValue(1));
14589 // Only the bottom 16 bits of the source register are used.
14590 APInt DemandedMask
= APInt::getLowBitsSet(32, 16);
14591 const TargetLowering
&TLI
= DCI
.DAG
.getTargetLoweringInfo();
14592 if (TLI
.SimplifyDemandedBits(Op0
, DemandedMask
, DCI
))
14593 return SDValue(N
, 0);
14598 static SDValue
PerformVMOVrhCombine(SDNode
*N
, SelectionDAG
&DAG
) {
14599 SDValue N0
= N
->getOperand(0);
14600 EVT VT
= N
->getValueType(0);
14602 // fold (VMOVrh (fpconst x)) -> const x
14603 if (ConstantFPSDNode
*C
= dyn_cast
<ConstantFPSDNode
>(N0
)) {
14604 APFloat V
= C
->getValueAPF();
14605 return DAG
.getConstant(V
.bitcastToAPInt().getZExtValue(), SDLoc(N
), VT
);
14608 // fold (VMOVrh (load x)) -> (zextload (i16*)x)
14609 if (ISD::isNormalLoad(N0
.getNode()) && N0
.hasOneUse()) {
14610 LoadSDNode
*LN0
= cast
<LoadSDNode
>(N0
);
14613 DAG
.getExtLoad(ISD::ZEXTLOAD
, SDLoc(N
), VT
, LN0
->getChain(),
14614 LN0
->getBasePtr(), MVT::i16
, LN0
->getMemOperand());
14615 DAG
.ReplaceAllUsesOfValueWith(SDValue(N
, 0), Load
.getValue(0));
14616 DAG
.ReplaceAllUsesOfValueWith(N0
.getValue(1), Load
.getValue(1));
14620 // Fold VMOVrh(extract(x, n)) -> vgetlaneu(x, n)
14621 if (N0
->getOpcode() == ISD::EXTRACT_VECTOR_ELT
&&
14622 isa
<ConstantSDNode
>(N0
->getOperand(1)))
14623 return DAG
.getNode(ARMISD::VGETLANEu
, SDLoc(N
), VT
, N0
->getOperand(0),
14624 N0
->getOperand(1));
14629 /// hasNormalLoadOperand - Check if any of the operands of a BUILD_VECTOR node
14630 /// are normal, non-volatile loads. If so, it is profitable to bitcast an
14631 /// i64 vector to have f64 elements, since the value can then be loaded
14632 /// directly into a VFP register.
14633 static bool hasNormalLoadOperand(SDNode
*N
) {
14634 unsigned NumElts
= N
->getValueType(0).getVectorNumElements();
14635 for (unsigned i
= 0; i
< NumElts
; ++i
) {
14636 SDNode
*Elt
= N
->getOperand(i
).getNode();
14637 if (ISD::isNormalLoad(Elt
) && !cast
<LoadSDNode
>(Elt
)->isVolatile())
14643 /// PerformBUILD_VECTORCombine - Target-specific dag combine xforms for
14644 /// ISD::BUILD_VECTOR.
14645 static SDValue
PerformBUILD_VECTORCombine(SDNode
*N
,
14646 TargetLowering::DAGCombinerInfo
&DCI
,
14647 const ARMSubtarget
*Subtarget
) {
14648 // build_vector(N=ARMISD::VMOVRRD(X), N:1) -> bit_convert(X):
14649 // VMOVRRD is introduced when legalizing i64 types. It forces the i64 value
14650 // into a pair of GPRs, which is fine when the value is used as a scalar,
14651 // but if the i64 value is converted to a vector, we need to undo the VMOVRRD.
14652 SelectionDAG
&DAG
= DCI
.DAG
;
14653 if (N
->getNumOperands() == 2)
14654 if (SDValue RV
= PerformVMOVDRRCombine(N
, DAG
))
14657 // Load i64 elements as f64 values so that type legalization does not split
14658 // them up into i32 values.
14659 EVT VT
= N
->getValueType(0);
14660 if (VT
.getVectorElementType() != MVT::i64
|| !hasNormalLoadOperand(N
))
14663 SmallVector
<SDValue
, 8> Ops
;
14664 unsigned NumElts
= VT
.getVectorNumElements();
14665 for (unsigned i
= 0; i
< NumElts
; ++i
) {
14666 SDValue V
= DAG
.getNode(ISD::BITCAST
, dl
, MVT::f64
, N
->getOperand(i
));
14668 // Make the DAGCombiner fold the bitcast.
14669 DCI
.AddToWorklist(V
.getNode());
14671 EVT FloatVT
= EVT::getVectorVT(*DAG
.getContext(), MVT::f64
, NumElts
);
14672 SDValue BV
= DAG
.getBuildVector(FloatVT
, dl
, Ops
);
14673 return DAG
.getNode(ISD::BITCAST
, dl
, VT
, BV
);
14676 /// Target-specific dag combine xforms for ARMISD::BUILD_VECTOR.
14678 PerformARMBUILD_VECTORCombine(SDNode
*N
, TargetLowering::DAGCombinerInfo
&DCI
) {
14679 // ARMISD::BUILD_VECTOR is introduced when legalizing ISD::BUILD_VECTOR.
14680 // At that time, we may have inserted bitcasts from integer to float.
14681 // If these bitcasts have survived DAGCombine, change the lowering of this
14682 // BUILD_VECTOR in something more vector friendly, i.e., that does not
14683 // force to use floating point types.
14685 // Make sure we can change the type of the vector.
14686 // This is possible iff:
14687 // 1. The vector is only used in a bitcast to a integer type. I.e.,
14688 // 1.1. Vector is used only once.
14689 // 1.2. Use is a bit convert to an integer type.
14690 // 2. The size of its operands are 32-bits (64-bits are not legal).
14691 EVT VT
= N
->getValueType(0);
14692 EVT EltVT
= VT
.getVectorElementType();
14694 // Check 1.1. and 2.
14695 if (EltVT
.getSizeInBits() != 32 || !N
->hasOneUse())
14698 // By construction, the input type must be float.
14699 assert(EltVT
== MVT::f32
&& "Unexpected type!");
14702 SDNode
*Use
= *N
->use_begin();
14703 if (Use
->getOpcode() != ISD::BITCAST
||
14704 Use
->getValueType(0).isFloatingPoint())
14707 // Check profitability.
14708 // Model is, if more than half of the relevant operands are bitcast from
14709 // i32, turn the build_vector into a sequence of insert_vector_elt.
14710 // Relevant operands are everything that is not statically
14711 // (i.e., at compile time) bitcasted.
14712 unsigned NumOfBitCastedElts
= 0;
14713 unsigned NumElts
= VT
.getVectorNumElements();
14714 unsigned NumOfRelevantElts
= NumElts
;
14715 for (unsigned Idx
= 0; Idx
< NumElts
; ++Idx
) {
14716 SDValue Elt
= N
->getOperand(Idx
);
14717 if (Elt
->getOpcode() == ISD::BITCAST
) {
14718 // Assume only bit cast to i32 will go away.
14719 if (Elt
->getOperand(0).getValueType() == MVT::i32
)
14720 ++NumOfBitCastedElts
;
14721 } else if (Elt
.isUndef() || isa
<ConstantSDNode
>(Elt
))
14722 // Constants are statically casted, thus do not count them as
14723 // relevant operands.
14724 --NumOfRelevantElts
;
14727 // Check if more than half of the elements require a non-free bitcast.
14728 if (NumOfBitCastedElts
<= NumOfRelevantElts
/ 2)
14731 SelectionDAG
&DAG
= DCI
.DAG
;
14732 // Create the new vector type.
14733 EVT VecVT
= EVT::getVectorVT(*DAG
.getContext(), MVT::i32
, NumElts
);
14734 // Check if the type is legal.
14735 const TargetLowering
&TLI
= DAG
.getTargetLoweringInfo();
14736 if (!TLI
.isTypeLegal(VecVT
))
14740 // ARMISD::BUILD_VECTOR E1, E2, ..., EN.
14741 // => BITCAST INSERT_VECTOR_ELT
14742 // (INSERT_VECTOR_ELT (...), (BITCAST EN-1), N-1),
14743 // (BITCAST EN), N.
14744 SDValue Vec
= DAG
.getUNDEF(VecVT
);
14746 for (unsigned Idx
= 0 ; Idx
< NumElts
; ++Idx
) {
14747 SDValue V
= N
->getOperand(Idx
);
14750 if (V
.getOpcode() == ISD::BITCAST
&&
14751 V
->getOperand(0).getValueType() == MVT::i32
)
14752 // Fold obvious case.
14753 V
= V
.getOperand(0);
14755 V
= DAG
.getNode(ISD::BITCAST
, SDLoc(V
), MVT::i32
, V
);
14756 // Make the DAGCombiner fold the bitcasts.
14757 DCI
.AddToWorklist(V
.getNode());
14759 SDValue LaneIdx
= DAG
.getConstant(Idx
, dl
, MVT::i32
);
14760 Vec
= DAG
.getNode(ISD::INSERT_VECTOR_ELT
, dl
, VecVT
, Vec
, V
, LaneIdx
);
14762 Vec
= DAG
.getNode(ISD::BITCAST
, dl
, VT
, Vec
);
14763 // Make the DAGCombiner fold the bitcasts.
14764 DCI
.AddToWorklist(Vec
.getNode());
14769 PerformPREDICATE_CASTCombine(SDNode
*N
, TargetLowering::DAGCombinerInfo
&DCI
) {
14770 EVT VT
= N
->getValueType(0);
14771 SDValue Op
= N
->getOperand(0);
14774 // PREDICATE_CAST(PREDICATE_CAST(x)) == PREDICATE_CAST(x)
14775 if (Op
->getOpcode() == ARMISD::PREDICATE_CAST
) {
14776 // If the valuetypes are the same, we can remove the cast entirely.
14777 if (Op
->getOperand(0).getValueType() == VT
)
14778 return Op
->getOperand(0);
14779 return DCI
.DAG
.getNode(ARMISD::PREDICATE_CAST
, dl
, VT
, Op
->getOperand(0));
14782 // Turn pred_cast(xor x, -1) into xor(pred_cast x, -1), in order to produce
14783 // more VPNOT which might get folded as else predicates.
14784 if (Op
.getValueType() == MVT::i32
&& isBitwiseNot(Op
)) {
14786 DCI
.DAG
.getNode(ARMISD::PREDICATE_CAST
, dl
, VT
, Op
->getOperand(0));
14787 SDValue C
= DCI
.DAG
.getNode(ARMISD::PREDICATE_CAST
, dl
, VT
,
14788 DCI
.DAG
.getConstant(65535, dl
, MVT::i32
));
14789 return DCI
.DAG
.getNode(ISD::XOR
, dl
, VT
, X
, C
);
14792 // Only the bottom 16 bits of the source register are used.
14793 if (Op
.getValueType() == MVT::i32
) {
14794 APInt DemandedMask
= APInt::getLowBitsSet(32, 16);
14795 const TargetLowering
&TLI
= DCI
.DAG
.getTargetLoweringInfo();
14796 if (TLI
.SimplifyDemandedBits(Op
, DemandedMask
, DCI
))
14797 return SDValue(N
, 0);
14802 static SDValue
PerformVECTOR_REG_CASTCombine(SDNode
*N
, SelectionDAG
&DAG
,
14803 const ARMSubtarget
*ST
) {
14804 EVT VT
= N
->getValueType(0);
14805 SDValue Op
= N
->getOperand(0);
14808 // Under Little endian, a VECTOR_REG_CAST is equivalent to a BITCAST
14809 if (ST
->isLittle())
14810 return DAG
.getNode(ISD::BITCAST
, dl
, VT
, Op
);
14812 // VECTOR_REG_CAST undef -> undef
14814 return DAG
.getUNDEF(VT
);
14816 // VECTOR_REG_CAST(VECTOR_REG_CAST(x)) == VECTOR_REG_CAST(x)
14817 if (Op
->getOpcode() == ARMISD::VECTOR_REG_CAST
) {
14818 // If the valuetypes are the same, we can remove the cast entirely.
14819 if (Op
->getOperand(0).getValueType() == VT
)
14820 return Op
->getOperand(0);
14821 return DAG
.getNode(ARMISD::VECTOR_REG_CAST
, dl
, VT
, Op
->getOperand(0));
14827 static SDValue
PerformVCMPCombine(SDNode
*N
, SelectionDAG
&DAG
,
14828 const ARMSubtarget
*Subtarget
) {
14829 if (!Subtarget
->hasMVEIntegerOps())
14832 EVT VT
= N
->getValueType(0);
14833 SDValue Op0
= N
->getOperand(0);
14834 SDValue Op1
= N
->getOperand(1);
14835 ARMCC::CondCodes Cond
=
14836 (ARMCC::CondCodes
)cast
<ConstantSDNode
>(N
->getOperand(2))->getZExtValue();
14839 // vcmp X, 0, cc -> vcmpz X, cc
14840 if (isZeroVector(Op1
))
14841 return DAG
.getNode(ARMISD::VCMPZ
, dl
, VT
, Op0
, N
->getOperand(2));
14843 unsigned SwappedCond
= getSwappedCondition(Cond
);
14844 if (isValidMVECond(SwappedCond
, VT
.isFloatingPoint())) {
14845 // vcmp 0, X, cc -> vcmpz X, reversed(cc)
14846 if (isZeroVector(Op0
))
14847 return DAG
.getNode(ARMISD::VCMPZ
, dl
, VT
, Op1
,
14848 DAG
.getConstant(SwappedCond
, dl
, MVT::i32
));
14849 // vcmp vdup(Y), X, cc -> vcmp X, vdup(Y), reversed(cc)
14850 if (Op0
->getOpcode() == ARMISD::VDUP
&& Op1
->getOpcode() != ARMISD::VDUP
)
14851 return DAG
.getNode(ARMISD::VCMP
, dl
, VT
, Op1
, Op0
,
14852 DAG
.getConstant(SwappedCond
, dl
, MVT::i32
));
14858 /// PerformInsertEltCombine - Target-specific dag combine xforms for
14859 /// ISD::INSERT_VECTOR_ELT.
14860 static SDValue
PerformInsertEltCombine(SDNode
*N
,
14861 TargetLowering::DAGCombinerInfo
&DCI
) {
14862 // Bitcast an i64 load inserted into a vector to f64.
14863 // Otherwise, the i64 value will be legalized to a pair of i32 values.
14864 EVT VT
= N
->getValueType(0);
14865 SDNode
*Elt
= N
->getOperand(1).getNode();
14866 if (VT
.getVectorElementType() != MVT::i64
||
14867 !ISD::isNormalLoad(Elt
) || cast
<LoadSDNode
>(Elt
)->isVolatile())
14870 SelectionDAG
&DAG
= DCI
.DAG
;
14872 EVT FloatVT
= EVT::getVectorVT(*DAG
.getContext(), MVT::f64
,
14873 VT
.getVectorNumElements());
14874 SDValue Vec
= DAG
.getNode(ISD::BITCAST
, dl
, FloatVT
, N
->getOperand(0));
14875 SDValue V
= DAG
.getNode(ISD::BITCAST
, dl
, MVT::f64
, N
->getOperand(1));
14876 // Make the DAGCombiner fold the bitcasts.
14877 DCI
.AddToWorklist(Vec
.getNode());
14878 DCI
.AddToWorklist(V
.getNode());
14879 SDValue InsElt
= DAG
.getNode(ISD::INSERT_VECTOR_ELT
, dl
, FloatVT
,
14880 Vec
, V
, N
->getOperand(2));
14881 return DAG
.getNode(ISD::BITCAST
, dl
, VT
, InsElt
);
14884 // Convert a pair of extracts from the same base vector to a VMOVRRD. Either
14885 // directly or bitcast to an integer if the original is a float vector.
14886 // extract(x, n); extract(x, n+1) -> VMOVRRD(extract v2f64 x, n/2)
14887 // bitcast(extract(x, n)); bitcast(extract(x, n+1)) -> VMOVRRD(extract x, n/2)
14889 PerformExtractEltToVMOVRRD(SDNode
*N
, TargetLowering::DAGCombinerInfo
&DCI
) {
14890 EVT VT
= N
->getValueType(0);
14893 if (!DCI
.isAfterLegalizeDAG() || VT
!= MVT::i32
||
14894 !DCI
.DAG
.getTargetLoweringInfo().isTypeLegal(MVT::f64
))
14897 SDValue Ext
= SDValue(N
, 0);
14898 if (Ext
.getOpcode() == ISD::BITCAST
&&
14899 Ext
.getOperand(0).getValueType() == MVT::f32
)
14900 Ext
= Ext
.getOperand(0);
14901 if (Ext
.getOpcode() != ISD::EXTRACT_VECTOR_ELT
||
14902 !isa
<ConstantSDNode
>(Ext
.getOperand(1)) ||
14903 Ext
.getConstantOperandVal(1) % 2 != 0)
14905 if (Ext
->use_size() == 1 &&
14906 (Ext
->use_begin()->getOpcode() == ISD::SINT_TO_FP
||
14907 Ext
->use_begin()->getOpcode() == ISD::UINT_TO_FP
))
14910 SDValue Op0
= Ext
.getOperand(0);
14911 EVT VecVT
= Op0
.getValueType();
14912 unsigned Lane
= Ext
.getConstantOperandVal(1);
14913 if (VecVT
.getVectorNumElements() != 4)
14916 // Find another extract, of Lane + 1
14917 auto OtherIt
= find_if(Op0
->uses(), [&](SDNode
*V
) {
14918 return V
->getOpcode() == ISD::EXTRACT_VECTOR_ELT
&&
14919 isa
<ConstantSDNode
>(V
->getOperand(1)) &&
14920 V
->getConstantOperandVal(1) == Lane
+ 1;
14922 if (OtherIt
== Op0
->uses().end())
14925 // For float extracts, we need to be converting to a i32 for both vector
14927 SDValue
OtherExt(*OtherIt
, 0);
14928 if (OtherExt
.getValueType() != MVT::i32
) {
14929 if (OtherExt
->use_size() != 1 ||
14930 OtherExt
->use_begin()->getOpcode() != ISD::BITCAST
||
14931 OtherExt
->use_begin()->getValueType(0) != MVT::i32
)
14933 OtherExt
= SDValue(*OtherExt
->use_begin(), 0);
14936 // Convert the type to a f64 and extract with a VMOVRRD.
14937 SDValue F64
= DCI
.DAG
.getNode(
14938 ISD::EXTRACT_VECTOR_ELT
, dl
, MVT::f64
,
14939 DCI
.DAG
.getNode(ARMISD::VECTOR_REG_CAST
, dl
, MVT::v2f64
, Op0
),
14940 DCI
.DAG
.getConstant(Ext
.getConstantOperandVal(1) / 2, dl
, MVT::i32
));
14942 DCI
.DAG
.getNode(ARMISD::VMOVRRD
, dl
, {MVT::i32
, MVT::i32
}, F64
);
14944 DCI
.CombineTo(OtherExt
.getNode(), SDValue(VMOVRRD
.getNode(), 1));
14948 static SDValue
PerformExtractEltCombine(SDNode
*N
,
14949 TargetLowering::DAGCombinerInfo
&DCI
,
14950 const ARMSubtarget
*ST
) {
14951 SDValue Op0
= N
->getOperand(0);
14952 EVT VT
= N
->getValueType(0);
14955 // extract (vdup x) -> x
14956 if (Op0
->getOpcode() == ARMISD::VDUP
) {
14957 SDValue X
= Op0
->getOperand(0);
14958 if (VT
== MVT::f16
&& X
.getValueType() == MVT::i32
)
14959 return DCI
.DAG
.getNode(ARMISD::VMOVhr
, dl
, VT
, X
);
14960 if (VT
== MVT::i32
&& X
.getValueType() == MVT::f16
)
14961 return DCI
.DAG
.getNode(ARMISD::VMOVrh
, dl
, VT
, X
);
14962 if (VT
== MVT::f32
&& X
.getValueType() == MVT::i32
)
14963 return DCI
.DAG
.getNode(ISD::BITCAST
, dl
, VT
, X
);
14965 while (X
.getValueType() != VT
&& X
->getOpcode() == ISD::BITCAST
)
14966 X
= X
->getOperand(0);
14967 if (X
.getValueType() == VT
)
14971 // extract ARM_BUILD_VECTOR -> x
14972 if (Op0
->getOpcode() == ARMISD::BUILD_VECTOR
&&
14973 isa
<ConstantSDNode
>(N
->getOperand(1)) &&
14974 N
->getConstantOperandVal(1) < Op0
.getNumOperands()) {
14975 return Op0
.getOperand(N
->getConstantOperandVal(1));
14978 // extract(bitcast(BUILD_VECTOR(VMOVDRR(a, b), ..))) -> a or b
14979 if (Op0
.getValueType() == MVT::v4i32
&&
14980 isa
<ConstantSDNode
>(N
->getOperand(1)) &&
14981 Op0
.getOpcode() == ISD::BITCAST
&&
14982 Op0
.getOperand(0).getOpcode() == ISD::BUILD_VECTOR
&&
14983 Op0
.getOperand(0).getValueType() == MVT::v2f64
) {
14984 SDValue BV
= Op0
.getOperand(0);
14985 unsigned Offset
= N
->getConstantOperandVal(1);
14986 SDValue MOV
= BV
.getOperand(Offset
< 2 ? 0 : 1);
14987 if (MOV
.getOpcode() == ARMISD::VMOVDRR
)
14988 return MOV
.getOperand(ST
->isLittle() ? Offset
% 2 : 1 - Offset
% 2);
14991 // extract x, n; extract x, n+1 -> VMOVRRD x
14992 if (SDValue R
= PerformExtractEltToVMOVRRD(N
, DCI
))
14995 // extract (MVETrunc(x)) -> extract x
14996 if (Op0
->getOpcode() == ARMISD::MVETRUNC
) {
14997 unsigned Idx
= N
->getConstantOperandVal(1);
14999 Idx
/ Op0
->getOperand(0).getValueType().getVectorNumElements();
15001 Idx
% Op0
->getOperand(0).getValueType().getVectorNumElements();
15002 return DCI
.DAG
.getNode(ISD::EXTRACT_VECTOR_ELT
, dl
, VT
, Op0
.getOperand(Vec
),
15003 DCI
.DAG
.getConstant(SubIdx
, dl
, MVT::i32
));
15009 static SDValue
PerformSignExtendInregCombine(SDNode
*N
, SelectionDAG
&DAG
) {
15010 SDValue Op
= N
->getOperand(0);
15011 EVT VT
= N
->getValueType(0);
15013 // sext_inreg(VGETLANEu) -> VGETLANEs
15014 if (Op
.getOpcode() == ARMISD::VGETLANEu
&&
15015 cast
<VTSDNode
>(N
->getOperand(1))->getVT() ==
15016 Op
.getOperand(0).getValueType().getScalarType())
15017 return DAG
.getNode(ARMISD::VGETLANEs
, SDLoc(N
), VT
, Op
.getOperand(0),
15023 // When lowering complex nodes that we recognize, like VQDMULH and MULH, we
15024 // can end up with shuffle(binop(shuffle, shuffle)), that can be simplified to
15025 // binop as the shuffles cancel out.
15026 static SDValue
FlattenVectorShuffle(ShuffleVectorSDNode
*N
, SelectionDAG
&DAG
) {
15027 EVT VT
= N
->getValueType(0);
15028 if (!N
->getOperand(1).isUndef() || N
->getOperand(0).getValueType() != VT
)
15030 SDValue Op
= N
->getOperand(0);
15032 // Looking for binary operators that will have been folded from
15033 // truncates/extends.
15034 switch (Op
.getOpcode()) {
15035 case ARMISD::VQDMULH
:
15045 ShuffleVectorSDNode
*Op0
= dyn_cast
<ShuffleVectorSDNode
>(Op
.getOperand(0));
15046 ShuffleVectorSDNode
*Op1
= dyn_cast
<ShuffleVectorSDNode
>(Op
.getOperand(1));
15047 if (!Op0
|| !Op1
|| !Op0
->getOperand(1).isUndef() ||
15048 !Op1
->getOperand(1).isUndef() || Op0
->getMask() != Op1
->getMask() ||
15049 Op0
->getOperand(0).getValueType() != VT
)
15052 // Check the mask turns into an identity shuffle.
15053 ArrayRef
<int> NMask
= N
->getMask();
15054 ArrayRef
<int> OpMask
= Op0
->getMask();
15055 for (int i
= 0, e
= NMask
.size(); i
!= e
; i
++) {
15056 if (NMask
[i
] > 0 && OpMask
[NMask
[i
]] > 0 && OpMask
[NMask
[i
]] != i
)
15060 return DAG
.getNode(Op
.getOpcode(), SDLoc(Op
), Op
.getValueType(),
15061 Op0
->getOperand(0), Op1
->getOperand(0));
15065 PerformInsertSubvectorCombine(SDNode
*N
, TargetLowering::DAGCombinerInfo
&DCI
) {
15066 SDValue Vec
= N
->getOperand(0);
15067 SDValue SubVec
= N
->getOperand(1);
15068 uint64_t IdxVal
= N
->getConstantOperandVal(2);
15069 EVT VecVT
= Vec
.getValueType();
15070 EVT SubVT
= SubVec
.getValueType();
15072 // Only do this for legal fixed vector types.
15073 if (!VecVT
.isFixedLengthVector() ||
15074 !DCI
.DAG
.getTargetLoweringInfo().isTypeLegal(VecVT
) ||
15075 !DCI
.DAG
.getTargetLoweringInfo().isTypeLegal(SubVT
))
15078 // Ignore widening patterns.
15079 if (IdxVal
== 0 && Vec
.isUndef())
15082 // Subvector must be half the width and an "aligned" insertion.
15083 unsigned NumSubElts
= SubVT
.getVectorNumElements();
15084 if ((SubVT
.getSizeInBits() * 2) != VecVT
.getSizeInBits() ||
15085 (IdxVal
!= 0 && IdxVal
!= NumSubElts
))
15088 // Fold insert_subvector -> concat_vectors
15089 // insert_subvector(Vec,Sub,lo) -> concat_vectors(Sub,extract(Vec,hi))
15090 // insert_subvector(Vec,Sub,hi) -> concat_vectors(extract(Vec,lo),Sub)
15095 Hi
= DCI
.DAG
.getNode(ISD::EXTRACT_SUBVECTOR
, DL
, SubVT
, Vec
,
15096 DCI
.DAG
.getVectorIdxConstant(NumSubElts
, DL
));
15098 Lo
= DCI
.DAG
.getNode(ISD::EXTRACT_SUBVECTOR
, DL
, SubVT
, Vec
,
15099 DCI
.DAG
.getVectorIdxConstant(0, DL
));
15102 return DCI
.DAG
.getNode(ISD::CONCAT_VECTORS
, DL
, VecVT
, Lo
, Hi
);
15105 // shuffle(MVETrunc(x, y)) -> VMOVN(x, y)
15106 static SDValue
PerformShuffleVMOVNCombine(ShuffleVectorSDNode
*N
,
15107 SelectionDAG
&DAG
) {
15108 SDValue Trunc
= N
->getOperand(0);
15109 EVT VT
= Trunc
.getValueType();
15110 if (Trunc
.getOpcode() != ARMISD::MVETRUNC
|| !N
->getOperand(1).isUndef())
15114 if (isVMOVNTruncMask(N
->getMask(), VT
, 0))
15115 return DAG
.getNode(
15116 ARMISD::VMOVN
, DL
, VT
,
15117 DAG
.getNode(ARMISD::VECTOR_REG_CAST
, DL
, VT
, Trunc
.getOperand(0)),
15118 DAG
.getNode(ARMISD::VECTOR_REG_CAST
, DL
, VT
, Trunc
.getOperand(1)),
15119 DAG
.getConstant(1, DL
, MVT::i32
));
15120 else if (isVMOVNTruncMask(N
->getMask(), VT
, 1))
15121 return DAG
.getNode(
15122 ARMISD::VMOVN
, DL
, VT
,
15123 DAG
.getNode(ARMISD::VECTOR_REG_CAST
, DL
, VT
, Trunc
.getOperand(1)),
15124 DAG
.getNode(ARMISD::VECTOR_REG_CAST
, DL
, VT
, Trunc
.getOperand(0)),
15125 DAG
.getConstant(1, DL
, MVT::i32
));
15129 /// PerformVECTOR_SHUFFLECombine - Target-specific dag combine xforms for
15130 /// ISD::VECTOR_SHUFFLE.
15131 static SDValue
PerformVECTOR_SHUFFLECombine(SDNode
*N
, SelectionDAG
&DAG
) {
15132 if (SDValue R
= FlattenVectorShuffle(cast
<ShuffleVectorSDNode
>(N
), DAG
))
15134 if (SDValue R
= PerformShuffleVMOVNCombine(cast
<ShuffleVectorSDNode
>(N
), DAG
))
15137 // The LLVM shufflevector instruction does not require the shuffle mask
15138 // length to match the operand vector length, but ISD::VECTOR_SHUFFLE does
15139 // have that requirement. When translating to ISD::VECTOR_SHUFFLE, if the
15140 // operands do not match the mask length, they are extended by concatenating
15141 // them with undef vectors. That is probably the right thing for other
15142 // targets, but for NEON it is better to concatenate two double-register
15143 // size vector operands into a single quad-register size vector. Do that
15144 // transformation here:
15145 // shuffle(concat(v1, undef), concat(v2, undef)) ->
15146 // shuffle(concat(v1, v2), undef)
15147 SDValue Op0
= N
->getOperand(0);
15148 SDValue Op1
= N
->getOperand(1);
15149 if (Op0
.getOpcode() != ISD::CONCAT_VECTORS
||
15150 Op1
.getOpcode() != ISD::CONCAT_VECTORS
||
15151 Op0
.getNumOperands() != 2 ||
15152 Op1
.getNumOperands() != 2)
15154 SDValue Concat0Op1
= Op0
.getOperand(1);
15155 SDValue Concat1Op1
= Op1
.getOperand(1);
15156 if (!Concat0Op1
.isUndef() || !Concat1Op1
.isUndef())
15158 // Skip the transformation if any of the types are illegal.
15159 const TargetLowering
&TLI
= DAG
.getTargetLoweringInfo();
15160 EVT VT
= N
->getValueType(0);
15161 if (!TLI
.isTypeLegal(VT
) ||
15162 !TLI
.isTypeLegal(Concat0Op1
.getValueType()) ||
15163 !TLI
.isTypeLegal(Concat1Op1
.getValueType()))
15166 SDValue NewConcat
= DAG
.getNode(ISD::CONCAT_VECTORS
, SDLoc(N
), VT
,
15167 Op0
.getOperand(0), Op1
.getOperand(0));
15168 // Translate the shuffle mask.
15169 SmallVector
<int, 16> NewMask
;
15170 unsigned NumElts
= VT
.getVectorNumElements();
15171 unsigned HalfElts
= NumElts
/2;
15172 ShuffleVectorSDNode
*SVN
= cast
<ShuffleVectorSDNode
>(N
);
15173 for (unsigned n
= 0; n
< NumElts
; ++n
) {
15174 int MaskElt
= SVN
->getMaskElt(n
);
15176 if (MaskElt
< (int)HalfElts
)
15178 else if (MaskElt
>= (int)NumElts
&& MaskElt
< (int)(NumElts
+ HalfElts
))
15179 NewElt
= HalfElts
+ MaskElt
- NumElts
;
15180 NewMask
.push_back(NewElt
);
15182 return DAG
.getVectorShuffle(VT
, SDLoc(N
), NewConcat
,
15183 DAG
.getUNDEF(VT
), NewMask
);
15186 /// CombineBaseUpdate - Target-specific DAG combine function for VLDDUP,
15187 /// NEON load/store intrinsics, and generic vector load/stores, to merge
15188 /// base address updates.
15189 /// For generic load/stores, the memory type is assumed to be a vector.
15190 /// The caller is assumed to have checked legality.
15191 static SDValue
CombineBaseUpdate(SDNode
*N
,
15192 TargetLowering::DAGCombinerInfo
&DCI
) {
15193 SelectionDAG
&DAG
= DCI
.DAG
;
15194 const bool isIntrinsic
= (N
->getOpcode() == ISD::INTRINSIC_VOID
||
15195 N
->getOpcode() == ISD::INTRINSIC_W_CHAIN
);
15196 const bool isStore
= N
->getOpcode() == ISD::STORE
;
15197 const unsigned AddrOpIdx
= ((isIntrinsic
|| isStore
) ? 2 : 1);
15198 SDValue Addr
= N
->getOperand(AddrOpIdx
);
15199 MemSDNode
*MemN
= cast
<MemSDNode
>(N
);
15202 // Search for a use of the address operand that is an increment.
15203 for (SDNode::use_iterator UI
= Addr
.getNode()->use_begin(),
15204 UE
= Addr
.getNode()->use_end(); UI
!= UE
; ++UI
) {
15205 SDNode
*User
= *UI
;
15206 if (User
->getOpcode() != ISD::ADD
||
15207 UI
.getUse().getResNo() != Addr
.getResNo())
15210 // Check that the add is independent of the load/store. Otherwise, folding
15211 // it would create a cycle. We can avoid searching through Addr as it's a
15212 // predecessor to both.
15213 SmallPtrSet
<const SDNode
*, 32> Visited
;
15214 SmallVector
<const SDNode
*, 16> Worklist
;
15215 Visited
.insert(Addr
.getNode());
15216 Worklist
.push_back(N
);
15217 Worklist
.push_back(User
);
15218 if (SDNode::hasPredecessorHelper(N
, Visited
, Worklist
) ||
15219 SDNode::hasPredecessorHelper(User
, Visited
, Worklist
))
15222 // Find the new opcode for the updating load/store.
15223 bool isLoadOp
= true;
15224 bool isLaneOp
= false;
15225 // Workaround for vst1x and vld1x intrinsics which do not have alignment
15227 bool hasAlignment
= true;
15228 unsigned NewOpc
= 0;
15229 unsigned NumVecs
= 0;
15231 unsigned IntNo
= cast
<ConstantSDNode
>(N
->getOperand(1))->getZExtValue();
15233 default: llvm_unreachable("unexpected intrinsic for Neon base update");
15234 case Intrinsic::arm_neon_vld1
: NewOpc
= ARMISD::VLD1_UPD
;
15235 NumVecs
= 1; break;
15236 case Intrinsic::arm_neon_vld2
: NewOpc
= ARMISD::VLD2_UPD
;
15237 NumVecs
= 2; break;
15238 case Intrinsic::arm_neon_vld3
: NewOpc
= ARMISD::VLD3_UPD
;
15239 NumVecs
= 3; break;
15240 case Intrinsic::arm_neon_vld4
: NewOpc
= ARMISD::VLD4_UPD
;
15241 NumVecs
= 4; break;
15242 case Intrinsic::arm_neon_vld1x2
: NewOpc
= ARMISD::VLD1x2_UPD
;
15243 NumVecs
= 2; hasAlignment
= false; break;
15244 case Intrinsic::arm_neon_vld1x3
: NewOpc
= ARMISD::VLD1x3_UPD
;
15245 NumVecs
= 3; hasAlignment
= false; break;
15246 case Intrinsic::arm_neon_vld1x4
: NewOpc
= ARMISD::VLD1x4_UPD
;
15247 NumVecs
= 4; hasAlignment
= false; break;
15248 case Intrinsic::arm_neon_vld2dup
: NewOpc
= ARMISD::VLD2DUP_UPD
;
15249 NumVecs
= 2; break;
15250 case Intrinsic::arm_neon_vld3dup
: NewOpc
= ARMISD::VLD3DUP_UPD
;
15251 NumVecs
= 3; break;
15252 case Intrinsic::arm_neon_vld4dup
: NewOpc
= ARMISD::VLD4DUP_UPD
;
15253 NumVecs
= 4; break;
15254 case Intrinsic::arm_neon_vld2lane
: NewOpc
= ARMISD::VLD2LN_UPD
;
15255 NumVecs
= 2; isLaneOp
= true; break;
15256 case Intrinsic::arm_neon_vld3lane
: NewOpc
= ARMISD::VLD3LN_UPD
;
15257 NumVecs
= 3; isLaneOp
= true; break;
15258 case Intrinsic::arm_neon_vld4lane
: NewOpc
= ARMISD::VLD4LN_UPD
;
15259 NumVecs
= 4; isLaneOp
= true; break;
15260 case Intrinsic::arm_neon_vst1
: NewOpc
= ARMISD::VST1_UPD
;
15261 NumVecs
= 1; isLoadOp
= false; break;
15262 case Intrinsic::arm_neon_vst2
: NewOpc
= ARMISD::VST2_UPD
;
15263 NumVecs
= 2; isLoadOp
= false; break;
15264 case Intrinsic::arm_neon_vst3
: NewOpc
= ARMISD::VST3_UPD
;
15265 NumVecs
= 3; isLoadOp
= false; break;
15266 case Intrinsic::arm_neon_vst4
: NewOpc
= ARMISD::VST4_UPD
;
15267 NumVecs
= 4; isLoadOp
= false; break;
15268 case Intrinsic::arm_neon_vst2lane
: NewOpc
= ARMISD::VST2LN_UPD
;
15269 NumVecs
= 2; isLoadOp
= false; isLaneOp
= true; break;
15270 case Intrinsic::arm_neon_vst3lane
: NewOpc
= ARMISD::VST3LN_UPD
;
15271 NumVecs
= 3; isLoadOp
= false; isLaneOp
= true; break;
15272 case Intrinsic::arm_neon_vst4lane
: NewOpc
= ARMISD::VST4LN_UPD
;
15273 NumVecs
= 4; isLoadOp
= false; isLaneOp
= true; break;
15274 case Intrinsic::arm_neon_vst1x2
: NewOpc
= ARMISD::VST1x2_UPD
;
15275 NumVecs
= 2; isLoadOp
= false; hasAlignment
= false; break;
15276 case Intrinsic::arm_neon_vst1x3
: NewOpc
= ARMISD::VST1x3_UPD
;
15277 NumVecs
= 3; isLoadOp
= false; hasAlignment
= false; break;
15278 case Intrinsic::arm_neon_vst1x4
: NewOpc
= ARMISD::VST1x4_UPD
;
15279 NumVecs
= 4; isLoadOp
= false; hasAlignment
= false; break;
15283 switch (N
->getOpcode()) {
15284 default: llvm_unreachable("unexpected opcode for Neon base update");
15285 case ARMISD::VLD1DUP
: NewOpc
= ARMISD::VLD1DUP_UPD
; NumVecs
= 1; break;
15286 case ARMISD::VLD2DUP
: NewOpc
= ARMISD::VLD2DUP_UPD
; NumVecs
= 2; break;
15287 case ARMISD::VLD3DUP
: NewOpc
= ARMISD::VLD3DUP_UPD
; NumVecs
= 3; break;
15288 case ARMISD::VLD4DUP
: NewOpc
= ARMISD::VLD4DUP_UPD
; NumVecs
= 4; break;
15289 case ISD::LOAD
: NewOpc
= ARMISD::VLD1_UPD
;
15290 NumVecs
= 1; isLaneOp
= false; break;
15291 case ISD::STORE
: NewOpc
= ARMISD::VST1_UPD
;
15292 NumVecs
= 1; isLaneOp
= false; isLoadOp
= false; break;
15296 // Find the size of memory referenced by the load/store.
15299 VecTy
= N
->getValueType(0);
15300 } else if (isIntrinsic
) {
15301 VecTy
= N
->getOperand(AddrOpIdx
+1).getValueType();
15303 assert(isStore
&& "Node has to be a load, a store, or an intrinsic!");
15304 VecTy
= N
->getOperand(1).getValueType();
15308 NewOpc
== ARMISD::VLD1DUP_UPD
|| NewOpc
== ARMISD::VLD2DUP_UPD
||
15309 NewOpc
== ARMISD::VLD3DUP_UPD
|| NewOpc
== ARMISD::VLD4DUP_UPD
;
15311 unsigned NumBytes
= NumVecs
* VecTy
.getSizeInBits() / 8;
15312 if (isLaneOp
|| isVLDDUPOp
)
15313 NumBytes
/= VecTy
.getVectorNumElements();
15315 // If the increment is a constant, it must match the memory ref size.
15316 SDValue Inc
= User
->getOperand(User
->getOperand(0) == Addr
? 1 : 0);
15317 ConstantSDNode
*CInc
= dyn_cast
<ConstantSDNode
>(Inc
.getNode());
15318 if (NumBytes
>= 3 * 16 && (!CInc
|| CInc
->getZExtValue() != NumBytes
)) {
15319 // VLD3/4 and VST3/4 for 128-bit vectors are implemented with two
15320 // separate instructions that make it harder to use a non-constant update.
15324 // OK, we found an ADD we can fold into the base update.
15325 // Now, create a _UPD node, taking care of not breaking alignment.
15327 EVT AlignedVecTy
= VecTy
;
15328 unsigned Alignment
= MemN
->getAlignment();
15330 // If this is a less-than-standard-aligned load/store, change the type to
15331 // match the standard alignment.
15332 // The alignment is overlooked when selecting _UPD variants; and it's
15333 // easier to introduce bitcasts here than fix that.
15334 // There are 3 ways to get to this base-update combine:
15335 // - intrinsics: they are assumed to be properly aligned (to the standard
15336 // alignment of the memory type), so we don't need to do anything.
15337 // - ARMISD::VLDx nodes: they are only generated from the aforementioned
15338 // intrinsics, so, likewise, there's nothing to do.
15339 // - generic load/store instructions: the alignment is specified as an
15340 // explicit operand, rather than implicitly as the standard alignment
15341 // of the memory type (like the intrisics). We need to change the
15342 // memory type to match the explicit alignment. That way, we don't
15343 // generate non-standard-aligned ARMISD::VLDx nodes.
15344 if (isa
<LSBaseSDNode
>(N
)) {
15345 if (Alignment
== 0)
15347 if (Alignment
< VecTy
.getScalarSizeInBits() / 8) {
15348 MVT EltTy
= MVT::getIntegerVT(Alignment
* 8);
15349 assert(NumVecs
== 1 && "Unexpected multi-element generic load/store.");
15350 assert(!isLaneOp
&& "Unexpected generic load/store lane.");
15351 unsigned NumElts
= NumBytes
/ (EltTy
.getSizeInBits() / 8);
15352 AlignedVecTy
= MVT::getVectorVT(EltTy
, NumElts
);
15354 // Don't set an explicit alignment on regular load/stores that we want
15355 // to transform to VLD/VST 1_UPD nodes.
15356 // This matches the behavior of regular load/stores, which only get an
15357 // explicit alignment if the MMO alignment is larger than the standard
15358 // alignment of the memory type.
15359 // Intrinsics, however, always get an explicit alignment, set to the
15360 // alignment of the MMO.
15364 // Create the new updating load/store node.
15365 // First, create an SDVTList for the new updating node's results.
15367 unsigned NumResultVecs
= (isLoadOp
? NumVecs
: 0);
15369 for (n
= 0; n
< NumResultVecs
; ++n
)
15370 Tys
[n
] = AlignedVecTy
;
15371 Tys
[n
++] = MVT::i32
;
15372 Tys
[n
] = MVT::Other
;
15373 SDVTList SDTys
= DAG
.getVTList(makeArrayRef(Tys
, NumResultVecs
+2));
15375 // Then, gather the new node's operands.
15376 SmallVector
<SDValue
, 8> Ops
;
15377 Ops
.push_back(N
->getOperand(0)); // incoming chain
15378 Ops
.push_back(N
->getOperand(AddrOpIdx
));
15379 Ops
.push_back(Inc
);
15381 if (StoreSDNode
*StN
= dyn_cast
<StoreSDNode
>(N
)) {
15382 // Try to match the intrinsic's signature
15383 Ops
.push_back(StN
->getValue());
15385 // Loads (and of course intrinsics) match the intrinsics' signature,
15386 // so just add all but the alignment operand.
15387 unsigned LastOperand
=
15388 hasAlignment
? N
->getNumOperands() - 1 : N
->getNumOperands();
15389 for (unsigned i
= AddrOpIdx
+ 1; i
< LastOperand
; ++i
)
15390 Ops
.push_back(N
->getOperand(i
));
15393 // For all node types, the alignment operand is always the last one.
15394 Ops
.push_back(DAG
.getConstant(Alignment
, dl
, MVT::i32
));
15396 // If this is a non-standard-aligned STORE, the penultimate operand is the
15397 // stored value. Bitcast it to the aligned type.
15398 if (AlignedVecTy
!= VecTy
&& N
->getOpcode() == ISD::STORE
) {
15399 SDValue
&StVal
= Ops
[Ops
.size()-2];
15400 StVal
= DAG
.getNode(ISD::BITCAST
, dl
, AlignedVecTy
, StVal
);
15403 EVT LoadVT
= isLaneOp
? VecTy
.getVectorElementType() : AlignedVecTy
;
15404 SDValue UpdN
= DAG
.getMemIntrinsicNode(NewOpc
, dl
, SDTys
, Ops
, LoadVT
,
15405 MemN
->getMemOperand());
15407 // Update the uses.
15408 SmallVector
<SDValue
, 5> NewResults
;
15409 for (unsigned i
= 0; i
< NumResultVecs
; ++i
)
15410 NewResults
.push_back(SDValue(UpdN
.getNode(), i
));
15412 // If this is an non-standard-aligned LOAD, the first result is the loaded
15413 // value. Bitcast it to the expected result type.
15414 if (AlignedVecTy
!= VecTy
&& N
->getOpcode() == ISD::LOAD
) {
15415 SDValue
&LdVal
= NewResults
[0];
15416 LdVal
= DAG
.getNode(ISD::BITCAST
, dl
, VecTy
, LdVal
);
15419 NewResults
.push_back(SDValue(UpdN
.getNode(), NumResultVecs
+1)); // chain
15420 DCI
.CombineTo(N
, NewResults
);
15421 DCI
.CombineTo(User
, SDValue(UpdN
.getNode(), NumResultVecs
));
15428 static SDValue
PerformVLDCombine(SDNode
*N
,
15429 TargetLowering::DAGCombinerInfo
&DCI
) {
15430 if (DCI
.isBeforeLegalize() || DCI
.isCalledByLegalizer())
15433 return CombineBaseUpdate(N
, DCI
);
15436 static SDValue
PerformMVEVLDCombine(SDNode
*N
,
15437 TargetLowering::DAGCombinerInfo
&DCI
) {
15438 if (DCI
.isBeforeLegalize() || DCI
.isCalledByLegalizer())
15441 SelectionDAG
&DAG
= DCI
.DAG
;
15442 SDValue Addr
= N
->getOperand(2);
15443 MemSDNode
*MemN
= cast
<MemSDNode
>(N
);
15446 // For the stores, where there are multiple intrinsics we only actually want
15447 // to post-inc the last of the them.
15448 unsigned IntNo
= cast
<ConstantSDNode
>(N
->getOperand(1))->getZExtValue();
15449 if (IntNo
== Intrinsic::arm_mve_vst2q
&&
15450 cast
<ConstantSDNode
>(N
->getOperand(5))->getZExtValue() != 1)
15452 if (IntNo
== Intrinsic::arm_mve_vst4q
&&
15453 cast
<ConstantSDNode
>(N
->getOperand(7))->getZExtValue() != 3)
15456 // Search for a use of the address operand that is an increment.
15457 for (SDNode::use_iterator UI
= Addr
.getNode()->use_begin(),
15458 UE
= Addr
.getNode()->use_end();
15460 SDNode
*User
= *UI
;
15461 if (User
->getOpcode() != ISD::ADD
||
15462 UI
.getUse().getResNo() != Addr
.getResNo())
15465 // Check that the add is independent of the load/store. Otherwise, folding
15466 // it would create a cycle. We can avoid searching through Addr as it's a
15467 // predecessor to both.
15468 SmallPtrSet
<const SDNode
*, 32> Visited
;
15469 SmallVector
<const SDNode
*, 16> Worklist
;
15470 Visited
.insert(Addr
.getNode());
15471 Worklist
.push_back(N
);
15472 Worklist
.push_back(User
);
15473 if (SDNode::hasPredecessorHelper(N
, Visited
, Worklist
) ||
15474 SDNode::hasPredecessorHelper(User
, Visited
, Worklist
))
15477 // Find the new opcode for the updating load/store.
15478 bool isLoadOp
= true;
15479 unsigned NewOpc
= 0;
15480 unsigned NumVecs
= 0;
15483 llvm_unreachable("unexpected intrinsic for MVE VLDn combine");
15484 case Intrinsic::arm_mve_vld2q
:
15485 NewOpc
= ARMISD::VLD2_UPD
;
15488 case Intrinsic::arm_mve_vld4q
:
15489 NewOpc
= ARMISD::VLD4_UPD
;
15492 case Intrinsic::arm_mve_vst2q
:
15493 NewOpc
= ARMISD::VST2_UPD
;
15497 case Intrinsic::arm_mve_vst4q
:
15498 NewOpc
= ARMISD::VST4_UPD
;
15504 // Find the size of memory referenced by the load/store.
15507 VecTy
= N
->getValueType(0);
15509 VecTy
= N
->getOperand(3).getValueType();
15512 unsigned NumBytes
= NumVecs
* VecTy
.getSizeInBits() / 8;
15514 // If the increment is a constant, it must match the memory ref size.
15515 SDValue Inc
= User
->getOperand(User
->getOperand(0) == Addr
? 1 : 0);
15516 ConstantSDNode
*CInc
= dyn_cast
<ConstantSDNode
>(Inc
.getNode());
15517 if (!CInc
|| CInc
->getZExtValue() != NumBytes
)
15520 // Create the new updating load/store node.
15521 // First, create an SDVTList for the new updating node's results.
15523 unsigned NumResultVecs
= (isLoadOp
? NumVecs
: 0);
15525 for (n
= 0; n
< NumResultVecs
; ++n
)
15527 Tys
[n
++] = MVT::i32
;
15528 Tys
[n
] = MVT::Other
;
15529 SDVTList SDTys
= DAG
.getVTList(makeArrayRef(Tys
, NumResultVecs
+ 2));
15531 // Then, gather the new node's operands.
15532 SmallVector
<SDValue
, 8> Ops
;
15533 Ops
.push_back(N
->getOperand(0)); // incoming chain
15534 Ops
.push_back(N
->getOperand(2)); // ptr
15535 Ops
.push_back(Inc
);
15537 for (unsigned i
= 3; i
< N
->getNumOperands(); ++i
)
15538 Ops
.push_back(N
->getOperand(i
));
15540 SDValue UpdN
= DAG
.getMemIntrinsicNode(NewOpc
, dl
, SDTys
, Ops
, VecTy
,
15541 MemN
->getMemOperand());
15543 // Update the uses.
15544 SmallVector
<SDValue
, 5> NewResults
;
15545 for (unsigned i
= 0; i
< NumResultVecs
; ++i
)
15546 NewResults
.push_back(SDValue(UpdN
.getNode(), i
));
15548 NewResults
.push_back(SDValue(UpdN
.getNode(), NumResultVecs
+ 1)); // chain
15549 DCI
.CombineTo(N
, NewResults
);
15550 DCI
.CombineTo(User
, SDValue(UpdN
.getNode(), NumResultVecs
));
15558 /// CombineVLDDUP - For a VDUPLANE node N, check if its source operand is a
15559 /// vldN-lane (N > 1) intrinsic, and if all the other uses of that intrinsic
15560 /// are also VDUPLANEs. If so, combine them to a vldN-dup operation and
15562 static bool CombineVLDDUP(SDNode
*N
, TargetLowering::DAGCombinerInfo
&DCI
) {
15563 SelectionDAG
&DAG
= DCI
.DAG
;
15564 EVT VT
= N
->getValueType(0);
15565 // vldN-dup instructions only support 64-bit vectors for N > 1.
15566 if (!VT
.is64BitVector())
15569 // Check if the VDUPLANE operand is a vldN-dup intrinsic.
15570 SDNode
*VLD
= N
->getOperand(0).getNode();
15571 if (VLD
->getOpcode() != ISD::INTRINSIC_W_CHAIN
)
15573 unsigned NumVecs
= 0;
15574 unsigned NewOpc
= 0;
15575 unsigned IntNo
= cast
<ConstantSDNode
>(VLD
->getOperand(1))->getZExtValue();
15576 if (IntNo
== Intrinsic::arm_neon_vld2lane
) {
15578 NewOpc
= ARMISD::VLD2DUP
;
15579 } else if (IntNo
== Intrinsic::arm_neon_vld3lane
) {
15581 NewOpc
= ARMISD::VLD3DUP
;
15582 } else if (IntNo
== Intrinsic::arm_neon_vld4lane
) {
15584 NewOpc
= ARMISD::VLD4DUP
;
15589 // First check that all the vldN-lane uses are VDUPLANEs and that the lane
15590 // numbers match the load.
15591 unsigned VLDLaneNo
=
15592 cast
<ConstantSDNode
>(VLD
->getOperand(NumVecs
+3))->getZExtValue();
15593 for (SDNode::use_iterator UI
= VLD
->use_begin(), UE
= VLD
->use_end();
15595 // Ignore uses of the chain result.
15596 if (UI
.getUse().getResNo() == NumVecs
)
15598 SDNode
*User
= *UI
;
15599 if (User
->getOpcode() != ARMISD::VDUPLANE
||
15600 VLDLaneNo
!= cast
<ConstantSDNode
>(User
->getOperand(1))->getZExtValue())
15604 // Create the vldN-dup node.
15607 for (n
= 0; n
< NumVecs
; ++n
)
15609 Tys
[n
] = MVT::Other
;
15610 SDVTList SDTys
= DAG
.getVTList(makeArrayRef(Tys
, NumVecs
+1));
15611 SDValue Ops
[] = { VLD
->getOperand(0), VLD
->getOperand(2) };
15612 MemIntrinsicSDNode
*VLDMemInt
= cast
<MemIntrinsicSDNode
>(VLD
);
15613 SDValue VLDDup
= DAG
.getMemIntrinsicNode(NewOpc
, SDLoc(VLD
), SDTys
,
15614 Ops
, VLDMemInt
->getMemoryVT(),
15615 VLDMemInt
->getMemOperand());
15617 // Update the uses.
15618 for (SDNode::use_iterator UI
= VLD
->use_begin(), UE
= VLD
->use_end();
15620 unsigned ResNo
= UI
.getUse().getResNo();
15621 // Ignore uses of the chain result.
15622 if (ResNo
== NumVecs
)
15624 SDNode
*User
= *UI
;
15625 DCI
.CombineTo(User
, SDValue(VLDDup
.getNode(), ResNo
));
15628 // Now the vldN-lane intrinsic is dead except for its chain result.
15629 // Update uses of the chain.
15630 std::vector
<SDValue
> VLDDupResults
;
15631 for (unsigned n
= 0; n
< NumVecs
; ++n
)
15632 VLDDupResults
.push_back(SDValue(VLDDup
.getNode(), n
));
15633 VLDDupResults
.push_back(SDValue(VLDDup
.getNode(), NumVecs
));
15634 DCI
.CombineTo(VLD
, VLDDupResults
);
15639 /// PerformVDUPLANECombine - Target-specific dag combine xforms for
15640 /// ARMISD::VDUPLANE.
15641 static SDValue
PerformVDUPLANECombine(SDNode
*N
,
15642 TargetLowering::DAGCombinerInfo
&DCI
,
15643 const ARMSubtarget
*Subtarget
) {
15644 SDValue Op
= N
->getOperand(0);
15645 EVT VT
= N
->getValueType(0);
15647 // On MVE, we just convert the VDUPLANE to a VDUP with an extract.
15648 if (Subtarget
->hasMVEIntegerOps()) {
15649 EVT ExtractVT
= VT
.getVectorElementType();
15650 // We need to ensure we are creating a legal type.
15651 if (!DCI
.DAG
.getTargetLoweringInfo().isTypeLegal(ExtractVT
))
15652 ExtractVT
= MVT::i32
;
15653 SDValue Extract
= DCI
.DAG
.getNode(ISD::EXTRACT_VECTOR_ELT
, SDLoc(N
), ExtractVT
,
15654 N
->getOperand(0), N
->getOperand(1));
15655 return DCI
.DAG
.getNode(ARMISD::VDUP
, SDLoc(N
), VT
, Extract
);
15658 // If the source is a vldN-lane (N > 1) intrinsic, and all the other uses
15659 // of that intrinsic are also VDUPLANEs, combine them to a vldN-dup operation.
15660 if (CombineVLDDUP(N
, DCI
))
15661 return SDValue(N
, 0);
15663 // If the source is already a VMOVIMM or VMVNIMM splat, the VDUPLANE is
15664 // redundant. Ignore bit_converts for now; element sizes are checked below.
15665 while (Op
.getOpcode() == ISD::BITCAST
)
15666 Op
= Op
.getOperand(0);
15667 if (Op
.getOpcode() != ARMISD::VMOVIMM
&& Op
.getOpcode() != ARMISD::VMVNIMM
)
15670 // Make sure the VMOV element size is not bigger than the VDUPLANE elements.
15671 unsigned EltSize
= Op
.getScalarValueSizeInBits();
15672 // The canonical VMOV for a zero vector uses a 32-bit element size.
15673 unsigned Imm
= cast
<ConstantSDNode
>(Op
.getOperand(0))->getZExtValue();
15675 if (ARM_AM::decodeVMOVModImm(Imm
, EltBits
) == 0)
15677 if (EltSize
> VT
.getScalarSizeInBits())
15680 return DCI
.DAG
.getNode(ISD::BITCAST
, SDLoc(N
), VT
, Op
);
15683 /// PerformVDUPCombine - Target-specific dag combine xforms for ARMISD::VDUP.
15684 static SDValue
PerformVDUPCombine(SDNode
*N
, SelectionDAG
&DAG
,
15685 const ARMSubtarget
*Subtarget
) {
15686 SDValue Op
= N
->getOperand(0);
15689 if (Subtarget
->hasMVEIntegerOps()) {
15690 // Convert VDUP f32 -> VDUP BITCAST i32 under MVE, as we know the value will
15691 // need to come from a GPR.
15692 if (Op
.getValueType() == MVT::f32
)
15693 return DAG
.getNode(ARMISD::VDUP
, dl
, N
->getValueType(0),
15694 DAG
.getNode(ISD::BITCAST
, dl
, MVT::i32
, Op
));
15695 else if (Op
.getValueType() == MVT::f16
)
15696 return DAG
.getNode(ARMISD::VDUP
, dl
, N
->getValueType(0),
15697 DAG
.getNode(ARMISD::VMOVrh
, dl
, MVT::i32
, Op
));
15700 if (!Subtarget
->hasNEON())
15703 // Match VDUP(LOAD) -> VLD1DUP.
15704 // We match this pattern here rather than waiting for isel because the
15705 // transform is only legal for unindexed loads.
15706 LoadSDNode
*LD
= dyn_cast
<LoadSDNode
>(Op
.getNode());
15707 if (LD
&& Op
.hasOneUse() && LD
->isUnindexed() &&
15708 LD
->getMemoryVT() == N
->getValueType(0).getVectorElementType()) {
15709 SDValue Ops
[] = {LD
->getOperand(0), LD
->getOperand(1),
15710 DAG
.getConstant(LD
->getAlignment(), SDLoc(N
), MVT::i32
)};
15711 SDVTList SDTys
= DAG
.getVTList(N
->getValueType(0), MVT::Other
);
15713 DAG
.getMemIntrinsicNode(ARMISD::VLD1DUP
, SDLoc(N
), SDTys
, Ops
,
15714 LD
->getMemoryVT(), LD
->getMemOperand());
15715 DAG
.ReplaceAllUsesOfValueWith(SDValue(LD
, 1), VLDDup
.getValue(1));
15722 static SDValue
PerformLOADCombine(SDNode
*N
,
15723 TargetLowering::DAGCombinerInfo
&DCI
) {
15724 EVT VT
= N
->getValueType(0);
15726 // If this is a legal vector load, try to combine it into a VLD1_UPD.
15727 if (ISD::isNormalLoad(N
) && VT
.isVector() &&
15728 DCI
.DAG
.getTargetLoweringInfo().isTypeLegal(VT
))
15729 return CombineBaseUpdate(N
, DCI
);
15734 // Optimize trunc store (of multiple scalars) to shuffle and store. First,
15735 // pack all of the elements in one place. Next, store to memory in fewer
15737 static SDValue
PerformTruncatingStoreCombine(StoreSDNode
*St
,
15738 SelectionDAG
&DAG
) {
15739 SDValue StVal
= St
->getValue();
15740 EVT VT
= StVal
.getValueType();
15741 if (!St
->isTruncatingStore() || !VT
.isVector())
15743 const TargetLowering
&TLI
= DAG
.getTargetLoweringInfo();
15744 EVT StVT
= St
->getMemoryVT();
15745 unsigned NumElems
= VT
.getVectorNumElements();
15746 assert(StVT
!= VT
&& "Cannot truncate to the same type");
15747 unsigned FromEltSz
= VT
.getScalarSizeInBits();
15748 unsigned ToEltSz
= StVT
.getScalarSizeInBits();
15750 // From, To sizes and ElemCount must be pow of two
15751 if (!isPowerOf2_32(NumElems
* FromEltSz
* ToEltSz
))
15754 // We are going to use the original vector elt for storing.
15755 // Accumulated smaller vector elements must be a multiple of the store size.
15756 if (0 != (NumElems
* FromEltSz
) % ToEltSz
)
15759 unsigned SizeRatio
= FromEltSz
/ ToEltSz
;
15760 assert(SizeRatio
* NumElems
* ToEltSz
== VT
.getSizeInBits());
15762 // Create a type on which we perform the shuffle.
15763 EVT WideVecVT
= EVT::getVectorVT(*DAG
.getContext(), StVT
.getScalarType(),
15764 NumElems
* SizeRatio
);
15765 assert(WideVecVT
.getSizeInBits() == VT
.getSizeInBits());
15768 SDValue WideVec
= DAG
.getNode(ISD::BITCAST
, DL
, WideVecVT
, StVal
);
15769 SmallVector
<int, 8> ShuffleVec(NumElems
* SizeRatio
, -1);
15770 for (unsigned i
= 0; i
< NumElems
; ++i
)
15771 ShuffleVec
[i
] = DAG
.getDataLayout().isBigEndian() ? (i
+ 1) * SizeRatio
- 1
15774 // Can't shuffle using an illegal type.
15775 if (!TLI
.isTypeLegal(WideVecVT
))
15778 SDValue Shuff
= DAG
.getVectorShuffle(
15779 WideVecVT
, DL
, WideVec
, DAG
.getUNDEF(WideVec
.getValueType()), ShuffleVec
);
15780 // At this point all of the data is stored at the bottom of the
15781 // register. We now need to save it to mem.
15783 // Find the largest store unit
15784 MVT StoreType
= MVT::i8
;
15785 for (MVT Tp
: MVT::integer_valuetypes()) {
15786 if (TLI
.isTypeLegal(Tp
) && Tp
.getSizeInBits() <= NumElems
* ToEltSz
)
15789 // Didn't find a legal store type.
15790 if (!TLI
.isTypeLegal(StoreType
))
15793 // Bitcast the original vector into a vector of store-size units
15795 EVT::getVectorVT(*DAG
.getContext(), StoreType
,
15796 VT
.getSizeInBits() / EVT(StoreType
).getSizeInBits());
15797 assert(StoreVecVT
.getSizeInBits() == VT
.getSizeInBits());
15798 SDValue ShuffWide
= DAG
.getNode(ISD::BITCAST
, DL
, StoreVecVT
, Shuff
);
15799 SmallVector
<SDValue
, 8> Chains
;
15800 SDValue Increment
= DAG
.getConstant(StoreType
.getSizeInBits() / 8, DL
,
15801 TLI
.getPointerTy(DAG
.getDataLayout()));
15802 SDValue BasePtr
= St
->getBasePtr();
15804 // Perform one or more big stores into memory.
15805 unsigned E
= (ToEltSz
* NumElems
) / StoreType
.getSizeInBits();
15806 for (unsigned I
= 0; I
< E
; I
++) {
15807 SDValue SubVec
= DAG
.getNode(ISD::EXTRACT_VECTOR_ELT
, DL
, StoreType
,
15808 ShuffWide
, DAG
.getIntPtrConstant(I
, DL
));
15810 DAG
.getStore(St
->getChain(), DL
, SubVec
, BasePtr
, St
->getPointerInfo(),
15811 St
->getAlignment(), St
->getMemOperand()->getFlags());
15813 DAG
.getNode(ISD::ADD
, DL
, BasePtr
.getValueType(), BasePtr
, Increment
);
15814 Chains
.push_back(Ch
);
15816 return DAG
.getNode(ISD::TokenFactor
, DL
, MVT::Other
, Chains
);
15819 // Try taking a single vector store from an fpround (which would otherwise turn
15820 // into an expensive buildvector) and splitting it into a series of narrowing
15822 static SDValue
PerformSplittingToNarrowingStores(StoreSDNode
*St
,
15823 SelectionDAG
&DAG
) {
15824 if (!St
->isSimple() || St
->isTruncatingStore() || !St
->isUnindexed())
15826 SDValue Trunc
= St
->getValue();
15827 if (Trunc
->getOpcode() != ISD::FP_ROUND
)
15829 EVT FromVT
= Trunc
->getOperand(0).getValueType();
15830 EVT ToVT
= Trunc
.getValueType();
15831 if (!ToVT
.isVector())
15833 assert(FromVT
.getVectorNumElements() == ToVT
.getVectorNumElements());
15834 EVT ToEltVT
= ToVT
.getVectorElementType();
15835 EVT FromEltVT
= FromVT
.getVectorElementType();
15837 if (FromEltVT
!= MVT::f32
|| ToEltVT
!= MVT::f16
)
15840 unsigned NumElements
= 4;
15841 if (FromVT
.getVectorNumElements() % NumElements
!= 0)
15844 // Test if the Trunc will be convertable to a VMOVN with a shuffle, and if so
15845 // use the VMOVN over splitting the store. We are looking for patterns of:
15846 // !rev: 0 N 1 N+1 2 N+2 ...
15847 // rev: N 0 N+1 1 N+2 2 ...
15848 // The shuffle may either be a single source (in which case N = NumElts/2) or
15849 // two inputs extended with concat to the same size (in which case N =
15851 auto isVMOVNShuffle
= [&](ShuffleVectorSDNode
*SVN
, bool Rev
) {
15852 ArrayRef
<int> M
= SVN
->getMask();
15853 unsigned NumElts
= ToVT
.getVectorNumElements();
15854 if (SVN
->getOperand(1).isUndef())
15857 unsigned Off0
= Rev
? NumElts
: 0;
15858 unsigned Off1
= Rev
? 0 : NumElts
;
15860 for (unsigned I
= 0; I
< NumElts
; I
+= 2) {
15861 if (M
[I
] >= 0 && M
[I
] != (int)(Off0
+ I
/ 2))
15863 if (M
[I
+ 1] >= 0 && M
[I
+ 1] != (int)(Off1
+ I
/ 2))
15870 if (auto *Shuffle
= dyn_cast
<ShuffleVectorSDNode
>(Trunc
.getOperand(0)))
15871 if (isVMOVNShuffle(Shuffle
, false) || isVMOVNShuffle(Shuffle
, true))
15874 LLVMContext
&C
= *DAG
.getContext();
15876 // Details about the old store
15877 SDValue Ch
= St
->getChain();
15878 SDValue BasePtr
= St
->getBasePtr();
15879 Align Alignment
= St
->getOriginalAlign();
15880 MachineMemOperand::Flags MMOFlags
= St
->getMemOperand()->getFlags();
15881 AAMDNodes AAInfo
= St
->getAAInfo();
15883 // We split the store into slices of NumElements. fp16 trunc stores are vcvt
15884 // and then stored as truncating integer stores.
15885 EVT NewFromVT
= EVT::getVectorVT(C
, FromEltVT
, NumElements
);
15886 EVT NewToVT
= EVT::getVectorVT(
15887 C
, EVT::getIntegerVT(C
, ToEltVT
.getSizeInBits()), NumElements
);
15889 SmallVector
<SDValue
, 4> Stores
;
15890 for (unsigned i
= 0; i
< FromVT
.getVectorNumElements() / NumElements
; i
++) {
15891 unsigned NewOffset
= i
* NumElements
* ToEltVT
.getSizeInBits() / 8;
15893 DAG
.getObjectPtrOffset(DL
, BasePtr
, TypeSize::Fixed(NewOffset
));
15896 DAG
.getNode(ISD::EXTRACT_SUBVECTOR
, DL
, NewFromVT
, Trunc
.getOperand(0),
15897 DAG
.getConstant(i
* NumElements
, DL
, MVT::i32
));
15900 DAG
.getNode(ARMISD::VCVTN
, DL
, MVT::v8f16
, DAG
.getUNDEF(MVT::v8f16
),
15901 Extract
, DAG
.getConstant(0, DL
, MVT::i32
));
15902 Extract
= DAG
.getNode(ARMISD::VECTOR_REG_CAST
, DL
, MVT::v4i32
, FPTrunc
);
15904 SDValue Store
= DAG
.getTruncStore(
15905 Ch
, DL
, Extract
, NewPtr
, St
->getPointerInfo().getWithOffset(NewOffset
),
15906 NewToVT
, Alignment
.value(), MMOFlags
, AAInfo
);
15907 Stores
.push_back(Store
);
15909 return DAG
.getNode(ISD::TokenFactor
, DL
, MVT::Other
, Stores
);
15912 // Try taking a single vector store from an MVETRUNC (which would otherwise turn
15913 // into an expensive buildvector) and splitting it into a series of narrowing
15915 static SDValue
PerformSplittingMVETruncToNarrowingStores(StoreSDNode
*St
,
15916 SelectionDAG
&DAG
) {
15917 if (!St
->isSimple() || St
->isTruncatingStore() || !St
->isUnindexed())
15919 SDValue Trunc
= St
->getValue();
15920 if (Trunc
->getOpcode() != ARMISD::MVETRUNC
)
15922 EVT FromVT
= Trunc
->getOperand(0).getValueType();
15923 EVT ToVT
= Trunc
.getValueType();
15925 LLVMContext
&C
= *DAG
.getContext();
15927 // Details about the old store
15928 SDValue Ch
= St
->getChain();
15929 SDValue BasePtr
= St
->getBasePtr();
15930 Align Alignment
= St
->getOriginalAlign();
15931 MachineMemOperand::Flags MMOFlags
= St
->getMemOperand()->getFlags();
15932 AAMDNodes AAInfo
= St
->getAAInfo();
15934 EVT NewToVT
= EVT::getVectorVT(C
, ToVT
.getVectorElementType(),
15935 FromVT
.getVectorNumElements());
15937 SmallVector
<SDValue
, 4> Stores
;
15938 for (unsigned i
= 0; i
< Trunc
.getNumOperands(); i
++) {
15939 unsigned NewOffset
=
15940 i
* FromVT
.getVectorNumElements() * ToVT
.getScalarSizeInBits() / 8;
15942 DAG
.getObjectPtrOffset(DL
, BasePtr
, TypeSize::Fixed(NewOffset
));
15944 SDValue Extract
= Trunc
.getOperand(i
);
15945 SDValue Store
= DAG
.getTruncStore(
15946 Ch
, DL
, Extract
, NewPtr
, St
->getPointerInfo().getWithOffset(NewOffset
),
15947 NewToVT
, Alignment
.value(), MMOFlags
, AAInfo
);
15948 Stores
.push_back(Store
);
15950 return DAG
.getNode(ISD::TokenFactor
, DL
, MVT::Other
, Stores
);
15953 // Given a floating point store from an extracted vector, with an integer
15954 // VGETLANE that already exists, store the existing VGETLANEu directly. This can
15955 // help reduce fp register pressure, doesn't require the fp extract and allows
15956 // use of more integer post-inc stores not available with vstr.
15957 static SDValue
PerformExtractFpToIntStores(StoreSDNode
*St
, SelectionDAG
&DAG
) {
15958 if (!St
->isSimple() || St
->isTruncatingStore() || !St
->isUnindexed())
15960 SDValue Extract
= St
->getValue();
15961 EVT VT
= Extract
.getValueType();
15962 // For now only uses f16. This may be useful for f32 too, but that will
15963 // be bitcast(extract), not the VGETLANEu we currently check here.
15964 if (VT
!= MVT::f16
|| Extract
->getOpcode() != ISD::EXTRACT_VECTOR_ELT
)
15968 DAG
.getNodeIfExists(ARMISD::VGETLANEu
, DAG
.getVTList(MVT::i32
),
15969 {Extract
.getOperand(0), Extract
.getOperand(1)});
15973 LLVMContext
&C
= *DAG
.getContext();
15975 // Create a new integer store to replace the existing floating point version.
15976 SDValue Ch
= St
->getChain();
15977 SDValue BasePtr
= St
->getBasePtr();
15978 Align Alignment
= St
->getOriginalAlign();
15979 MachineMemOperand::Flags MMOFlags
= St
->getMemOperand()->getFlags();
15980 AAMDNodes AAInfo
= St
->getAAInfo();
15981 EVT NewToVT
= EVT::getIntegerVT(C
, VT
.getSizeInBits());
15982 SDValue Store
= DAG
.getTruncStore(Ch
, DL
, SDValue(GetLane
, 0), BasePtr
,
15983 St
->getPointerInfo(), NewToVT
,
15984 Alignment
.value(), MMOFlags
, AAInfo
);
15989 /// PerformSTORECombine - Target-specific dag combine xforms for
15991 static SDValue
PerformSTORECombine(SDNode
*N
,
15992 TargetLowering::DAGCombinerInfo
&DCI
,
15993 const ARMSubtarget
*Subtarget
) {
15994 StoreSDNode
*St
= cast
<StoreSDNode
>(N
);
15995 if (St
->isVolatile())
15997 SDValue StVal
= St
->getValue();
15998 EVT VT
= StVal
.getValueType();
16000 if (Subtarget
->hasNEON())
16001 if (SDValue Store
= PerformTruncatingStoreCombine(St
, DCI
.DAG
))
16004 if (Subtarget
->hasMVEIntegerOps()) {
16005 if (SDValue NewToken
= PerformSplittingToNarrowingStores(St
, DCI
.DAG
))
16007 if (SDValue NewChain
= PerformExtractFpToIntStores(St
, DCI
.DAG
))
16009 if (SDValue NewToken
=
16010 PerformSplittingMVETruncToNarrowingStores(St
, DCI
.DAG
))
16014 if (!ISD::isNormalStore(St
))
16017 // Split a store of a VMOVDRR into two integer stores to avoid mixing NEON and
16018 // ARM stores of arguments in the same cache line.
16019 if (StVal
.getNode()->getOpcode() == ARMISD::VMOVDRR
&&
16020 StVal
.getNode()->hasOneUse()) {
16021 SelectionDAG
&DAG
= DCI
.DAG
;
16022 bool isBigEndian
= DAG
.getDataLayout().isBigEndian();
16024 SDValue BasePtr
= St
->getBasePtr();
16025 SDValue NewST1
= DAG
.getStore(
16026 St
->getChain(), DL
, StVal
.getNode()->getOperand(isBigEndian
? 1 : 0),
16027 BasePtr
, St
->getPointerInfo(), St
->getOriginalAlign(),
16028 St
->getMemOperand()->getFlags());
16030 SDValue OffsetPtr
= DAG
.getNode(ISD::ADD
, DL
, MVT::i32
, BasePtr
,
16031 DAG
.getConstant(4, DL
, MVT::i32
));
16032 return DAG
.getStore(NewST1
.getValue(0), DL
,
16033 StVal
.getNode()->getOperand(isBigEndian
? 0 : 1),
16034 OffsetPtr
, St
->getPointerInfo().getWithOffset(4),
16035 St
->getOriginalAlign(),
16036 St
->getMemOperand()->getFlags());
16039 if (StVal
.getValueType() == MVT::i64
&&
16040 StVal
.getNode()->getOpcode() == ISD::EXTRACT_VECTOR_ELT
) {
16042 // Bitcast an i64 store extracted from a vector to f64.
16043 // Otherwise, the i64 value will be legalized to a pair of i32 values.
16044 SelectionDAG
&DAG
= DCI
.DAG
;
16046 SDValue IntVec
= StVal
.getOperand(0);
16047 EVT FloatVT
= EVT::getVectorVT(*DAG
.getContext(), MVT::f64
,
16048 IntVec
.getValueType().getVectorNumElements());
16049 SDValue Vec
= DAG
.getNode(ISD::BITCAST
, dl
, FloatVT
, IntVec
);
16050 SDValue ExtElt
= DAG
.getNode(ISD::EXTRACT_VECTOR_ELT
, dl
, MVT::f64
,
16051 Vec
, StVal
.getOperand(1));
16053 SDValue V
= DAG
.getNode(ISD::BITCAST
, dl
, MVT::i64
, ExtElt
);
16054 // Make the DAGCombiner fold the bitcasts.
16055 DCI
.AddToWorklist(Vec
.getNode());
16056 DCI
.AddToWorklist(ExtElt
.getNode());
16057 DCI
.AddToWorklist(V
.getNode());
16058 return DAG
.getStore(St
->getChain(), dl
, V
, St
->getBasePtr(),
16059 St
->getPointerInfo(), St
->getAlignment(),
16060 St
->getMemOperand()->getFlags(), St
->getAAInfo());
16063 // If this is a legal vector store, try to combine it into a VST1_UPD.
16064 if (Subtarget
->hasNEON() && ISD::isNormalStore(N
) && VT
.isVector() &&
16065 DCI
.DAG
.getTargetLoweringInfo().isTypeLegal(VT
))
16066 return CombineBaseUpdate(N
, DCI
);
16071 /// PerformVCVTCombine - VCVT (floating-point to fixed-point, Advanced SIMD)
16072 /// can replace combinations of VMUL and VCVT (floating-point to integer)
16073 /// when the VMUL has a constant operand that is a power of 2.
16075 /// Example (assume d17 = <float 8.000000e+00, float 8.000000e+00>):
16076 /// vmul.f32 d16, d17, d16
16077 /// vcvt.s32.f32 d16, d16
16079 /// vcvt.s32.f32 d16, d16, #3
16080 static SDValue
PerformVCVTCombine(SDNode
*N
, SelectionDAG
&DAG
,
16081 const ARMSubtarget
*Subtarget
) {
16082 if (!Subtarget
->hasNEON())
16085 SDValue Op
= N
->getOperand(0);
16086 if (!Op
.getValueType().isVector() || !Op
.getValueType().isSimple() ||
16087 Op
.getOpcode() != ISD::FMUL
)
16090 SDValue ConstVec
= Op
->getOperand(1);
16091 if (!isa
<BuildVectorSDNode
>(ConstVec
))
16094 MVT FloatTy
= Op
.getSimpleValueType().getVectorElementType();
16095 uint32_t FloatBits
= FloatTy
.getSizeInBits();
16096 MVT IntTy
= N
->getSimpleValueType(0).getVectorElementType();
16097 uint32_t IntBits
= IntTy
.getSizeInBits();
16098 unsigned NumLanes
= Op
.getValueType().getVectorNumElements();
16099 if (FloatBits
!= 32 || IntBits
> 32 || (NumLanes
!= 4 && NumLanes
!= 2)) {
16100 // These instructions only exist converting from f32 to i32. We can handle
16101 // smaller integers by generating an extra truncate, but larger ones would
16102 // be lossy. We also can't handle anything other than 2 or 4 lanes, since
16103 // these intructions only support v2i32/v4i32 types.
16107 BitVector UndefElements
;
16108 BuildVectorSDNode
*BV
= cast
<BuildVectorSDNode
>(ConstVec
);
16109 int32_t C
= BV
->getConstantFPSplatPow2ToLog2Int(&UndefElements
, 33);
16110 if (C
== -1 || C
== 0 || C
> 32)
16114 bool isSigned
= N
->getOpcode() == ISD::FP_TO_SINT
;
16115 unsigned IntrinsicOpcode
= isSigned
? Intrinsic::arm_neon_vcvtfp2fxs
:
16116 Intrinsic::arm_neon_vcvtfp2fxu
;
16117 SDValue FixConv
= DAG
.getNode(
16118 ISD::INTRINSIC_WO_CHAIN
, dl
, NumLanes
== 2 ? MVT::v2i32
: MVT::v4i32
,
16119 DAG
.getConstant(IntrinsicOpcode
, dl
, MVT::i32
), Op
->getOperand(0),
16120 DAG
.getConstant(C
, dl
, MVT::i32
));
16122 if (IntBits
< FloatBits
)
16123 FixConv
= DAG
.getNode(ISD::TRUNCATE
, dl
, N
->getValueType(0), FixConv
);
16128 /// PerformVDIVCombine - VCVT (fixed-point to floating-point, Advanced SIMD)
16129 /// can replace combinations of VCVT (integer to floating-point) and VDIV
16130 /// when the VDIV has a constant operand that is a power of 2.
16132 /// Example (assume d17 = <float 8.000000e+00, float 8.000000e+00>):
16133 /// vcvt.f32.s32 d16, d16
16134 /// vdiv.f32 d16, d17, d16
16136 /// vcvt.f32.s32 d16, d16, #3
16137 static SDValue
PerformVDIVCombine(SDNode
*N
, SelectionDAG
&DAG
,
16138 const ARMSubtarget
*Subtarget
) {
16139 if (!Subtarget
->hasNEON())
16142 SDValue Op
= N
->getOperand(0);
16143 unsigned OpOpcode
= Op
.getNode()->getOpcode();
16144 if (!N
->getValueType(0).isVector() || !N
->getValueType(0).isSimple() ||
16145 (OpOpcode
!= ISD::SINT_TO_FP
&& OpOpcode
!= ISD::UINT_TO_FP
))
16148 SDValue ConstVec
= N
->getOperand(1);
16149 if (!isa
<BuildVectorSDNode
>(ConstVec
))
16152 MVT FloatTy
= N
->getSimpleValueType(0).getVectorElementType();
16153 uint32_t FloatBits
= FloatTy
.getSizeInBits();
16154 MVT IntTy
= Op
.getOperand(0).getSimpleValueType().getVectorElementType();
16155 uint32_t IntBits
= IntTy
.getSizeInBits();
16156 unsigned NumLanes
= Op
.getValueType().getVectorNumElements();
16157 if (FloatBits
!= 32 || IntBits
> 32 || (NumLanes
!= 4 && NumLanes
!= 2)) {
16158 // These instructions only exist converting from i32 to f32. We can handle
16159 // smaller integers by generating an extra extend, but larger ones would
16160 // be lossy. We also can't handle anything other than 2 or 4 lanes, since
16161 // these intructions only support v2i32/v4i32 types.
16165 BitVector UndefElements
;
16166 BuildVectorSDNode
*BV
= cast
<BuildVectorSDNode
>(ConstVec
);
16167 int32_t C
= BV
->getConstantFPSplatPow2ToLog2Int(&UndefElements
, 33);
16168 if (C
== -1 || C
== 0 || C
> 32)
16172 bool isSigned
= OpOpcode
== ISD::SINT_TO_FP
;
16173 SDValue ConvInput
= Op
.getOperand(0);
16174 if (IntBits
< FloatBits
)
16175 ConvInput
= DAG
.getNode(isSigned
? ISD::SIGN_EXTEND
: ISD::ZERO_EXTEND
,
16176 dl
, NumLanes
== 2 ? MVT::v2i32
: MVT::v4i32
,
16179 unsigned IntrinsicOpcode
= isSigned
? Intrinsic::arm_neon_vcvtfxs2fp
:
16180 Intrinsic::arm_neon_vcvtfxu2fp
;
16181 return DAG
.getNode(ISD::INTRINSIC_WO_CHAIN
, dl
,
16183 DAG
.getConstant(IntrinsicOpcode
, dl
, MVT::i32
),
16184 ConvInput
, DAG
.getConstant(C
, dl
, MVT::i32
));
16187 static SDValue
PerformVECREDUCE_ADDCombine(SDNode
*N
, SelectionDAG
&DAG
,
16188 const ARMSubtarget
*ST
) {
16189 if (!ST
->hasMVEIntegerOps())
16192 assert(N
->getOpcode() == ISD::VECREDUCE_ADD
);
16193 EVT ResVT
= N
->getValueType(0);
16194 SDValue N0
= N
->getOperand(0);
16197 // Try to turn vecreduce_add(add(x, y)) into vecreduce(x) + vecreduce(y)
16198 if (ResVT
== MVT::i32
&& N0
.getOpcode() == ISD::ADD
&&
16199 (N0
.getValueType() == MVT::v4i32
|| N0
.getValueType() == MVT::v8i16
||
16200 N0
.getValueType() == MVT::v16i8
)) {
16201 SDValue Red0
= DAG
.getNode(ISD::VECREDUCE_ADD
, dl
, ResVT
, N0
.getOperand(0));
16202 SDValue Red1
= DAG
.getNode(ISD::VECREDUCE_ADD
, dl
, ResVT
, N0
.getOperand(1));
16203 return DAG
.getNode(ISD::ADD
, dl
, ResVT
, Red0
, Red1
);
16206 // We are looking for something that will have illegal types if left alone,
16207 // but that we can convert to a single instruction under MVE. For example
16208 // vecreduce_add(sext(A, v8i32)) => VADDV.s16 A
16210 // vecreduce_add(mul(zext(A, v16i32), zext(B, v16i32))) => VMLADAV.u8 A, B
16212 // The legal cases are:
16213 // VADDV u/s 8/16/32
16214 // VMLAV u/s 8/16/32
16216 // VMLALV u/s 16/32
16218 // If the input vector is smaller than legal (v4i8/v4i16 for example) we can
16219 // extend it and use v4i32 instead.
16220 auto ExtTypeMatches
= [](SDValue A
, ArrayRef
<MVT
> ExtTypes
) {
16221 EVT AVT
= A
.getValueType();
16222 return any_of(ExtTypes
, [&](MVT Ty
) {
16223 return AVT
.getVectorNumElements() == Ty
.getVectorNumElements() &&
16227 auto ExtendIfNeeded
= [&](SDValue A
, unsigned ExtendCode
) {
16228 EVT AVT
= A
.getValueType();
16229 if (!AVT
.is128BitVector())
16230 A
= DAG
.getNode(ExtendCode
, dl
,
16231 AVT
.changeVectorElementType(MVT::getIntegerVT(
16232 128 / AVT
.getVectorMinNumElements())),
16236 auto IsVADDV
= [&](MVT RetTy
, unsigned ExtendCode
, ArrayRef
<MVT
> ExtTypes
) {
16237 if (ResVT
!= RetTy
|| N0
->getOpcode() != ExtendCode
)
16239 SDValue A
= N0
->getOperand(0);
16240 if (ExtTypeMatches(A
, ExtTypes
))
16241 return ExtendIfNeeded(A
, ExtendCode
);
16244 auto IsPredVADDV
= [&](MVT RetTy
, unsigned ExtendCode
,
16245 ArrayRef
<MVT
> ExtTypes
, SDValue
&Mask
) {
16246 if (ResVT
!= RetTy
|| N0
->getOpcode() != ISD::VSELECT
||
16247 !ISD::isBuildVectorAllZeros(N0
->getOperand(2).getNode()))
16249 Mask
= N0
->getOperand(0);
16250 SDValue Ext
= N0
->getOperand(1);
16251 if (Ext
->getOpcode() != ExtendCode
)
16253 SDValue A
= Ext
->getOperand(0);
16254 if (ExtTypeMatches(A
, ExtTypes
))
16255 return ExtendIfNeeded(A
, ExtendCode
);
16258 auto IsVMLAV
= [&](MVT RetTy
, unsigned ExtendCode
, ArrayRef
<MVT
> ExtTypes
,
16259 SDValue
&A
, SDValue
&B
) {
16260 // For a vmla we are trying to match a larger pattern:
16261 // ExtA = sext/zext A
16262 // ExtB = sext/zext B
16263 // Mul = mul ExtA, ExtB
16264 // vecreduce.add Mul
16265 // There might also be en extra extend between the mul and the addreduce, so
16266 // long as the bitwidth is high enough to make them equivalent (for example
16267 // original v8i16 might be mul at v8i32 and the reduce happens at v8i64).
16268 if (ResVT
!= RetTy
)
16271 if (Mul
->getOpcode() == ExtendCode
&&
16272 Mul
->getOperand(0).getScalarValueSizeInBits() * 2 >=
16273 ResVT
.getScalarSizeInBits())
16274 Mul
= Mul
->getOperand(0);
16275 if (Mul
->getOpcode() != ISD::MUL
)
16277 SDValue ExtA
= Mul
->getOperand(0);
16278 SDValue ExtB
= Mul
->getOperand(1);
16279 if (ExtA
->getOpcode() != ExtendCode
|| ExtB
->getOpcode() != ExtendCode
)
16281 A
= ExtA
->getOperand(0);
16282 B
= ExtB
->getOperand(0);
16283 if (ExtTypeMatches(A
, ExtTypes
) && ExtTypeMatches(B
, ExtTypes
)) {
16284 A
= ExtendIfNeeded(A
, ExtendCode
);
16285 B
= ExtendIfNeeded(B
, ExtendCode
);
16290 auto IsPredVMLAV
= [&](MVT RetTy
, unsigned ExtendCode
, ArrayRef
<MVT
> ExtTypes
,
16291 SDValue
&A
, SDValue
&B
, SDValue
&Mask
) {
16292 // Same as the pattern above with a select for the zero predicated lanes
16293 // ExtA = sext/zext A
16294 // ExtB = sext/zext B
16295 // Mul = mul ExtA, ExtB
16296 // N0 = select Mask, Mul, 0
16297 // vecreduce.add N0
16298 if (ResVT
!= RetTy
|| N0
->getOpcode() != ISD::VSELECT
||
16299 !ISD::isBuildVectorAllZeros(N0
->getOperand(2).getNode()))
16301 Mask
= N0
->getOperand(0);
16302 SDValue Mul
= N0
->getOperand(1);
16303 if (Mul
->getOpcode() == ExtendCode
&&
16304 Mul
->getOperand(0).getScalarValueSizeInBits() * 2 >=
16305 ResVT
.getScalarSizeInBits())
16306 Mul
= Mul
->getOperand(0);
16307 if (Mul
->getOpcode() != ISD::MUL
)
16309 SDValue ExtA
= Mul
->getOperand(0);
16310 SDValue ExtB
= Mul
->getOperand(1);
16311 if (ExtA
->getOpcode() != ExtendCode
|| ExtB
->getOpcode() != ExtendCode
)
16313 A
= ExtA
->getOperand(0);
16314 B
= ExtB
->getOperand(0);
16315 if (ExtTypeMatches(A
, ExtTypes
) && ExtTypeMatches(B
, ExtTypes
)) {
16316 A
= ExtendIfNeeded(A
, ExtendCode
);
16317 B
= ExtendIfNeeded(B
, ExtendCode
);
16322 auto Create64bitNode
= [&](unsigned Opcode
, ArrayRef
<SDValue
> Ops
) {
16323 // Split illegal MVT::v16i8->i64 vector reductions into two legal v8i16->i64
16324 // reductions. The operands are extended with MVEEXT, but as they are
16325 // reductions the lane orders do not matter. MVEEXT may be combined with
16326 // loads to produce two extending loads, or else they will be expanded to
16328 EVT VT
= Ops
[0].getValueType();
16329 if (VT
== MVT::v16i8
) {
16330 assert((Opcode
== ARMISD::VMLALVs
|| Opcode
== ARMISD::VMLALVu
) &&
16331 "Unexpected illegal long reduction opcode");
16332 bool IsUnsigned
= Opcode
== ARMISD::VMLALVu
;
16335 DAG
.getNode(IsUnsigned
? ARMISD::MVEZEXT
: ARMISD::MVESEXT
, dl
,
16336 DAG
.getVTList(MVT::v8i16
, MVT::v8i16
), Ops
[0]);
16338 DAG
.getNode(IsUnsigned
? ARMISD::MVEZEXT
: ARMISD::MVESEXT
, dl
,
16339 DAG
.getVTList(MVT::v8i16
, MVT::v8i16
), Ops
[1]);
16341 SDValue MLA0
= DAG
.getNode(Opcode
, dl
, DAG
.getVTList(MVT::i32
, MVT::i32
),
16344 DAG
.getNode(IsUnsigned
? ARMISD::VMLALVAu
: ARMISD::VMLALVAs
, dl
,
16345 DAG
.getVTList(MVT::i32
, MVT::i32
), MLA0
, MLA0
.getValue(1),
16346 Ext0
.getValue(1), Ext1
.getValue(1));
16347 return DAG
.getNode(ISD::BUILD_PAIR
, dl
, MVT::i64
, MLA1
, MLA1
.getValue(1));
16349 SDValue Node
= DAG
.getNode(Opcode
, dl
, {MVT::i32
, MVT::i32
}, Ops
);
16350 return DAG
.getNode(ISD::BUILD_PAIR
, dl
, MVT::i64
, Node
,
16351 SDValue(Node
.getNode(), 1));
16356 if (IsVMLAV(MVT::i32
, ISD::SIGN_EXTEND
, {MVT::v8i16
, MVT::v16i8
}, A
, B
))
16357 return DAG
.getNode(ARMISD::VMLAVs
, dl
, ResVT
, A
, B
);
16358 if (IsVMLAV(MVT::i32
, ISD::ZERO_EXTEND
, {MVT::v8i16
, MVT::v16i8
}, A
, B
))
16359 return DAG
.getNode(ARMISD::VMLAVu
, dl
, ResVT
, A
, B
);
16360 if (IsVMLAV(MVT::i64
, ISD::SIGN_EXTEND
, {MVT::v16i8
, MVT::v8i16
, MVT::v4i32
},
16362 return Create64bitNode(ARMISD::VMLALVs
, {A
, B
});
16363 if (IsVMLAV(MVT::i64
, ISD::ZERO_EXTEND
, {MVT::v16i8
, MVT::v8i16
, MVT::v4i32
},
16365 return Create64bitNode(ARMISD::VMLALVu
, {A
, B
});
16366 if (IsVMLAV(MVT::i16
, ISD::SIGN_EXTEND
, {MVT::v16i8
}, A
, B
))
16367 return DAG
.getNode(ISD::TRUNCATE
, dl
, ResVT
,
16368 DAG
.getNode(ARMISD::VMLAVs
, dl
, MVT::i32
, A
, B
));
16369 if (IsVMLAV(MVT::i16
, ISD::ZERO_EXTEND
, {MVT::v16i8
}, A
, B
))
16370 return DAG
.getNode(ISD::TRUNCATE
, dl
, ResVT
,
16371 DAG
.getNode(ARMISD::VMLAVu
, dl
, MVT::i32
, A
, B
));
16373 if (IsPredVMLAV(MVT::i32
, ISD::SIGN_EXTEND
, {MVT::v8i16
, MVT::v16i8
}, A
, B
,
16375 return DAG
.getNode(ARMISD::VMLAVps
, dl
, ResVT
, A
, B
, Mask
);
16376 if (IsPredVMLAV(MVT::i32
, ISD::ZERO_EXTEND
, {MVT::v8i16
, MVT::v16i8
}, A
, B
,
16378 return DAG
.getNode(ARMISD::VMLAVpu
, dl
, ResVT
, A
, B
, Mask
);
16379 if (IsPredVMLAV(MVT::i64
, ISD::SIGN_EXTEND
, {MVT::v8i16
, MVT::v4i32
}, A
, B
,
16381 return Create64bitNode(ARMISD::VMLALVps
, {A
, B
, Mask
});
16382 if (IsPredVMLAV(MVT::i64
, ISD::ZERO_EXTEND
, {MVT::v8i16
, MVT::v4i32
}, A
, B
,
16384 return Create64bitNode(ARMISD::VMLALVpu
, {A
, B
, Mask
});
16385 if (IsPredVMLAV(MVT::i16
, ISD::SIGN_EXTEND
, {MVT::v16i8
}, A
, B
, Mask
))
16386 return DAG
.getNode(ISD::TRUNCATE
, dl
, ResVT
,
16387 DAG
.getNode(ARMISD::VMLAVps
, dl
, MVT::i32
, A
, B
, Mask
));
16388 if (IsPredVMLAV(MVT::i16
, ISD::ZERO_EXTEND
, {MVT::v16i8
}, A
, B
, Mask
))
16389 return DAG
.getNode(ISD::TRUNCATE
, dl
, ResVT
,
16390 DAG
.getNode(ARMISD::VMLAVpu
, dl
, MVT::i32
, A
, B
, Mask
));
16392 if (SDValue A
= IsVADDV(MVT::i32
, ISD::SIGN_EXTEND
, {MVT::v8i16
, MVT::v16i8
}))
16393 return DAG
.getNode(ARMISD::VADDVs
, dl
, ResVT
, A
);
16394 if (SDValue A
= IsVADDV(MVT::i32
, ISD::ZERO_EXTEND
, {MVT::v8i16
, MVT::v16i8
}))
16395 return DAG
.getNode(ARMISD::VADDVu
, dl
, ResVT
, A
);
16396 if (SDValue A
= IsVADDV(MVT::i64
, ISD::SIGN_EXTEND
, {MVT::v4i32
}))
16397 return Create64bitNode(ARMISD::VADDLVs
, {A
});
16398 if (SDValue A
= IsVADDV(MVT::i64
, ISD::ZERO_EXTEND
, {MVT::v4i32
}))
16399 return Create64bitNode(ARMISD::VADDLVu
, {A
});
16400 if (SDValue A
= IsVADDV(MVT::i16
, ISD::SIGN_EXTEND
, {MVT::v16i8
}))
16401 return DAG
.getNode(ISD::TRUNCATE
, dl
, ResVT
,
16402 DAG
.getNode(ARMISD::VADDVs
, dl
, MVT::i32
, A
));
16403 if (SDValue A
= IsVADDV(MVT::i16
, ISD::ZERO_EXTEND
, {MVT::v16i8
}))
16404 return DAG
.getNode(ISD::TRUNCATE
, dl
, ResVT
,
16405 DAG
.getNode(ARMISD::VADDVu
, dl
, MVT::i32
, A
));
16407 if (SDValue A
= IsPredVADDV(MVT::i32
, ISD::SIGN_EXTEND
, {MVT::v8i16
, MVT::v16i8
}, Mask
))
16408 return DAG
.getNode(ARMISD::VADDVps
, dl
, ResVT
, A
, Mask
);
16409 if (SDValue A
= IsPredVADDV(MVT::i32
, ISD::ZERO_EXTEND
, {MVT::v8i16
, MVT::v16i8
}, Mask
))
16410 return DAG
.getNode(ARMISD::VADDVpu
, dl
, ResVT
, A
, Mask
);
16411 if (SDValue A
= IsPredVADDV(MVT::i64
, ISD::SIGN_EXTEND
, {MVT::v4i32
}, Mask
))
16412 return Create64bitNode(ARMISD::VADDLVps
, {A
, Mask
});
16413 if (SDValue A
= IsPredVADDV(MVT::i64
, ISD::ZERO_EXTEND
, {MVT::v4i32
}, Mask
))
16414 return Create64bitNode(ARMISD::VADDLVpu
, {A
, Mask
});
16415 if (SDValue A
= IsPredVADDV(MVT::i16
, ISD::SIGN_EXTEND
, {MVT::v16i8
}, Mask
))
16416 return DAG
.getNode(ISD::TRUNCATE
, dl
, ResVT
,
16417 DAG
.getNode(ARMISD::VADDVps
, dl
, MVT::i32
, A
, Mask
));
16418 if (SDValue A
= IsPredVADDV(MVT::i16
, ISD::ZERO_EXTEND
, {MVT::v16i8
}, Mask
))
16419 return DAG
.getNode(ISD::TRUNCATE
, dl
, ResVT
,
16420 DAG
.getNode(ARMISD::VADDVpu
, dl
, MVT::i32
, A
, Mask
));
16422 // Some complications. We can get a case where the two inputs of the mul are
16423 // the same, then the output sext will have been helpfully converted to a
16424 // zext. Turn it back.
16426 if (Op
->getOpcode() == ISD::VSELECT
)
16427 Op
= Op
->getOperand(1);
16428 if (Op
->getOpcode() == ISD::ZERO_EXTEND
&&
16429 Op
->getOperand(0)->getOpcode() == ISD::MUL
) {
16430 SDValue Mul
= Op
->getOperand(0);
16431 if (Mul
->getOperand(0) == Mul
->getOperand(1) &&
16432 Mul
->getOperand(0)->getOpcode() == ISD::SIGN_EXTEND
) {
16433 SDValue Ext
= DAG
.getNode(ISD::SIGN_EXTEND
, dl
, N0
->getValueType(0), Mul
);
16435 Ext
= DAG
.getNode(ISD::VSELECT
, dl
, N0
->getValueType(0),
16436 N0
->getOperand(0), Ext
, N0
->getOperand(2));
16437 return DAG
.getNode(ISD::VECREDUCE_ADD
, dl
, ResVT
, Ext
);
16444 static SDValue
PerformVMOVNCombine(SDNode
*N
,
16445 TargetLowering::DAGCombinerInfo
&DCI
) {
16446 SDValue Op0
= N
->getOperand(0);
16447 SDValue Op1
= N
->getOperand(1);
16448 unsigned IsTop
= N
->getConstantOperandVal(2);
16450 // VMOVNT a undef -> a
16451 // VMOVNB a undef -> a
16452 // VMOVNB undef a -> a
16453 if (Op1
->isUndef())
16455 if (Op0
->isUndef() && !IsTop
)
16458 // VMOVNt(c, VQMOVNb(a, b)) => VQMOVNt(c, b)
16459 // VMOVNb(c, VQMOVNb(a, b)) => VQMOVNb(c, b)
16460 if ((Op1
->getOpcode() == ARMISD::VQMOVNs
||
16461 Op1
->getOpcode() == ARMISD::VQMOVNu
) &&
16462 Op1
->getConstantOperandVal(2) == 0)
16463 return DCI
.DAG
.getNode(Op1
->getOpcode(), SDLoc(Op1
), N
->getValueType(0),
16464 Op0
, Op1
->getOperand(1), N
->getOperand(2));
16466 // Only the bottom lanes from Qm (Op1) and either the top or bottom lanes from
16467 // Qd (Op0) are demanded from a VMOVN, depending on whether we are inserting
16468 // into the top or bottom lanes.
16469 unsigned NumElts
= N
->getValueType(0).getVectorNumElements();
16470 APInt Op1DemandedElts
= APInt::getSplat(NumElts
, APInt::getLowBitsSet(2, 1));
16471 APInt Op0DemandedElts
=
16472 IsTop
? Op1DemandedElts
16473 : APInt::getSplat(NumElts
, APInt::getHighBitsSet(2, 1));
16475 APInt KnownUndef
, KnownZero
;
16476 const TargetLowering
&TLI
= DCI
.DAG
.getTargetLoweringInfo();
16477 if (TLI
.SimplifyDemandedVectorElts(Op0
, Op0DemandedElts
, KnownUndef
,
16479 return SDValue(N
, 0);
16480 if (TLI
.SimplifyDemandedVectorElts(Op1
, Op1DemandedElts
, KnownUndef
,
16482 return SDValue(N
, 0);
16487 static SDValue
PerformVQMOVNCombine(SDNode
*N
,
16488 TargetLowering::DAGCombinerInfo
&DCI
) {
16489 SDValue Op0
= N
->getOperand(0);
16490 unsigned IsTop
= N
->getConstantOperandVal(2);
16492 unsigned NumElts
= N
->getValueType(0).getVectorNumElements();
16493 APInt Op0DemandedElts
=
16494 APInt::getSplat(NumElts
, IsTop
? APInt::getLowBitsSet(2, 1)
16495 : APInt::getHighBitsSet(2, 1));
16497 APInt KnownUndef
, KnownZero
;
16498 const TargetLowering
&TLI
= DCI
.DAG
.getTargetLoweringInfo();
16499 if (TLI
.SimplifyDemandedVectorElts(Op0
, Op0DemandedElts
, KnownUndef
,
16501 return SDValue(N
, 0);
16505 static SDValue
PerformLongShiftCombine(SDNode
*N
, SelectionDAG
&DAG
) {
16507 SDValue Op0
= N
->getOperand(0);
16508 SDValue Op1
= N
->getOperand(1);
16510 // Turn X << -C -> X >> C and viceversa. The negative shifts can come up from
16511 // uses of the intrinsics.
16512 if (auto C
= dyn_cast
<ConstantSDNode
>(N
->getOperand(2))) {
16513 int ShiftAmt
= C
->getSExtValue();
16514 if (ShiftAmt
== 0) {
16515 SDValue Merge
= DAG
.getMergeValues({Op0
, Op1
}, DL
);
16516 DAG
.ReplaceAllUsesWith(N
, Merge
.getNode());
16520 if (ShiftAmt
>= -32 && ShiftAmt
< 0) {
16521 unsigned NewOpcode
=
16522 N
->getOpcode() == ARMISD::LSLL
? ARMISD::LSRL
: ARMISD::LSLL
;
16523 SDValue NewShift
= DAG
.getNode(NewOpcode
, DL
, N
->getVTList(), Op0
, Op1
,
16524 DAG
.getConstant(-ShiftAmt
, DL
, MVT::i32
));
16525 DAG
.ReplaceAllUsesWith(N
, NewShift
.getNode());
16533 /// PerformIntrinsicCombine - ARM-specific DAG combining for intrinsics.
16534 SDValue
ARMTargetLowering::PerformIntrinsicCombine(SDNode
*N
,
16535 DAGCombinerInfo
&DCI
) const {
16536 SelectionDAG
&DAG
= DCI
.DAG
;
16537 unsigned IntNo
= cast
<ConstantSDNode
>(N
->getOperand(0))->getZExtValue();
16540 // Don't do anything for most intrinsics.
16543 // Vector shifts: check for immediate versions and lower them.
16544 // Note: This is done during DAG combining instead of DAG legalizing because
16545 // the build_vectors for 64-bit vector element shift counts are generally
16546 // not legal, and it is hard to see their values after they get legalized to
16547 // loads from a constant pool.
16548 case Intrinsic::arm_neon_vshifts
:
16549 case Intrinsic::arm_neon_vshiftu
:
16550 case Intrinsic::arm_neon_vrshifts
:
16551 case Intrinsic::arm_neon_vrshiftu
:
16552 case Intrinsic::arm_neon_vrshiftn
:
16553 case Intrinsic::arm_neon_vqshifts
:
16554 case Intrinsic::arm_neon_vqshiftu
:
16555 case Intrinsic::arm_neon_vqshiftsu
:
16556 case Intrinsic::arm_neon_vqshiftns
:
16557 case Intrinsic::arm_neon_vqshiftnu
:
16558 case Intrinsic::arm_neon_vqshiftnsu
:
16559 case Intrinsic::arm_neon_vqrshiftns
:
16560 case Intrinsic::arm_neon_vqrshiftnu
:
16561 case Intrinsic::arm_neon_vqrshiftnsu
: {
16562 EVT VT
= N
->getOperand(1).getValueType();
16564 unsigned VShiftOpc
= 0;
16567 case Intrinsic::arm_neon_vshifts
:
16568 case Intrinsic::arm_neon_vshiftu
:
16569 if (isVShiftLImm(N
->getOperand(2), VT
, false, Cnt
)) {
16570 VShiftOpc
= ARMISD::VSHLIMM
;
16573 if (isVShiftRImm(N
->getOperand(2), VT
, false, true, Cnt
)) {
16574 VShiftOpc
= (IntNo
== Intrinsic::arm_neon_vshifts
? ARMISD::VSHRsIMM
16575 : ARMISD::VSHRuIMM
);
16580 case Intrinsic::arm_neon_vrshifts
:
16581 case Intrinsic::arm_neon_vrshiftu
:
16582 if (isVShiftRImm(N
->getOperand(2), VT
, false, true, Cnt
))
16586 case Intrinsic::arm_neon_vqshifts
:
16587 case Intrinsic::arm_neon_vqshiftu
:
16588 if (isVShiftLImm(N
->getOperand(2), VT
, false, Cnt
))
16592 case Intrinsic::arm_neon_vqshiftsu
:
16593 if (isVShiftLImm(N
->getOperand(2), VT
, false, Cnt
))
16595 llvm_unreachable("invalid shift count for vqshlu intrinsic");
16597 case Intrinsic::arm_neon_vrshiftn
:
16598 case Intrinsic::arm_neon_vqshiftns
:
16599 case Intrinsic::arm_neon_vqshiftnu
:
16600 case Intrinsic::arm_neon_vqshiftnsu
:
16601 case Intrinsic::arm_neon_vqrshiftns
:
16602 case Intrinsic::arm_neon_vqrshiftnu
:
16603 case Intrinsic::arm_neon_vqrshiftnsu
:
16604 // Narrowing shifts require an immediate right shift.
16605 if (isVShiftRImm(N
->getOperand(2), VT
, true, true, Cnt
))
16607 llvm_unreachable("invalid shift count for narrowing vector shift "
16611 llvm_unreachable("unhandled vector shift");
16615 case Intrinsic::arm_neon_vshifts
:
16616 case Intrinsic::arm_neon_vshiftu
:
16617 // Opcode already set above.
16619 case Intrinsic::arm_neon_vrshifts
:
16620 VShiftOpc
= ARMISD::VRSHRsIMM
;
16622 case Intrinsic::arm_neon_vrshiftu
:
16623 VShiftOpc
= ARMISD::VRSHRuIMM
;
16625 case Intrinsic::arm_neon_vrshiftn
:
16626 VShiftOpc
= ARMISD::VRSHRNIMM
;
16628 case Intrinsic::arm_neon_vqshifts
:
16629 VShiftOpc
= ARMISD::VQSHLsIMM
;
16631 case Intrinsic::arm_neon_vqshiftu
:
16632 VShiftOpc
= ARMISD::VQSHLuIMM
;
16634 case Intrinsic::arm_neon_vqshiftsu
:
16635 VShiftOpc
= ARMISD::VQSHLsuIMM
;
16637 case Intrinsic::arm_neon_vqshiftns
:
16638 VShiftOpc
= ARMISD::VQSHRNsIMM
;
16640 case Intrinsic::arm_neon_vqshiftnu
:
16641 VShiftOpc
= ARMISD::VQSHRNuIMM
;
16643 case Intrinsic::arm_neon_vqshiftnsu
:
16644 VShiftOpc
= ARMISD::VQSHRNsuIMM
;
16646 case Intrinsic::arm_neon_vqrshiftns
:
16647 VShiftOpc
= ARMISD::VQRSHRNsIMM
;
16649 case Intrinsic::arm_neon_vqrshiftnu
:
16650 VShiftOpc
= ARMISD::VQRSHRNuIMM
;
16652 case Intrinsic::arm_neon_vqrshiftnsu
:
16653 VShiftOpc
= ARMISD::VQRSHRNsuIMM
;
16658 return DAG
.getNode(VShiftOpc
, dl
, N
->getValueType(0),
16659 N
->getOperand(1), DAG
.getConstant(Cnt
, dl
, MVT::i32
));
16662 case Intrinsic::arm_neon_vshiftins
: {
16663 EVT VT
= N
->getOperand(1).getValueType();
16665 unsigned VShiftOpc
= 0;
16667 if (isVShiftLImm(N
->getOperand(3), VT
, false, Cnt
))
16668 VShiftOpc
= ARMISD::VSLIIMM
;
16669 else if (isVShiftRImm(N
->getOperand(3), VT
, false, true, Cnt
))
16670 VShiftOpc
= ARMISD::VSRIIMM
;
16672 llvm_unreachable("invalid shift count for vsli/vsri intrinsic");
16676 return DAG
.getNode(VShiftOpc
, dl
, N
->getValueType(0),
16677 N
->getOperand(1), N
->getOperand(2),
16678 DAG
.getConstant(Cnt
, dl
, MVT::i32
));
16681 case Intrinsic::arm_neon_vqrshifts
:
16682 case Intrinsic::arm_neon_vqrshiftu
:
16683 // No immediate versions of these to check for.
16686 case Intrinsic::arm_mve_vqdmlah
:
16687 case Intrinsic::arm_mve_vqdmlash
:
16688 case Intrinsic::arm_mve_vqrdmlah
:
16689 case Intrinsic::arm_mve_vqrdmlash
:
16690 case Intrinsic::arm_mve_vmla_n_predicated
:
16691 case Intrinsic::arm_mve_vmlas_n_predicated
:
16692 case Intrinsic::arm_mve_vqdmlah_predicated
:
16693 case Intrinsic::arm_mve_vqdmlash_predicated
:
16694 case Intrinsic::arm_mve_vqrdmlah_predicated
:
16695 case Intrinsic::arm_mve_vqrdmlash_predicated
: {
16696 // These intrinsics all take an i32 scalar operand which is narrowed to the
16697 // size of a single lane of the vector type they return. So we don't need
16698 // any bits of that operand above that point, which allows us to eliminate
16700 unsigned BitWidth
= N
->getValueType(0).getScalarSizeInBits();
16701 APInt DemandedMask
= APInt::getLowBitsSet(32, BitWidth
);
16702 if (SimplifyDemandedBits(N
->getOperand(3), DemandedMask
, DCI
))
16707 case Intrinsic::arm_mve_minv
:
16708 case Intrinsic::arm_mve_maxv
:
16709 case Intrinsic::arm_mve_minav
:
16710 case Intrinsic::arm_mve_maxav
:
16711 case Intrinsic::arm_mve_minv_predicated
:
16712 case Intrinsic::arm_mve_maxv_predicated
:
16713 case Intrinsic::arm_mve_minav_predicated
:
16714 case Intrinsic::arm_mve_maxav_predicated
: {
16715 // These intrinsics all take an i32 scalar operand which is narrowed to the
16716 // size of a single lane of the vector type they take as the other input.
16717 unsigned BitWidth
= N
->getOperand(2)->getValueType(0).getScalarSizeInBits();
16718 APInt DemandedMask
= APInt::getLowBitsSet(32, BitWidth
);
16719 if (SimplifyDemandedBits(N
->getOperand(1), DemandedMask
, DCI
))
16724 case Intrinsic::arm_mve_addv
: {
16725 // Turn this intrinsic straight into the appropriate ARMISD::VADDV node,
16726 // which allow PerformADDVecReduce to turn it into VADDLV when possible.
16727 bool Unsigned
= cast
<ConstantSDNode
>(N
->getOperand(2))->getZExtValue();
16728 unsigned Opc
= Unsigned
? ARMISD::VADDVu
: ARMISD::VADDVs
;
16729 return DAG
.getNode(Opc
, SDLoc(N
), N
->getVTList(), N
->getOperand(1));
16732 case Intrinsic::arm_mve_addlv
:
16733 case Intrinsic::arm_mve_addlv_predicated
: {
16734 // Same for these, but ARMISD::VADDLV has to be followed by a BUILD_PAIR
16735 // which recombines the two outputs into an i64
16736 bool Unsigned
= cast
<ConstantSDNode
>(N
->getOperand(2))->getZExtValue();
16737 unsigned Opc
= IntNo
== Intrinsic::arm_mve_addlv
?
16738 (Unsigned
? ARMISD::VADDLVu
: ARMISD::VADDLVs
) :
16739 (Unsigned
? ARMISD::VADDLVpu
: ARMISD::VADDLVps
);
16741 SmallVector
<SDValue
, 4> Ops
;
16742 for (unsigned i
= 1, e
= N
->getNumOperands(); i
< e
; i
++)
16743 if (i
!= 2) // skip the unsigned flag
16744 Ops
.push_back(N
->getOperand(i
));
16747 SDValue val
= DAG
.getNode(Opc
, dl
, {MVT::i32
, MVT::i32
}, Ops
);
16748 return DAG
.getNode(ISD::BUILD_PAIR
, dl
, MVT::i64
, val
.getValue(0),
16756 /// PerformShiftCombine - Checks for immediate versions of vector shifts and
16757 /// lowers them. As with the vector shift intrinsics, this is done during DAG
16758 /// combining instead of DAG legalizing because the build_vectors for 64-bit
16759 /// vector element shift counts are generally not legal, and it is hard to see
16760 /// their values after they get legalized to loads from a constant pool.
16761 static SDValue
PerformShiftCombine(SDNode
*N
,
16762 TargetLowering::DAGCombinerInfo
&DCI
,
16763 const ARMSubtarget
*ST
) {
16764 SelectionDAG
&DAG
= DCI
.DAG
;
16765 EVT VT
= N
->getValueType(0);
16766 if (N
->getOpcode() == ISD::SRL
&& VT
== MVT::i32
&& ST
->hasV6Ops()) {
16767 // Canonicalize (srl (bswap x), 16) to (rotr (bswap x), 16) if the high
16768 // 16-bits of x is zero. This optimizes rev + lsr 16 to rev16.
16769 SDValue N1
= N
->getOperand(1);
16770 if (ConstantSDNode
*C
= dyn_cast
<ConstantSDNode
>(N1
)) {
16771 SDValue N0
= N
->getOperand(0);
16772 if (C
->getZExtValue() == 16 && N0
.getOpcode() == ISD::BSWAP
&&
16773 DAG
.MaskedValueIsZero(N0
.getOperand(0),
16774 APInt::getHighBitsSet(32, 16)))
16775 return DAG
.getNode(ISD::ROTR
, SDLoc(N
), VT
, N0
, N1
);
16779 if (ST
->isThumb1Only() && N
->getOpcode() == ISD::SHL
&& VT
== MVT::i32
&&
16780 N
->getOperand(0)->getOpcode() == ISD::AND
&&
16781 N
->getOperand(0)->hasOneUse()) {
16782 if (DCI
.isBeforeLegalize() || DCI
.isCalledByLegalizer())
16784 // Look for the pattern (shl (and x, AndMask), ShiftAmt). This doesn't
16785 // usually show up because instcombine prefers to canonicalize it to
16786 // (and (shl x, ShiftAmt) (shl AndMask, ShiftAmt)), but the shift can come
16787 // out of GEP lowering in some cases.
16788 SDValue N0
= N
->getOperand(0);
16789 ConstantSDNode
*ShiftAmtNode
= dyn_cast
<ConstantSDNode
>(N
->getOperand(1));
16792 uint32_t ShiftAmt
= static_cast<uint32_t>(ShiftAmtNode
->getZExtValue());
16793 ConstantSDNode
*AndMaskNode
= dyn_cast
<ConstantSDNode
>(N0
->getOperand(1));
16796 uint32_t AndMask
= static_cast<uint32_t>(AndMaskNode
->getZExtValue());
16797 // Don't transform uxtb/uxth.
16798 if (AndMask
== 255 || AndMask
== 65535)
16800 if (isMask_32(AndMask
)) {
16801 uint32_t MaskedBits
= countLeadingZeros(AndMask
);
16802 if (MaskedBits
> ShiftAmt
) {
16804 SDValue SHL
= DAG
.getNode(ISD::SHL
, DL
, MVT::i32
, N0
->getOperand(0),
16805 DAG
.getConstant(MaskedBits
, DL
, MVT::i32
));
16806 return DAG
.getNode(
16807 ISD::SRL
, DL
, MVT::i32
, SHL
,
16808 DAG
.getConstant(MaskedBits
- ShiftAmt
, DL
, MVT::i32
));
16813 // Nothing to be done for scalar shifts.
16814 const TargetLowering
&TLI
= DAG
.getTargetLoweringInfo();
16815 if (!VT
.isVector() || !TLI
.isTypeLegal(VT
))
16817 if (ST
->hasMVEIntegerOps() && VT
== MVT::v2i64
)
16822 switch (N
->getOpcode()) {
16823 default: llvm_unreachable("unexpected shift opcode");
16826 if (isVShiftLImm(N
->getOperand(1), VT
, false, Cnt
)) {
16828 return DAG
.getNode(ARMISD::VSHLIMM
, dl
, VT
, N
->getOperand(0),
16829 DAG
.getConstant(Cnt
, dl
, MVT::i32
));
16835 if (isVShiftRImm(N
->getOperand(1), VT
, false, false, Cnt
)) {
16836 unsigned VShiftOpc
=
16837 (N
->getOpcode() == ISD::SRA
? ARMISD::VSHRsIMM
: ARMISD::VSHRuIMM
);
16839 return DAG
.getNode(VShiftOpc
, dl
, VT
, N
->getOperand(0),
16840 DAG
.getConstant(Cnt
, dl
, MVT::i32
));
16846 // Look for a sign/zero/fpextend extend of a larger than legal load. This can be
16847 // split into multiple extending loads, which are simpler to deal with than an
16848 // arbitrary extend. For fp extends we use an integer extending load and a VCVTL
16849 // to convert the type to an f32.
16850 static SDValue
PerformSplittingToWideningLoad(SDNode
*N
, SelectionDAG
&DAG
) {
16851 SDValue N0
= N
->getOperand(0);
16852 if (N0
.getOpcode() != ISD::LOAD
)
16854 LoadSDNode
*LD
= cast
<LoadSDNode
>(N0
.getNode());
16855 if (!LD
->isSimple() || !N0
.hasOneUse() || LD
->isIndexed() ||
16856 LD
->getExtensionType() != ISD::NON_EXTLOAD
)
16858 EVT FromVT
= LD
->getValueType(0);
16859 EVT ToVT
= N
->getValueType(0);
16860 if (!ToVT
.isVector())
16862 assert(FromVT
.getVectorNumElements() == ToVT
.getVectorNumElements());
16863 EVT ToEltVT
= ToVT
.getVectorElementType();
16864 EVT FromEltVT
= FromVT
.getVectorElementType();
16866 unsigned NumElements
= 0;
16867 if (ToEltVT
== MVT::i32
&& FromEltVT
== MVT::i8
)
16869 if (ToEltVT
== MVT::f32
&& FromEltVT
== MVT::f16
)
16871 if (NumElements
== 0 ||
16872 (FromEltVT
!= MVT::f16
&& FromVT
.getVectorNumElements() == NumElements
) ||
16873 FromVT
.getVectorNumElements() % NumElements
!= 0 ||
16874 !isPowerOf2_32(NumElements
))
16877 LLVMContext
&C
= *DAG
.getContext();
16879 // Details about the old load
16880 SDValue Ch
= LD
->getChain();
16881 SDValue BasePtr
= LD
->getBasePtr();
16882 Align Alignment
= LD
->getOriginalAlign();
16883 MachineMemOperand::Flags MMOFlags
= LD
->getMemOperand()->getFlags();
16884 AAMDNodes AAInfo
= LD
->getAAInfo();
16886 ISD::LoadExtType NewExtType
=
16887 N
->getOpcode() == ISD::SIGN_EXTEND
? ISD::SEXTLOAD
: ISD::ZEXTLOAD
;
16888 SDValue Offset
= DAG
.getUNDEF(BasePtr
.getValueType());
16889 EVT NewFromVT
= EVT::getVectorVT(
16890 C
, EVT::getIntegerVT(C
, FromEltVT
.getScalarSizeInBits()), NumElements
);
16891 EVT NewToVT
= EVT::getVectorVT(
16892 C
, EVT::getIntegerVT(C
, ToEltVT
.getScalarSizeInBits()), NumElements
);
16894 SmallVector
<SDValue
, 4> Loads
;
16895 SmallVector
<SDValue
, 4> Chains
;
16896 for (unsigned i
= 0; i
< FromVT
.getVectorNumElements() / NumElements
; i
++) {
16897 unsigned NewOffset
= (i
* NewFromVT
.getSizeInBits()) / 8;
16899 DAG
.getObjectPtrOffset(DL
, BasePtr
, TypeSize::Fixed(NewOffset
));
16902 DAG
.getLoad(ISD::UNINDEXED
, NewExtType
, NewToVT
, DL
, Ch
, NewPtr
, Offset
,
16903 LD
->getPointerInfo().getWithOffset(NewOffset
), NewFromVT
,
16904 Alignment
, MMOFlags
, AAInfo
);
16905 Loads
.push_back(NewLoad
);
16906 Chains
.push_back(SDValue(NewLoad
.getNode(), 1));
16909 // Float truncs need to extended with VCVTB's into their floating point types.
16910 if (FromEltVT
== MVT::f16
) {
16911 SmallVector
<SDValue
, 4> Extends
;
16913 for (unsigned i
= 0; i
< Loads
.size(); i
++) {
16915 DAG
.getNode(ARMISD::VECTOR_REG_CAST
, DL
, MVT::v8f16
, Loads
[i
]);
16916 SDValue FPExt
= DAG
.getNode(ARMISD::VCVTL
, DL
, MVT::v4f32
, LoadBC
,
16917 DAG
.getConstant(0, DL
, MVT::i32
));
16918 Extends
.push_back(FPExt
);
16924 SDValue NewChain
= DAG
.getNode(ISD::TokenFactor
, DL
, MVT::Other
, Chains
);
16925 DAG
.ReplaceAllUsesOfValueWith(SDValue(LD
, 1), NewChain
);
16926 return DAG
.getNode(ISD::CONCAT_VECTORS
, DL
, ToVT
, Loads
);
16929 /// PerformExtendCombine - Target-specific DAG combining for ISD::SIGN_EXTEND,
16930 /// ISD::ZERO_EXTEND, and ISD::ANY_EXTEND.
16931 static SDValue
PerformExtendCombine(SDNode
*N
, SelectionDAG
&DAG
,
16932 const ARMSubtarget
*ST
) {
16933 SDValue N0
= N
->getOperand(0);
16935 // Check for sign- and zero-extensions of vector extract operations of 8- and
16936 // 16-bit vector elements. NEON and MVE support these directly. They are
16937 // handled during DAG combining because type legalization will promote them
16938 // to 32-bit types and it is messy to recognize the operations after that.
16939 if ((ST
->hasNEON() || ST
->hasMVEIntegerOps()) &&
16940 N0
.getOpcode() == ISD::EXTRACT_VECTOR_ELT
) {
16941 SDValue Vec
= N0
.getOperand(0);
16942 SDValue Lane
= N0
.getOperand(1);
16943 EVT VT
= N
->getValueType(0);
16944 EVT EltVT
= N0
.getValueType();
16945 const TargetLowering
&TLI
= DAG
.getTargetLoweringInfo();
16947 if (VT
== MVT::i32
&&
16948 (EltVT
== MVT::i8
|| EltVT
== MVT::i16
) &&
16949 TLI
.isTypeLegal(Vec
.getValueType()) &&
16950 isa
<ConstantSDNode
>(Lane
)) {
16953 switch (N
->getOpcode()) {
16954 default: llvm_unreachable("unexpected opcode");
16955 case ISD::SIGN_EXTEND
:
16956 Opc
= ARMISD::VGETLANEs
;
16958 case ISD::ZERO_EXTEND
:
16959 case ISD::ANY_EXTEND
:
16960 Opc
= ARMISD::VGETLANEu
;
16963 return DAG
.getNode(Opc
, SDLoc(N
), VT
, Vec
, Lane
);
16967 if (ST
->hasMVEIntegerOps())
16968 if (SDValue NewLoad
= PerformSplittingToWideningLoad(N
, DAG
))
16974 static SDValue
PerformFPExtendCombine(SDNode
*N
, SelectionDAG
&DAG
,
16975 const ARMSubtarget
*ST
) {
16976 if (ST
->hasMVEFloatOps())
16977 if (SDValue NewLoad
= PerformSplittingToWideningLoad(N
, DAG
))
16983 /// PerformMinMaxCombine - Target-specific DAG combining for creating truncating
16985 static SDValue
PerformMinMaxCombine(SDNode
*N
, SelectionDAG
&DAG
,
16986 const ARMSubtarget
*ST
) {
16987 EVT VT
= N
->getValueType(0);
16988 SDValue N0
= N
->getOperand(0);
16989 if (!ST
->hasMVEIntegerOps())
16992 if (SDValue V
= PerformVQDMULHCombine(N
, DAG
))
16995 if (VT
!= MVT::v4i32
&& VT
!= MVT::v8i16
)
16998 auto IsSignedSaturate
= [&](SDNode
*Min
, SDNode
*Max
) {
16999 // Check one is a smin and the other is a smax
17000 if (Min
->getOpcode() != ISD::SMIN
)
17001 std::swap(Min
, Max
);
17002 if (Min
->getOpcode() != ISD::SMIN
|| Max
->getOpcode() != ISD::SMAX
)
17006 if (VT
== MVT::v4i32
)
17007 SaturateC
= APInt(32, (1 << 15) - 1, true);
17008 else //if (VT == MVT::v8i16)
17009 SaturateC
= APInt(16, (1 << 7) - 1, true);
17012 if (!ISD::isConstantSplatVector(Min
->getOperand(1).getNode(), MinC
) ||
17015 if (!ISD::isConstantSplatVector(Max
->getOperand(1).getNode(), MaxC
) ||
17016 MaxC
!= ~SaturateC
)
17021 if (IsSignedSaturate(N
, N0
.getNode())) {
17024 if (VT
== MVT::v4i32
) {
17025 HalfVT
= MVT::v8i16
;
17026 ExtVT
= MVT::v4i16
;
17027 } else { // if (VT == MVT::v8i16)
17028 HalfVT
= MVT::v16i8
;
17032 // Create a VQMOVNB with undef top lanes, then signed extended into the top
17033 // half. That extend will hopefully be removed if only the bottom bits are
17034 // demanded (though a truncating store, for example).
17036 DAG
.getNode(ARMISD::VQMOVNs
, DL
, HalfVT
, DAG
.getUNDEF(HalfVT
),
17037 N0
->getOperand(0), DAG
.getConstant(0, DL
, MVT::i32
));
17038 SDValue Bitcast
= DAG
.getNode(ARMISD::VECTOR_REG_CAST
, DL
, VT
, VQMOVN
);
17039 return DAG
.getNode(ISD::SIGN_EXTEND_INREG
, DL
, VT
, Bitcast
,
17040 DAG
.getValueType(ExtVT
));
17043 auto IsUnsignedSaturate
= [&](SDNode
*Min
) {
17044 // For unsigned, we just need to check for <= 0xffff
17045 if (Min
->getOpcode() != ISD::UMIN
)
17049 if (VT
== MVT::v4i32
)
17050 SaturateC
= APInt(32, (1 << 16) - 1, true);
17051 else //if (VT == MVT::v8i16)
17052 SaturateC
= APInt(16, (1 << 8) - 1, true);
17055 if (!ISD::isConstantSplatVector(Min
->getOperand(1).getNode(), MinC
) ||
17061 if (IsUnsignedSaturate(N
)) {
17065 if (VT
== MVT::v4i32
) {
17066 HalfVT
= MVT::v8i16
;
17067 ExtConst
= 0x0000FFFF;
17068 } else { //if (VT == MVT::v8i16)
17069 HalfVT
= MVT::v16i8
;
17073 // Create a VQMOVNB with undef top lanes, then ZExt into the top half with
17074 // an AND. That extend will hopefully be removed if only the bottom bits are
17075 // demanded (though a truncating store, for example).
17077 DAG
.getNode(ARMISD::VQMOVNu
, DL
, HalfVT
, DAG
.getUNDEF(HalfVT
), N0
,
17078 DAG
.getConstant(0, DL
, MVT::i32
));
17079 SDValue Bitcast
= DAG
.getNode(ARMISD::VECTOR_REG_CAST
, DL
, VT
, VQMOVN
);
17080 return DAG
.getNode(ISD::AND
, DL
, VT
, Bitcast
,
17081 DAG
.getConstant(ExtConst
, DL
, VT
));
17087 static const APInt
*isPowerOf2Constant(SDValue V
) {
17088 ConstantSDNode
*C
= dyn_cast
<ConstantSDNode
>(V
);
17091 const APInt
*CV
= &C
->getAPIntValue();
17092 return CV
->isPowerOf2() ? CV
: nullptr;
17095 SDValue
ARMTargetLowering::PerformCMOVToBFICombine(SDNode
*CMOV
, SelectionDAG
&DAG
) const {
17096 // If we have a CMOV, OR and AND combination such as:
17101 // * CN is a single bit;
17102 // * All bits covered by CM are known zero in y
17104 // Then we can convert this into a sequence of BFI instructions. This will
17105 // always be a win if CM is a single bit, will always be no worse than the
17106 // TST&OR sequence if CM is two bits, and for thumb will be no worse if CM is
17107 // three bits (due to the extra IT instruction).
17109 SDValue Op0
= CMOV
->getOperand(0);
17110 SDValue Op1
= CMOV
->getOperand(1);
17111 auto CCNode
= cast
<ConstantSDNode
>(CMOV
->getOperand(2));
17112 auto CC
= CCNode
->getAPIntValue().getLimitedValue();
17113 SDValue CmpZ
= CMOV
->getOperand(4);
17115 // The compare must be against zero.
17116 if (!isNullConstant(CmpZ
->getOperand(1)))
17119 assert(CmpZ
->getOpcode() == ARMISD::CMPZ
);
17120 SDValue And
= CmpZ
->getOperand(0);
17121 if (And
->getOpcode() != ISD::AND
)
17123 const APInt
*AndC
= isPowerOf2Constant(And
->getOperand(1));
17126 SDValue X
= And
->getOperand(0);
17128 if (CC
== ARMCC::EQ
) {
17129 // We're performing an "equal to zero" compare. Swap the operands so we
17130 // canonicalize on a "not equal to zero" compare.
17131 std::swap(Op0
, Op1
);
17133 assert(CC
== ARMCC::NE
&& "How can a CMPZ node not be EQ or NE?");
17136 if (Op1
->getOpcode() != ISD::OR
)
17139 ConstantSDNode
*OrC
= dyn_cast
<ConstantSDNode
>(Op1
->getOperand(1));
17142 SDValue Y
= Op1
->getOperand(0);
17147 // Now, is it profitable to continue?
17148 APInt OrCI
= OrC
->getAPIntValue();
17149 unsigned Heuristic
= Subtarget
->isThumb() ? 3 : 2;
17150 if (OrCI
.countPopulation() > Heuristic
)
17153 // Lastly, can we determine that the bits defined by OrCI
17155 KnownBits Known
= DAG
.computeKnownBits(Y
);
17156 if ((OrCI
& Known
.Zero
) != OrCI
)
17159 // OK, we can do the combine.
17162 EVT VT
= X
.getValueType();
17163 unsigned BitInX
= AndC
->logBase2();
17166 // We must shift X first.
17167 X
= DAG
.getNode(ISD::SRL
, dl
, VT
, X
,
17168 DAG
.getConstant(BitInX
, dl
, VT
));
17171 for (unsigned BitInY
= 0, NumActiveBits
= OrCI
.getActiveBits();
17172 BitInY
< NumActiveBits
; ++BitInY
) {
17173 if (OrCI
[BitInY
] == 0)
17175 APInt
Mask(VT
.getSizeInBits(), 0);
17176 Mask
.setBit(BitInY
);
17177 V
= DAG
.getNode(ARMISD::BFI
, dl
, VT
, V
, X
,
17178 // Confusingly, the operand is an *inverted* mask.
17179 DAG
.getConstant(~Mask
, dl
, VT
));
17185 // Given N, the value controlling the conditional branch, search for the loop
17186 // intrinsic, returning it, along with how the value is used. We need to handle
17187 // patterns such as the following:
17188 // (brcond (xor (setcc (loop.decrement), 0, ne), 1), exit)
17189 // (brcond (setcc (loop.decrement), 0, eq), exit)
17190 // (brcond (setcc (loop.decrement), 0, ne), header)
17191 static SDValue
SearchLoopIntrinsic(SDValue N
, ISD::CondCode
&CC
, int &Imm
,
17193 switch (N
->getOpcode()) {
17197 if (!isa
<ConstantSDNode
>(N
.getOperand(1)))
17199 if (!cast
<ConstantSDNode
>(N
.getOperand(1))->isOne())
17202 return SearchLoopIntrinsic(N
.getOperand(0), CC
, Imm
, Negate
);
17205 auto *Const
= dyn_cast
<ConstantSDNode
>(N
.getOperand(1));
17208 if (Const
->isNullValue())
17210 else if (Const
->isOne())
17214 CC
= cast
<CondCodeSDNode
>(N
.getOperand(2))->get();
17215 return SearchLoopIntrinsic(N
->getOperand(0), CC
, Imm
, Negate
);
17217 case ISD::INTRINSIC_W_CHAIN
: {
17218 unsigned IntOp
= cast
<ConstantSDNode
>(N
.getOperand(1))->getZExtValue();
17219 if (IntOp
!= Intrinsic::test_start_loop_iterations
&&
17220 IntOp
!= Intrinsic::loop_decrement_reg
)
17228 static SDValue
PerformHWLoopCombine(SDNode
*N
,
17229 TargetLowering::DAGCombinerInfo
&DCI
,
17230 const ARMSubtarget
*ST
) {
17232 // The hwloop intrinsics that we're interested are used for control-flow,
17233 // either for entering or exiting the loop:
17234 // - test.start.loop.iterations will test whether its operand is zero. If it
17235 // is zero, the proceeding branch should not enter the loop.
17236 // - loop.decrement.reg also tests whether its operand is zero. If it is
17237 // zero, the proceeding branch should not branch back to the beginning of
17239 // So here, we need to check that how the brcond is using the result of each
17240 // of the intrinsics to ensure that we're branching to the right place at the
17246 bool Negate
= false;
17247 SDValue Chain
= N
->getOperand(0);
17250 if (N
->getOpcode() == ISD::BRCOND
) {
17252 Cond
= N
->getOperand(1);
17253 Dest
= N
->getOperand(2);
17255 assert(N
->getOpcode() == ISD::BR_CC
&& "Expected BRCOND or BR_CC!");
17256 CC
= cast
<CondCodeSDNode
>(N
->getOperand(1))->get();
17257 Cond
= N
->getOperand(2);
17258 Dest
= N
->getOperand(4);
17259 if (auto *Const
= dyn_cast
<ConstantSDNode
>(N
->getOperand(3))) {
17260 if (!Const
->isOne() && !Const
->isNullValue())
17262 Imm
= Const
->getZExtValue();
17267 SDValue Int
= SearchLoopIntrinsic(Cond
, CC
, Imm
, Negate
);
17272 CC
= ISD::getSetCCInverse(CC
, /* Integer inverse */ MVT::i32
);
17274 auto IsTrueIfZero
= [](ISD::CondCode CC
, int Imm
) {
17275 return (CC
== ISD::SETEQ
&& Imm
== 0) ||
17276 (CC
== ISD::SETNE
&& Imm
== 1) ||
17277 (CC
== ISD::SETLT
&& Imm
== 1) ||
17278 (CC
== ISD::SETULT
&& Imm
== 1);
17281 auto IsFalseIfZero
= [](ISD::CondCode CC
, int Imm
) {
17282 return (CC
== ISD::SETEQ
&& Imm
== 1) ||
17283 (CC
== ISD::SETNE
&& Imm
== 0) ||
17284 (CC
== ISD::SETGT
&& Imm
== 0) ||
17285 (CC
== ISD::SETUGT
&& Imm
== 0) ||
17286 (CC
== ISD::SETGE
&& Imm
== 1) ||
17287 (CC
== ISD::SETUGE
&& Imm
== 1);
17290 assert((IsTrueIfZero(CC
, Imm
) || IsFalseIfZero(CC
, Imm
)) &&
17291 "unsupported condition");
17294 SelectionDAG
&DAG
= DCI
.DAG
;
17295 SDValue Elements
= Int
.getOperand(2);
17296 unsigned IntOp
= cast
<ConstantSDNode
>(Int
->getOperand(1))->getZExtValue();
17297 assert((N
->hasOneUse() && N
->use_begin()->getOpcode() == ISD::BR
)
17298 && "expected single br user");
17299 SDNode
*Br
= *N
->use_begin();
17300 SDValue OtherTarget
= Br
->getOperand(1);
17302 // Update the unconditional branch to branch to the given Dest.
17303 auto UpdateUncondBr
= [](SDNode
*Br
, SDValue Dest
, SelectionDAG
&DAG
) {
17304 SDValue NewBrOps
[] = { Br
->getOperand(0), Dest
};
17305 SDValue NewBr
= DAG
.getNode(ISD::BR
, SDLoc(Br
), MVT::Other
, NewBrOps
);
17306 DAG
.ReplaceAllUsesOfValueWith(SDValue(Br
, 0), NewBr
);
17309 if (IntOp
== Intrinsic::test_start_loop_iterations
) {
17311 SDValue Setup
= DAG
.getNode(ARMISD::WLSSETUP
, dl
, MVT::i32
, Elements
);
17312 // We expect this 'instruction' to branch when the counter is zero.
17313 if (IsTrueIfZero(CC
, Imm
)) {
17314 SDValue Ops
[] = {Chain
, Setup
, Dest
};
17315 Res
= DAG
.getNode(ARMISD::WLS
, dl
, MVT::Other
, Ops
);
17317 // The logic is the reverse of what we need for WLS, so find the other
17318 // basic block target: the target of the proceeding br.
17319 UpdateUncondBr(Br
, Dest
, DAG
);
17321 SDValue Ops
[] = {Chain
, Setup
, OtherTarget
};
17322 Res
= DAG
.getNode(ARMISD::WLS
, dl
, MVT::Other
, Ops
);
17324 // Update LR count to the new value
17325 DAG
.ReplaceAllUsesOfValueWith(Int
.getValue(0), Setup
);
17327 DAG
.ReplaceAllUsesOfValueWith(Int
.getValue(2), Int
.getOperand(0));
17330 SDValue Size
= DAG
.getTargetConstant(
17331 cast
<ConstantSDNode
>(Int
.getOperand(3))->getZExtValue(), dl
, MVT::i32
);
17332 SDValue Args
[] = { Int
.getOperand(0), Elements
, Size
, };
17333 SDValue LoopDec
= DAG
.getNode(ARMISD::LOOP_DEC
, dl
,
17334 DAG
.getVTList(MVT::i32
, MVT::Other
), Args
);
17335 DAG
.ReplaceAllUsesWith(Int
.getNode(), LoopDec
.getNode());
17337 // We expect this instruction to branch when the count is not zero.
17338 SDValue Target
= IsFalseIfZero(CC
, Imm
) ? Dest
: OtherTarget
;
17340 // Update the unconditional branch to target the loop preheader if we've
17341 // found the condition has been reversed.
17342 if (Target
== OtherTarget
)
17343 UpdateUncondBr(Br
, Dest
, DAG
);
17345 Chain
= DAG
.getNode(ISD::TokenFactor
, dl
, MVT::Other
,
17346 SDValue(LoopDec
.getNode(), 1), Chain
);
17348 SDValue EndArgs
[] = { Chain
, SDValue(LoopDec
.getNode(), 0), Target
};
17349 return DAG
.getNode(ARMISD::LE
, dl
, MVT::Other
, EndArgs
);
17354 /// PerformBRCONDCombine - Target-specific DAG combining for ARMISD::BRCOND.
17356 ARMTargetLowering::PerformBRCONDCombine(SDNode
*N
, SelectionDAG
&DAG
) const {
17357 SDValue Cmp
= N
->getOperand(4);
17358 if (Cmp
.getOpcode() != ARMISD::CMPZ
)
17359 // Only looking at NE cases.
17362 EVT VT
= N
->getValueType(0);
17364 SDValue LHS
= Cmp
.getOperand(0);
17365 SDValue RHS
= Cmp
.getOperand(1);
17366 SDValue Chain
= N
->getOperand(0);
17367 SDValue BB
= N
->getOperand(1);
17368 SDValue ARMcc
= N
->getOperand(2);
17369 ARMCC::CondCodes CC
=
17370 (ARMCC::CondCodes
)cast
<ConstantSDNode
>(ARMcc
)->getZExtValue();
17372 // (brcond Chain BB ne CPSR (cmpz (and (cmov 0 1 CC CPSR Cmp) 1) 0))
17373 // -> (brcond Chain BB CC CPSR Cmp)
17374 if (CC
== ARMCC::NE
&& LHS
.getOpcode() == ISD::AND
&& LHS
->hasOneUse() &&
17375 LHS
->getOperand(0)->getOpcode() == ARMISD::CMOV
&&
17376 LHS
->getOperand(0)->hasOneUse()) {
17377 auto *LHS00C
= dyn_cast
<ConstantSDNode
>(LHS
->getOperand(0)->getOperand(0));
17378 auto *LHS01C
= dyn_cast
<ConstantSDNode
>(LHS
->getOperand(0)->getOperand(1));
17379 auto *LHS1C
= dyn_cast
<ConstantSDNode
>(LHS
->getOperand(1));
17380 auto *RHSC
= dyn_cast
<ConstantSDNode
>(RHS
);
17381 if ((LHS00C
&& LHS00C
->getZExtValue() == 0) &&
17382 (LHS01C
&& LHS01C
->getZExtValue() == 1) &&
17383 (LHS1C
&& LHS1C
->getZExtValue() == 1) &&
17384 (RHSC
&& RHSC
->getZExtValue() == 0)) {
17385 return DAG
.getNode(
17386 ARMISD::BRCOND
, dl
, VT
, Chain
, BB
, LHS
->getOperand(0)->getOperand(2),
17387 LHS
->getOperand(0)->getOperand(3), LHS
->getOperand(0)->getOperand(4));
17394 /// PerformCMOVCombine - Target-specific DAG combining for ARMISD::CMOV.
17396 ARMTargetLowering::PerformCMOVCombine(SDNode
*N
, SelectionDAG
&DAG
) const {
17397 SDValue Cmp
= N
->getOperand(4);
17398 if (Cmp
.getOpcode() != ARMISD::CMPZ
)
17399 // Only looking at EQ and NE cases.
17402 EVT VT
= N
->getValueType(0);
17404 SDValue LHS
= Cmp
.getOperand(0);
17405 SDValue RHS
= Cmp
.getOperand(1);
17406 SDValue FalseVal
= N
->getOperand(0);
17407 SDValue TrueVal
= N
->getOperand(1);
17408 SDValue ARMcc
= N
->getOperand(2);
17409 ARMCC::CondCodes CC
=
17410 (ARMCC::CondCodes
)cast
<ConstantSDNode
>(ARMcc
)->getZExtValue();
17412 // BFI is only available on V6T2+.
17413 if (!Subtarget
->isThumb1Only() && Subtarget
->hasV6T2Ops()) {
17414 SDValue R
= PerformCMOVToBFICombine(N
, DAG
);
17435 /// FIXME: Turn this into a target neutral optimization?
17437 if (CC
== ARMCC::NE
&& FalseVal
== RHS
&& FalseVal
!= LHS
) {
17438 Res
= DAG
.getNode(ARMISD::CMOV
, dl
, VT
, LHS
, TrueVal
, ARMcc
,
17439 N
->getOperand(3), Cmp
);
17440 } else if (CC
== ARMCC::EQ
&& TrueVal
== RHS
) {
17442 SDValue NewCmp
= getARMCmp(LHS
, RHS
, ISD::SETNE
, ARMcc
, DAG
, dl
);
17443 Res
= DAG
.getNode(ARMISD::CMOV
, dl
, VT
, LHS
, FalseVal
, ARMcc
,
17444 N
->getOperand(3), NewCmp
);
17447 // (cmov F T ne CPSR (cmpz (cmov 0 1 CC CPSR Cmp) 0))
17448 // -> (cmov F T CC CPSR Cmp)
17449 if (CC
== ARMCC::NE
&& LHS
.getOpcode() == ARMISD::CMOV
&& LHS
->hasOneUse()) {
17450 auto *LHS0C
= dyn_cast
<ConstantSDNode
>(LHS
->getOperand(0));
17451 auto *LHS1C
= dyn_cast
<ConstantSDNode
>(LHS
->getOperand(1));
17452 auto *RHSC
= dyn_cast
<ConstantSDNode
>(RHS
);
17453 if ((LHS0C
&& LHS0C
->getZExtValue() == 0) &&
17454 (LHS1C
&& LHS1C
->getZExtValue() == 1) &&
17455 (RHSC
&& RHSC
->getZExtValue() == 0)) {
17456 return DAG
.getNode(ARMISD::CMOV
, dl
, VT
, FalseVal
, TrueVal
,
17457 LHS
->getOperand(2), LHS
->getOperand(3),
17458 LHS
->getOperand(4));
17462 if (!VT
.isInteger())
17465 // Materialize a boolean comparison for integers so we can avoid branching.
17466 if (isNullConstant(FalseVal
)) {
17467 if (CC
== ARMCC::EQ
&& isOneConstant(TrueVal
)) {
17468 if (!Subtarget
->isThumb1Only() && Subtarget
->hasV5TOps()) {
17469 // If x == y then x - y == 0 and ARM's CLZ will return 32, shifting it
17470 // right 5 bits will make that 32 be 1, otherwise it will be 0.
17471 // CMOV 0, 1, ==, (CMPZ x, y) -> SRL (CTLZ (SUB x, y)), 5
17472 SDValue Sub
= DAG
.getNode(ISD::SUB
, dl
, VT
, LHS
, RHS
);
17473 Res
= DAG
.getNode(ISD::SRL
, dl
, VT
, DAG
.getNode(ISD::CTLZ
, dl
, VT
, Sub
),
17474 DAG
.getConstant(5, dl
, MVT::i32
));
17476 // CMOV 0, 1, ==, (CMPZ x, y) ->
17477 // (ADDCARRY (SUB x, y), t:0, t:1)
17478 // where t = (SUBCARRY 0, (SUB x, y), 0)
17480 // The SUBCARRY computes 0 - (x - y) and this will give a borrow when
17481 // x != y. In other words, a carry C == 1 when x == y, C == 0
17483 // The final ADDCARRY computes
17484 // x - y + (0 - (x - y)) + C == C
17485 SDValue Sub
= DAG
.getNode(ISD::SUB
, dl
, VT
, LHS
, RHS
);
17486 SDVTList VTs
= DAG
.getVTList(VT
, MVT::i32
);
17487 SDValue Neg
= DAG
.getNode(ISD::USUBO
, dl
, VTs
, FalseVal
, Sub
);
17488 // ISD::SUBCARRY returns a borrow but we want the carry here
17491 DAG
.getNode(ISD::SUB
, dl
, MVT::i32
,
17492 DAG
.getConstant(1, dl
, MVT::i32
), Neg
.getValue(1));
17493 Res
= DAG
.getNode(ISD::ADDCARRY
, dl
, VTs
, Sub
, Neg
, Carry
);
17495 } else if (CC
== ARMCC::NE
&& !isNullConstant(RHS
) &&
17496 (!Subtarget
->isThumb1Only() || isPowerOf2Constant(TrueVal
))) {
17497 // This seems pointless but will allow us to combine it further below.
17498 // CMOV 0, z, !=, (CMPZ x, y) -> CMOV (SUBS x, y), z, !=, (SUBS x, y):1
17500 DAG
.getNode(ARMISD::SUBS
, dl
, DAG
.getVTList(VT
, MVT::i32
), LHS
, RHS
);
17501 SDValue CPSRGlue
= DAG
.getCopyToReg(DAG
.getEntryNode(), dl
, ARM::CPSR
,
17502 Sub
.getValue(1), SDValue());
17503 Res
= DAG
.getNode(ARMISD::CMOV
, dl
, VT
, Sub
, TrueVal
, ARMcc
,
17504 N
->getOperand(3), CPSRGlue
.getValue(1));
17507 } else if (isNullConstant(TrueVal
)) {
17508 if (CC
== ARMCC::EQ
&& !isNullConstant(RHS
) &&
17509 (!Subtarget
->isThumb1Only() || isPowerOf2Constant(FalseVal
))) {
17510 // This seems pointless but will allow us to combine it further below
17511 // Note that we change == for != as this is the dual for the case above.
17512 // CMOV z, 0, ==, (CMPZ x, y) -> CMOV (SUBS x, y), z, !=, (SUBS x, y):1
17514 DAG
.getNode(ARMISD::SUBS
, dl
, DAG
.getVTList(VT
, MVT::i32
), LHS
, RHS
);
17515 SDValue CPSRGlue
= DAG
.getCopyToReg(DAG
.getEntryNode(), dl
, ARM::CPSR
,
17516 Sub
.getValue(1), SDValue());
17517 Res
= DAG
.getNode(ARMISD::CMOV
, dl
, VT
, Sub
, FalseVal
,
17518 DAG
.getConstant(ARMCC::NE
, dl
, MVT::i32
),
17519 N
->getOperand(3), CPSRGlue
.getValue(1));
17524 // On Thumb1, the DAG above may be further combined if z is a power of 2
17526 // CMOV (SUBS x, y), z, !=, (SUBS x, y):1 ->
17527 // t1 = (USUBO (SUB x, y), 1)
17528 // t2 = (SUBCARRY (SUB x, y), t1:0, t1:1)
17529 // Result = if K != 0 then (SHL t2:0, K) else t2:0
17531 // This also handles the special case of comparing against zero; it's
17532 // essentially, the same pattern, except there's no SUBS:
17533 // CMOV x, z, !=, (CMPZ x, 0) ->
17534 // t1 = (USUBO x, 1)
17535 // t2 = (SUBCARRY x, t1:0, t1:1)
17536 // Result = if K != 0 then (SHL t2:0, K) else t2:0
17537 const APInt
*TrueConst
;
17538 if (Subtarget
->isThumb1Only() && CC
== ARMCC::NE
&&
17539 ((FalseVal
.getOpcode() == ARMISD::SUBS
&&
17540 FalseVal
.getOperand(0) == LHS
&& FalseVal
.getOperand(1) == RHS
) ||
17541 (FalseVal
== LHS
&& isNullConstant(RHS
))) &&
17542 (TrueConst
= isPowerOf2Constant(TrueVal
))) {
17543 SDVTList VTs
= DAG
.getVTList(VT
, MVT::i32
);
17544 unsigned ShiftAmount
= TrueConst
->logBase2();
17546 TrueVal
= DAG
.getConstant(1, dl
, VT
);
17547 SDValue Subc
= DAG
.getNode(ISD::USUBO
, dl
, VTs
, FalseVal
, TrueVal
);
17548 Res
= DAG
.getNode(ISD::SUBCARRY
, dl
, VTs
, FalseVal
, Subc
, Subc
.getValue(1));
17551 Res
= DAG
.getNode(ISD::SHL
, dl
, VT
, Res
,
17552 DAG
.getConstant(ShiftAmount
, dl
, MVT::i32
));
17555 if (Res
.getNode()) {
17556 KnownBits Known
= DAG
.computeKnownBits(SDValue(N
,0));
17557 // Capture demanded bits information that would be otherwise lost.
17558 if (Known
.Zero
== 0xfffffffe)
17559 Res
= DAG
.getNode(ISD::AssertZext
, dl
, MVT::i32
, Res
,
17560 DAG
.getValueType(MVT::i1
));
17561 else if (Known
.Zero
== 0xffffff00)
17562 Res
= DAG
.getNode(ISD::AssertZext
, dl
, MVT::i32
, Res
,
17563 DAG
.getValueType(MVT::i8
));
17564 else if (Known
.Zero
== 0xffff0000)
17565 Res
= DAG
.getNode(ISD::AssertZext
, dl
, MVT::i32
, Res
,
17566 DAG
.getValueType(MVT::i16
));
17572 static SDValue
PerformBITCASTCombine(SDNode
*N
,
17573 TargetLowering::DAGCombinerInfo
&DCI
,
17574 const ARMSubtarget
*ST
) {
17575 SelectionDAG
&DAG
= DCI
.DAG
;
17576 SDValue Src
= N
->getOperand(0);
17577 EVT DstVT
= N
->getValueType(0);
17579 // Convert v4f32 bitcast (v4i32 vdup (i32)) -> v4f32 vdup (i32) under MVE.
17580 if (ST
->hasMVEIntegerOps() && Src
.getOpcode() == ARMISD::VDUP
) {
17581 EVT SrcVT
= Src
.getValueType();
17582 if (SrcVT
.getScalarSizeInBits() == DstVT
.getScalarSizeInBits())
17583 return DAG
.getNode(ARMISD::VDUP
, SDLoc(N
), DstVT
, Src
.getOperand(0));
17586 // We may have a bitcast of something that has already had this bitcast
17587 // combine performed on it, so skip past any VECTOR_REG_CASTs.
17588 while (Src
.getOpcode() == ARMISD::VECTOR_REG_CAST
)
17589 Src
= Src
.getOperand(0);
17591 // Bitcast from element-wise VMOV or VMVN doesn't need VREV if the VREV that
17592 // would be generated is at least the width of the element type.
17593 EVT SrcVT
= Src
.getValueType();
17594 if ((Src
.getOpcode() == ARMISD::VMOVIMM
||
17595 Src
.getOpcode() == ARMISD::VMVNIMM
||
17596 Src
.getOpcode() == ARMISD::VMOVFPIMM
) &&
17597 SrcVT
.getScalarSizeInBits() <= DstVT
.getScalarSizeInBits() &&
17598 DAG
.getDataLayout().isBigEndian())
17599 return DAG
.getNode(ARMISD::VECTOR_REG_CAST
, SDLoc(N
), DstVT
, Src
);
17601 // bitcast(extract(x, n)); bitcast(extract(x, n+1)) -> VMOVRRD x
17602 if (SDValue R
= PerformExtractEltToVMOVRRD(N
, DCI
))
17608 // Some combines for the MVETrunc truncations legalizer helper. Also lowers the
17609 // node into stack operations after legalizeOps.
17610 SDValue
ARMTargetLowering::PerformMVETruncCombine(
17611 SDNode
*N
, TargetLowering::DAGCombinerInfo
&DCI
) const {
17612 SelectionDAG
&DAG
= DCI
.DAG
;
17613 EVT VT
= N
->getValueType(0);
17616 // MVETrunc(Undef, Undef) -> Undef
17617 if (all_of(N
->ops(), [](SDValue Op
) { return Op
.isUndef(); }))
17618 return DAG
.getUNDEF(VT
);
17620 // MVETrunc(MVETrunc a b, MVETrunc c, d) -> MVETrunc
17621 if (N
->getNumOperands() == 2 &&
17622 N
->getOperand(0).getOpcode() == ARMISD::MVETRUNC
&&
17623 N
->getOperand(1).getOpcode() == ARMISD::MVETRUNC
)
17624 return DAG
.getNode(ARMISD::MVETRUNC
, DL
, VT
, N
->getOperand(0).getOperand(0),
17625 N
->getOperand(0).getOperand(1),
17626 N
->getOperand(1).getOperand(0),
17627 N
->getOperand(1).getOperand(1));
17629 // MVETrunc(shuffle, shuffle) -> VMOVN
17630 if (N
->getNumOperands() == 2 &&
17631 N
->getOperand(0).getOpcode() == ISD::VECTOR_SHUFFLE
&&
17632 N
->getOperand(1).getOpcode() == ISD::VECTOR_SHUFFLE
) {
17633 auto *S0
= cast
<ShuffleVectorSDNode
>(N
->getOperand(0).getNode());
17634 auto *S1
= cast
<ShuffleVectorSDNode
>(N
->getOperand(1).getNode());
17636 if (S0
->getOperand(0) == S1
->getOperand(0) &&
17637 S0
->getOperand(1) == S1
->getOperand(1)) {
17638 // Construct complete shuffle mask
17639 SmallVector
<int, 8> Mask(S0
->getMask().begin(), S0
->getMask().end());
17640 Mask
.append(S1
->getMask().begin(), S1
->getMask().end());
17642 if (isVMOVNTruncMask(Mask
, VT
, 0))
17643 return DAG
.getNode(
17644 ARMISD::VMOVN
, DL
, VT
,
17645 DAG
.getNode(ARMISD::VECTOR_REG_CAST
, DL
, VT
, S0
->getOperand(0)),
17646 DAG
.getNode(ARMISD::VECTOR_REG_CAST
, DL
, VT
, S0
->getOperand(1)),
17647 DAG
.getConstant(1, DL
, MVT::i32
));
17648 if (isVMOVNTruncMask(Mask
, VT
, 1))
17649 return DAG
.getNode(
17650 ARMISD::VMOVN
, DL
, VT
,
17651 DAG
.getNode(ARMISD::VECTOR_REG_CAST
, DL
, VT
, S0
->getOperand(1)),
17652 DAG
.getNode(ARMISD::VECTOR_REG_CAST
, DL
, VT
, S0
->getOperand(0)),
17653 DAG
.getConstant(1, DL
, MVT::i32
));
17657 // For MVETrunc of a buildvector or shuffle, it can be beneficial to lower the
17658 // truncate to a buildvector to allow the generic optimisations to kick in.
17659 if (all_of(N
->ops(), [](SDValue Op
) {
17660 return Op
.getOpcode() == ISD::BUILD_VECTOR
||
17661 Op
.getOpcode() == ISD::VECTOR_SHUFFLE
||
17662 (Op
.getOpcode() == ISD::BITCAST
&&
17663 Op
.getOperand(0).getOpcode() == ISD::BUILD_VECTOR
);
17665 SmallVector
<SDValue
, 8> Extracts
;
17666 for (unsigned Op
= 0; Op
< N
->getNumOperands(); Op
++) {
17667 SDValue O
= N
->getOperand(Op
);
17668 for (unsigned i
= 0; i
< O
.getValueType().getVectorNumElements(); i
++) {
17669 SDValue Ext
= DAG
.getNode(ISD::EXTRACT_VECTOR_ELT
, DL
, MVT::i32
, O
,
17670 DAG
.getConstant(i
, DL
, MVT::i32
));
17671 Extracts
.push_back(Ext
);
17674 return DAG
.getBuildVector(VT
, DL
, Extracts
);
17677 // If we are late in the legalization process and nothing has optimised
17678 // the trunc to anything better, lower it to a stack store and reload,
17679 // performing the truncation whilst keeping the lanes in the correct order:
17680 // VSTRH.32 a, stack; VSTRH.32 b, stack+8; VLDRW.32 stack;
17681 if (!DCI
.isAfterLegalizeDAG())
17684 SDValue StackPtr
= DAG
.CreateStackTemporary(TypeSize::Fixed(16), Align(4));
17685 int SPFI
= cast
<FrameIndexSDNode
>(StackPtr
.getNode())->getIndex();
17686 int NumIns
= N
->getNumOperands();
17687 assert((NumIns
== 2 || NumIns
== 4) &&
17688 "Expected 2 or 4 inputs to an MVETrunc");
17689 EVT StoreVT
= VT
.getHalfNumVectorElementsVT(*DAG
.getContext());
17690 if (N
->getNumOperands() == 4)
17691 StoreVT
= StoreVT
.getHalfNumVectorElementsVT(*DAG
.getContext());
17693 SmallVector
<SDValue
> Chains
;
17694 for (int I
= 0; I
< NumIns
; I
++) {
17695 SDValue Ptr
= DAG
.getNode(
17696 ISD::ADD
, DL
, StackPtr
.getValueType(), StackPtr
,
17697 DAG
.getConstant(I
* 16 / NumIns
, DL
, StackPtr
.getValueType()));
17698 MachinePointerInfo MPI
= MachinePointerInfo::getFixedStack(
17699 DAG
.getMachineFunction(), SPFI
, I
* 16 / NumIns
);
17700 SDValue Ch
= DAG
.getTruncStore(DAG
.getEntryNode(), DL
, N
->getOperand(I
),
17701 Ptr
, MPI
, StoreVT
, Align(4));
17702 Chains
.push_back(Ch
);
17705 SDValue Chain
= DAG
.getNode(ISD::TokenFactor
, DL
, MVT::Other
, Chains
);
17706 MachinePointerInfo MPI
=
17707 MachinePointerInfo::getFixedStack(DAG
.getMachineFunction(), SPFI
, 0);
17708 return DAG
.getLoad(VT
, DL
, Chain
, StackPtr
, MPI
, Align(4));
17711 // Take a MVEEXT(load x) and split that into (extload x, extload x+8)
17712 static SDValue
PerformSplittingMVEEXTToWideningLoad(SDNode
*N
,
17713 SelectionDAG
&DAG
) {
17714 SDValue N0
= N
->getOperand(0);
17715 LoadSDNode
*LD
= dyn_cast
<LoadSDNode
>(N0
.getNode());
17716 if (!LD
|| !LD
->isSimple() || !N0
.hasOneUse() || LD
->isIndexed())
17719 EVT FromVT
= LD
->getMemoryVT();
17720 EVT ToVT
= N
->getValueType(0);
17721 if (!ToVT
.isVector())
17723 assert(FromVT
.getVectorNumElements() == ToVT
.getVectorNumElements() * 2);
17724 EVT ToEltVT
= ToVT
.getVectorElementType();
17725 EVT FromEltVT
= FromVT
.getVectorElementType();
17727 unsigned NumElements
= 0;
17728 if (ToEltVT
== MVT::i32
&& (FromEltVT
== MVT::i16
|| FromEltVT
== MVT::i8
))
17730 if (ToEltVT
== MVT::i16
&& FromEltVT
== MVT::i8
)
17732 assert(NumElements
!= 0);
17734 ISD::LoadExtType NewExtType
=
17735 N
->getOpcode() == ARMISD::MVESEXT
? ISD::SEXTLOAD
: ISD::ZEXTLOAD
;
17736 if (LD
->getExtensionType() != ISD::NON_EXTLOAD
&&
17737 LD
->getExtensionType() != ISD::EXTLOAD
&&
17738 LD
->getExtensionType() != NewExtType
)
17741 LLVMContext
&C
= *DAG
.getContext();
17743 // Details about the old load
17744 SDValue Ch
= LD
->getChain();
17745 SDValue BasePtr
= LD
->getBasePtr();
17746 Align Alignment
= LD
->getOriginalAlign();
17747 MachineMemOperand::Flags MMOFlags
= LD
->getMemOperand()->getFlags();
17748 AAMDNodes AAInfo
= LD
->getAAInfo();
17750 SDValue Offset
= DAG
.getUNDEF(BasePtr
.getValueType());
17751 EVT NewFromVT
= EVT::getVectorVT(
17752 C
, EVT::getIntegerVT(C
, FromEltVT
.getScalarSizeInBits()), NumElements
);
17753 EVT NewToVT
= EVT::getVectorVT(
17754 C
, EVT::getIntegerVT(C
, ToEltVT
.getScalarSizeInBits()), NumElements
);
17756 SmallVector
<SDValue
, 4> Loads
;
17757 SmallVector
<SDValue
, 4> Chains
;
17758 for (unsigned i
= 0; i
< FromVT
.getVectorNumElements() / NumElements
; i
++) {
17759 unsigned NewOffset
= (i
* NewFromVT
.getSizeInBits()) / 8;
17761 DAG
.getObjectPtrOffset(DL
, BasePtr
, TypeSize::Fixed(NewOffset
));
17764 DAG
.getLoad(ISD::UNINDEXED
, NewExtType
, NewToVT
, DL
, Ch
, NewPtr
, Offset
,
17765 LD
->getPointerInfo().getWithOffset(NewOffset
), NewFromVT
,
17766 Alignment
, MMOFlags
, AAInfo
);
17767 Loads
.push_back(NewLoad
);
17768 Chains
.push_back(SDValue(NewLoad
.getNode(), 1));
17771 SDValue NewChain
= DAG
.getNode(ISD::TokenFactor
, DL
, MVT::Other
, Chains
);
17772 DAG
.ReplaceAllUsesOfValueWith(SDValue(LD
, 1), NewChain
);
17773 return DAG
.getMergeValues(Loads
, DL
);
17776 // Perform combines for MVEEXT. If it has not be optimized to anything better
17777 // before lowering, it gets converted to stack store and extloads performing the
17778 // extend whilst still keeping the same lane ordering.
17779 SDValue
ARMTargetLowering::PerformMVEExtCombine(
17780 SDNode
*N
, TargetLowering::DAGCombinerInfo
&DCI
) const {
17781 SelectionDAG
&DAG
= DCI
.DAG
;
17782 EVT VT
= N
->getValueType(0);
17784 assert(N
->getNumValues() == 2 && "Expected MVEEXT with 2 elements");
17785 assert((VT
== MVT::v4i32
|| VT
== MVT::v8i16
) && "Unexpected MVEEXT type");
17787 EVT ExtVT
= N
->getOperand(0).getValueType().getHalfNumVectorElementsVT(
17788 *DAG
.getContext());
17789 auto Extend
= [&](SDValue V
) {
17790 SDValue VVT
= DAG
.getNode(ARMISD::VECTOR_REG_CAST
, DL
, VT
, V
);
17791 return N
->getOpcode() == ARMISD::MVESEXT
17792 ? DAG
.getNode(ISD::SIGN_EXTEND_INREG
, DL
, VT
, VVT
,
17793 DAG
.getValueType(ExtVT
))
17794 : DAG
.getZeroExtendInReg(VVT
, DL
, ExtVT
);
17797 // MVEEXT(VDUP) -> SIGN_EXTEND_INREG(VDUP)
17798 if (N
->getOperand(0).getOpcode() == ARMISD::VDUP
) {
17799 SDValue Ext
= Extend(N
->getOperand(0));
17800 return DAG
.getMergeValues({Ext
, Ext
}, DL
);
17803 // MVEEXT(shuffle) -> SIGN_EXTEND_INREG/ZERO_EXTEND_INREG
17804 if (auto *SVN
= dyn_cast
<ShuffleVectorSDNode
>(N
->getOperand(0))) {
17805 ArrayRef
<int> Mask
= SVN
->getMask();
17806 assert(Mask
.size() == 2 * VT
.getVectorNumElements());
17807 assert(Mask
.size() == SVN
->getValueType(0).getVectorNumElements());
17808 unsigned Rev
= VT
== MVT::v4i32
? ARMISD::VREV32
: ARMISD::VREV16
;
17809 SDValue Op0
= SVN
->getOperand(0);
17810 SDValue Op1
= SVN
->getOperand(1);
17812 auto CheckInregMask
= [&](int Start
, int Offset
) {
17813 for (int Idx
= 0, E
= VT
.getVectorNumElements(); Idx
< E
; ++Idx
)
17814 if (Mask
[Start
+ Idx
] >= 0 && Mask
[Start
+ Idx
] != Idx
* 2 + Offset
)
17818 SDValue V0
= SDValue(N
, 0);
17819 SDValue V1
= SDValue(N
, 1);
17820 if (CheckInregMask(0, 0))
17822 else if (CheckInregMask(0, 1))
17823 V0
= Extend(DAG
.getNode(Rev
, DL
, SVN
->getValueType(0), Op0
));
17824 else if (CheckInregMask(0, Mask
.size()))
17826 else if (CheckInregMask(0, Mask
.size() + 1))
17827 V0
= Extend(DAG
.getNode(Rev
, DL
, SVN
->getValueType(0), Op1
));
17829 if (CheckInregMask(VT
.getVectorNumElements(), Mask
.size()))
17831 else if (CheckInregMask(VT
.getVectorNumElements(), Mask
.size() + 1))
17832 V1
= Extend(DAG
.getNode(Rev
, DL
, SVN
->getValueType(0), Op1
));
17833 else if (CheckInregMask(VT
.getVectorNumElements(), 0))
17835 else if (CheckInregMask(VT
.getVectorNumElements(), 1))
17836 V1
= Extend(DAG
.getNode(Rev
, DL
, SVN
->getValueType(0), Op0
));
17838 if (V0
.getNode() != N
|| V1
.getNode() != N
)
17839 return DAG
.getMergeValues({V0
, V1
}, DL
);
17842 // MVEEXT(load) -> extload, extload
17843 if (N
->getOperand(0)->getOpcode() == ISD::LOAD
)
17844 if (SDValue L
= PerformSplittingMVEEXTToWideningLoad(N
, DAG
))
17847 if (!DCI
.isAfterLegalizeDAG())
17850 // Lower to a stack store and reload:
17851 // VSTRW.32 a, stack; VLDRH.32 stack; VLDRH.32 stack+8;
17852 SDValue StackPtr
= DAG
.CreateStackTemporary(TypeSize::Fixed(16), Align(4));
17853 int SPFI
= cast
<FrameIndexSDNode
>(StackPtr
.getNode())->getIndex();
17854 int NumOuts
= N
->getNumValues();
17855 assert((NumOuts
== 2 || NumOuts
== 4) &&
17856 "Expected 2 or 4 outputs to an MVEEXT");
17857 EVT LoadVT
= N
->getOperand(0).getValueType().getHalfNumVectorElementsVT(
17858 *DAG
.getContext());
17859 if (N
->getNumOperands() == 4)
17860 LoadVT
= LoadVT
.getHalfNumVectorElementsVT(*DAG
.getContext());
17862 MachinePointerInfo MPI
=
17863 MachinePointerInfo::getFixedStack(DAG
.getMachineFunction(), SPFI
, 0);
17864 SDValue Chain
= DAG
.getStore(DAG
.getEntryNode(), DL
, N
->getOperand(0),
17865 StackPtr
, MPI
, Align(4));
17867 SmallVector
<SDValue
> Loads
;
17868 for (int I
= 0; I
< NumOuts
; I
++) {
17869 SDValue Ptr
= DAG
.getNode(
17870 ISD::ADD
, DL
, StackPtr
.getValueType(), StackPtr
,
17871 DAG
.getConstant(I
* 16 / NumOuts
, DL
, StackPtr
.getValueType()));
17872 MachinePointerInfo MPI
= MachinePointerInfo::getFixedStack(
17873 DAG
.getMachineFunction(), SPFI
, I
* 16 / NumOuts
);
17874 SDValue Load
= DAG
.getExtLoad(
17875 N
->getOpcode() == ARMISD::MVESEXT
? ISD::SEXTLOAD
: ISD::ZEXTLOAD
, DL
,
17876 VT
, Chain
, Ptr
, MPI
, LoadVT
, Align(4));
17877 Loads
.push_back(Load
);
17880 return DAG
.getMergeValues(Loads
, DL
);
17883 SDValue
ARMTargetLowering::PerformDAGCombine(SDNode
*N
,
17884 DAGCombinerInfo
&DCI
) const {
17885 switch (N
->getOpcode()) {
17887 case ISD::SELECT_CC
:
17888 case ISD::SELECT
: return PerformSELECTCombine(N
, DCI
, Subtarget
);
17889 case ISD::VSELECT
: return PerformVSELECTCombine(N
, DCI
, Subtarget
);
17890 case ISD::ABS
: return PerformABSCombine(N
, DCI
, Subtarget
);
17891 case ARMISD::ADDE
: return PerformADDECombine(N
, DCI
, Subtarget
);
17892 case ARMISD::UMLAL
: return PerformUMLALCombine(N
, DCI
.DAG
, Subtarget
);
17893 case ISD::ADD
: return PerformADDCombine(N
, DCI
, Subtarget
);
17894 case ISD::SUB
: return PerformSUBCombine(N
, DCI
, Subtarget
);
17895 case ISD::MUL
: return PerformMULCombine(N
, DCI
, Subtarget
);
17896 case ISD::OR
: return PerformORCombine(N
, DCI
, Subtarget
);
17897 case ISD::XOR
: return PerformXORCombine(N
, DCI
, Subtarget
);
17898 case ISD::AND
: return PerformANDCombine(N
, DCI
, Subtarget
);
17900 case ISD::BR_CC
: return PerformHWLoopCombine(N
, DCI
, Subtarget
);
17902 case ARMISD::SUBC
: return PerformAddcSubcCombine(N
, DCI
, Subtarget
);
17903 case ARMISD::SUBE
: return PerformAddeSubeCombine(N
, DCI
, Subtarget
);
17904 case ARMISD::BFI
: return PerformBFICombine(N
, DCI
.DAG
);
17905 case ARMISD::VMOVRRD
: return PerformVMOVRRDCombine(N
, DCI
, Subtarget
);
17906 case ARMISD::VMOVDRR
: return PerformVMOVDRRCombine(N
, DCI
.DAG
);
17907 case ARMISD::VMOVhr
: return PerformVMOVhrCombine(N
, DCI
);
17908 case ARMISD::VMOVrh
: return PerformVMOVrhCombine(N
, DCI
.DAG
);
17909 case ISD::STORE
: return PerformSTORECombine(N
, DCI
, Subtarget
);
17910 case ISD::BUILD_VECTOR
: return PerformBUILD_VECTORCombine(N
, DCI
, Subtarget
);
17911 case ISD::INSERT_VECTOR_ELT
: return PerformInsertEltCombine(N
, DCI
);
17912 case ISD::EXTRACT_VECTOR_ELT
:
17913 return PerformExtractEltCombine(N
, DCI
, Subtarget
);
17914 case ISD::SIGN_EXTEND_INREG
: return PerformSignExtendInregCombine(N
, DCI
.DAG
);
17915 case ISD::INSERT_SUBVECTOR
: return PerformInsertSubvectorCombine(N
, DCI
);
17916 case ISD::VECTOR_SHUFFLE
: return PerformVECTOR_SHUFFLECombine(N
, DCI
.DAG
);
17917 case ARMISD::VDUPLANE
: return PerformVDUPLANECombine(N
, DCI
, Subtarget
);
17918 case ARMISD::VDUP
: return PerformVDUPCombine(N
, DCI
.DAG
, Subtarget
);
17919 case ISD::FP_TO_SINT
:
17920 case ISD::FP_TO_UINT
:
17921 return PerformVCVTCombine(N
, DCI
.DAG
, Subtarget
);
17923 return PerformVDIVCombine(N
, DCI
.DAG
, Subtarget
);
17924 case ISD::INTRINSIC_WO_CHAIN
:
17925 return PerformIntrinsicCombine(N
, DCI
);
17929 return PerformShiftCombine(N
, DCI
, Subtarget
);
17930 case ISD::SIGN_EXTEND
:
17931 case ISD::ZERO_EXTEND
:
17932 case ISD::ANY_EXTEND
:
17933 return PerformExtendCombine(N
, DCI
.DAG
, Subtarget
);
17934 case ISD::FP_EXTEND
:
17935 return PerformFPExtendCombine(N
, DCI
.DAG
, Subtarget
);
17940 return PerformMinMaxCombine(N
, DCI
.DAG
, Subtarget
);
17941 case ARMISD::CMOV
: return PerformCMOVCombine(N
, DCI
.DAG
);
17942 case ARMISD::BRCOND
: return PerformBRCONDCombine(N
, DCI
.DAG
);
17943 case ISD::LOAD
: return PerformLOADCombine(N
, DCI
);
17944 case ARMISD::VLD1DUP
:
17945 case ARMISD::VLD2DUP
:
17946 case ARMISD::VLD3DUP
:
17947 case ARMISD::VLD4DUP
:
17948 return PerformVLDCombine(N
, DCI
);
17949 case ARMISD::BUILD_VECTOR
:
17950 return PerformARMBUILD_VECTORCombine(N
, DCI
);
17952 return PerformBITCASTCombine(N
, DCI
, Subtarget
);
17953 case ARMISD::PREDICATE_CAST
:
17954 return PerformPREDICATE_CASTCombine(N
, DCI
);
17955 case ARMISD::VECTOR_REG_CAST
:
17956 return PerformVECTOR_REG_CASTCombine(N
, DCI
.DAG
, Subtarget
);
17957 case ARMISD::MVETRUNC
:
17958 return PerformMVETruncCombine(N
, DCI
);
17959 case ARMISD::MVESEXT
:
17960 case ARMISD::MVEZEXT
:
17961 return PerformMVEExtCombine(N
, DCI
);
17963 return PerformVCMPCombine(N
, DCI
.DAG
, Subtarget
);
17964 case ISD::VECREDUCE_ADD
:
17965 return PerformVECREDUCE_ADDCombine(N
, DCI
.DAG
, Subtarget
);
17966 case ARMISD::VMOVN
:
17967 return PerformVMOVNCombine(N
, DCI
);
17968 case ARMISD::VQMOVNs
:
17969 case ARMISD::VQMOVNu
:
17970 return PerformVQMOVNCombine(N
, DCI
);
17974 return PerformLongShiftCombine(N
, DCI
.DAG
);
17975 case ARMISD::SMULWB
: {
17976 unsigned BitWidth
= N
->getValueType(0).getSizeInBits();
17977 APInt DemandedMask
= APInt::getLowBitsSet(BitWidth
, 16);
17978 if (SimplifyDemandedBits(N
->getOperand(1), DemandedMask
, DCI
))
17982 case ARMISD::SMULWT
: {
17983 unsigned BitWidth
= N
->getValueType(0).getSizeInBits();
17984 APInt DemandedMask
= APInt::getHighBitsSet(BitWidth
, 16);
17985 if (SimplifyDemandedBits(N
->getOperand(1), DemandedMask
, DCI
))
17989 case ARMISD::SMLALBB
:
17990 case ARMISD::QADD16b
:
17991 case ARMISD::QSUB16b
:
17992 case ARMISD::UQADD16b
:
17993 case ARMISD::UQSUB16b
: {
17994 unsigned BitWidth
= N
->getValueType(0).getSizeInBits();
17995 APInt DemandedMask
= APInt::getLowBitsSet(BitWidth
, 16);
17996 if ((SimplifyDemandedBits(N
->getOperand(0), DemandedMask
, DCI
)) ||
17997 (SimplifyDemandedBits(N
->getOperand(1), DemandedMask
, DCI
)))
18001 case ARMISD::SMLALBT
: {
18002 unsigned LowWidth
= N
->getOperand(0).getValueType().getSizeInBits();
18003 APInt LowMask
= APInt::getLowBitsSet(LowWidth
, 16);
18004 unsigned HighWidth
= N
->getOperand(1).getValueType().getSizeInBits();
18005 APInt HighMask
= APInt::getHighBitsSet(HighWidth
, 16);
18006 if ((SimplifyDemandedBits(N
->getOperand(0), LowMask
, DCI
)) ||
18007 (SimplifyDemandedBits(N
->getOperand(1), HighMask
, DCI
)))
18011 case ARMISD::SMLALTB
: {
18012 unsigned HighWidth
= N
->getOperand(0).getValueType().getSizeInBits();
18013 APInt HighMask
= APInt::getHighBitsSet(HighWidth
, 16);
18014 unsigned LowWidth
= N
->getOperand(1).getValueType().getSizeInBits();
18015 APInt LowMask
= APInt::getLowBitsSet(LowWidth
, 16);
18016 if ((SimplifyDemandedBits(N
->getOperand(0), HighMask
, DCI
)) ||
18017 (SimplifyDemandedBits(N
->getOperand(1), LowMask
, DCI
)))
18021 case ARMISD::SMLALTT
: {
18022 unsigned BitWidth
= N
->getValueType(0).getSizeInBits();
18023 APInt DemandedMask
= APInt::getHighBitsSet(BitWidth
, 16);
18024 if ((SimplifyDemandedBits(N
->getOperand(0), DemandedMask
, DCI
)) ||
18025 (SimplifyDemandedBits(N
->getOperand(1), DemandedMask
, DCI
)))
18029 case ARMISD::QADD8b
:
18030 case ARMISD::QSUB8b
:
18031 case ARMISD::UQADD8b
:
18032 case ARMISD::UQSUB8b
: {
18033 unsigned BitWidth
= N
->getValueType(0).getSizeInBits();
18034 APInt DemandedMask
= APInt::getLowBitsSet(BitWidth
, 8);
18035 if ((SimplifyDemandedBits(N
->getOperand(0), DemandedMask
, DCI
)) ||
18036 (SimplifyDemandedBits(N
->getOperand(1), DemandedMask
, DCI
)))
18040 case ISD::INTRINSIC_VOID
:
18041 case ISD::INTRINSIC_W_CHAIN
:
18042 switch (cast
<ConstantSDNode
>(N
->getOperand(1))->getZExtValue()) {
18043 case Intrinsic::arm_neon_vld1
:
18044 case Intrinsic::arm_neon_vld1x2
:
18045 case Intrinsic::arm_neon_vld1x3
:
18046 case Intrinsic::arm_neon_vld1x4
:
18047 case Intrinsic::arm_neon_vld2
:
18048 case Intrinsic::arm_neon_vld3
:
18049 case Intrinsic::arm_neon_vld4
:
18050 case Intrinsic::arm_neon_vld2lane
:
18051 case Intrinsic::arm_neon_vld3lane
:
18052 case Intrinsic::arm_neon_vld4lane
:
18053 case Intrinsic::arm_neon_vld2dup
:
18054 case Intrinsic::arm_neon_vld3dup
:
18055 case Intrinsic::arm_neon_vld4dup
:
18056 case Intrinsic::arm_neon_vst1
:
18057 case Intrinsic::arm_neon_vst1x2
:
18058 case Intrinsic::arm_neon_vst1x3
:
18059 case Intrinsic::arm_neon_vst1x4
:
18060 case Intrinsic::arm_neon_vst2
:
18061 case Intrinsic::arm_neon_vst3
:
18062 case Intrinsic::arm_neon_vst4
:
18063 case Intrinsic::arm_neon_vst2lane
:
18064 case Intrinsic::arm_neon_vst3lane
:
18065 case Intrinsic::arm_neon_vst4lane
:
18066 return PerformVLDCombine(N
, DCI
);
18067 case Intrinsic::arm_mve_vld2q
:
18068 case Intrinsic::arm_mve_vld4q
:
18069 case Intrinsic::arm_mve_vst2q
:
18070 case Intrinsic::arm_mve_vst4q
:
18071 return PerformMVEVLDCombine(N
, DCI
);
18079 bool ARMTargetLowering::isDesirableToTransformToIntegerOp(unsigned Opc
,
18081 return (VT
== MVT::f32
) && (Opc
== ISD::LOAD
|| Opc
== ISD::STORE
);
18084 bool ARMTargetLowering::allowsMisalignedMemoryAccesses(EVT VT
, unsigned,
18086 MachineMemOperand::Flags
,
18087 bool *Fast
) const {
18088 // Depends what it gets converted into if the type is weird.
18089 if (!VT
.isSimple())
18092 // The AllowsUnaligned flag models the SCTLR.A setting in ARM cpus
18093 bool AllowsUnaligned
= Subtarget
->allowsUnalignedMem();
18094 auto Ty
= VT
.getSimpleVT().SimpleTy
;
18096 if (Ty
== MVT::i8
|| Ty
== MVT::i16
|| Ty
== MVT::i32
) {
18097 // Unaligned access can use (for example) LRDB, LRDH, LDR
18098 if (AllowsUnaligned
) {
18100 *Fast
= Subtarget
->hasV7Ops();
18105 if (Ty
== MVT::f64
|| Ty
== MVT::v2f64
) {
18106 // For any little-endian targets with neon, we can support unaligned ld/st
18107 // of D and Q (e.g. {D0,D1}) registers by using vld1.i8/vst1.i8.
18108 // A big-endian target may also explicitly support unaligned accesses
18109 if (Subtarget
->hasNEON() && (AllowsUnaligned
|| Subtarget
->isLittle())) {
18116 if (!Subtarget
->hasMVEIntegerOps())
18119 // These are for predicates
18120 if ((Ty
== MVT::v16i1
|| Ty
== MVT::v8i1
|| Ty
== MVT::v4i1
)) {
18126 // These are for truncated stores/narrowing loads. They are fine so long as
18127 // the alignment is at least the size of the item being loaded
18128 if ((Ty
== MVT::v4i8
|| Ty
== MVT::v8i8
|| Ty
== MVT::v4i16
) &&
18129 Alignment
>= VT
.getScalarSizeInBits() / 8) {
18135 // In little-endian MVE, the store instructions VSTRB.U8, VSTRH.U16 and
18136 // VSTRW.U32 all store the vector register in exactly the same format, and
18137 // differ only in the range of their immediate offset field and the required
18138 // alignment. So there is always a store that can be used, regardless of
18141 // For big endian, that is not the case. But can still emit a (VSTRB.U8;
18142 // VREV64.8) pair and get the same effect. This will likely be better than
18143 // aligning the vector through the stack.
18144 if (Ty
== MVT::v16i8
|| Ty
== MVT::v8i16
|| Ty
== MVT::v8f16
||
18145 Ty
== MVT::v4i32
|| Ty
== MVT::v4f32
|| Ty
== MVT::v2i64
||
18146 Ty
== MVT::v2f64
) {
18156 EVT
ARMTargetLowering::getOptimalMemOpType(
18157 const MemOp
&Op
, const AttributeList
&FuncAttributes
) const {
18158 // See if we can use NEON instructions for this...
18159 if ((Op
.isMemcpy() || Op
.isZeroMemset()) && Subtarget
->hasNEON() &&
18160 !FuncAttributes
.hasFnAttr(Attribute::NoImplicitFloat
)) {
18162 if (Op
.size() >= 16 &&
18163 (Op
.isAligned(Align(16)) ||
18164 (allowsMisalignedMemoryAccesses(MVT::v2f64
, 0, Align(1),
18165 MachineMemOperand::MONone
, &Fast
) &&
18168 } else if (Op
.size() >= 8 &&
18169 (Op
.isAligned(Align(8)) ||
18170 (allowsMisalignedMemoryAccesses(
18171 MVT::f64
, 0, Align(1), MachineMemOperand::MONone
, &Fast
) &&
18177 // Let the target-independent logic figure it out.
18181 // 64-bit integers are split into their high and low parts and held in two
18182 // different registers, so the trunc is free since the low register can just
18184 bool ARMTargetLowering::isTruncateFree(Type
*SrcTy
, Type
*DstTy
) const {
18185 if (!SrcTy
->isIntegerTy() || !DstTy
->isIntegerTy())
18187 unsigned SrcBits
= SrcTy
->getPrimitiveSizeInBits();
18188 unsigned DestBits
= DstTy
->getPrimitiveSizeInBits();
18189 return (SrcBits
== 64 && DestBits
== 32);
18192 bool ARMTargetLowering::isTruncateFree(EVT SrcVT
, EVT DstVT
) const {
18193 if (SrcVT
.isVector() || DstVT
.isVector() || !SrcVT
.isInteger() ||
18194 !DstVT
.isInteger())
18196 unsigned SrcBits
= SrcVT
.getSizeInBits();
18197 unsigned DestBits
= DstVT
.getSizeInBits();
18198 return (SrcBits
== 64 && DestBits
== 32);
18201 bool ARMTargetLowering::isZExtFree(SDValue Val
, EVT VT2
) const {
18202 if (Val
.getOpcode() != ISD::LOAD
)
18205 EVT VT1
= Val
.getValueType();
18206 if (!VT1
.isSimple() || !VT1
.isInteger() ||
18207 !VT2
.isSimple() || !VT2
.isInteger())
18210 switch (VT1
.getSimpleVT().SimpleTy
) {
18215 // 8-bit and 16-bit loads implicitly zero-extend to 32-bits.
18222 bool ARMTargetLowering::isFNegFree(EVT VT
) const {
18223 if (!VT
.isSimple())
18226 // There are quite a few FP16 instructions (e.g. VNMLA, VNMLS, etc.) that
18227 // negate values directly (fneg is free). So, we don't want to let the DAG
18228 // combiner rewrite fneg into xors and some other instructions. For f16 and
18229 // FullFP16 argument passing, some bitcast nodes may be introduced,
18230 // triggering this DAG combine rewrite, so we are avoiding that with this.
18231 switch (VT
.getSimpleVT().SimpleTy
) {
18234 return Subtarget
->hasFullFP16();
18240 /// Check if Ext1 and Ext2 are extends of the same type, doubling the bitwidth
18241 /// of the vector elements.
18242 static bool areExtractExts(Value
*Ext1
, Value
*Ext2
) {
18243 auto areExtDoubled
= [](Instruction
*Ext
) {
18244 return Ext
->getType()->getScalarSizeInBits() ==
18245 2 * Ext
->getOperand(0)->getType()->getScalarSizeInBits();
18248 if (!match(Ext1
, m_ZExtOrSExt(m_Value())) ||
18249 !match(Ext2
, m_ZExtOrSExt(m_Value())) ||
18250 !areExtDoubled(cast
<Instruction
>(Ext1
)) ||
18251 !areExtDoubled(cast
<Instruction
>(Ext2
)))
18257 /// Check if sinking \p I's operands to I's basic block is profitable, because
18258 /// the operands can be folded into a target instruction, e.g.
18259 /// sext/zext can be folded into vsubl.
18260 bool ARMTargetLowering::shouldSinkOperands(Instruction
*I
,
18261 SmallVectorImpl
<Use
*> &Ops
) const {
18262 if (!I
->getType()->isVectorTy())
18265 if (Subtarget
->hasNEON()) {
18266 switch (I
->getOpcode()) {
18267 case Instruction::Sub
:
18268 case Instruction::Add
: {
18269 if (!areExtractExts(I
->getOperand(0), I
->getOperand(1)))
18271 Ops
.push_back(&I
->getOperandUse(0));
18272 Ops
.push_back(&I
->getOperandUse(1));
18280 if (!Subtarget
->hasMVEIntegerOps())
18283 auto IsFMSMul
= [&](Instruction
*I
) {
18284 if (!I
->hasOneUse())
18286 auto *Sub
= cast
<Instruction
>(*I
->users().begin());
18287 return Sub
->getOpcode() == Instruction::FSub
&& Sub
->getOperand(1) == I
;
18289 auto IsFMS
= [&](Instruction
*I
) {
18290 if (match(I
->getOperand(0), m_FNeg(m_Value())) ||
18291 match(I
->getOperand(1), m_FNeg(m_Value())))
18296 auto IsSinker
= [&](Instruction
*I
, int Operand
) {
18297 switch (I
->getOpcode()) {
18298 case Instruction::Add
:
18299 case Instruction::Mul
:
18300 case Instruction::FAdd
:
18301 case Instruction::ICmp
:
18302 case Instruction::FCmp
:
18304 case Instruction::FMul
:
18305 return !IsFMSMul(I
);
18306 case Instruction::Sub
:
18307 case Instruction::FSub
:
18308 case Instruction::Shl
:
18309 case Instruction::LShr
:
18310 case Instruction::AShr
:
18311 return Operand
== 1;
18312 case Instruction::Call
:
18313 if (auto *II
= dyn_cast
<IntrinsicInst
>(I
)) {
18314 switch (II
->getIntrinsicID()) {
18315 case Intrinsic::fma
:
18317 case Intrinsic::arm_mve_add_predicated
:
18318 case Intrinsic::arm_mve_mul_predicated
:
18319 case Intrinsic::arm_mve_qadd_predicated
:
18320 case Intrinsic::arm_mve_hadd_predicated
:
18321 case Intrinsic::arm_mve_vqdmull_predicated
:
18322 case Intrinsic::arm_mve_qdmulh_predicated
:
18323 case Intrinsic::arm_mve_qrdmulh_predicated
:
18324 case Intrinsic::arm_mve_fma_predicated
:
18326 case Intrinsic::arm_mve_sub_predicated
:
18327 case Intrinsic::arm_mve_qsub_predicated
:
18328 case Intrinsic::arm_mve_hsub_predicated
:
18329 return Operand
== 1;
18340 for (auto OpIdx
: enumerate(I
->operands())) {
18341 Instruction
*Op
= dyn_cast
<Instruction
>(OpIdx
.value().get());
18342 // Make sure we are not already sinking this operand
18343 if (!Op
|| any_of(Ops
, [&](Use
*U
) { return U
->get() == Op
; }))
18346 Instruction
*Shuffle
= Op
;
18347 if (Shuffle
->getOpcode() == Instruction::BitCast
)
18348 Shuffle
= dyn_cast
<Instruction
>(Shuffle
->getOperand(0));
18349 // We are looking for a splat that can be sunk.
18351 !match(Shuffle
, m_Shuffle(
18352 m_InsertElt(m_Undef(), m_Value(), m_ZeroInt()),
18353 m_Undef(), m_ZeroMask())))
18355 if (!IsSinker(I
, OpIdx
.index()))
18358 // All uses of the shuffle should be sunk to avoid duplicating it across gpr
18359 // and vector registers
18360 for (Use
&U
: Op
->uses()) {
18361 Instruction
*Insn
= cast
<Instruction
>(U
.getUser());
18362 if (!IsSinker(Insn
, U
.getOperandNo()))
18366 Ops
.push_back(&Shuffle
->getOperandUse(0));
18368 Ops
.push_back(&Op
->getOperandUse(0));
18369 Ops
.push_back(&OpIdx
.value());
18374 Type
*ARMTargetLowering::shouldConvertSplatType(ShuffleVectorInst
*SVI
) const {
18375 if (!Subtarget
->hasMVEIntegerOps())
18377 Type
*SVIType
= SVI
->getType();
18378 Type
*ScalarType
= SVIType
->getScalarType();
18380 if (ScalarType
->isFloatTy())
18381 return Type::getInt32Ty(SVIType
->getContext());
18382 if (ScalarType
->isHalfTy())
18383 return Type::getInt16Ty(SVIType
->getContext());
18387 bool ARMTargetLowering::isVectorLoadExtDesirable(SDValue ExtVal
) const {
18388 EVT VT
= ExtVal
.getValueType();
18390 if (!isTypeLegal(VT
))
18393 if (auto *Ld
= dyn_cast
<MaskedLoadSDNode
>(ExtVal
.getOperand(0))) {
18394 if (Ld
->isExpandingLoad())
18398 if (Subtarget
->hasMVEIntegerOps())
18401 // Don't create a loadext if we can fold the extension into a wide/long
18403 // If there's more than one user instruction, the loadext is desirable no
18404 // matter what. There can be two uses by the same instruction.
18405 if (ExtVal
->use_empty() ||
18406 !ExtVal
->use_begin()->isOnlyUserOf(ExtVal
.getNode()))
18409 SDNode
*U
= *ExtVal
->use_begin();
18410 if ((U
->getOpcode() == ISD::ADD
|| U
->getOpcode() == ISD::SUB
||
18411 U
->getOpcode() == ISD::SHL
|| U
->getOpcode() == ARMISD::VSHLIMM
))
18417 bool ARMTargetLowering::allowTruncateForTailCall(Type
*Ty1
, Type
*Ty2
) const {
18418 if (!Ty1
->isIntegerTy() || !Ty2
->isIntegerTy())
18421 if (!isTypeLegal(EVT::getEVT(Ty1
)))
18424 assert(Ty1
->getPrimitiveSizeInBits() <= 64 && "i128 is probably not a noop");
18426 // Assuming the caller doesn't have a zeroext or signext return parameter,
18427 // truncation all the way down to i1 is valid.
18431 InstructionCost
ARMTargetLowering::getScalingFactorCost(const DataLayout
&DL
,
18432 const AddrMode
&AM
,
18434 unsigned AS
) const {
18435 if (isLegalAddressingMode(DL
, AM
, Ty
, AS
)) {
18436 if (Subtarget
->hasFPAO())
18437 return AM
.Scale
< 0 ? 1 : 0; // positive offsets execute faster
18443 /// isFMAFasterThanFMulAndFAdd - Return true if an FMA operation is faster
18444 /// than a pair of fmul and fadd instructions. fmuladd intrinsics will be
18445 /// expanded to FMAs when this method returns true, otherwise fmuladd is
18446 /// expanded to fmul + fadd.
18448 /// ARM supports both fused and unfused multiply-add operations; we already
18449 /// lower a pair of fmul and fadd to the latter so it's not clear that there
18450 /// would be a gain or that the gain would be worthwhile enough to risk
18451 /// correctness bugs.
18453 /// For MVE, we set this to true as it helps simplify the need for some
18454 /// patterns (and we don't have the non-fused floating point instruction).
18455 bool ARMTargetLowering::isFMAFasterThanFMulAndFAdd(const MachineFunction
&MF
,
18457 if (!VT
.isSimple())
18460 switch (VT
.getSimpleVT().SimpleTy
) {
18463 return Subtarget
->hasMVEFloatOps();
18465 return Subtarget
->useFPVFMx16();
18467 return Subtarget
->useFPVFMx();
18469 return Subtarget
->useFPVFMx64();
18477 static bool isLegalT1AddressImmediate(int64_t V
, EVT VT
) {
18481 unsigned Scale
= 1;
18482 switch (VT
.getSimpleVT().SimpleTy
) {
18492 // On thumb1 we load most things (i32, i64, floats, etc) with a LDR
18498 if ((V
& (Scale
- 1)) != 0)
18500 return isUInt
<5>(V
/ Scale
);
18503 static bool isLegalT2AddressImmediate(int64_t V
, EVT VT
,
18504 const ARMSubtarget
*Subtarget
) {
18505 if (!VT
.isInteger() && !VT
.isFloatingPoint())
18507 if (VT
.isVector() && Subtarget
->hasNEON())
18509 if (VT
.isVector() && VT
.isFloatingPoint() && Subtarget
->hasMVEIntegerOps() &&
18510 !Subtarget
->hasMVEFloatOps())
18513 bool IsNeg
= false;
18519 unsigned NumBytes
= std::max((unsigned)VT
.getSizeInBits() / 8, 1U);
18521 // MVE: size * imm7
18522 if (VT
.isVector() && Subtarget
->hasMVEIntegerOps()) {
18523 switch (VT
.getSimpleVT().getVectorElementType().SimpleTy
) {
18526 return isShiftedUInt
<7,2>(V
);
18529 return isShiftedUInt
<7,1>(V
);
18531 return isUInt
<7>(V
);
18537 // half VLDR: 2 * imm8
18538 if (VT
.isFloatingPoint() && NumBytes
== 2 && Subtarget
->hasFPRegs16())
18539 return isShiftedUInt
<8, 1>(V
);
18540 // VLDR and LDRD: 4 * imm8
18541 if ((VT
.isFloatingPoint() && Subtarget
->hasVFP2Base()) || NumBytes
== 8)
18542 return isShiftedUInt
<8, 2>(V
);
18544 if (NumBytes
== 1 || NumBytes
== 2 || NumBytes
== 4) {
18545 // + imm12 or - imm8
18547 return isUInt
<8>(V
);
18548 return isUInt
<12>(V
);
18554 /// isLegalAddressImmediate - Return true if the integer value can be used
18555 /// as the offset of the target addressing mode for load / store of the
18557 static bool isLegalAddressImmediate(int64_t V
, EVT VT
,
18558 const ARMSubtarget
*Subtarget
) {
18562 if (!VT
.isSimple())
18565 if (Subtarget
->isThumb1Only())
18566 return isLegalT1AddressImmediate(V
, VT
);
18567 else if (Subtarget
->isThumb2())
18568 return isLegalT2AddressImmediate(V
, VT
, Subtarget
);
18573 switch (VT
.getSimpleVT().SimpleTy
) {
18574 default: return false;
18579 return isUInt
<12>(V
);
18582 return isUInt
<8>(V
);
18585 if (!Subtarget
->hasVFP2Base()) // FIXME: NEON?
18587 return isShiftedUInt
<8, 2>(V
);
18591 bool ARMTargetLowering::isLegalT2ScaledAddressingMode(const AddrMode
&AM
,
18593 int Scale
= AM
.Scale
;
18597 switch (VT
.getSimpleVT().SimpleTy
) {
18598 default: return false;
18606 Scale
= Scale
& ~1;
18607 return Scale
== 2 || Scale
== 4 || Scale
== 8;
18609 // FIXME: What are we trying to model here? ldrd doesn't have an r + r
18610 // version in Thumb mode.
18614 // r * 2 (this can be lowered to r + r).
18615 if (!AM
.HasBaseReg
&& Scale
== 2)
18619 // Note, we allow "void" uses (basically, uses that aren't loads or
18620 // stores), because arm allows folding a scale into many arithmetic
18621 // operations. This should be made more precise and revisited later.
18623 // Allow r << imm, but the imm has to be a multiple of two.
18624 if (Scale
& 1) return false;
18625 return isPowerOf2_32(Scale
);
18629 bool ARMTargetLowering::isLegalT1ScaledAddressingMode(const AddrMode
&AM
,
18631 const int Scale
= AM
.Scale
;
18633 // Negative scales are not supported in Thumb1.
18637 // Thumb1 addressing modes do not support register scaling excepting the
18638 // following cases:
18639 // 1. Scale == 1 means no scaling.
18640 // 2. Scale == 2 this can be lowered to r + r if there is no base register.
18641 return (Scale
== 1) || (!AM
.HasBaseReg
&& Scale
== 2);
18644 /// isLegalAddressingMode - Return true if the addressing mode represented
18645 /// by AM is legal for this target, for a load/store of the specified type.
18646 bool ARMTargetLowering::isLegalAddressingMode(const DataLayout
&DL
,
18647 const AddrMode
&AM
, Type
*Ty
,
18648 unsigned AS
, Instruction
*I
) const {
18649 EVT VT
= getValueType(DL
, Ty
, true);
18650 if (!isLegalAddressImmediate(AM
.BaseOffs
, VT
, Subtarget
))
18653 // Can never fold addr of global into load/store.
18657 switch (AM
.Scale
) {
18658 case 0: // no scale reg, must be "r+i" or "r", or "i".
18661 // ARM doesn't support any R+R*scale+imm addr modes.
18665 if (!VT
.isSimple())
18668 if (Subtarget
->isThumb1Only())
18669 return isLegalT1ScaledAddressingMode(AM
, VT
);
18671 if (Subtarget
->isThumb2())
18672 return isLegalT2ScaledAddressingMode(AM
, VT
);
18674 int Scale
= AM
.Scale
;
18675 switch (VT
.getSimpleVT().SimpleTy
) {
18676 default: return false;
18680 if (Scale
< 0) Scale
= -Scale
;
18684 return isPowerOf2_32(Scale
& ~1);
18688 if (Scale
== 1 || (AM
.HasBaseReg
&& Scale
== -1))
18690 // r * 2 (this can be lowered to r + r).
18691 if (!AM
.HasBaseReg
&& Scale
== 2)
18696 // Note, we allow "void" uses (basically, uses that aren't loads or
18697 // stores), because arm allows folding a scale into many arithmetic
18698 // operations. This should be made more precise and revisited later.
18700 // Allow r << imm, but the imm has to be a multiple of two.
18701 if (Scale
& 1) return false;
18702 return isPowerOf2_32(Scale
);
18708 /// isLegalICmpImmediate - Return true if the specified immediate is legal
18709 /// icmp immediate, that is the target has icmp instructions which can compare
18710 /// a register against the immediate without having to materialize the
18711 /// immediate into a register.
18712 bool ARMTargetLowering::isLegalICmpImmediate(int64_t Imm
) const {
18713 // Thumb2 and ARM modes can use cmn for negative immediates.
18714 if (!Subtarget
->isThumb())
18715 return ARM_AM::getSOImmVal((uint32_t)Imm
) != -1 ||
18716 ARM_AM::getSOImmVal(-(uint32_t)Imm
) != -1;
18717 if (Subtarget
->isThumb2())
18718 return ARM_AM::getT2SOImmVal((uint32_t)Imm
) != -1 ||
18719 ARM_AM::getT2SOImmVal(-(uint32_t)Imm
) != -1;
18720 // Thumb1 doesn't have cmn, and only 8-bit immediates.
18721 return Imm
>= 0 && Imm
<= 255;
18724 /// isLegalAddImmediate - Return true if the specified immediate is a legal add
18725 /// *or sub* immediate, that is the target has add or sub instructions which can
18726 /// add a register with the immediate without having to materialize the
18727 /// immediate into a register.
18728 bool ARMTargetLowering::isLegalAddImmediate(int64_t Imm
) const {
18729 // Same encoding for add/sub, just flip the sign.
18730 int64_t AbsImm
= std::abs(Imm
);
18731 if (!Subtarget
->isThumb())
18732 return ARM_AM::getSOImmVal(AbsImm
) != -1;
18733 if (Subtarget
->isThumb2())
18734 return ARM_AM::getT2SOImmVal(AbsImm
) != -1;
18735 // Thumb1 only has 8-bit unsigned immediate.
18736 return AbsImm
>= 0 && AbsImm
<= 255;
18739 static bool getARMIndexedAddressParts(SDNode
*Ptr
, EVT VT
,
18740 bool isSEXTLoad
, SDValue
&Base
,
18741 SDValue
&Offset
, bool &isInc
,
18742 SelectionDAG
&DAG
) {
18743 if (Ptr
->getOpcode() != ISD::ADD
&& Ptr
->getOpcode() != ISD::SUB
)
18746 if (VT
== MVT::i16
|| ((VT
== MVT::i8
|| VT
== MVT::i1
) && isSEXTLoad
)) {
18747 // AddressingMode 3
18748 Base
= Ptr
->getOperand(0);
18749 if (ConstantSDNode
*RHS
= dyn_cast
<ConstantSDNode
>(Ptr
->getOperand(1))) {
18750 int RHSC
= (int)RHS
->getZExtValue();
18751 if (RHSC
< 0 && RHSC
> -256) {
18752 assert(Ptr
->getOpcode() == ISD::ADD
);
18754 Offset
= DAG
.getConstant(-RHSC
, SDLoc(Ptr
), RHS
->getValueType(0));
18758 isInc
= (Ptr
->getOpcode() == ISD::ADD
);
18759 Offset
= Ptr
->getOperand(1);
18761 } else if (VT
== MVT::i32
|| VT
== MVT::i8
|| VT
== MVT::i1
) {
18762 // AddressingMode 2
18763 if (ConstantSDNode
*RHS
= dyn_cast
<ConstantSDNode
>(Ptr
->getOperand(1))) {
18764 int RHSC
= (int)RHS
->getZExtValue();
18765 if (RHSC
< 0 && RHSC
> -0x1000) {
18766 assert(Ptr
->getOpcode() == ISD::ADD
);
18768 Offset
= DAG
.getConstant(-RHSC
, SDLoc(Ptr
), RHS
->getValueType(0));
18769 Base
= Ptr
->getOperand(0);
18774 if (Ptr
->getOpcode() == ISD::ADD
) {
18776 ARM_AM::ShiftOpc ShOpcVal
=
18777 ARM_AM::getShiftOpcForNode(Ptr
->getOperand(0).getOpcode());
18778 if (ShOpcVal
!= ARM_AM::no_shift
) {
18779 Base
= Ptr
->getOperand(1);
18780 Offset
= Ptr
->getOperand(0);
18782 Base
= Ptr
->getOperand(0);
18783 Offset
= Ptr
->getOperand(1);
18788 isInc
= (Ptr
->getOpcode() == ISD::ADD
);
18789 Base
= Ptr
->getOperand(0);
18790 Offset
= Ptr
->getOperand(1);
18794 // FIXME: Use VLDM / VSTM to emulate indexed FP load / store.
18798 static bool getT2IndexedAddressParts(SDNode
*Ptr
, EVT VT
,
18799 bool isSEXTLoad
, SDValue
&Base
,
18800 SDValue
&Offset
, bool &isInc
,
18801 SelectionDAG
&DAG
) {
18802 if (Ptr
->getOpcode() != ISD::ADD
&& Ptr
->getOpcode() != ISD::SUB
)
18805 Base
= Ptr
->getOperand(0);
18806 if (ConstantSDNode
*RHS
= dyn_cast
<ConstantSDNode
>(Ptr
->getOperand(1))) {
18807 int RHSC
= (int)RHS
->getZExtValue();
18808 if (RHSC
< 0 && RHSC
> -0x100) { // 8 bits.
18809 assert(Ptr
->getOpcode() == ISD::ADD
);
18811 Offset
= DAG
.getConstant(-RHSC
, SDLoc(Ptr
), RHS
->getValueType(0));
18813 } else if (RHSC
> 0 && RHSC
< 0x100) { // 8 bit, no zero.
18814 isInc
= Ptr
->getOpcode() == ISD::ADD
;
18815 Offset
= DAG
.getConstant(RHSC
, SDLoc(Ptr
), RHS
->getValueType(0));
18823 static bool getMVEIndexedAddressParts(SDNode
*Ptr
, EVT VT
, Align Alignment
,
18824 bool isSEXTLoad
, bool IsMasked
, bool isLE
,
18825 SDValue
&Base
, SDValue
&Offset
,
18826 bool &isInc
, SelectionDAG
&DAG
) {
18827 if (Ptr
->getOpcode() != ISD::ADD
&& Ptr
->getOpcode() != ISD::SUB
)
18829 if (!isa
<ConstantSDNode
>(Ptr
->getOperand(1)))
18832 // We allow LE non-masked loads to change the type (for example use a vldrb.8
18833 // as opposed to a vldrw.32). This can allow extra addressing modes or
18834 // alignments for what is otherwise an equivalent instruction.
18835 bool CanChangeType
= isLE
&& !IsMasked
;
18837 ConstantSDNode
*RHS
= cast
<ConstantSDNode
>(Ptr
->getOperand(1));
18838 int RHSC
= (int)RHS
->getZExtValue();
18840 auto IsInRange
= [&](int RHSC
, int Limit
, int Scale
) {
18841 if (RHSC
< 0 && RHSC
> -Limit
* Scale
&& RHSC
% Scale
== 0) {
18842 assert(Ptr
->getOpcode() == ISD::ADD
);
18844 Offset
= DAG
.getConstant(-RHSC
, SDLoc(Ptr
), RHS
->getValueType(0));
18846 } else if (RHSC
> 0 && RHSC
< Limit
* Scale
&& RHSC
% Scale
== 0) {
18847 isInc
= Ptr
->getOpcode() == ISD::ADD
;
18848 Offset
= DAG
.getConstant(RHSC
, SDLoc(Ptr
), RHS
->getValueType(0));
18854 // Try to find a matching instruction based on s/zext, Alignment, Offset and
18855 // (in BE/masked) type.
18856 Base
= Ptr
->getOperand(0);
18857 if (VT
== MVT::v4i16
) {
18858 if (Alignment
>= 2 && IsInRange(RHSC
, 0x80, 2))
18860 } else if (VT
== MVT::v4i8
|| VT
== MVT::v8i8
) {
18861 if (IsInRange(RHSC
, 0x80, 1))
18863 } else if (Alignment
>= 4 &&
18864 (CanChangeType
|| VT
== MVT::v4i32
|| VT
== MVT::v4f32
) &&
18865 IsInRange(RHSC
, 0x80, 4))
18867 else if (Alignment
>= 2 &&
18868 (CanChangeType
|| VT
== MVT::v8i16
|| VT
== MVT::v8f16
) &&
18869 IsInRange(RHSC
, 0x80, 2))
18871 else if ((CanChangeType
|| VT
== MVT::v16i8
) && IsInRange(RHSC
, 0x80, 1))
18876 /// getPreIndexedAddressParts - returns true by value, base pointer and
18877 /// offset pointer and addressing mode by reference if the node's address
18878 /// can be legally represented as pre-indexed load / store address.
18880 ARMTargetLowering::getPreIndexedAddressParts(SDNode
*N
, SDValue
&Base
,
18882 ISD::MemIndexedMode
&AM
,
18883 SelectionDAG
&DAG
) const {
18884 if (Subtarget
->isThumb1Only())
18890 bool isSEXTLoad
= false;
18891 bool IsMasked
= false;
18892 if (LoadSDNode
*LD
= dyn_cast
<LoadSDNode
>(N
)) {
18893 Ptr
= LD
->getBasePtr();
18894 VT
= LD
->getMemoryVT();
18895 Alignment
= LD
->getAlign();
18896 isSEXTLoad
= LD
->getExtensionType() == ISD::SEXTLOAD
;
18897 } else if (StoreSDNode
*ST
= dyn_cast
<StoreSDNode
>(N
)) {
18898 Ptr
= ST
->getBasePtr();
18899 VT
= ST
->getMemoryVT();
18900 Alignment
= ST
->getAlign();
18901 } else if (MaskedLoadSDNode
*LD
= dyn_cast
<MaskedLoadSDNode
>(N
)) {
18902 Ptr
= LD
->getBasePtr();
18903 VT
= LD
->getMemoryVT();
18904 Alignment
= LD
->getAlign();
18905 isSEXTLoad
= LD
->getExtensionType() == ISD::SEXTLOAD
;
18907 } else if (MaskedStoreSDNode
*ST
= dyn_cast
<MaskedStoreSDNode
>(N
)) {
18908 Ptr
= ST
->getBasePtr();
18909 VT
= ST
->getMemoryVT();
18910 Alignment
= ST
->getAlign();
18916 bool isLegal
= false;
18918 isLegal
= Subtarget
->hasMVEIntegerOps() &&
18919 getMVEIndexedAddressParts(
18920 Ptr
.getNode(), VT
, Alignment
, isSEXTLoad
, IsMasked
,
18921 Subtarget
->isLittle(), Base
, Offset
, isInc
, DAG
);
18923 if (Subtarget
->isThumb2())
18924 isLegal
= getT2IndexedAddressParts(Ptr
.getNode(), VT
, isSEXTLoad
, Base
,
18925 Offset
, isInc
, DAG
);
18927 isLegal
= getARMIndexedAddressParts(Ptr
.getNode(), VT
, isSEXTLoad
, Base
,
18928 Offset
, isInc
, DAG
);
18933 AM
= isInc
? ISD::PRE_INC
: ISD::PRE_DEC
;
18937 /// getPostIndexedAddressParts - returns true by value, base pointer and
18938 /// offset pointer and addressing mode by reference if this node can be
18939 /// combined with a load / store to form a post-indexed load / store.
18940 bool ARMTargetLowering::getPostIndexedAddressParts(SDNode
*N
, SDNode
*Op
,
18943 ISD::MemIndexedMode
&AM
,
18944 SelectionDAG
&DAG
) const {
18948 bool isSEXTLoad
= false, isNonExt
;
18949 bool IsMasked
= false;
18950 if (LoadSDNode
*LD
= dyn_cast
<LoadSDNode
>(N
)) {
18951 VT
= LD
->getMemoryVT();
18952 Ptr
= LD
->getBasePtr();
18953 Alignment
= LD
->getAlign();
18954 isSEXTLoad
= LD
->getExtensionType() == ISD::SEXTLOAD
;
18955 isNonExt
= LD
->getExtensionType() == ISD::NON_EXTLOAD
;
18956 } else if (StoreSDNode
*ST
= dyn_cast
<StoreSDNode
>(N
)) {
18957 VT
= ST
->getMemoryVT();
18958 Ptr
= ST
->getBasePtr();
18959 Alignment
= ST
->getAlign();
18960 isNonExt
= !ST
->isTruncatingStore();
18961 } else if (MaskedLoadSDNode
*LD
= dyn_cast
<MaskedLoadSDNode
>(N
)) {
18962 VT
= LD
->getMemoryVT();
18963 Ptr
= LD
->getBasePtr();
18964 Alignment
= LD
->getAlign();
18965 isSEXTLoad
= LD
->getExtensionType() == ISD::SEXTLOAD
;
18966 isNonExt
= LD
->getExtensionType() == ISD::NON_EXTLOAD
;
18968 } else if (MaskedStoreSDNode
*ST
= dyn_cast
<MaskedStoreSDNode
>(N
)) {
18969 VT
= ST
->getMemoryVT();
18970 Ptr
= ST
->getBasePtr();
18971 Alignment
= ST
->getAlign();
18972 isNonExt
= !ST
->isTruncatingStore();
18977 if (Subtarget
->isThumb1Only()) {
18978 // Thumb-1 can do a limited post-inc load or store as an updating LDM. It
18979 // must be non-extending/truncating, i32, with an offset of 4.
18980 assert(Op
->getValueType(0) == MVT::i32
&& "Non-i32 post-inc op?!");
18981 if (Op
->getOpcode() != ISD::ADD
|| !isNonExt
)
18983 auto *RHS
= dyn_cast
<ConstantSDNode
>(Op
->getOperand(1));
18984 if (!RHS
|| RHS
->getZExtValue() != 4)
18986 if (Alignment
< Align(4))
18989 Offset
= Op
->getOperand(1);
18990 Base
= Op
->getOperand(0);
18991 AM
= ISD::POST_INC
;
18996 bool isLegal
= false;
18998 isLegal
= Subtarget
->hasMVEIntegerOps() &&
18999 getMVEIndexedAddressParts(Op
, VT
, Alignment
, isSEXTLoad
, IsMasked
,
19000 Subtarget
->isLittle(), Base
, Offset
,
19003 if (Subtarget
->isThumb2())
19004 isLegal
= getT2IndexedAddressParts(Op
, VT
, isSEXTLoad
, Base
, Offset
,
19007 isLegal
= getARMIndexedAddressParts(Op
, VT
, isSEXTLoad
, Base
, Offset
,
19014 // Swap base ptr and offset to catch more post-index load / store when
19015 // it's legal. In Thumb2 mode, offset must be an immediate.
19016 if (Ptr
== Offset
&& Op
->getOpcode() == ISD::ADD
&&
19017 !Subtarget
->isThumb2())
19018 std::swap(Base
, Offset
);
19020 // Post-indexed load / store update the base pointer.
19025 AM
= isInc
? ISD::POST_INC
: ISD::POST_DEC
;
19029 void ARMTargetLowering::computeKnownBitsForTargetNode(const SDValue Op
,
19031 const APInt
&DemandedElts
,
19032 const SelectionDAG
&DAG
,
19033 unsigned Depth
) const {
19034 unsigned BitWidth
= Known
.getBitWidth();
19036 switch (Op
.getOpcode()) {
19042 // Special cases when we convert a carry to a boolean.
19043 if (Op
.getResNo() == 0) {
19044 SDValue LHS
= Op
.getOperand(0);
19045 SDValue RHS
= Op
.getOperand(1);
19046 // (ADDE 0, 0, C) will give us a single bit.
19047 if (Op
->getOpcode() == ARMISD::ADDE
&& isNullConstant(LHS
) &&
19048 isNullConstant(RHS
)) {
19049 Known
.Zero
|= APInt::getHighBitsSet(BitWidth
, BitWidth
- 1);
19054 case ARMISD::CMOV
: {
19055 // Bits are known zero/one if known on the LHS and RHS.
19056 Known
= DAG
.computeKnownBits(Op
.getOperand(0), Depth
+1);
19057 if (Known
.isUnknown())
19060 KnownBits KnownRHS
= DAG
.computeKnownBits(Op
.getOperand(1), Depth
+1);
19061 Known
= KnownBits::commonBits(Known
, KnownRHS
);
19064 case ISD::INTRINSIC_W_CHAIN
: {
19065 ConstantSDNode
*CN
= cast
<ConstantSDNode
>(Op
->getOperand(1));
19066 Intrinsic::ID IntID
= static_cast<Intrinsic::ID
>(CN
->getZExtValue());
19069 case Intrinsic::arm_ldaex
:
19070 case Intrinsic::arm_ldrex
: {
19071 EVT VT
= cast
<MemIntrinsicSDNode
>(Op
)->getMemoryVT();
19072 unsigned MemBits
= VT
.getScalarSizeInBits();
19073 Known
.Zero
|= APInt::getHighBitsSet(BitWidth
, BitWidth
- MemBits
);
19078 case ARMISD::BFI
: {
19079 // Conservatively, we can recurse down the first operand
19080 // and just mask out all affected bits.
19081 Known
= DAG
.computeKnownBits(Op
.getOperand(0), Depth
+ 1);
19083 // The operand to BFI is already a mask suitable for removing the bits it
19085 ConstantSDNode
*CI
= cast
<ConstantSDNode
>(Op
.getOperand(2));
19086 const APInt
&Mask
= CI
->getAPIntValue();
19087 Known
.Zero
&= Mask
;
19091 case ARMISD::VGETLANEs
:
19092 case ARMISD::VGETLANEu
: {
19093 const SDValue
&SrcSV
= Op
.getOperand(0);
19094 EVT VecVT
= SrcSV
.getValueType();
19095 assert(VecVT
.isVector() && "VGETLANE expected a vector type");
19096 const unsigned NumSrcElts
= VecVT
.getVectorNumElements();
19097 ConstantSDNode
*Pos
= cast
<ConstantSDNode
>(Op
.getOperand(1).getNode());
19098 assert(Pos
->getAPIntValue().ult(NumSrcElts
) &&
19099 "VGETLANE index out of bounds");
19100 unsigned Idx
= Pos
->getZExtValue();
19101 APInt DemandedElt
= APInt::getOneBitSet(NumSrcElts
, Idx
);
19102 Known
= DAG
.computeKnownBits(SrcSV
, DemandedElt
, Depth
+ 1);
19104 EVT VT
= Op
.getValueType();
19105 const unsigned DstSz
= VT
.getScalarSizeInBits();
19106 const unsigned SrcSz
= VecVT
.getVectorElementType().getSizeInBits();
19108 assert(SrcSz
== Known
.getBitWidth());
19109 assert(DstSz
> SrcSz
);
19110 if (Op
.getOpcode() == ARMISD::VGETLANEs
)
19111 Known
= Known
.sext(DstSz
);
19113 Known
= Known
.zext(DstSz
);
19115 assert(DstSz
== Known
.getBitWidth());
19118 case ARMISD::VMOVrh
: {
19119 KnownBits KnownOp
= DAG
.computeKnownBits(Op
->getOperand(0), Depth
+ 1);
19120 assert(KnownOp
.getBitWidth() == 16);
19121 Known
= KnownOp
.zext(32);
19124 case ARMISD::CSINC
:
19125 case ARMISD::CSINV
:
19126 case ARMISD::CSNEG
: {
19127 KnownBits KnownOp0
= DAG
.computeKnownBits(Op
->getOperand(0), Depth
+ 1);
19128 KnownBits KnownOp1
= DAG
.computeKnownBits(Op
->getOperand(1), Depth
+ 1);
19130 // The result is either:
19131 // CSINC: KnownOp0 or KnownOp1 + 1
19132 // CSINV: KnownOp0 or ~KnownOp1
19133 // CSNEG: KnownOp0 or KnownOp1 * -1
19134 if (Op
.getOpcode() == ARMISD::CSINC
)
19135 KnownOp1
= KnownBits::computeForAddSub(
19136 true, false, KnownOp1
, KnownBits::makeConstant(APInt(32, 1)));
19137 else if (Op
.getOpcode() == ARMISD::CSINV
)
19138 std::swap(KnownOp1
.Zero
, KnownOp1
.One
);
19139 else if (Op
.getOpcode() == ARMISD::CSNEG
)
19140 KnownOp1
= KnownBits::mul(
19141 KnownOp1
, KnownBits::makeConstant(APInt(32, -1)));
19143 Known
= KnownBits::commonBits(KnownOp0
, KnownOp1
);
19149 bool ARMTargetLowering::targetShrinkDemandedConstant(
19150 SDValue Op
, const APInt
&DemandedBits
, const APInt
&DemandedElts
,
19151 TargetLoweringOpt
&TLO
) const {
19152 // Delay optimization, so we don't have to deal with illegal types, or block
19157 // Only optimize AND for now.
19158 if (Op
.getOpcode() != ISD::AND
)
19161 EVT VT
= Op
.getValueType();
19167 assert(VT
== MVT::i32
&& "Unexpected integer type");
19169 // Make sure the RHS really is a constant.
19170 ConstantSDNode
*C
= dyn_cast
<ConstantSDNode
>(Op
.getOperand(1));
19174 unsigned Mask
= C
->getZExtValue();
19176 unsigned Demanded
= DemandedBits
.getZExtValue();
19177 unsigned ShrunkMask
= Mask
& Demanded
;
19178 unsigned ExpandedMask
= Mask
| ~Demanded
;
19180 // If the mask is all zeros, let the target-independent code replace the
19181 // result with zero.
19182 if (ShrunkMask
== 0)
19185 // If the mask is all ones, erase the AND. (Currently, the target-independent
19186 // code won't do this, so we have to do it explicitly to avoid an infinite
19187 // loop in obscure cases.)
19188 if (ExpandedMask
== ~0U)
19189 return TLO
.CombineTo(Op
, Op
.getOperand(0));
19191 auto IsLegalMask
= [ShrunkMask
, ExpandedMask
](unsigned Mask
) -> bool {
19192 return (ShrunkMask
& Mask
) == ShrunkMask
&& (~ExpandedMask
& Mask
) == 0;
19194 auto UseMask
= [Mask
, Op
, VT
, &TLO
](unsigned NewMask
) -> bool {
19195 if (NewMask
== Mask
)
19198 SDValue NewC
= TLO
.DAG
.getConstant(NewMask
, DL
, VT
);
19199 SDValue NewOp
= TLO
.DAG
.getNode(ISD::AND
, DL
, VT
, Op
.getOperand(0), NewC
);
19200 return TLO
.CombineTo(Op
, NewOp
);
19203 // Prefer uxtb mask.
19204 if (IsLegalMask(0xFF))
19205 return UseMask(0xFF);
19207 // Prefer uxth mask.
19208 if (IsLegalMask(0xFFFF))
19209 return UseMask(0xFFFF);
19211 // [1, 255] is Thumb1 movs+ands, legal immediate for ARM/Thumb2.
19212 // FIXME: Prefer a contiguous sequence of bits for other optimizations.
19213 if (ShrunkMask
< 256)
19214 return UseMask(ShrunkMask
);
19216 // [-256, -2] is Thumb1 movs+bics, legal immediate for ARM/Thumb2.
19217 // FIXME: Prefer a contiguous sequence of bits for other optimizations.
19218 if ((int)ExpandedMask
<= -2 && (int)ExpandedMask
>= -256)
19219 return UseMask(ExpandedMask
);
19221 // Potential improvements:
19223 // We could try to recognize lsls+lsrs or lsrs+lsls pairs here.
19224 // We could try to prefer Thumb1 immediates which can be lowered to a
19225 // two-instruction sequence.
19226 // We could try to recognize more legal ARM/Thumb2 immediates here.
19231 bool ARMTargetLowering::SimplifyDemandedBitsForTargetNode(
19232 SDValue Op
, const APInt
&OriginalDemandedBits
,
19233 const APInt
&OriginalDemandedElts
, KnownBits
&Known
, TargetLoweringOpt
&TLO
,
19234 unsigned Depth
) const {
19235 unsigned Opc
= Op
.getOpcode();
19239 case ARMISD::LSRL
: {
19240 // If this is result 0 and the other result is unused, see if the demand
19241 // bits allow us to shrink this long shift into a standard small shift in
19242 // the opposite direction.
19243 if (Op
.getResNo() == 0 && !Op
->hasAnyUseOfValue(1) &&
19244 isa
<ConstantSDNode
>(Op
->getOperand(2))) {
19245 unsigned ShAmt
= Op
->getConstantOperandVal(2);
19246 if (ShAmt
< 32 && OriginalDemandedBits
.isSubsetOf(
19247 APInt::getAllOnesValue(32) << (32 - ShAmt
)))
19248 return TLO
.CombineTo(
19249 Op
, TLO
.DAG
.getNode(
19250 ISD::SHL
, SDLoc(Op
), MVT::i32
, Op
.getOperand(1),
19251 TLO
.DAG
.getConstant(32 - ShAmt
, SDLoc(Op
), MVT::i32
)));
19257 return TargetLowering::SimplifyDemandedBitsForTargetNode(
19258 Op
, OriginalDemandedBits
, OriginalDemandedElts
, Known
, TLO
, Depth
);
19261 //===----------------------------------------------------------------------===//
19262 // ARM Inline Assembly Support
19263 //===----------------------------------------------------------------------===//
19265 bool ARMTargetLowering::ExpandInlineAsm(CallInst
*CI
) const {
19266 // Looking for "rev" which is V6+.
19267 if (!Subtarget
->hasV6Ops())
19270 InlineAsm
*IA
= cast
<InlineAsm
>(CI
->getCalledOperand());
19271 std::string AsmStr
= IA
->getAsmString();
19272 SmallVector
<StringRef
, 4> AsmPieces
;
19273 SplitString(AsmStr
, AsmPieces
, ";\n");
19275 switch (AsmPieces
.size()) {
19276 default: return false;
19278 AsmStr
= std::string(AsmPieces
[0]);
19280 SplitString(AsmStr
, AsmPieces
, " \t,");
19283 if (AsmPieces
.size() == 3 &&
19284 AsmPieces
[0] == "rev" && AsmPieces
[1] == "$0" && AsmPieces
[2] == "$1" &&
19285 IA
->getConstraintString().compare(0, 4, "=l,l") == 0) {
19286 IntegerType
*Ty
= dyn_cast
<IntegerType
>(CI
->getType());
19287 if (Ty
&& Ty
->getBitWidth() == 32)
19288 return IntrinsicLowering::LowerToByteSwap(CI
);
19296 const char *ARMTargetLowering::LowerXConstraint(EVT ConstraintVT
) const {
19297 // At this point, we have to lower this constraint to something else, so we
19298 // lower it to an "r" or "w". However, by doing this we will force the result
19299 // to be in register, while the X constraint is much more permissive.
19301 // Although we are correct (we are free to emit anything, without
19302 // constraints), we might break use cases that would expect us to be more
19303 // efficient and emit something else.
19304 if (!Subtarget
->hasVFP2Base())
19306 if (ConstraintVT
.isFloatingPoint())
19308 if (ConstraintVT
.isVector() && Subtarget
->hasNEON() &&
19309 (ConstraintVT
.getSizeInBits() == 64 ||
19310 ConstraintVT
.getSizeInBits() == 128))
19316 /// getConstraintType - Given a constraint letter, return the type of
19317 /// constraint it is for this target.
19318 ARMTargetLowering::ConstraintType
19319 ARMTargetLowering::getConstraintType(StringRef Constraint
) const {
19320 unsigned S
= Constraint
.size();
19322 switch (Constraint
[0]) {
19324 case 'l': return C_RegisterClass
;
19325 case 'w': return C_RegisterClass
;
19326 case 'h': return C_RegisterClass
;
19327 case 'x': return C_RegisterClass
;
19328 case 't': return C_RegisterClass
;
19329 case 'j': return C_Immediate
; // Constant for movw.
19330 // An address with a single base register. Due to the way we
19331 // currently handle addresses it is the same as an 'r' memory constraint.
19332 case 'Q': return C_Memory
;
19334 } else if (S
== 2) {
19335 switch (Constraint
[0]) {
19337 case 'T': return C_RegisterClass
;
19338 // All 'U+' constraints are addresses.
19339 case 'U': return C_Memory
;
19342 return TargetLowering::getConstraintType(Constraint
);
19345 /// Examine constraint type and operand type and determine a weight value.
19346 /// This object must already have been set up with the operand type
19347 /// and the current alternative constraint selected.
19348 TargetLowering::ConstraintWeight
19349 ARMTargetLowering::getSingleConstraintMatchWeight(
19350 AsmOperandInfo
&info
, const char *constraint
) const {
19351 ConstraintWeight weight
= CW_Invalid
;
19352 Value
*CallOperandVal
= info
.CallOperandVal
;
19353 // If we don't have a value, we can't do a match,
19354 // but allow it at the lowest weight.
19355 if (!CallOperandVal
)
19357 Type
*type
= CallOperandVal
->getType();
19358 // Look at the constraint type.
19359 switch (*constraint
) {
19361 weight
= TargetLowering::getSingleConstraintMatchWeight(info
, constraint
);
19364 if (type
->isIntegerTy()) {
19365 if (Subtarget
->isThumb())
19366 weight
= CW_SpecificReg
;
19368 weight
= CW_Register
;
19372 if (type
->isFloatingPointTy())
19373 weight
= CW_Register
;
19379 using RCPair
= std::pair
<unsigned, const TargetRegisterClass
*>;
19381 RCPair
ARMTargetLowering::getRegForInlineAsmConstraint(
19382 const TargetRegisterInfo
*TRI
, StringRef Constraint
, MVT VT
) const {
19383 switch (Constraint
.size()) {
19385 // GCC ARM Constraint Letters
19386 switch (Constraint
[0]) {
19387 case 'l': // Low regs or general regs.
19388 if (Subtarget
->isThumb())
19389 return RCPair(0U, &ARM::tGPRRegClass
);
19390 return RCPair(0U, &ARM::GPRRegClass
);
19391 case 'h': // High regs or no regs.
19392 if (Subtarget
->isThumb())
19393 return RCPair(0U, &ARM::hGPRRegClass
);
19396 if (Subtarget
->isThumb1Only())
19397 return RCPair(0U, &ARM::tGPRRegClass
);
19398 return RCPair(0U, &ARM::GPRRegClass
);
19400 if (VT
== MVT::Other
)
19402 if (VT
== MVT::f32
)
19403 return RCPair(0U, &ARM::SPRRegClass
);
19404 if (VT
.getSizeInBits() == 64)
19405 return RCPair(0U, &ARM::DPRRegClass
);
19406 if (VT
.getSizeInBits() == 128)
19407 return RCPair(0U, &ARM::QPRRegClass
);
19410 if (VT
== MVT::Other
)
19412 if (VT
== MVT::f32
)
19413 return RCPair(0U, &ARM::SPR_8RegClass
);
19414 if (VT
.getSizeInBits() == 64)
19415 return RCPair(0U, &ARM::DPR_8RegClass
);
19416 if (VT
.getSizeInBits() == 128)
19417 return RCPair(0U, &ARM::QPR_8RegClass
);
19420 if (VT
== MVT::Other
)
19422 if (VT
== MVT::f32
|| VT
== MVT::i32
)
19423 return RCPair(0U, &ARM::SPRRegClass
);
19424 if (VT
.getSizeInBits() == 64)
19425 return RCPair(0U, &ARM::DPR_VFP2RegClass
);
19426 if (VT
.getSizeInBits() == 128)
19427 return RCPair(0U, &ARM::QPR_VFP2RegClass
);
19433 if (Constraint
[0] == 'T') {
19434 switch (Constraint
[1]) {
19438 return RCPair(0U, &ARM::tGPREvenRegClass
);
19440 return RCPair(0U, &ARM::tGPROddRegClass
);
19449 if (StringRef("{cc}").equals_insensitive(Constraint
))
19450 return std::make_pair(unsigned(ARM::CPSR
), &ARM::CCRRegClass
);
19452 return TargetLowering::getRegForInlineAsmConstraint(TRI
, Constraint
, VT
);
19455 /// LowerAsmOperandForConstraint - Lower the specified operand into the Ops
19456 /// vector. If it is invalid, don't add anything to Ops.
19457 void ARMTargetLowering::LowerAsmOperandForConstraint(SDValue Op
,
19458 std::string
&Constraint
,
19459 std::vector
<SDValue
>&Ops
,
19460 SelectionDAG
&DAG
) const {
19463 // Currently only support length 1 constraints.
19464 if (Constraint
.length() != 1) return;
19466 char ConstraintLetter
= Constraint
[0];
19467 switch (ConstraintLetter
) {
19470 case 'I': case 'J': case 'K': case 'L':
19471 case 'M': case 'N': case 'O':
19472 ConstantSDNode
*C
= dyn_cast
<ConstantSDNode
>(Op
);
19476 int64_t CVal64
= C
->getSExtValue();
19477 int CVal
= (int) CVal64
;
19478 // None of these constraints allow values larger than 32 bits. Check
19479 // that the value fits in an int.
19480 if (CVal
!= CVal64
)
19483 switch (ConstraintLetter
) {
19485 // Constant suitable for movw, must be between 0 and
19487 if (Subtarget
->hasV6T2Ops() || (Subtarget
->hasV8MBaselineOps()))
19488 if (CVal
>= 0 && CVal
<= 65535)
19492 if (Subtarget
->isThumb1Only()) {
19493 // This must be a constant between 0 and 255, for ADD
19495 if (CVal
>= 0 && CVal
<= 255)
19497 } else if (Subtarget
->isThumb2()) {
19498 // A constant that can be used as an immediate value in a
19499 // data-processing instruction.
19500 if (ARM_AM::getT2SOImmVal(CVal
) != -1)
19503 // A constant that can be used as an immediate value in a
19504 // data-processing instruction.
19505 if (ARM_AM::getSOImmVal(CVal
) != -1)
19511 if (Subtarget
->isThumb1Only()) {
19512 // This must be a constant between -255 and -1, for negated ADD
19513 // immediates. This can be used in GCC with an "n" modifier that
19514 // prints the negated value, for use with SUB instructions. It is
19515 // not useful otherwise but is implemented for compatibility.
19516 if (CVal
>= -255 && CVal
<= -1)
19519 // This must be a constant between -4095 and 4095. It is not clear
19520 // what this constraint is intended for. Implemented for
19521 // compatibility with GCC.
19522 if (CVal
>= -4095 && CVal
<= 4095)
19528 if (Subtarget
->isThumb1Only()) {
19529 // A 32-bit value where only one byte has a nonzero value. Exclude
19530 // zero to match GCC. This constraint is used by GCC internally for
19531 // constants that can be loaded with a move/shift combination.
19532 // It is not useful otherwise but is implemented for compatibility.
19533 if (CVal
!= 0 && ARM_AM::isThumbImmShiftedVal(CVal
))
19535 } else if (Subtarget
->isThumb2()) {
19536 // A constant whose bitwise inverse can be used as an immediate
19537 // value in a data-processing instruction. This can be used in GCC
19538 // with a "B" modifier that prints the inverted value, for use with
19539 // BIC and MVN instructions. It is not useful otherwise but is
19540 // implemented for compatibility.
19541 if (ARM_AM::getT2SOImmVal(~CVal
) != -1)
19544 // A constant whose bitwise inverse can be used as an immediate
19545 // value in a data-processing instruction. This can be used in GCC
19546 // with a "B" modifier that prints the inverted value, for use with
19547 // BIC and MVN instructions. It is not useful otherwise but is
19548 // implemented for compatibility.
19549 if (ARM_AM::getSOImmVal(~CVal
) != -1)
19555 if (Subtarget
->isThumb1Only()) {
19556 // This must be a constant between -7 and 7,
19557 // for 3-operand ADD/SUB immediate instructions.
19558 if (CVal
>= -7 && CVal
< 7)
19560 } else if (Subtarget
->isThumb2()) {
19561 // A constant whose negation can be used as an immediate value in a
19562 // data-processing instruction. This can be used in GCC with an "n"
19563 // modifier that prints the negated value, for use with SUB
19564 // instructions. It is not useful otherwise but is implemented for
19566 if (ARM_AM::getT2SOImmVal(-CVal
) != -1)
19569 // A constant whose negation can be used as an immediate value in a
19570 // data-processing instruction. This can be used in GCC with an "n"
19571 // modifier that prints the negated value, for use with SUB
19572 // instructions. It is not useful otherwise but is implemented for
19574 if (ARM_AM::getSOImmVal(-CVal
) != -1)
19580 if (Subtarget
->isThumb1Only()) {
19581 // This must be a multiple of 4 between 0 and 1020, for
19582 // ADD sp + immediate.
19583 if ((CVal
>= 0 && CVal
<= 1020) && ((CVal
& 3) == 0))
19586 // A power of two or a constant between 0 and 32. This is used in
19587 // GCC for the shift amount on shifted register operands, but it is
19588 // useful in general for any shift amounts.
19589 if ((CVal
>= 0 && CVal
<= 32) || ((CVal
& (CVal
- 1)) == 0))
19595 if (Subtarget
->isThumb1Only()) {
19596 // This must be a constant between 0 and 31, for shift amounts.
19597 if (CVal
>= 0 && CVal
<= 31)
19603 if (Subtarget
->isThumb1Only()) {
19604 // This must be a multiple of 4 between -508 and 508, for
19605 // ADD/SUB sp = sp + immediate.
19606 if ((CVal
>= -508 && CVal
<= 508) && ((CVal
& 3) == 0))
19611 Result
= DAG
.getTargetConstant(CVal
, SDLoc(Op
), Op
.getValueType());
19615 if (Result
.getNode()) {
19616 Ops
.push_back(Result
);
19619 return TargetLowering::LowerAsmOperandForConstraint(Op
, Constraint
, Ops
, DAG
);
19622 static RTLIB::Libcall
getDivRemLibcall(
19623 const SDNode
*N
, MVT::SimpleValueType SVT
) {
19624 assert((N
->getOpcode() == ISD::SDIVREM
|| N
->getOpcode() == ISD::UDIVREM
||
19625 N
->getOpcode() == ISD::SREM
|| N
->getOpcode() == ISD::UREM
) &&
19626 "Unhandled Opcode in getDivRemLibcall");
19627 bool isSigned
= N
->getOpcode() == ISD::SDIVREM
||
19628 N
->getOpcode() == ISD::SREM
;
19631 default: llvm_unreachable("Unexpected request for libcall!");
19632 case MVT::i8
: LC
= isSigned
? RTLIB::SDIVREM_I8
: RTLIB::UDIVREM_I8
; break;
19633 case MVT::i16
: LC
= isSigned
? RTLIB::SDIVREM_I16
: RTLIB::UDIVREM_I16
; break;
19634 case MVT::i32
: LC
= isSigned
? RTLIB::SDIVREM_I32
: RTLIB::UDIVREM_I32
; break;
19635 case MVT::i64
: LC
= isSigned
? RTLIB::SDIVREM_I64
: RTLIB::UDIVREM_I64
; break;
19640 static TargetLowering::ArgListTy
getDivRemArgList(
19641 const SDNode
*N
, LLVMContext
*Context
, const ARMSubtarget
*Subtarget
) {
19642 assert((N
->getOpcode() == ISD::SDIVREM
|| N
->getOpcode() == ISD::UDIVREM
||
19643 N
->getOpcode() == ISD::SREM
|| N
->getOpcode() == ISD::UREM
) &&
19644 "Unhandled Opcode in getDivRemArgList");
19645 bool isSigned
= N
->getOpcode() == ISD::SDIVREM
||
19646 N
->getOpcode() == ISD::SREM
;
19647 TargetLowering::ArgListTy Args
;
19648 TargetLowering::ArgListEntry Entry
;
19649 for (unsigned i
= 0, e
= N
->getNumOperands(); i
!= e
; ++i
) {
19650 EVT ArgVT
= N
->getOperand(i
).getValueType();
19651 Type
*ArgTy
= ArgVT
.getTypeForEVT(*Context
);
19652 Entry
.Node
= N
->getOperand(i
);
19654 Entry
.IsSExt
= isSigned
;
19655 Entry
.IsZExt
= !isSigned
;
19656 Args
.push_back(Entry
);
19658 if (Subtarget
->isTargetWindows() && Args
.size() >= 2)
19659 std::swap(Args
[0], Args
[1]);
19663 SDValue
ARMTargetLowering::LowerDivRem(SDValue Op
, SelectionDAG
&DAG
) const {
19664 assert((Subtarget
->isTargetAEABI() || Subtarget
->isTargetAndroid() ||
19665 Subtarget
->isTargetGNUAEABI() || Subtarget
->isTargetMuslAEABI() ||
19666 Subtarget
->isTargetWindows()) &&
19667 "Register-based DivRem lowering only");
19668 unsigned Opcode
= Op
->getOpcode();
19669 assert((Opcode
== ISD::SDIVREM
|| Opcode
== ISD::UDIVREM
) &&
19670 "Invalid opcode for Div/Rem lowering");
19671 bool isSigned
= (Opcode
== ISD::SDIVREM
);
19672 EVT VT
= Op
->getValueType(0);
19673 Type
*Ty
= VT
.getTypeForEVT(*DAG
.getContext());
19676 // If the target has hardware divide, use divide + multiply + subtract:
19678 // rem = a - b * div
19679 // return {div, rem}
19680 // This should be lowered into UDIV/SDIV + MLS later on.
19681 bool hasDivide
= Subtarget
->isThumb() ? Subtarget
->hasDivideInThumbMode()
19682 : Subtarget
->hasDivideInARMMode();
19683 if (hasDivide
&& Op
->getValueType(0).isSimple() &&
19684 Op
->getSimpleValueType(0) == MVT::i32
) {
19685 unsigned DivOpcode
= isSigned
? ISD::SDIV
: ISD::UDIV
;
19686 const SDValue Dividend
= Op
->getOperand(0);
19687 const SDValue Divisor
= Op
->getOperand(1);
19688 SDValue Div
= DAG
.getNode(DivOpcode
, dl
, VT
, Dividend
, Divisor
);
19689 SDValue Mul
= DAG
.getNode(ISD::MUL
, dl
, VT
, Div
, Divisor
);
19690 SDValue Rem
= DAG
.getNode(ISD::SUB
, dl
, VT
, Dividend
, Mul
);
19692 SDValue Values
[2] = {Div
, Rem
};
19693 return DAG
.getNode(ISD::MERGE_VALUES
, dl
, DAG
.getVTList(VT
, VT
), Values
);
19696 RTLIB::Libcall LC
= getDivRemLibcall(Op
.getNode(),
19697 VT
.getSimpleVT().SimpleTy
);
19698 SDValue InChain
= DAG
.getEntryNode();
19700 TargetLowering::ArgListTy Args
= getDivRemArgList(Op
.getNode(),
19704 SDValue Callee
= DAG
.getExternalSymbol(getLibcallName(LC
),
19705 getPointerTy(DAG
.getDataLayout()));
19707 Type
*RetTy
= StructType::get(Ty
, Ty
);
19709 if (Subtarget
->isTargetWindows())
19710 InChain
= WinDBZCheckDenominator(DAG
, Op
.getNode(), InChain
);
19712 TargetLowering::CallLoweringInfo
CLI(DAG
);
19713 CLI
.setDebugLoc(dl
).setChain(InChain
)
19714 .setCallee(getLibcallCallingConv(LC
), RetTy
, Callee
, std::move(Args
))
19715 .setInRegister().setSExtResult(isSigned
).setZExtResult(!isSigned
);
19717 std::pair
<SDValue
, SDValue
> CallInfo
= LowerCallTo(CLI
);
19718 return CallInfo
.first
;
19721 // Lowers REM using divmod helpers
19722 // see RTABI section 4.2/4.3
19723 SDValue
ARMTargetLowering::LowerREM(SDNode
*N
, SelectionDAG
&DAG
) const {
19724 // Build return types (div and rem)
19725 std::vector
<Type
*> RetTyParams
;
19726 Type
*RetTyElement
;
19728 switch (N
->getValueType(0).getSimpleVT().SimpleTy
) {
19729 default: llvm_unreachable("Unexpected request for libcall!");
19730 case MVT::i8
: RetTyElement
= Type::getInt8Ty(*DAG
.getContext()); break;
19731 case MVT::i16
: RetTyElement
= Type::getInt16Ty(*DAG
.getContext()); break;
19732 case MVT::i32
: RetTyElement
= Type::getInt32Ty(*DAG
.getContext()); break;
19733 case MVT::i64
: RetTyElement
= Type::getInt64Ty(*DAG
.getContext()); break;
19736 RetTyParams
.push_back(RetTyElement
);
19737 RetTyParams
.push_back(RetTyElement
);
19738 ArrayRef
<Type
*> ret
= ArrayRef
<Type
*>(RetTyParams
);
19739 Type
*RetTy
= StructType::get(*DAG
.getContext(), ret
);
19741 RTLIB::Libcall LC
= getDivRemLibcall(N
, N
->getValueType(0).getSimpleVT().
19743 SDValue InChain
= DAG
.getEntryNode();
19744 TargetLowering::ArgListTy Args
= getDivRemArgList(N
, DAG
.getContext(),
19746 bool isSigned
= N
->getOpcode() == ISD::SREM
;
19747 SDValue Callee
= DAG
.getExternalSymbol(getLibcallName(LC
),
19748 getPointerTy(DAG
.getDataLayout()));
19750 if (Subtarget
->isTargetWindows())
19751 InChain
= WinDBZCheckDenominator(DAG
, N
, InChain
);
19754 CallLoweringInfo
CLI(DAG
);
19755 CLI
.setChain(InChain
)
19756 .setCallee(CallingConv::ARM_AAPCS
, RetTy
, Callee
, std::move(Args
))
19757 .setSExtResult(isSigned
).setZExtResult(!isSigned
).setDebugLoc(SDLoc(N
));
19758 std::pair
<SDValue
, SDValue
> CallResult
= LowerCallTo(CLI
);
19760 // Return second (rem) result operand (first contains div)
19761 SDNode
*ResNode
= CallResult
.first
.getNode();
19762 assert(ResNode
->getNumOperands() == 2 && "divmod should return two operands");
19763 return ResNode
->getOperand(1);
19767 ARMTargetLowering::LowerDYNAMIC_STACKALLOC(SDValue Op
, SelectionDAG
&DAG
) const {
19768 assert(Subtarget
->isTargetWindows() && "unsupported target platform");
19772 SDValue Chain
= Op
.getOperand(0);
19773 SDValue Size
= Op
.getOperand(1);
19775 if (DAG
.getMachineFunction().getFunction().hasFnAttribute(
19776 "no-stack-arg-probe")) {
19778 cast
<ConstantSDNode
>(Op
.getOperand(2))->getMaybeAlignValue();
19779 SDValue SP
= DAG
.getCopyFromReg(Chain
, DL
, ARM::SP
, MVT::i32
);
19780 Chain
= SP
.getValue(1);
19781 SP
= DAG
.getNode(ISD::SUB
, DL
, MVT::i32
, SP
, Size
);
19784 DAG
.getNode(ISD::AND
, DL
, MVT::i32
, SP
.getValue(0),
19785 DAG
.getConstant(-(uint64_t)Align
->value(), DL
, MVT::i32
));
19786 Chain
= DAG
.getCopyToReg(Chain
, DL
, ARM::SP
, SP
);
19787 SDValue Ops
[2] = { SP
, Chain
};
19788 return DAG
.getMergeValues(Ops
, DL
);
19791 SDValue Words
= DAG
.getNode(ISD::SRL
, DL
, MVT::i32
, Size
,
19792 DAG
.getConstant(2, DL
, MVT::i32
));
19795 Chain
= DAG
.getCopyToReg(Chain
, DL
, ARM::R4
, Words
, Flag
);
19796 Flag
= Chain
.getValue(1);
19798 SDVTList NodeTys
= DAG
.getVTList(MVT::Other
, MVT::Glue
);
19799 Chain
= DAG
.getNode(ARMISD::WIN__CHKSTK
, DL
, NodeTys
, Chain
, Flag
);
19801 SDValue NewSP
= DAG
.getCopyFromReg(Chain
, DL
, ARM::SP
, MVT::i32
);
19802 Chain
= NewSP
.getValue(1);
19804 SDValue Ops
[2] = { NewSP
, Chain
};
19805 return DAG
.getMergeValues(Ops
, DL
);
19808 SDValue
ARMTargetLowering::LowerFP_EXTEND(SDValue Op
, SelectionDAG
&DAG
) const {
19809 bool IsStrict
= Op
->isStrictFPOpcode();
19810 SDValue SrcVal
= Op
.getOperand(IsStrict
? 1 : 0);
19811 const unsigned DstSz
= Op
.getValueType().getSizeInBits();
19812 const unsigned SrcSz
= SrcVal
.getValueType().getSizeInBits();
19813 assert(DstSz
> SrcSz
&& DstSz
<= 64 && SrcSz
>= 16 &&
19814 "Unexpected type for custom-lowering FP_EXTEND");
19816 assert((!Subtarget
->hasFP64() || !Subtarget
->hasFPARMv8Base()) &&
19817 "With both FP DP and 16, any FP conversion is legal!");
19819 assert(!(DstSz
== 32 && Subtarget
->hasFP16()) &&
19820 "With FP16, 16 to 32 conversion is legal!");
19822 // Converting from 32 -> 64 is valid if we have FP64.
19823 if (SrcSz
== 32 && DstSz
== 64 && Subtarget
->hasFP64()) {
19824 // FIXME: Remove this when we have strict fp instruction selection patterns
19827 SDValue Result
= DAG
.getNode(ISD::FP_EXTEND
,
19828 Loc
, Op
.getValueType(), SrcVal
);
19829 return DAG
.getMergeValues({Result
, Op
.getOperand(0)}, Loc
);
19834 // Either we are converting from 16 -> 64, without FP16 and/or
19835 // FP.double-precision or without Armv8-fp. So we must do it in two
19837 // Or we are converting from 32 -> 64 without fp.double-precision or 16 -> 32
19838 // without FP16. So we must do a function call.
19841 MakeLibCallOptions CallOptions
;
19842 SDValue Chain
= IsStrict
? Op
.getOperand(0) : SDValue();
19843 for (unsigned Sz
= SrcSz
; Sz
<= 32 && Sz
< DstSz
; Sz
*= 2) {
19844 bool Supported
= (Sz
== 16 ? Subtarget
->hasFP16() : Subtarget
->hasFP64());
19845 MVT SrcVT
= (Sz
== 16 ? MVT::f16
: MVT::f32
);
19846 MVT DstVT
= (Sz
== 16 ? MVT::f32
: MVT::f64
);
19849 SrcVal
= DAG
.getNode(ISD::STRICT_FP_EXTEND
, Loc
,
19850 {DstVT
, MVT::Other
}, {Chain
, SrcVal
});
19851 Chain
= SrcVal
.getValue(1);
19853 SrcVal
= DAG
.getNode(ISD::FP_EXTEND
, Loc
, DstVT
, SrcVal
);
19856 LC
= RTLIB::getFPEXT(SrcVT
, DstVT
);
19857 assert(LC
!= RTLIB::UNKNOWN_LIBCALL
&&
19858 "Unexpected type for custom-lowering FP_EXTEND");
19859 std::tie(SrcVal
, Chain
) = makeLibCall(DAG
, LC
, DstVT
, SrcVal
, CallOptions
,
19864 return IsStrict
? DAG
.getMergeValues({SrcVal
, Chain
}, Loc
) : SrcVal
;
19867 SDValue
ARMTargetLowering::LowerFP_ROUND(SDValue Op
, SelectionDAG
&DAG
) const {
19868 bool IsStrict
= Op
->isStrictFPOpcode();
19870 SDValue SrcVal
= Op
.getOperand(IsStrict
? 1 : 0);
19871 EVT SrcVT
= SrcVal
.getValueType();
19872 EVT DstVT
= Op
.getValueType();
19873 const unsigned DstSz
= Op
.getValueType().getSizeInBits();
19874 const unsigned SrcSz
= SrcVT
.getSizeInBits();
19876 assert(DstSz
< SrcSz
&& SrcSz
<= 64 && DstSz
>= 16 &&
19877 "Unexpected type for custom-lowering FP_ROUND");
19879 assert((!Subtarget
->hasFP64() || !Subtarget
->hasFPARMv8Base()) &&
19880 "With both FP DP and 16, any FP conversion is legal!");
19884 // Instruction from 32 -> 16 if hasFP16 is valid
19885 if (SrcSz
== 32 && Subtarget
->hasFP16())
19888 // Lib call from 32 -> 16 / 64 -> [32, 16]
19889 RTLIB::Libcall LC
= RTLIB::getFPROUND(SrcVT
, DstVT
);
19890 assert(LC
!= RTLIB::UNKNOWN_LIBCALL
&&
19891 "Unexpected type for custom-lowering FP_ROUND");
19892 MakeLibCallOptions CallOptions
;
19893 SDValue Chain
= IsStrict
? Op
.getOperand(0) : SDValue();
19895 std::tie(Result
, Chain
) = makeLibCall(DAG
, LC
, DstVT
, SrcVal
, CallOptions
,
19897 return IsStrict
? DAG
.getMergeValues({Result
, Chain
}, Loc
) : Result
;
19900 void ARMTargetLowering::lowerABS(SDNode
*N
, SmallVectorImpl
<SDValue
> &Results
,
19901 SelectionDAG
&DAG
) const {
19902 assert(N
->getValueType(0) == MVT::i64
&& "Unexpected type (!= i64) on ABS.");
19903 MVT HalfT
= MVT::i32
;
19905 SDValue Hi
, Lo
, Tmp
;
19907 if (!isOperationLegalOrCustom(ISD::ADDCARRY
, HalfT
) ||
19908 !isOperationLegalOrCustom(ISD::UADDO
, HalfT
))
19911 unsigned OpTypeBits
= HalfT
.getScalarSizeInBits();
19912 SDVTList VTList
= DAG
.getVTList(HalfT
, MVT::i1
);
19914 Lo
= DAG
.getNode(ISD::EXTRACT_ELEMENT
, dl
, HalfT
, N
->getOperand(0),
19915 DAG
.getConstant(0, dl
, HalfT
));
19916 Hi
= DAG
.getNode(ISD::EXTRACT_ELEMENT
, dl
, HalfT
, N
->getOperand(0),
19917 DAG
.getConstant(1, dl
, HalfT
));
19919 Tmp
= DAG
.getNode(ISD::SRA
, dl
, HalfT
, Hi
,
19920 DAG
.getConstant(OpTypeBits
- 1, dl
,
19921 getShiftAmountTy(HalfT
, DAG
.getDataLayout())));
19922 Lo
= DAG
.getNode(ISD::UADDO
, dl
, VTList
, Tmp
, Lo
);
19923 Hi
= DAG
.getNode(ISD::ADDCARRY
, dl
, VTList
, Tmp
, Hi
,
19924 SDValue(Lo
.getNode(), 1));
19925 Hi
= DAG
.getNode(ISD::XOR
, dl
, HalfT
, Tmp
, Hi
);
19926 Lo
= DAG
.getNode(ISD::XOR
, dl
, HalfT
, Tmp
, Lo
);
19928 Results
.push_back(Lo
);
19929 Results
.push_back(Hi
);
19933 ARMTargetLowering::isOffsetFoldingLegal(const GlobalAddressSDNode
*GA
) const {
19934 // The ARM target isn't yet aware of offsets.
19938 bool ARM::isBitFieldInvertedMask(unsigned v
) {
19939 if (v
== 0xffffffff)
19942 // there can be 1's on either or both "outsides", all the "inside"
19943 // bits must be 0's
19944 return isShiftedMask_32(~v
);
19947 /// isFPImmLegal - Returns true if the target can instruction select the
19948 /// specified FP immediate natively. If false, the legalizer will
19949 /// materialize the FP immediate as a load from a constant pool.
19950 bool ARMTargetLowering::isFPImmLegal(const APFloat
&Imm
, EVT VT
,
19951 bool ForCodeSize
) const {
19952 if (!Subtarget
->hasVFP3Base())
19954 if (VT
== MVT::f16
&& Subtarget
->hasFullFP16())
19955 return ARM_AM::getFP16Imm(Imm
) != -1;
19956 if (VT
== MVT::f32
&& Subtarget
->hasFullFP16() &&
19957 ARM_AM::getFP32FP16Imm(Imm
) != -1)
19959 if (VT
== MVT::f32
)
19960 return ARM_AM::getFP32Imm(Imm
) != -1;
19961 if (VT
== MVT::f64
&& Subtarget
->hasFP64())
19962 return ARM_AM::getFP64Imm(Imm
) != -1;
19966 /// getTgtMemIntrinsic - Represent NEON load and store intrinsics as
19967 /// MemIntrinsicNodes. The associated MachineMemOperands record the alignment
19968 /// specified in the intrinsic calls.
19969 bool ARMTargetLowering::getTgtMemIntrinsic(IntrinsicInfo
&Info
,
19971 MachineFunction
&MF
,
19972 unsigned Intrinsic
) const {
19973 switch (Intrinsic
) {
19974 case Intrinsic::arm_neon_vld1
:
19975 case Intrinsic::arm_neon_vld2
:
19976 case Intrinsic::arm_neon_vld3
:
19977 case Intrinsic::arm_neon_vld4
:
19978 case Intrinsic::arm_neon_vld2lane
:
19979 case Intrinsic::arm_neon_vld3lane
:
19980 case Intrinsic::arm_neon_vld4lane
:
19981 case Intrinsic::arm_neon_vld2dup
:
19982 case Intrinsic::arm_neon_vld3dup
:
19983 case Intrinsic::arm_neon_vld4dup
: {
19984 Info
.opc
= ISD::INTRINSIC_W_CHAIN
;
19985 // Conservatively set memVT to the entire set of vectors loaded.
19986 auto &DL
= I
.getCalledFunction()->getParent()->getDataLayout();
19987 uint64_t NumElts
= DL
.getTypeSizeInBits(I
.getType()) / 64;
19988 Info
.memVT
= EVT::getVectorVT(I
.getType()->getContext(), MVT::i64
, NumElts
);
19989 Info
.ptrVal
= I
.getArgOperand(0);
19991 Value
*AlignArg
= I
.getArgOperand(I
.getNumArgOperands() - 1);
19992 Info
.align
= cast
<ConstantInt
>(AlignArg
)->getMaybeAlignValue();
19993 // volatile loads with NEON intrinsics not supported
19994 Info
.flags
= MachineMemOperand::MOLoad
;
19997 case Intrinsic::arm_neon_vld1x2
:
19998 case Intrinsic::arm_neon_vld1x3
:
19999 case Intrinsic::arm_neon_vld1x4
: {
20000 Info
.opc
= ISD::INTRINSIC_W_CHAIN
;
20001 // Conservatively set memVT to the entire set of vectors loaded.
20002 auto &DL
= I
.getCalledFunction()->getParent()->getDataLayout();
20003 uint64_t NumElts
= DL
.getTypeSizeInBits(I
.getType()) / 64;
20004 Info
.memVT
= EVT::getVectorVT(I
.getType()->getContext(), MVT::i64
, NumElts
);
20005 Info
.ptrVal
= I
.getArgOperand(I
.getNumArgOperands() - 1);
20007 Info
.align
.reset();
20008 // volatile loads with NEON intrinsics not supported
20009 Info
.flags
= MachineMemOperand::MOLoad
;
20012 case Intrinsic::arm_neon_vst1
:
20013 case Intrinsic::arm_neon_vst2
:
20014 case Intrinsic::arm_neon_vst3
:
20015 case Intrinsic::arm_neon_vst4
:
20016 case Intrinsic::arm_neon_vst2lane
:
20017 case Intrinsic::arm_neon_vst3lane
:
20018 case Intrinsic::arm_neon_vst4lane
: {
20019 Info
.opc
= ISD::INTRINSIC_VOID
;
20020 // Conservatively set memVT to the entire set of vectors stored.
20021 auto &DL
= I
.getCalledFunction()->getParent()->getDataLayout();
20022 unsigned NumElts
= 0;
20023 for (unsigned ArgI
= 1, ArgE
= I
.getNumArgOperands(); ArgI
< ArgE
; ++ArgI
) {
20024 Type
*ArgTy
= I
.getArgOperand(ArgI
)->getType();
20025 if (!ArgTy
->isVectorTy())
20027 NumElts
+= DL
.getTypeSizeInBits(ArgTy
) / 64;
20029 Info
.memVT
= EVT::getVectorVT(I
.getType()->getContext(), MVT::i64
, NumElts
);
20030 Info
.ptrVal
= I
.getArgOperand(0);
20032 Value
*AlignArg
= I
.getArgOperand(I
.getNumArgOperands() - 1);
20033 Info
.align
= cast
<ConstantInt
>(AlignArg
)->getMaybeAlignValue();
20034 // volatile stores with NEON intrinsics not supported
20035 Info
.flags
= MachineMemOperand::MOStore
;
20038 case Intrinsic::arm_neon_vst1x2
:
20039 case Intrinsic::arm_neon_vst1x3
:
20040 case Intrinsic::arm_neon_vst1x4
: {
20041 Info
.opc
= ISD::INTRINSIC_VOID
;
20042 // Conservatively set memVT to the entire set of vectors stored.
20043 auto &DL
= I
.getCalledFunction()->getParent()->getDataLayout();
20044 unsigned NumElts
= 0;
20045 for (unsigned ArgI
= 1, ArgE
= I
.getNumArgOperands(); ArgI
< ArgE
; ++ArgI
) {
20046 Type
*ArgTy
= I
.getArgOperand(ArgI
)->getType();
20047 if (!ArgTy
->isVectorTy())
20049 NumElts
+= DL
.getTypeSizeInBits(ArgTy
) / 64;
20051 Info
.memVT
= EVT::getVectorVT(I
.getType()->getContext(), MVT::i64
, NumElts
);
20052 Info
.ptrVal
= I
.getArgOperand(0);
20054 Info
.align
.reset();
20055 // volatile stores with NEON intrinsics not supported
20056 Info
.flags
= MachineMemOperand::MOStore
;
20059 case Intrinsic::arm_mve_vld2q
:
20060 case Intrinsic::arm_mve_vld4q
: {
20061 Info
.opc
= ISD::INTRINSIC_W_CHAIN
;
20062 // Conservatively set memVT to the entire set of vectors loaded.
20063 Type
*VecTy
= cast
<StructType
>(I
.getType())->getElementType(1);
20064 unsigned Factor
= Intrinsic
== Intrinsic::arm_mve_vld2q
? 2 : 4;
20065 Info
.memVT
= EVT::getVectorVT(VecTy
->getContext(), MVT::i64
, Factor
* 2);
20066 Info
.ptrVal
= I
.getArgOperand(0);
20068 Info
.align
= Align(VecTy
->getScalarSizeInBits() / 8);
20069 // volatile loads with MVE intrinsics not supported
20070 Info
.flags
= MachineMemOperand::MOLoad
;
20073 case Intrinsic::arm_mve_vst2q
:
20074 case Intrinsic::arm_mve_vst4q
: {
20075 Info
.opc
= ISD::INTRINSIC_VOID
;
20076 // Conservatively set memVT to the entire set of vectors stored.
20077 Type
*VecTy
= I
.getArgOperand(1)->getType();
20078 unsigned Factor
= Intrinsic
== Intrinsic::arm_mve_vst2q
? 2 : 4;
20079 Info
.memVT
= EVT::getVectorVT(VecTy
->getContext(), MVT::i64
, Factor
* 2);
20080 Info
.ptrVal
= I
.getArgOperand(0);
20082 Info
.align
= Align(VecTy
->getScalarSizeInBits() / 8);
20083 // volatile stores with MVE intrinsics not supported
20084 Info
.flags
= MachineMemOperand::MOStore
;
20087 case Intrinsic::arm_mve_vldr_gather_base
:
20088 case Intrinsic::arm_mve_vldr_gather_base_predicated
: {
20089 Info
.opc
= ISD::INTRINSIC_W_CHAIN
;
20090 Info
.ptrVal
= nullptr;
20091 Info
.memVT
= MVT::getVT(I
.getType());
20092 Info
.align
= Align(1);
20093 Info
.flags
|= MachineMemOperand::MOLoad
;
20096 case Intrinsic::arm_mve_vldr_gather_base_wb
:
20097 case Intrinsic::arm_mve_vldr_gather_base_wb_predicated
: {
20098 Info
.opc
= ISD::INTRINSIC_W_CHAIN
;
20099 Info
.ptrVal
= nullptr;
20100 Info
.memVT
= MVT::getVT(I
.getType()->getContainedType(0));
20101 Info
.align
= Align(1);
20102 Info
.flags
|= MachineMemOperand::MOLoad
;
20105 case Intrinsic::arm_mve_vldr_gather_offset
:
20106 case Intrinsic::arm_mve_vldr_gather_offset_predicated
: {
20107 Info
.opc
= ISD::INTRINSIC_W_CHAIN
;
20108 Info
.ptrVal
= nullptr;
20109 MVT DataVT
= MVT::getVT(I
.getType());
20110 unsigned MemSize
= cast
<ConstantInt
>(I
.getArgOperand(2))->getZExtValue();
20111 Info
.memVT
= MVT::getVectorVT(MVT::getIntegerVT(MemSize
),
20112 DataVT
.getVectorNumElements());
20113 Info
.align
= Align(1);
20114 Info
.flags
|= MachineMemOperand::MOLoad
;
20117 case Intrinsic::arm_mve_vstr_scatter_base
:
20118 case Intrinsic::arm_mve_vstr_scatter_base_predicated
: {
20119 Info
.opc
= ISD::INTRINSIC_VOID
;
20120 Info
.ptrVal
= nullptr;
20121 Info
.memVT
= MVT::getVT(I
.getArgOperand(2)->getType());
20122 Info
.align
= Align(1);
20123 Info
.flags
|= MachineMemOperand::MOStore
;
20126 case Intrinsic::arm_mve_vstr_scatter_base_wb
:
20127 case Intrinsic::arm_mve_vstr_scatter_base_wb_predicated
: {
20128 Info
.opc
= ISD::INTRINSIC_W_CHAIN
;
20129 Info
.ptrVal
= nullptr;
20130 Info
.memVT
= MVT::getVT(I
.getArgOperand(2)->getType());
20131 Info
.align
= Align(1);
20132 Info
.flags
|= MachineMemOperand::MOStore
;
20135 case Intrinsic::arm_mve_vstr_scatter_offset
:
20136 case Intrinsic::arm_mve_vstr_scatter_offset_predicated
: {
20137 Info
.opc
= ISD::INTRINSIC_VOID
;
20138 Info
.ptrVal
= nullptr;
20139 MVT DataVT
= MVT::getVT(I
.getArgOperand(2)->getType());
20140 unsigned MemSize
= cast
<ConstantInt
>(I
.getArgOperand(3))->getZExtValue();
20141 Info
.memVT
= MVT::getVectorVT(MVT::getIntegerVT(MemSize
),
20142 DataVT
.getVectorNumElements());
20143 Info
.align
= Align(1);
20144 Info
.flags
|= MachineMemOperand::MOStore
;
20147 case Intrinsic::arm_ldaex
:
20148 case Intrinsic::arm_ldrex
: {
20149 auto &DL
= I
.getCalledFunction()->getParent()->getDataLayout();
20150 PointerType
*PtrTy
= cast
<PointerType
>(I
.getArgOperand(0)->getType());
20151 Info
.opc
= ISD::INTRINSIC_W_CHAIN
;
20152 Info
.memVT
= MVT::getVT(PtrTy
->getElementType());
20153 Info
.ptrVal
= I
.getArgOperand(0);
20155 Info
.align
= DL
.getABITypeAlign(PtrTy
->getElementType());
20156 Info
.flags
= MachineMemOperand::MOLoad
| MachineMemOperand::MOVolatile
;
20159 case Intrinsic::arm_stlex
:
20160 case Intrinsic::arm_strex
: {
20161 auto &DL
= I
.getCalledFunction()->getParent()->getDataLayout();
20162 PointerType
*PtrTy
= cast
<PointerType
>(I
.getArgOperand(1)->getType());
20163 Info
.opc
= ISD::INTRINSIC_W_CHAIN
;
20164 Info
.memVT
= MVT::getVT(PtrTy
->getElementType());
20165 Info
.ptrVal
= I
.getArgOperand(1);
20167 Info
.align
= DL
.getABITypeAlign(PtrTy
->getElementType());
20168 Info
.flags
= MachineMemOperand::MOStore
| MachineMemOperand::MOVolatile
;
20171 case Intrinsic::arm_stlexd
:
20172 case Intrinsic::arm_strexd
:
20173 Info
.opc
= ISD::INTRINSIC_W_CHAIN
;
20174 Info
.memVT
= MVT::i64
;
20175 Info
.ptrVal
= I
.getArgOperand(2);
20177 Info
.align
= Align(8);
20178 Info
.flags
= MachineMemOperand::MOStore
| MachineMemOperand::MOVolatile
;
20181 case Intrinsic::arm_ldaexd
:
20182 case Intrinsic::arm_ldrexd
:
20183 Info
.opc
= ISD::INTRINSIC_W_CHAIN
;
20184 Info
.memVT
= MVT::i64
;
20185 Info
.ptrVal
= I
.getArgOperand(0);
20187 Info
.align
= Align(8);
20188 Info
.flags
= MachineMemOperand::MOLoad
| MachineMemOperand::MOVolatile
;
20198 /// Returns true if it is beneficial to convert a load of a constant
20199 /// to just the constant itself.
20200 bool ARMTargetLowering::shouldConvertConstantLoadToIntImm(const APInt
&Imm
,
20202 assert(Ty
->isIntegerTy());
20204 unsigned Bits
= Ty
->getPrimitiveSizeInBits();
20205 if (Bits
== 0 || Bits
> 32)
20210 bool ARMTargetLowering::isExtractSubvectorCheap(EVT ResVT
, EVT SrcVT
,
20211 unsigned Index
) const {
20212 if (!isOperationLegalOrCustom(ISD::EXTRACT_SUBVECTOR
, ResVT
))
20215 return (Index
== 0 || Index
== ResVT
.getVectorNumElements());
20218 Instruction
*ARMTargetLowering::makeDMB(IRBuilderBase
&Builder
,
20219 ARM_MB::MemBOpt Domain
) const {
20220 Module
*M
= Builder
.GetInsertBlock()->getParent()->getParent();
20222 // First, if the target has no DMB, see what fallback we can use.
20223 if (!Subtarget
->hasDataBarrier()) {
20224 // Some ARMv6 cpus can support data barriers with an mcr instruction.
20225 // Thumb1 and pre-v6 ARM mode use a libcall instead and should never get
20227 if (Subtarget
->hasV6Ops() && !Subtarget
->isThumb()) {
20228 Function
*MCR
= Intrinsic::getDeclaration(M
, Intrinsic::arm_mcr
);
20229 Value
* args
[6] = {Builder
.getInt32(15), Builder
.getInt32(0),
20230 Builder
.getInt32(0), Builder
.getInt32(7),
20231 Builder
.getInt32(10), Builder
.getInt32(5)};
20232 return Builder
.CreateCall(MCR
, args
);
20234 // Instead of using barriers, atomic accesses on these subtargets use
20236 llvm_unreachable("makeDMB on a target so old that it has no barriers");
20239 Function
*DMB
= Intrinsic::getDeclaration(M
, Intrinsic::arm_dmb
);
20240 // Only a full system barrier exists in the M-class architectures.
20241 Domain
= Subtarget
->isMClass() ? ARM_MB::SY
: Domain
;
20242 Constant
*CDomain
= Builder
.getInt32(Domain
);
20243 return Builder
.CreateCall(DMB
, CDomain
);
20247 // Based on http://www.cl.cam.ac.uk/~pes20/cpp/cpp0xmappings.html
20248 Instruction
*ARMTargetLowering::emitLeadingFence(IRBuilderBase
&Builder
,
20250 AtomicOrdering Ord
) const {
20252 case AtomicOrdering::NotAtomic
:
20253 case AtomicOrdering::Unordered
:
20254 llvm_unreachable("Invalid fence: unordered/non-atomic");
20255 case AtomicOrdering::Monotonic
:
20256 case AtomicOrdering::Acquire
:
20257 return nullptr; // Nothing to do
20258 case AtomicOrdering::SequentiallyConsistent
:
20259 if (!Inst
->hasAtomicStore())
20260 return nullptr; // Nothing to do
20262 case AtomicOrdering::Release
:
20263 case AtomicOrdering::AcquireRelease
:
20264 if (Subtarget
->preferISHSTBarriers())
20265 return makeDMB(Builder
, ARM_MB::ISHST
);
20266 // FIXME: add a comment with a link to documentation justifying this.
20268 return makeDMB(Builder
, ARM_MB::ISH
);
20270 llvm_unreachable("Unknown fence ordering in emitLeadingFence");
20273 Instruction
*ARMTargetLowering::emitTrailingFence(IRBuilderBase
&Builder
,
20275 AtomicOrdering Ord
) const {
20277 case AtomicOrdering::NotAtomic
:
20278 case AtomicOrdering::Unordered
:
20279 llvm_unreachable("Invalid fence: unordered/not-atomic");
20280 case AtomicOrdering::Monotonic
:
20281 case AtomicOrdering::Release
:
20282 return nullptr; // Nothing to do
20283 case AtomicOrdering::Acquire
:
20284 case AtomicOrdering::AcquireRelease
:
20285 case AtomicOrdering::SequentiallyConsistent
:
20286 return makeDMB(Builder
, ARM_MB::ISH
);
20288 llvm_unreachable("Unknown fence ordering in emitTrailingFence");
20291 // Loads and stores less than 64-bits are already atomic; ones above that
20292 // are doomed anyway, so defer to the default libcall and blame the OS when
20293 // things go wrong. Cortex M doesn't have ldrexd/strexd though, so don't emit
20294 // anything for those.
20295 bool ARMTargetLowering::shouldExpandAtomicStoreInIR(StoreInst
*SI
) const {
20296 unsigned Size
= SI
->getValueOperand()->getType()->getPrimitiveSizeInBits();
20297 return (Size
== 64) && !Subtarget
->isMClass();
20300 // Loads and stores less than 64-bits are already atomic; ones above that
20301 // are doomed anyway, so defer to the default libcall and blame the OS when
20302 // things go wrong. Cortex M doesn't have ldrexd/strexd though, so don't emit
20303 // anything for those.
20304 // FIXME: ldrd and strd are atomic if the CPU has LPAE (e.g. A15 has that
20305 // guarantee, see DDI0406C ARM architecture reference manual,
20306 // sections A8.8.72-74 LDRD)
20307 TargetLowering::AtomicExpansionKind
20308 ARMTargetLowering::shouldExpandAtomicLoadInIR(LoadInst
*LI
) const {
20309 unsigned Size
= LI
->getType()->getPrimitiveSizeInBits();
20310 return ((Size
== 64) && !Subtarget
->isMClass()) ? AtomicExpansionKind::LLOnly
20311 : AtomicExpansionKind::None
;
20314 // For the real atomic operations, we have ldrex/strex up to 32 bits,
20315 // and up to 64 bits on the non-M profiles
20316 TargetLowering::AtomicExpansionKind
20317 ARMTargetLowering::shouldExpandAtomicRMWInIR(AtomicRMWInst
*AI
) const {
20318 if (AI
->isFloatingPointOperation())
20319 return AtomicExpansionKind::CmpXChg
;
20321 // At -O0, fast-regalloc cannot cope with the live vregs necessary to
20322 // implement atomicrmw without spilling. If the target address is also on the
20323 // stack and close enough to the spill slot, this can lead to a situation
20324 // where the monitor always gets cleared and the atomic operation can never
20325 // succeed. So at -O0 lower this operation to a CAS loop.
20326 if (getTargetMachine().getOptLevel() == CodeGenOpt::None
)
20327 return AtomicExpansionKind::CmpXChg
;
20329 unsigned Size
= AI
->getType()->getPrimitiveSizeInBits();
20330 bool hasAtomicRMW
= !Subtarget
->isThumb() || Subtarget
->hasV8MBaselineOps();
20331 return (Size
<= (Subtarget
->isMClass() ? 32U : 64U) && hasAtomicRMW
)
20332 ? AtomicExpansionKind::LLSC
20333 : AtomicExpansionKind::None
;
20336 // Similar to shouldExpandAtomicRMWInIR, ldrex/strex can be used up to 32
20337 // bits, and up to 64 bits on the non-M profiles.
20338 TargetLowering::AtomicExpansionKind
20339 ARMTargetLowering::shouldExpandAtomicCmpXchgInIR(AtomicCmpXchgInst
*AI
) const {
20340 // At -O0, fast-regalloc cannot cope with the live vregs necessary to
20341 // implement cmpxchg without spilling. If the address being exchanged is also
20342 // on the stack and close enough to the spill slot, this can lead to a
20343 // situation where the monitor always gets cleared and the atomic operation
20344 // can never succeed. So at -O0 we need a late-expanded pseudo-inst instead.
20345 unsigned Size
= AI
->getOperand(1)->getType()->getPrimitiveSizeInBits();
20346 bool HasAtomicCmpXchg
=
20347 !Subtarget
->isThumb() || Subtarget
->hasV8MBaselineOps();
20348 if (getTargetMachine().getOptLevel() != 0 && HasAtomicCmpXchg
&&
20349 Size
<= (Subtarget
->isMClass() ? 32U : 64U))
20350 return AtomicExpansionKind::LLSC
;
20351 return AtomicExpansionKind::None
;
20354 bool ARMTargetLowering::shouldInsertFencesForAtomic(
20355 const Instruction
*I
) const {
20356 return InsertFencesForAtomic
;
20359 // This has so far only been implemented for MachO.
20360 bool ARMTargetLowering::useLoadStackGuardNode() const {
20361 return Subtarget
->isTargetMachO();
20364 void ARMTargetLowering::insertSSPDeclarations(Module
&M
) const {
20365 if (!Subtarget
->getTargetTriple().isWindowsMSVCEnvironment())
20366 return TargetLowering::insertSSPDeclarations(M
);
20368 // MSVC CRT has a global variable holding security cookie.
20369 M
.getOrInsertGlobal("__security_cookie",
20370 Type::getInt8PtrTy(M
.getContext()));
20372 // MSVC CRT has a function to validate security cookie.
20373 FunctionCallee SecurityCheckCookie
= M
.getOrInsertFunction(
20374 "__security_check_cookie", Type::getVoidTy(M
.getContext()),
20375 Type::getInt8PtrTy(M
.getContext()));
20376 if (Function
*F
= dyn_cast
<Function
>(SecurityCheckCookie
.getCallee()))
20377 F
->addParamAttr(0, Attribute::AttrKind::InReg
);
20380 Value
*ARMTargetLowering::getSDagStackGuard(const Module
&M
) const {
20381 // MSVC CRT has a global variable holding security cookie.
20382 if (Subtarget
->getTargetTriple().isWindowsMSVCEnvironment())
20383 return M
.getGlobalVariable("__security_cookie");
20384 return TargetLowering::getSDagStackGuard(M
);
20387 Function
*ARMTargetLowering::getSSPStackGuardCheck(const Module
&M
) const {
20388 // MSVC CRT has a function to validate security cookie.
20389 if (Subtarget
->getTargetTriple().isWindowsMSVCEnvironment())
20390 return M
.getFunction("__security_check_cookie");
20391 return TargetLowering::getSSPStackGuardCheck(M
);
20394 bool ARMTargetLowering::canCombineStoreAndExtract(Type
*VectorTy
, Value
*Idx
,
20395 unsigned &Cost
) const {
20396 // If we do not have NEON, vector types are not natively supported.
20397 if (!Subtarget
->hasNEON())
20400 // Floating point values and vector values map to the same register file.
20401 // Therefore, although we could do a store extract of a vector type, this is
20402 // better to leave at float as we have more freedom in the addressing mode for
20404 if (VectorTy
->isFPOrFPVectorTy())
20407 // If the index is unknown at compile time, this is very expensive to lower
20408 // and it is not possible to combine the store with the extract.
20409 if (!isa
<ConstantInt
>(Idx
))
20412 assert(VectorTy
->isVectorTy() && "VectorTy is not a vector type");
20413 unsigned BitWidth
= VectorTy
->getPrimitiveSizeInBits().getFixedSize();
20414 // We can do a store + vector extract on any vector that fits perfectly in a D
20416 if (BitWidth
== 64 || BitWidth
== 128) {
20423 bool ARMTargetLowering::isCheapToSpeculateCttz() const {
20424 return Subtarget
->hasV6T2Ops();
20427 bool ARMTargetLowering::isCheapToSpeculateCtlz() const {
20428 return Subtarget
->hasV6T2Ops();
20431 bool ARMTargetLowering::shouldExpandShift(SelectionDAG
&DAG
, SDNode
*N
) const {
20432 return !Subtarget
->hasMinSize() || Subtarget
->isTargetWindows();
20435 Value
*ARMTargetLowering::emitLoadLinked(IRBuilderBase
&Builder
, Type
*ValueTy
,
20437 AtomicOrdering Ord
) const {
20438 Module
*M
= Builder
.GetInsertBlock()->getParent()->getParent();
20439 bool IsAcquire
= isAcquireOrStronger(Ord
);
20441 // Since i64 isn't legal and intrinsics don't get type-lowered, the ldrexd
20442 // intrinsic must return {i32, i32} and we have to recombine them into a
20443 // single i64 here.
20444 if (ValueTy
->getPrimitiveSizeInBits() == 64) {
20445 Intrinsic::ID Int
=
20446 IsAcquire
? Intrinsic::arm_ldaexd
: Intrinsic::arm_ldrexd
;
20447 Function
*Ldrex
= Intrinsic::getDeclaration(M
, Int
);
20449 Addr
= Builder
.CreateBitCast(Addr
, Type::getInt8PtrTy(M
->getContext()));
20450 Value
*LoHi
= Builder
.CreateCall(Ldrex
, Addr
, "lohi");
20452 Value
*Lo
= Builder
.CreateExtractValue(LoHi
, 0, "lo");
20453 Value
*Hi
= Builder
.CreateExtractValue(LoHi
, 1, "hi");
20454 if (!Subtarget
->isLittle())
20455 std::swap (Lo
, Hi
);
20456 Lo
= Builder
.CreateZExt(Lo
, ValueTy
, "lo64");
20457 Hi
= Builder
.CreateZExt(Hi
, ValueTy
, "hi64");
20458 return Builder
.CreateOr(
20459 Lo
, Builder
.CreateShl(Hi
, ConstantInt::get(ValueTy
, 32)), "val64");
20462 Type
*Tys
[] = { Addr
->getType() };
20463 Intrinsic::ID Int
= IsAcquire
? Intrinsic::arm_ldaex
: Intrinsic::arm_ldrex
;
20464 Function
*Ldrex
= Intrinsic::getDeclaration(M
, Int
, Tys
);
20466 return Builder
.CreateTruncOrBitCast(Builder
.CreateCall(Ldrex
, Addr
), ValueTy
);
20469 void ARMTargetLowering::emitAtomicCmpXchgNoStoreLLBalance(
20470 IRBuilderBase
&Builder
) const {
20471 if (!Subtarget
->hasV7Ops())
20473 Module
*M
= Builder
.GetInsertBlock()->getParent()->getParent();
20474 Builder
.CreateCall(Intrinsic::getDeclaration(M
, Intrinsic::arm_clrex
));
20477 Value
*ARMTargetLowering::emitStoreConditional(IRBuilderBase
&Builder
,
20478 Value
*Val
, Value
*Addr
,
20479 AtomicOrdering Ord
) const {
20480 Module
*M
= Builder
.GetInsertBlock()->getParent()->getParent();
20481 bool IsRelease
= isReleaseOrStronger(Ord
);
20483 // Since the intrinsics must have legal type, the i64 intrinsics take two
20484 // parameters: "i32, i32". We must marshal Val into the appropriate form
20485 // before the call.
20486 if (Val
->getType()->getPrimitiveSizeInBits() == 64) {
20487 Intrinsic::ID Int
=
20488 IsRelease
? Intrinsic::arm_stlexd
: Intrinsic::arm_strexd
;
20489 Function
*Strex
= Intrinsic::getDeclaration(M
, Int
);
20490 Type
*Int32Ty
= Type::getInt32Ty(M
->getContext());
20492 Value
*Lo
= Builder
.CreateTrunc(Val
, Int32Ty
, "lo");
20493 Value
*Hi
= Builder
.CreateTrunc(Builder
.CreateLShr(Val
, 32), Int32Ty
, "hi");
20494 if (!Subtarget
->isLittle())
20496 Addr
= Builder
.CreateBitCast(Addr
, Type::getInt8PtrTy(M
->getContext()));
20497 return Builder
.CreateCall(Strex
, {Lo
, Hi
, Addr
});
20500 Intrinsic::ID Int
= IsRelease
? Intrinsic::arm_stlex
: Intrinsic::arm_strex
;
20501 Type
*Tys
[] = { Addr
->getType() };
20502 Function
*Strex
= Intrinsic::getDeclaration(M
, Int
, Tys
);
20504 return Builder
.CreateCall(
20505 Strex
, {Builder
.CreateZExtOrBitCast(
20506 Val
, Strex
->getFunctionType()->getParamType(0)),
20511 bool ARMTargetLowering::alignLoopsWithOptSize() const {
20512 return Subtarget
->isMClass();
20515 /// A helper function for determining the number of interleaved accesses we
20516 /// will generate when lowering accesses of the given type.
20518 ARMTargetLowering::getNumInterleavedAccesses(VectorType
*VecTy
,
20519 const DataLayout
&DL
) const {
20520 return (DL
.getTypeSizeInBits(VecTy
) + 127) / 128;
20523 bool ARMTargetLowering::isLegalInterleavedAccessType(
20524 unsigned Factor
, FixedVectorType
*VecTy
, Align Alignment
,
20525 const DataLayout
&DL
) const {
20527 unsigned VecSize
= DL
.getTypeSizeInBits(VecTy
);
20528 unsigned ElSize
= DL
.getTypeSizeInBits(VecTy
->getElementType());
20530 if (!Subtarget
->hasNEON() && !Subtarget
->hasMVEIntegerOps())
20533 // Ensure the vector doesn't have f16 elements. Even though we could do an
20534 // i16 vldN, we can't hold the f16 vectors and will end up converting via
20536 if (Subtarget
->hasNEON() && VecTy
->getElementType()->isHalfTy())
20538 if (Subtarget
->hasMVEIntegerOps() && Factor
== 3)
20541 // Ensure the number of vector elements is greater than 1.
20542 if (VecTy
->getNumElements() < 2)
20545 // Ensure the element type is legal.
20546 if (ElSize
!= 8 && ElSize
!= 16 && ElSize
!= 32)
20548 // And the alignment if high enough under MVE.
20549 if (Subtarget
->hasMVEIntegerOps() && Alignment
< ElSize
/ 8)
20552 // Ensure the total vector size is 64 or a multiple of 128. Types larger than
20553 // 128 will be split into multiple interleaved accesses.
20554 if (Subtarget
->hasNEON() && VecSize
== 64)
20556 return VecSize
% 128 == 0;
20559 unsigned ARMTargetLowering::getMaxSupportedInterleaveFactor() const {
20560 if (Subtarget
->hasNEON())
20562 if (Subtarget
->hasMVEIntegerOps())
20563 return MVEMaxSupportedInterleaveFactor
;
20564 return TargetLoweringBase::getMaxSupportedInterleaveFactor();
20567 /// Lower an interleaved load into a vldN intrinsic.
20569 /// E.g. Lower an interleaved load (Factor = 2):
20570 /// %wide.vec = load <8 x i32>, <8 x i32>* %ptr, align 4
20571 /// %v0 = shuffle %wide.vec, undef, <0, 2, 4, 6> ; Extract even elements
20572 /// %v1 = shuffle %wide.vec, undef, <1, 3, 5, 7> ; Extract odd elements
20575 /// %vld2 = { <4 x i32>, <4 x i32> } call llvm.arm.neon.vld2(%ptr, 4)
20576 /// %vec0 = extractelement { <4 x i32>, <4 x i32> } %vld2, i32 0
20577 /// %vec1 = extractelement { <4 x i32>, <4 x i32> } %vld2, i32 1
20578 bool ARMTargetLowering::lowerInterleavedLoad(
20579 LoadInst
*LI
, ArrayRef
<ShuffleVectorInst
*> Shuffles
,
20580 ArrayRef
<unsigned> Indices
, unsigned Factor
) const {
20581 assert(Factor
>= 2 && Factor
<= getMaxSupportedInterleaveFactor() &&
20582 "Invalid interleave factor");
20583 assert(!Shuffles
.empty() && "Empty shufflevector input");
20584 assert(Shuffles
.size() == Indices
.size() &&
20585 "Unmatched number of shufflevectors and indices");
20587 auto *VecTy
= cast
<FixedVectorType
>(Shuffles
[0]->getType());
20588 Type
*EltTy
= VecTy
->getElementType();
20590 const DataLayout
&DL
= LI
->getModule()->getDataLayout();
20591 Align Alignment
= LI
->getAlign();
20593 // Skip if we do not have NEON and skip illegal vector types. We can
20594 // "legalize" wide vector types into multiple interleaved accesses as long as
20595 // the vector types are divisible by 128.
20596 if (!isLegalInterleavedAccessType(Factor
, VecTy
, Alignment
, DL
))
20599 unsigned NumLoads
= getNumInterleavedAccesses(VecTy
, DL
);
20601 // A pointer vector can not be the return type of the ldN intrinsics. Need to
20602 // load integer vectors first and then convert to pointer vectors.
20603 if (EltTy
->isPointerTy())
20604 VecTy
= FixedVectorType::get(DL
.getIntPtrType(EltTy
), VecTy
);
20606 IRBuilder
<> Builder(LI
);
20608 // The base address of the load.
20609 Value
*BaseAddr
= LI
->getPointerOperand();
20611 if (NumLoads
> 1) {
20612 // If we're going to generate more than one load, reset the sub-vector type
20613 // to something legal.
20614 VecTy
= FixedVectorType::get(VecTy
->getElementType(),
20615 VecTy
->getNumElements() / NumLoads
);
20617 // We will compute the pointer operand of each load from the original base
20618 // address using GEPs. Cast the base address to a pointer to the scalar
20620 BaseAddr
= Builder
.CreateBitCast(
20622 VecTy
->getElementType()->getPointerTo(LI
->getPointerAddressSpace()));
20625 assert(isTypeLegal(EVT::getEVT(VecTy
)) && "Illegal vldN vector type!");
20627 auto createLoadIntrinsic
= [&](Value
*BaseAddr
) {
20628 if (Subtarget
->hasNEON()) {
20629 Type
*Int8Ptr
= Builder
.getInt8PtrTy(LI
->getPointerAddressSpace());
20630 Type
*Tys
[] = {VecTy
, Int8Ptr
};
20631 static const Intrinsic::ID LoadInts
[3] = {Intrinsic::arm_neon_vld2
,
20632 Intrinsic::arm_neon_vld3
,
20633 Intrinsic::arm_neon_vld4
};
20634 Function
*VldnFunc
=
20635 Intrinsic::getDeclaration(LI
->getModule(), LoadInts
[Factor
- 2], Tys
);
20637 SmallVector
<Value
*, 2> Ops
;
20638 Ops
.push_back(Builder
.CreateBitCast(BaseAddr
, Int8Ptr
));
20639 Ops
.push_back(Builder
.getInt32(LI
->getAlignment()));
20641 return Builder
.CreateCall(VldnFunc
, Ops
, "vldN");
20643 assert((Factor
== 2 || Factor
== 4) &&
20644 "expected interleave factor of 2 or 4 for MVE");
20645 Intrinsic::ID LoadInts
=
20646 Factor
== 2 ? Intrinsic::arm_mve_vld2q
: Intrinsic::arm_mve_vld4q
;
20648 VecTy
->getElementType()->getPointerTo(LI
->getPointerAddressSpace());
20649 Type
*Tys
[] = {VecTy
, VecEltTy
};
20650 Function
*VldnFunc
=
20651 Intrinsic::getDeclaration(LI
->getModule(), LoadInts
, Tys
);
20653 SmallVector
<Value
*, 2> Ops
;
20654 Ops
.push_back(Builder
.CreateBitCast(BaseAddr
, VecEltTy
));
20655 return Builder
.CreateCall(VldnFunc
, Ops
, "vldN");
20659 // Holds sub-vectors extracted from the load intrinsic return values. The
20660 // sub-vectors are associated with the shufflevector instructions they will
20662 DenseMap
<ShuffleVectorInst
*, SmallVector
<Value
*, 4>> SubVecs
;
20664 for (unsigned LoadCount
= 0; LoadCount
< NumLoads
; ++LoadCount
) {
20665 // If we're generating more than one load, compute the base address of
20666 // subsequent loads as an offset from the previous.
20668 BaseAddr
= Builder
.CreateConstGEP1_32(VecTy
->getElementType(), BaseAddr
,
20669 VecTy
->getNumElements() * Factor
);
20671 CallInst
*VldN
= createLoadIntrinsic(BaseAddr
);
20673 // Replace uses of each shufflevector with the corresponding vector loaded
20675 for (unsigned i
= 0; i
< Shuffles
.size(); i
++) {
20676 ShuffleVectorInst
*SV
= Shuffles
[i
];
20677 unsigned Index
= Indices
[i
];
20679 Value
*SubVec
= Builder
.CreateExtractValue(VldN
, Index
);
20681 // Convert the integer vector to pointer vector if the element is pointer.
20682 if (EltTy
->isPointerTy())
20683 SubVec
= Builder
.CreateIntToPtr(
20685 FixedVectorType::get(SV
->getType()->getElementType(), VecTy
));
20687 SubVecs
[SV
].push_back(SubVec
);
20691 // Replace uses of the shufflevector instructions with the sub-vectors
20692 // returned by the load intrinsic. If a shufflevector instruction is
20693 // associated with more than one sub-vector, those sub-vectors will be
20694 // concatenated into a single wide vector.
20695 for (ShuffleVectorInst
*SVI
: Shuffles
) {
20696 auto &SubVec
= SubVecs
[SVI
];
20698 SubVec
.size() > 1 ? concatenateVectors(Builder
, SubVec
) : SubVec
[0];
20699 SVI
->replaceAllUsesWith(WideVec
);
20705 /// Lower an interleaved store into a vstN intrinsic.
20707 /// E.g. Lower an interleaved store (Factor = 3):
20708 /// %i.vec = shuffle <8 x i32> %v0, <8 x i32> %v1,
20709 /// <0, 4, 8, 1, 5, 9, 2, 6, 10, 3, 7, 11>
20710 /// store <12 x i32> %i.vec, <12 x i32>* %ptr, align 4
20713 /// %sub.v0 = shuffle <8 x i32> %v0, <8 x i32> v1, <0, 1, 2, 3>
20714 /// %sub.v1 = shuffle <8 x i32> %v0, <8 x i32> v1, <4, 5, 6, 7>
20715 /// %sub.v2 = shuffle <8 x i32> %v0, <8 x i32> v1, <8, 9, 10, 11>
20716 /// call void llvm.arm.neon.vst3(%ptr, %sub.v0, %sub.v1, %sub.v2, 4)
20718 /// Note that the new shufflevectors will be removed and we'll only generate one
20719 /// vst3 instruction in CodeGen.
20721 /// Example for a more general valid mask (Factor 3). Lower:
20722 /// %i.vec = shuffle <32 x i32> %v0, <32 x i32> %v1,
20723 /// <4, 32, 16, 5, 33, 17, 6, 34, 18, 7, 35, 19>
20724 /// store <12 x i32> %i.vec, <12 x i32>* %ptr
20727 /// %sub.v0 = shuffle <32 x i32> %v0, <32 x i32> v1, <4, 5, 6, 7>
20728 /// %sub.v1 = shuffle <32 x i32> %v0, <32 x i32> v1, <32, 33, 34, 35>
20729 /// %sub.v2 = shuffle <32 x i32> %v0, <32 x i32> v1, <16, 17, 18, 19>
20730 /// call void llvm.arm.neon.vst3(%ptr, %sub.v0, %sub.v1, %sub.v2, 4)
20731 bool ARMTargetLowering::lowerInterleavedStore(StoreInst
*SI
,
20732 ShuffleVectorInst
*SVI
,
20733 unsigned Factor
) const {
20734 assert(Factor
>= 2 && Factor
<= getMaxSupportedInterleaveFactor() &&
20735 "Invalid interleave factor");
20737 auto *VecTy
= cast
<FixedVectorType
>(SVI
->getType());
20738 assert(VecTy
->getNumElements() % Factor
== 0 && "Invalid interleaved store");
20740 unsigned LaneLen
= VecTy
->getNumElements() / Factor
;
20741 Type
*EltTy
= VecTy
->getElementType();
20742 auto *SubVecTy
= FixedVectorType::get(EltTy
, LaneLen
);
20744 const DataLayout
&DL
= SI
->getModule()->getDataLayout();
20745 Align Alignment
= SI
->getAlign();
20747 // Skip if we do not have NEON and skip illegal vector types. We can
20748 // "legalize" wide vector types into multiple interleaved accesses as long as
20749 // the vector types are divisible by 128.
20750 if (!isLegalInterleavedAccessType(Factor
, SubVecTy
, Alignment
, DL
))
20753 unsigned NumStores
= getNumInterleavedAccesses(SubVecTy
, DL
);
20755 Value
*Op0
= SVI
->getOperand(0);
20756 Value
*Op1
= SVI
->getOperand(1);
20757 IRBuilder
<> Builder(SI
);
20759 // StN intrinsics don't support pointer vectors as arguments. Convert pointer
20760 // vectors to integer vectors.
20761 if (EltTy
->isPointerTy()) {
20762 Type
*IntTy
= DL
.getIntPtrType(EltTy
);
20764 // Convert to the corresponding integer vector.
20766 FixedVectorType::get(IntTy
, cast
<FixedVectorType
>(Op0
->getType()));
20767 Op0
= Builder
.CreatePtrToInt(Op0
, IntVecTy
);
20768 Op1
= Builder
.CreatePtrToInt(Op1
, IntVecTy
);
20770 SubVecTy
= FixedVectorType::get(IntTy
, LaneLen
);
20773 // The base address of the store.
20774 Value
*BaseAddr
= SI
->getPointerOperand();
20776 if (NumStores
> 1) {
20777 // If we're going to generate more than one store, reset the lane length
20778 // and sub-vector type to something legal.
20779 LaneLen
/= NumStores
;
20780 SubVecTy
= FixedVectorType::get(SubVecTy
->getElementType(), LaneLen
);
20782 // We will compute the pointer operand of each store from the original base
20783 // address using GEPs. Cast the base address to a pointer to the scalar
20785 BaseAddr
= Builder
.CreateBitCast(
20787 SubVecTy
->getElementType()->getPointerTo(SI
->getPointerAddressSpace()));
20790 assert(isTypeLegal(EVT::getEVT(SubVecTy
)) && "Illegal vstN vector type!");
20792 auto Mask
= SVI
->getShuffleMask();
20794 auto createStoreIntrinsic
= [&](Value
*BaseAddr
,
20795 SmallVectorImpl
<Value
*> &Shuffles
) {
20796 if (Subtarget
->hasNEON()) {
20797 static const Intrinsic::ID StoreInts
[3] = {Intrinsic::arm_neon_vst2
,
20798 Intrinsic::arm_neon_vst3
,
20799 Intrinsic::arm_neon_vst4
};
20800 Type
*Int8Ptr
= Builder
.getInt8PtrTy(SI
->getPointerAddressSpace());
20801 Type
*Tys
[] = {Int8Ptr
, SubVecTy
};
20803 Function
*VstNFunc
= Intrinsic::getDeclaration(
20804 SI
->getModule(), StoreInts
[Factor
- 2], Tys
);
20806 SmallVector
<Value
*, 6> Ops
;
20807 Ops
.push_back(Builder
.CreateBitCast(BaseAddr
, Int8Ptr
));
20808 append_range(Ops
, Shuffles
);
20809 Ops
.push_back(Builder
.getInt32(SI
->getAlignment()));
20810 Builder
.CreateCall(VstNFunc
, Ops
);
20812 assert((Factor
== 2 || Factor
== 4) &&
20813 "expected interleave factor of 2 or 4 for MVE");
20814 Intrinsic::ID StoreInts
=
20815 Factor
== 2 ? Intrinsic::arm_mve_vst2q
: Intrinsic::arm_mve_vst4q
;
20816 Type
*EltPtrTy
= SubVecTy
->getElementType()->getPointerTo(
20817 SI
->getPointerAddressSpace());
20818 Type
*Tys
[] = {EltPtrTy
, SubVecTy
};
20819 Function
*VstNFunc
=
20820 Intrinsic::getDeclaration(SI
->getModule(), StoreInts
, Tys
);
20822 SmallVector
<Value
*, 6> Ops
;
20823 Ops
.push_back(Builder
.CreateBitCast(BaseAddr
, EltPtrTy
));
20824 append_range(Ops
, Shuffles
);
20825 for (unsigned F
= 0; F
< Factor
; F
++) {
20826 Ops
.push_back(Builder
.getInt32(F
));
20827 Builder
.CreateCall(VstNFunc
, Ops
);
20833 for (unsigned StoreCount
= 0; StoreCount
< NumStores
; ++StoreCount
) {
20834 // If we generating more than one store, we compute the base address of
20835 // subsequent stores as an offset from the previous.
20836 if (StoreCount
> 0)
20837 BaseAddr
= Builder
.CreateConstGEP1_32(SubVecTy
->getElementType(),
20838 BaseAddr
, LaneLen
* Factor
);
20840 SmallVector
<Value
*, 4> Shuffles
;
20842 // Split the shufflevector operands into sub vectors for the new vstN call.
20843 for (unsigned i
= 0; i
< Factor
; i
++) {
20844 unsigned IdxI
= StoreCount
* LaneLen
* Factor
+ i
;
20845 if (Mask
[IdxI
] >= 0) {
20846 Shuffles
.push_back(Builder
.CreateShuffleVector(
20847 Op0
, Op1
, createSequentialMask(Mask
[IdxI
], LaneLen
, 0)));
20849 unsigned StartMask
= 0;
20850 for (unsigned j
= 1; j
< LaneLen
; j
++) {
20851 unsigned IdxJ
= StoreCount
* LaneLen
* Factor
+ j
;
20852 if (Mask
[IdxJ
* Factor
+ IdxI
] >= 0) {
20853 StartMask
= Mask
[IdxJ
* Factor
+ IdxI
] - IdxJ
;
20857 // Note: If all elements in a chunk are undefs, StartMask=0!
20858 // Note: Filling undef gaps with random elements is ok, since
20859 // those elements were being written anyway (with undefs).
20860 // In the case of all undefs we're defaulting to using elems from 0
20861 // Note: StartMask cannot be negative, it's checked in
20862 // isReInterleaveMask
20863 Shuffles
.push_back(Builder
.CreateShuffleVector(
20864 Op0
, Op1
, createSequentialMask(StartMask
, LaneLen
, 0)));
20868 createStoreIntrinsic(BaseAddr
, Shuffles
);
20881 static bool isHomogeneousAggregate(Type
*Ty
, HABaseType
&Base
,
20882 uint64_t &Members
) {
20883 if (auto *ST
= dyn_cast
<StructType
>(Ty
)) {
20884 for (unsigned i
= 0; i
< ST
->getNumElements(); ++i
) {
20885 uint64_t SubMembers
= 0;
20886 if (!isHomogeneousAggregate(ST
->getElementType(i
), Base
, SubMembers
))
20888 Members
+= SubMembers
;
20890 } else if (auto *AT
= dyn_cast
<ArrayType
>(Ty
)) {
20891 uint64_t SubMembers
= 0;
20892 if (!isHomogeneousAggregate(AT
->getElementType(), Base
, SubMembers
))
20894 Members
+= SubMembers
* AT
->getNumElements();
20895 } else if (Ty
->isFloatTy()) {
20896 if (Base
!= HA_UNKNOWN
&& Base
!= HA_FLOAT
)
20900 } else if (Ty
->isDoubleTy()) {
20901 if (Base
!= HA_UNKNOWN
&& Base
!= HA_DOUBLE
)
20905 } else if (auto *VT
= dyn_cast
<VectorType
>(Ty
)) {
20912 return VT
->getPrimitiveSizeInBits().getFixedSize() == 64;
20914 return VT
->getPrimitiveSizeInBits().getFixedSize() == 128;
20916 switch (VT
->getPrimitiveSizeInBits().getFixedSize()) {
20929 return (Members
> 0 && Members
<= 4);
20932 /// Return the correct alignment for the current calling convention.
20933 Align
ARMTargetLowering::getABIAlignmentForCallingConv(
20934 Type
*ArgTy
, const DataLayout
&DL
) const {
20935 const Align ABITypeAlign
= DL
.getABITypeAlign(ArgTy
);
20936 if (!ArgTy
->isVectorTy())
20937 return ABITypeAlign
;
20939 // Avoid over-aligning vector parameters. It would require realigning the
20940 // stack and waste space for no real benefit.
20941 return std::min(ABITypeAlign
, DL
.getStackAlignment());
20944 /// Return true if a type is an AAPCS-VFP homogeneous aggregate or one of
20945 /// [N x i32] or [N x i64]. This allows front-ends to skip emitting padding when
20946 /// passing according to AAPCS rules.
20947 bool ARMTargetLowering::functionArgumentNeedsConsecutiveRegisters(
20948 Type
*Ty
, CallingConv::ID CallConv
, bool isVarArg
,
20949 const DataLayout
&DL
) const {
20950 if (getEffectiveCallingConv(CallConv
, isVarArg
) !=
20951 CallingConv::ARM_AAPCS_VFP
)
20954 HABaseType Base
= HA_UNKNOWN
;
20955 uint64_t Members
= 0;
20956 bool IsHA
= isHomogeneousAggregate(Ty
, Base
, Members
);
20957 LLVM_DEBUG(dbgs() << "isHA: " << IsHA
<< " "; Ty
->dump());
20959 bool IsIntArray
= Ty
->isArrayTy() && Ty
->getArrayElementType()->isIntegerTy();
20960 return IsHA
|| IsIntArray
;
20963 Register
ARMTargetLowering::getExceptionPointerRegister(
20964 const Constant
*PersonalityFn
) const {
20965 // Platforms which do not use SjLj EH may return values in these registers
20966 // via the personality function.
20967 return Subtarget
->useSjLjEH() ? Register() : ARM::R0
;
20970 Register
ARMTargetLowering::getExceptionSelectorRegister(
20971 const Constant
*PersonalityFn
) const {
20972 // Platforms which do not use SjLj EH may return values in these registers
20973 // via the personality function.
20974 return Subtarget
->useSjLjEH() ? Register() : ARM::R1
;
20977 void ARMTargetLowering::initializeSplitCSR(MachineBasicBlock
*Entry
) const {
20978 // Update IsSplitCSR in ARMFunctionInfo.
20979 ARMFunctionInfo
*AFI
= Entry
->getParent()->getInfo
<ARMFunctionInfo
>();
20980 AFI
->setIsSplitCSR(true);
20983 void ARMTargetLowering::insertCopiesSplitCSR(
20984 MachineBasicBlock
*Entry
,
20985 const SmallVectorImpl
<MachineBasicBlock
*> &Exits
) const {
20986 const ARMBaseRegisterInfo
*TRI
= Subtarget
->getRegisterInfo();
20987 const MCPhysReg
*IStart
= TRI
->getCalleeSavedRegsViaCopy(Entry
->getParent());
20991 const TargetInstrInfo
*TII
= Subtarget
->getInstrInfo();
20992 MachineRegisterInfo
*MRI
= &Entry
->getParent()->getRegInfo();
20993 MachineBasicBlock::iterator MBBI
= Entry
->begin();
20994 for (const MCPhysReg
*I
= IStart
; *I
; ++I
) {
20995 const TargetRegisterClass
*RC
= nullptr;
20996 if (ARM::GPRRegClass
.contains(*I
))
20997 RC
= &ARM::GPRRegClass
;
20998 else if (ARM::DPRRegClass
.contains(*I
))
20999 RC
= &ARM::DPRRegClass
;
21001 llvm_unreachable("Unexpected register class in CSRsViaCopy!");
21003 Register NewVR
= MRI
->createVirtualRegister(RC
);
21004 // Create copy from CSR to a virtual register.
21005 // FIXME: this currently does not emit CFI pseudo-instructions, it works
21006 // fine for CXX_FAST_TLS since the C++-style TLS access functions should be
21007 // nounwind. If we want to generalize this later, we may need to emit
21008 // CFI pseudo-instructions.
21009 assert(Entry
->getParent()->getFunction().hasFnAttribute(
21010 Attribute::NoUnwind
) &&
21011 "Function should be nounwind in insertCopiesSplitCSR!");
21012 Entry
->addLiveIn(*I
);
21013 BuildMI(*Entry
, MBBI
, DebugLoc(), TII
->get(TargetOpcode::COPY
), NewVR
)
21016 // Insert the copy-back instructions right before the terminator.
21017 for (auto *Exit
: Exits
)
21018 BuildMI(*Exit
, Exit
->getFirstTerminator(), DebugLoc(),
21019 TII
->get(TargetOpcode::COPY
), *I
)
21024 void ARMTargetLowering::finalizeLowering(MachineFunction
&MF
) const {
21025 MF
.getFrameInfo().computeMaxCallFrameSize(MF
);
21026 TargetLoweringBase::finalizeLowering(MF
);