1 //===-- AArch64ISelLowering.cpp - AArch64 DAG Lowering Implementation ----===//
3 // The LLVM Compiler Infrastructure
5 // This file is distributed under the University of Illinois Open Source
6 // License. See LICENSE.TXT for details.
8 //===----------------------------------------------------------------------===//
10 // This file implements the AArch64TargetLowering class.
12 //===----------------------------------------------------------------------===//
14 #include "AArch64ISelLowering.h"
15 #include "AArch64CallingConvention.h"
16 #include "AArch64MachineFunctionInfo.h"
17 #include "AArch64PerfectShuffle.h"
18 #include "AArch64RegisterInfo.h"
19 #include "AArch64Subtarget.h"
20 #include "MCTargetDesc/AArch64AddressingModes.h"
21 #include "Utils/AArch64BaseInfo.h"
22 #include "llvm/ADT/APFloat.h"
23 #include "llvm/ADT/APInt.h"
24 #include "llvm/ADT/ArrayRef.h"
25 #include "llvm/ADT/STLExtras.h"
26 #include "llvm/ADT/SmallVector.h"
27 #include "llvm/ADT/Statistic.h"
28 #include "llvm/ADT/StringRef.h"
29 #include "llvm/ADT/StringSwitch.h"
30 #include "llvm/ADT/Triple.h"
31 #include "llvm/ADT/Twine.h"
32 #include "llvm/Analysis/VectorUtils.h"
33 #include "llvm/CodeGen/CallingConvLower.h"
34 #include "llvm/CodeGen/MachineBasicBlock.h"
35 #include "llvm/CodeGen/MachineFrameInfo.h"
36 #include "llvm/CodeGen/MachineFunction.h"
37 #include "llvm/CodeGen/MachineInstr.h"
38 #include "llvm/CodeGen/MachineInstrBuilder.h"
39 #include "llvm/CodeGen/MachineMemOperand.h"
40 #include "llvm/CodeGen/MachineRegisterInfo.h"
41 #include "llvm/CodeGen/MachineValueType.h"
42 #include "llvm/CodeGen/RuntimeLibcalls.h"
43 #include "llvm/CodeGen/SelectionDAG.h"
44 #include "llvm/CodeGen/SelectionDAGNodes.h"
45 #include "llvm/CodeGen/TargetCallingConv.h"
46 #include "llvm/CodeGen/TargetInstrInfo.h"
47 #include "llvm/CodeGen/ValueTypes.h"
48 #include "llvm/IR/Attributes.h"
49 #include "llvm/IR/Constants.h"
50 #include "llvm/IR/DataLayout.h"
51 #include "llvm/IR/DebugLoc.h"
52 #include "llvm/IR/DerivedTypes.h"
53 #include "llvm/IR/Function.h"
54 #include "llvm/IR/GetElementPtrTypeIterator.h"
55 #include "llvm/IR/GlobalValue.h"
56 #include "llvm/IR/IRBuilder.h"
57 #include "llvm/IR/Instruction.h"
58 #include "llvm/IR/Instructions.h"
59 #include "llvm/IR/Intrinsics.h"
60 #include "llvm/IR/Module.h"
61 #include "llvm/IR/OperandTraits.h"
62 #include "llvm/IR/Type.h"
63 #include "llvm/IR/Use.h"
64 #include "llvm/IR/Value.h"
65 #include "llvm/MC/MCRegisterInfo.h"
66 #include "llvm/Support/Casting.h"
67 #include "llvm/Support/CodeGen.h"
68 #include "llvm/Support/CommandLine.h"
69 #include "llvm/Support/Compiler.h"
70 #include "llvm/Support/Debug.h"
71 #include "llvm/Support/ErrorHandling.h"
72 #include "llvm/Support/KnownBits.h"
73 #include "llvm/Support/MathExtras.h"
74 #include "llvm/Support/raw_ostream.h"
75 #include "llvm/Target/TargetMachine.h"
76 #include "llvm/Target/TargetOptions.h"
91 #define DEBUG_TYPE "aarch64-lower"
93 STATISTIC(NumTailCalls
, "Number of tail calls");
94 STATISTIC(NumShiftInserts
, "Number of vector shift inserts");
95 STATISTIC(NumOptimizedImms
, "Number of times immediates were optimized");
98 EnableAArch64SlrGeneration("aarch64-shift-insert-generation", cl::Hidden
,
99 cl::desc("Allow AArch64 SLI/SRI formation"),
102 // FIXME: The necessary dtprel relocations don't seem to be supported
103 // well in the GNU bfd and gold linkers at the moment. Therefore, by
104 // default, for now, fall back to GeneralDynamic code generation.
105 cl::opt
<bool> EnableAArch64ELFLocalDynamicTLSGeneration(
106 "aarch64-elf-ldtls-generation", cl::Hidden
,
107 cl::desc("Allow AArch64 Local Dynamic TLS code generation"),
111 EnableOptimizeLogicalImm("aarch64-enable-logical-imm", cl::Hidden
,
112 cl::desc("Enable AArch64 logical imm instruction "
116 /// Value type used for condition codes.
117 static const MVT MVT_CC
= MVT::i32
;
119 AArch64TargetLowering::AArch64TargetLowering(const TargetMachine
&TM
,
120 const AArch64Subtarget
&STI
)
121 : TargetLowering(TM
), Subtarget(&STI
) {
122 // AArch64 doesn't have comparisons which set GPRs or setcc instructions, so
123 // we have to make something up. Arbitrarily, choose ZeroOrOne.
124 setBooleanContents(ZeroOrOneBooleanContent
);
125 // When comparing vectors the result sets the different elements in the
126 // vector to all-one or all-zero.
127 setBooleanVectorContents(ZeroOrNegativeOneBooleanContent
);
129 // Set up the register classes.
130 addRegisterClass(MVT::i32
, &AArch64::GPR32allRegClass
);
131 addRegisterClass(MVT::i64
, &AArch64::GPR64allRegClass
);
133 if (Subtarget
->hasFPARMv8()) {
134 addRegisterClass(MVT::f16
, &AArch64::FPR16RegClass
);
135 addRegisterClass(MVT::f32
, &AArch64::FPR32RegClass
);
136 addRegisterClass(MVT::f64
, &AArch64::FPR64RegClass
);
137 addRegisterClass(MVT::f128
, &AArch64::FPR128RegClass
);
140 if (Subtarget
->hasNEON()) {
141 addRegisterClass(MVT::v16i8
, &AArch64::FPR8RegClass
);
142 addRegisterClass(MVT::v8i16
, &AArch64::FPR16RegClass
);
143 // Someone set us up the NEON.
144 addDRTypeForNEON(MVT::v2f32
);
145 addDRTypeForNEON(MVT::v8i8
);
146 addDRTypeForNEON(MVT::v4i16
);
147 addDRTypeForNEON(MVT::v2i32
);
148 addDRTypeForNEON(MVT::v1i64
);
149 addDRTypeForNEON(MVT::v1f64
);
150 addDRTypeForNEON(MVT::v4f16
);
152 addQRTypeForNEON(MVT::v4f32
);
153 addQRTypeForNEON(MVT::v2f64
);
154 addQRTypeForNEON(MVT::v16i8
);
155 addQRTypeForNEON(MVT::v8i16
);
156 addQRTypeForNEON(MVT::v4i32
);
157 addQRTypeForNEON(MVT::v2i64
);
158 addQRTypeForNEON(MVT::v8f16
);
161 // Compute derived properties from the register classes
162 computeRegisterProperties(Subtarget
->getRegisterInfo());
164 // Provide all sorts of operation actions
165 setOperationAction(ISD::GlobalAddress
, MVT::i64
, Custom
);
166 setOperationAction(ISD::GlobalTLSAddress
, MVT::i64
, Custom
);
167 setOperationAction(ISD::SETCC
, MVT::i32
, Custom
);
168 setOperationAction(ISD::SETCC
, MVT::i64
, Custom
);
169 setOperationAction(ISD::SETCC
, MVT::f16
, Custom
);
170 setOperationAction(ISD::SETCC
, MVT::f32
, Custom
);
171 setOperationAction(ISD::SETCC
, MVT::f64
, Custom
);
172 setOperationAction(ISD::BITREVERSE
, MVT::i32
, Legal
);
173 setOperationAction(ISD::BITREVERSE
, MVT::i64
, Legal
);
174 setOperationAction(ISD::BRCOND
, MVT::Other
, Expand
);
175 setOperationAction(ISD::BR_CC
, MVT::i32
, Custom
);
176 setOperationAction(ISD::BR_CC
, MVT::i64
, Custom
);
177 setOperationAction(ISD::BR_CC
, MVT::f16
, Custom
);
178 setOperationAction(ISD::BR_CC
, MVT::f32
, Custom
);
179 setOperationAction(ISD::BR_CC
, MVT::f64
, Custom
);
180 setOperationAction(ISD::SELECT
, MVT::i32
, Custom
);
181 setOperationAction(ISD::SELECT
, MVT::i64
, Custom
);
182 setOperationAction(ISD::SELECT
, MVT::f16
, Custom
);
183 setOperationAction(ISD::SELECT
, MVT::f32
, Custom
);
184 setOperationAction(ISD::SELECT
, MVT::f64
, Custom
);
185 setOperationAction(ISD::SELECT_CC
, MVT::i32
, Custom
);
186 setOperationAction(ISD::SELECT_CC
, MVT::i64
, Custom
);
187 setOperationAction(ISD::SELECT_CC
, MVT::f16
, Custom
);
188 setOperationAction(ISD::SELECT_CC
, MVT::f32
, Custom
);
189 setOperationAction(ISD::SELECT_CC
, MVT::f64
, Custom
);
190 setOperationAction(ISD::BR_JT
, MVT::Other
, Expand
);
191 setOperationAction(ISD::JumpTable
, MVT::i64
, Custom
);
193 setOperationAction(ISD::SHL_PARTS
, MVT::i64
, Custom
);
194 setOperationAction(ISD::SRA_PARTS
, MVT::i64
, Custom
);
195 setOperationAction(ISD::SRL_PARTS
, MVT::i64
, Custom
);
197 setOperationAction(ISD::FREM
, MVT::f32
, Expand
);
198 setOperationAction(ISD::FREM
, MVT::f64
, Expand
);
199 setOperationAction(ISD::FREM
, MVT::f80
, Expand
);
201 // Custom lowering hooks are needed for XOR
202 // to fold it into CSINC/CSINV.
203 setOperationAction(ISD::XOR
, MVT::i32
, Custom
);
204 setOperationAction(ISD::XOR
, MVT::i64
, Custom
);
206 // Virtually no operation on f128 is legal, but LLVM can't expand them when
207 // there's a valid register class, so we need custom operations in most cases.
208 setOperationAction(ISD::FABS
, MVT::f128
, Expand
);
209 setOperationAction(ISD::FADD
, MVT::f128
, Custom
);
210 setOperationAction(ISD::FCOPYSIGN
, MVT::f128
, Expand
);
211 setOperationAction(ISD::FCOS
, MVT::f128
, Expand
);
212 setOperationAction(ISD::FDIV
, MVT::f128
, Custom
);
213 setOperationAction(ISD::FMA
, MVT::f128
, Expand
);
214 setOperationAction(ISD::FMUL
, MVT::f128
, Custom
);
215 setOperationAction(ISD::FNEG
, MVT::f128
, Expand
);
216 setOperationAction(ISD::FPOW
, MVT::f128
, Expand
);
217 setOperationAction(ISD::FREM
, MVT::f128
, Expand
);
218 setOperationAction(ISD::FRINT
, MVT::f128
, Expand
);
219 setOperationAction(ISD::FSIN
, MVT::f128
, Expand
);
220 setOperationAction(ISD::FSINCOS
, MVT::f128
, Expand
);
221 setOperationAction(ISD::FSQRT
, MVT::f128
, Expand
);
222 setOperationAction(ISD::FSUB
, MVT::f128
, Custom
);
223 setOperationAction(ISD::FTRUNC
, MVT::f128
, Expand
);
224 setOperationAction(ISD::SETCC
, MVT::f128
, Custom
);
225 setOperationAction(ISD::BR_CC
, MVT::f128
, Custom
);
226 setOperationAction(ISD::SELECT
, MVT::f128
, Custom
);
227 setOperationAction(ISD::SELECT_CC
, MVT::f128
, Custom
);
228 setOperationAction(ISD::FP_EXTEND
, MVT::f128
, Custom
);
230 // Lowering for many of the conversions is actually specified by the non-f128
231 // type. The LowerXXX function will be trivial when f128 isn't involved.
232 setOperationAction(ISD::FP_TO_SINT
, MVT::i32
, Custom
);
233 setOperationAction(ISD::FP_TO_SINT
, MVT::i64
, Custom
);
234 setOperationAction(ISD::FP_TO_SINT
, MVT::i128
, Custom
);
235 setOperationAction(ISD::FP_TO_UINT
, MVT::i32
, Custom
);
236 setOperationAction(ISD::FP_TO_UINT
, MVT::i64
, Custom
);
237 setOperationAction(ISD::FP_TO_UINT
, MVT::i128
, Custom
);
238 setOperationAction(ISD::SINT_TO_FP
, MVT::i32
, Custom
);
239 setOperationAction(ISD::SINT_TO_FP
, MVT::i64
, Custom
);
240 setOperationAction(ISD::SINT_TO_FP
, MVT::i128
, Custom
);
241 setOperationAction(ISD::UINT_TO_FP
, MVT::i32
, Custom
);
242 setOperationAction(ISD::UINT_TO_FP
, MVT::i64
, Custom
);
243 setOperationAction(ISD::UINT_TO_FP
, MVT::i128
, Custom
);
244 setOperationAction(ISD::FP_ROUND
, MVT::f32
, Custom
);
245 setOperationAction(ISD::FP_ROUND
, MVT::f64
, Custom
);
247 // Variable arguments.
248 setOperationAction(ISD::VASTART
, MVT::Other
, Custom
);
249 setOperationAction(ISD::VAARG
, MVT::Other
, Custom
);
250 setOperationAction(ISD::VACOPY
, MVT::Other
, Custom
);
251 setOperationAction(ISD::VAEND
, MVT::Other
, Expand
);
253 // Variable-sized objects.
254 setOperationAction(ISD::STACKSAVE
, MVT::Other
, Expand
);
255 setOperationAction(ISD::STACKRESTORE
, MVT::Other
, Expand
);
257 if (Subtarget
->isTargetWindows())
258 setOperationAction(ISD::DYNAMIC_STACKALLOC
, MVT::i64
, Custom
);
260 setOperationAction(ISD::DYNAMIC_STACKALLOC
, MVT::i64
, Expand
);
262 // Constant pool entries
263 setOperationAction(ISD::ConstantPool
, MVT::i64
, Custom
);
266 setOperationAction(ISD::BlockAddress
, MVT::i64
, Custom
);
268 // Add/Sub overflow ops with MVT::Glues are lowered to NZCV dependences.
269 setOperationAction(ISD::ADDC
, MVT::i32
, Custom
);
270 setOperationAction(ISD::ADDE
, MVT::i32
, Custom
);
271 setOperationAction(ISD::SUBC
, MVT::i32
, Custom
);
272 setOperationAction(ISD::SUBE
, MVT::i32
, Custom
);
273 setOperationAction(ISD::ADDC
, MVT::i64
, Custom
);
274 setOperationAction(ISD::ADDE
, MVT::i64
, Custom
);
275 setOperationAction(ISD::SUBC
, MVT::i64
, Custom
);
276 setOperationAction(ISD::SUBE
, MVT::i64
, Custom
);
278 // AArch64 lacks both left-rotate and popcount instructions.
279 setOperationAction(ISD::ROTL
, MVT::i32
, Expand
);
280 setOperationAction(ISD::ROTL
, MVT::i64
, Expand
);
281 for (MVT VT
: MVT::vector_valuetypes()) {
282 setOperationAction(ISD::ROTL
, VT
, Expand
);
283 setOperationAction(ISD::ROTR
, VT
, Expand
);
286 // AArch64 doesn't have {U|S}MUL_LOHI.
287 setOperationAction(ISD::UMUL_LOHI
, MVT::i64
, Expand
);
288 setOperationAction(ISD::SMUL_LOHI
, MVT::i64
, Expand
);
290 setOperationAction(ISD::CTPOP
, MVT::i32
, Custom
);
291 setOperationAction(ISD::CTPOP
, MVT::i64
, Custom
);
293 setOperationAction(ISD::SDIVREM
, MVT::i32
, Expand
);
294 setOperationAction(ISD::SDIVREM
, MVT::i64
, Expand
);
295 for (MVT VT
: MVT::vector_valuetypes()) {
296 setOperationAction(ISD::SDIVREM
, VT
, Expand
);
297 setOperationAction(ISD::UDIVREM
, VT
, Expand
);
299 setOperationAction(ISD::SREM
, MVT::i32
, Expand
);
300 setOperationAction(ISD::SREM
, MVT::i64
, Expand
);
301 setOperationAction(ISD::UDIVREM
, MVT::i32
, Expand
);
302 setOperationAction(ISD::UDIVREM
, MVT::i64
, Expand
);
303 setOperationAction(ISD::UREM
, MVT::i32
, Expand
);
304 setOperationAction(ISD::UREM
, MVT::i64
, Expand
);
306 // Custom lower Add/Sub/Mul with overflow.
307 setOperationAction(ISD::SADDO
, MVT::i32
, Custom
);
308 setOperationAction(ISD::SADDO
, MVT::i64
, Custom
);
309 setOperationAction(ISD::UADDO
, MVT::i32
, Custom
);
310 setOperationAction(ISD::UADDO
, MVT::i64
, Custom
);
311 setOperationAction(ISD::SSUBO
, MVT::i32
, Custom
);
312 setOperationAction(ISD::SSUBO
, MVT::i64
, Custom
);
313 setOperationAction(ISD::USUBO
, MVT::i32
, Custom
);
314 setOperationAction(ISD::USUBO
, MVT::i64
, Custom
);
315 setOperationAction(ISD::SMULO
, MVT::i32
, Custom
);
316 setOperationAction(ISD::SMULO
, MVT::i64
, Custom
);
317 setOperationAction(ISD::UMULO
, MVT::i32
, Custom
);
318 setOperationAction(ISD::UMULO
, MVT::i64
, Custom
);
320 setOperationAction(ISD::FSIN
, MVT::f32
, Expand
);
321 setOperationAction(ISD::FSIN
, MVT::f64
, Expand
);
322 setOperationAction(ISD::FCOS
, MVT::f32
, Expand
);
323 setOperationAction(ISD::FCOS
, MVT::f64
, Expand
);
324 setOperationAction(ISD::FPOW
, MVT::f32
, Expand
);
325 setOperationAction(ISD::FPOW
, MVT::f64
, Expand
);
326 setOperationAction(ISD::FCOPYSIGN
, MVT::f64
, Custom
);
327 setOperationAction(ISD::FCOPYSIGN
, MVT::f32
, Custom
);
328 if (Subtarget
->hasFullFP16())
329 setOperationAction(ISD::FCOPYSIGN
, MVT::f16
, Custom
);
331 setOperationAction(ISD::FCOPYSIGN
, MVT::f16
, Promote
);
333 setOperationAction(ISD::FREM
, MVT::f16
, Promote
);
334 setOperationAction(ISD::FREM
, MVT::v4f16
, Promote
);
335 setOperationAction(ISD::FREM
, MVT::v8f16
, Promote
);
336 setOperationAction(ISD::FPOW
, MVT::f16
, Promote
);
337 setOperationAction(ISD::FPOW
, MVT::v4f16
, Promote
);
338 setOperationAction(ISD::FPOW
, MVT::v8f16
, Promote
);
339 setOperationAction(ISD::FPOWI
, MVT::f16
, Promote
);
340 setOperationAction(ISD::FCOS
, MVT::f16
, Promote
);
341 setOperationAction(ISD::FCOS
, MVT::v4f16
, Promote
);
342 setOperationAction(ISD::FCOS
, MVT::v8f16
, Promote
);
343 setOperationAction(ISD::FSIN
, MVT::f16
, Promote
);
344 setOperationAction(ISD::FSIN
, MVT::v4f16
, Promote
);
345 setOperationAction(ISD::FSIN
, MVT::v8f16
, Promote
);
346 setOperationAction(ISD::FSINCOS
, MVT::f16
, Promote
);
347 setOperationAction(ISD::FSINCOS
, MVT::v4f16
, Promote
);
348 setOperationAction(ISD::FSINCOS
, MVT::v8f16
, Promote
);
349 setOperationAction(ISD::FEXP
, MVT::f16
, Promote
);
350 setOperationAction(ISD::FEXP
, MVT::v4f16
, Promote
);
351 setOperationAction(ISD::FEXP
, MVT::v8f16
, Promote
);
352 setOperationAction(ISD::FEXP2
, MVT::f16
, Promote
);
353 setOperationAction(ISD::FEXP2
, MVT::v4f16
, Promote
);
354 setOperationAction(ISD::FEXP2
, MVT::v8f16
, Promote
);
355 setOperationAction(ISD::FLOG
, MVT::f16
, Promote
);
356 setOperationAction(ISD::FLOG
, MVT::v4f16
, Promote
);
357 setOperationAction(ISD::FLOG
, MVT::v8f16
, Promote
);
358 setOperationAction(ISD::FLOG2
, MVT::f16
, Promote
);
359 setOperationAction(ISD::FLOG2
, MVT::v4f16
, Promote
);
360 setOperationAction(ISD::FLOG2
, MVT::v8f16
, Promote
);
361 setOperationAction(ISD::FLOG10
, MVT::f16
, Promote
);
362 setOperationAction(ISD::FLOG10
, MVT::v4f16
, Promote
);
363 setOperationAction(ISD::FLOG10
, MVT::v8f16
, Promote
);
365 if (!Subtarget
->hasFullFP16()) {
366 setOperationAction(ISD::SELECT
, MVT::f16
, Promote
);
367 setOperationAction(ISD::SELECT_CC
, MVT::f16
, Promote
);
368 setOperationAction(ISD::SETCC
, MVT::f16
, Promote
);
369 setOperationAction(ISD::BR_CC
, MVT::f16
, Promote
);
370 setOperationAction(ISD::FADD
, MVT::f16
, Promote
);
371 setOperationAction(ISD::FSUB
, MVT::f16
, Promote
);
372 setOperationAction(ISD::FMUL
, MVT::f16
, Promote
);
373 setOperationAction(ISD::FDIV
, MVT::f16
, Promote
);
374 setOperationAction(ISD::FMA
, MVT::f16
, Promote
);
375 setOperationAction(ISD::FNEG
, MVT::f16
, Promote
);
376 setOperationAction(ISD::FABS
, MVT::f16
, Promote
);
377 setOperationAction(ISD::FCEIL
, MVT::f16
, Promote
);
378 setOperationAction(ISD::FSQRT
, MVT::f16
, Promote
);
379 setOperationAction(ISD::FFLOOR
, MVT::f16
, Promote
);
380 setOperationAction(ISD::FNEARBYINT
, MVT::f16
, Promote
);
381 setOperationAction(ISD::FRINT
, MVT::f16
, Promote
);
382 setOperationAction(ISD::FROUND
, MVT::f16
, Promote
);
383 setOperationAction(ISD::FTRUNC
, MVT::f16
, Promote
);
384 setOperationAction(ISD::FMINNUM
, MVT::f16
, Promote
);
385 setOperationAction(ISD::FMAXNUM
, MVT::f16
, Promote
);
386 setOperationAction(ISD::FMINNAN
, MVT::f16
, Promote
);
387 setOperationAction(ISD::FMAXNAN
, MVT::f16
, Promote
);
389 // promote v4f16 to v4f32 when that is known to be safe.
390 setOperationAction(ISD::FADD
, MVT::v4f16
, Promote
);
391 setOperationAction(ISD::FSUB
, MVT::v4f16
, Promote
);
392 setOperationAction(ISD::FMUL
, MVT::v4f16
, Promote
);
393 setOperationAction(ISD::FDIV
, MVT::v4f16
, Promote
);
394 setOperationAction(ISD::FP_EXTEND
, MVT::v4f16
, Promote
);
395 setOperationAction(ISD::FP_ROUND
, MVT::v4f16
, Promote
);
396 AddPromotedToType(ISD::FADD
, MVT::v4f16
, MVT::v4f32
);
397 AddPromotedToType(ISD::FSUB
, MVT::v4f16
, MVT::v4f32
);
398 AddPromotedToType(ISD::FMUL
, MVT::v4f16
, MVT::v4f32
);
399 AddPromotedToType(ISD::FDIV
, MVT::v4f16
, MVT::v4f32
);
400 AddPromotedToType(ISD::FP_EXTEND
, MVT::v4f16
, MVT::v4f32
);
401 AddPromotedToType(ISD::FP_ROUND
, MVT::v4f16
, MVT::v4f32
);
403 setOperationAction(ISD::FABS
, MVT::v4f16
, Expand
);
404 setOperationAction(ISD::FNEG
, MVT::v4f16
, Expand
);
405 setOperationAction(ISD::FROUND
, MVT::v4f16
, Expand
);
406 setOperationAction(ISD::FMA
, MVT::v4f16
, Expand
);
407 setOperationAction(ISD::SETCC
, MVT::v4f16
, Expand
);
408 setOperationAction(ISD::BR_CC
, MVT::v4f16
, Expand
);
409 setOperationAction(ISD::SELECT
, MVT::v4f16
, Expand
);
410 setOperationAction(ISD::SELECT_CC
, MVT::v4f16
, Expand
);
411 setOperationAction(ISD::FTRUNC
, MVT::v4f16
, Expand
);
412 setOperationAction(ISD::FCOPYSIGN
, MVT::v4f16
, Expand
);
413 setOperationAction(ISD::FFLOOR
, MVT::v4f16
, Expand
);
414 setOperationAction(ISD::FCEIL
, MVT::v4f16
, Expand
);
415 setOperationAction(ISD::FRINT
, MVT::v4f16
, Expand
);
416 setOperationAction(ISD::FNEARBYINT
, MVT::v4f16
, Expand
);
417 setOperationAction(ISD::FSQRT
, MVT::v4f16
, Expand
);
419 setOperationAction(ISD::FABS
, MVT::v8f16
, Expand
);
420 setOperationAction(ISD::FADD
, MVT::v8f16
, Expand
);
421 setOperationAction(ISD::FCEIL
, MVT::v8f16
, Expand
);
422 setOperationAction(ISD::FCOPYSIGN
, MVT::v8f16
, Expand
);
423 setOperationAction(ISD::FDIV
, MVT::v8f16
, Expand
);
424 setOperationAction(ISD::FFLOOR
, MVT::v8f16
, Expand
);
425 setOperationAction(ISD::FMA
, MVT::v8f16
, Expand
);
426 setOperationAction(ISD::FMUL
, MVT::v8f16
, Expand
);
427 setOperationAction(ISD::FNEARBYINT
, MVT::v8f16
, Expand
);
428 setOperationAction(ISD::FNEG
, MVT::v8f16
, Expand
);
429 setOperationAction(ISD::FROUND
, MVT::v8f16
, Expand
);
430 setOperationAction(ISD::FRINT
, MVT::v8f16
, Expand
);
431 setOperationAction(ISD::FSQRT
, MVT::v8f16
, Expand
);
432 setOperationAction(ISD::FSUB
, MVT::v8f16
, Expand
);
433 setOperationAction(ISD::FTRUNC
, MVT::v8f16
, Expand
);
434 setOperationAction(ISD::SETCC
, MVT::v8f16
, Expand
);
435 setOperationAction(ISD::BR_CC
, MVT::v8f16
, Expand
);
436 setOperationAction(ISD::SELECT
, MVT::v8f16
, Expand
);
437 setOperationAction(ISD::SELECT_CC
, MVT::v8f16
, Expand
);
438 setOperationAction(ISD::FP_EXTEND
, MVT::v8f16
, Expand
);
441 // AArch64 has implementations of a lot of rounding-like FP operations.
442 for (MVT Ty
: {MVT::f32
, MVT::f64
}) {
443 setOperationAction(ISD::FFLOOR
, Ty
, Legal
);
444 setOperationAction(ISD::FNEARBYINT
, Ty
, Legal
);
445 setOperationAction(ISD::FCEIL
, Ty
, Legal
);
446 setOperationAction(ISD::FRINT
, Ty
, Legal
);
447 setOperationAction(ISD::FTRUNC
, Ty
, Legal
);
448 setOperationAction(ISD::FROUND
, Ty
, Legal
);
449 setOperationAction(ISD::FMINNUM
, Ty
, Legal
);
450 setOperationAction(ISD::FMAXNUM
, Ty
, Legal
);
451 setOperationAction(ISD::FMINNAN
, Ty
, Legal
);
452 setOperationAction(ISD::FMAXNAN
, Ty
, Legal
);
455 if (Subtarget
->hasFullFP16()) {
456 setOperationAction(ISD::FNEARBYINT
, MVT::f16
, Legal
);
457 setOperationAction(ISD::FFLOOR
, MVT::f16
, Legal
);
458 setOperationAction(ISD::FCEIL
, MVT::f16
, Legal
);
459 setOperationAction(ISD::FRINT
, MVT::f16
, Legal
);
460 setOperationAction(ISD::FTRUNC
, MVT::f16
, Legal
);
461 setOperationAction(ISD::FROUND
, MVT::f16
, Legal
);
462 setOperationAction(ISD::FMINNUM
, MVT::f16
, Legal
);
463 setOperationAction(ISD::FMAXNUM
, MVT::f16
, Legal
);
464 setOperationAction(ISD::FMINNAN
, MVT::f16
, Legal
);
465 setOperationAction(ISD::FMAXNAN
, MVT::f16
, Legal
);
468 setOperationAction(ISD::PREFETCH
, MVT::Other
, Custom
);
470 setOperationAction(ISD::ATOMIC_CMP_SWAP
, MVT::i128
, Custom
);
471 setOperationAction(ISD::ATOMIC_LOAD_SUB
, MVT::i32
, Custom
);
472 setOperationAction(ISD::ATOMIC_LOAD_SUB
, MVT::i64
, Custom
);
473 setOperationAction(ISD::ATOMIC_LOAD_AND
, MVT::i32
, Custom
);
474 setOperationAction(ISD::ATOMIC_LOAD_AND
, MVT::i64
, Custom
);
476 // Lower READCYCLECOUNTER using an mrs from PMCCNTR_EL0.
477 // This requires the Performance Monitors extension.
478 if (Subtarget
->hasPerfMon())
479 setOperationAction(ISD::READCYCLECOUNTER
, MVT::i64
, Legal
);
481 if (getLibcallName(RTLIB::SINCOS_STRET_F32
) != nullptr &&
482 getLibcallName(RTLIB::SINCOS_STRET_F64
) != nullptr) {
483 // Issue __sincos_stret if available.
484 setOperationAction(ISD::FSINCOS
, MVT::f64
, Custom
);
485 setOperationAction(ISD::FSINCOS
, MVT::f32
, Custom
);
487 setOperationAction(ISD::FSINCOS
, MVT::f64
, Expand
);
488 setOperationAction(ISD::FSINCOS
, MVT::f32
, Expand
);
491 // Make floating-point constants legal for the large code model, so they don't
492 // become loads from the constant pool.
493 if (Subtarget
->isTargetMachO() && TM
.getCodeModel() == CodeModel::Large
) {
494 setOperationAction(ISD::ConstantFP
, MVT::f32
, Legal
);
495 setOperationAction(ISD::ConstantFP
, MVT::f64
, Legal
);
498 // AArch64 does not have floating-point extending loads, i1 sign-extending
499 // load, floating-point truncating stores, or v2i32->v2i16 truncating store.
500 for (MVT VT
: MVT::fp_valuetypes()) {
501 setLoadExtAction(ISD::EXTLOAD
, VT
, MVT::f16
, Expand
);
502 setLoadExtAction(ISD::EXTLOAD
, VT
, MVT::f32
, Expand
);
503 setLoadExtAction(ISD::EXTLOAD
, VT
, MVT::f64
, Expand
);
504 setLoadExtAction(ISD::EXTLOAD
, VT
, MVT::f80
, Expand
);
506 for (MVT VT
: MVT::integer_valuetypes())
507 setLoadExtAction(ISD::SEXTLOAD
, VT
, MVT::i1
, Expand
);
509 setTruncStoreAction(MVT::f32
, MVT::f16
, Expand
);
510 setTruncStoreAction(MVT::f64
, MVT::f32
, Expand
);
511 setTruncStoreAction(MVT::f64
, MVT::f16
, Expand
);
512 setTruncStoreAction(MVT::f128
, MVT::f80
, Expand
);
513 setTruncStoreAction(MVT::f128
, MVT::f64
, Expand
);
514 setTruncStoreAction(MVT::f128
, MVT::f32
, Expand
);
515 setTruncStoreAction(MVT::f128
, MVT::f16
, Expand
);
517 setOperationAction(ISD::BITCAST
, MVT::i16
, Custom
);
518 setOperationAction(ISD::BITCAST
, MVT::f16
, Custom
);
520 // Indexed loads and stores are supported.
521 for (unsigned im
= (unsigned)ISD::PRE_INC
;
522 im
!= (unsigned)ISD::LAST_INDEXED_MODE
; ++im
) {
523 setIndexedLoadAction(im
, MVT::i8
, Legal
);
524 setIndexedLoadAction(im
, MVT::i16
, Legal
);
525 setIndexedLoadAction(im
, MVT::i32
, Legal
);
526 setIndexedLoadAction(im
, MVT::i64
, Legal
);
527 setIndexedLoadAction(im
, MVT::f64
, Legal
);
528 setIndexedLoadAction(im
, MVT::f32
, Legal
);
529 setIndexedLoadAction(im
, MVT::f16
, Legal
);
530 setIndexedStoreAction(im
, MVT::i8
, Legal
);
531 setIndexedStoreAction(im
, MVT::i16
, Legal
);
532 setIndexedStoreAction(im
, MVT::i32
, Legal
);
533 setIndexedStoreAction(im
, MVT::i64
, Legal
);
534 setIndexedStoreAction(im
, MVT::f64
, Legal
);
535 setIndexedStoreAction(im
, MVT::f32
, Legal
);
536 setIndexedStoreAction(im
, MVT::f16
, Legal
);
540 setOperationAction(ISD::TRAP
, MVT::Other
, Legal
);
542 // We combine OR nodes for bitfield operations.
543 setTargetDAGCombine(ISD::OR
);
545 // Vector add and sub nodes may conceal a high-half opportunity.
546 // Also, try to fold ADD into CSINC/CSINV..
547 setTargetDAGCombine(ISD::ADD
);
548 setTargetDAGCombine(ISD::SUB
);
549 setTargetDAGCombine(ISD::SRL
);
550 setTargetDAGCombine(ISD::XOR
);
551 setTargetDAGCombine(ISD::SINT_TO_FP
);
552 setTargetDAGCombine(ISD::UINT_TO_FP
);
554 setTargetDAGCombine(ISD::FP_TO_SINT
);
555 setTargetDAGCombine(ISD::FP_TO_UINT
);
556 setTargetDAGCombine(ISD::FDIV
);
558 setTargetDAGCombine(ISD::INTRINSIC_WO_CHAIN
);
560 setTargetDAGCombine(ISD::ANY_EXTEND
);
561 setTargetDAGCombine(ISD::ZERO_EXTEND
);
562 setTargetDAGCombine(ISD::SIGN_EXTEND
);
563 setTargetDAGCombine(ISD::BITCAST
);
564 setTargetDAGCombine(ISD::CONCAT_VECTORS
);
565 setTargetDAGCombine(ISD::STORE
);
566 if (Subtarget
->supportsAddressTopByteIgnored())
567 setTargetDAGCombine(ISD::LOAD
);
569 setTargetDAGCombine(ISD::MUL
);
571 setTargetDAGCombine(ISD::SELECT
);
572 setTargetDAGCombine(ISD::VSELECT
);
574 setTargetDAGCombine(ISD::INTRINSIC_VOID
);
575 setTargetDAGCombine(ISD::INTRINSIC_W_CHAIN
);
576 setTargetDAGCombine(ISD::INSERT_VECTOR_ELT
);
578 MaxStoresPerMemset
= MaxStoresPerMemsetOptSize
= 8;
579 MaxStoresPerMemcpy
= MaxStoresPerMemcpyOptSize
= 4;
580 MaxStoresPerMemmove
= MaxStoresPerMemmoveOptSize
= 4;
582 setStackPointerRegisterToSaveRestore(AArch64::SP
);
584 setSchedulingPreference(Sched::Hybrid
);
586 EnableExtLdPromotion
= true;
588 // Set required alignment.
589 setMinFunctionAlignment(2);
590 // Set preferred alignments.
591 setPrefFunctionAlignment(STI
.getPrefFunctionAlignment());
592 setPrefLoopAlignment(STI
.getPrefLoopAlignment());
594 // Only change the limit for entries in a jump table if specified by
595 // the subtarget, but not at the command line.
596 unsigned MaxJT
= STI
.getMaximumJumpTableSize();
597 if (MaxJT
&& getMaximumJumpTableSize() == 0)
598 setMaximumJumpTableSize(MaxJT
);
600 setHasExtractBitsInsn(true);
602 setOperationAction(ISD::INTRINSIC_WO_CHAIN
, MVT::Other
, Custom
);
604 if (Subtarget
->hasNEON()) {
605 // FIXME: v1f64 shouldn't be legal if we can avoid it, because it leads to
606 // silliness like this:
607 setOperationAction(ISD::FABS
, MVT::v1f64
, Expand
);
608 setOperationAction(ISD::FADD
, MVT::v1f64
, Expand
);
609 setOperationAction(ISD::FCEIL
, MVT::v1f64
, Expand
);
610 setOperationAction(ISD::FCOPYSIGN
, MVT::v1f64
, Expand
);
611 setOperationAction(ISD::FCOS
, MVT::v1f64
, Expand
);
612 setOperationAction(ISD::FDIV
, MVT::v1f64
, Expand
);
613 setOperationAction(ISD::FFLOOR
, MVT::v1f64
, Expand
);
614 setOperationAction(ISD::FMA
, MVT::v1f64
, Expand
);
615 setOperationAction(ISD::FMUL
, MVT::v1f64
, Expand
);
616 setOperationAction(ISD::FNEARBYINT
, MVT::v1f64
, Expand
);
617 setOperationAction(ISD::FNEG
, MVT::v1f64
, Expand
);
618 setOperationAction(ISD::FPOW
, MVT::v1f64
, Expand
);
619 setOperationAction(ISD::FREM
, MVT::v1f64
, Expand
);
620 setOperationAction(ISD::FROUND
, MVT::v1f64
, Expand
);
621 setOperationAction(ISD::FRINT
, MVT::v1f64
, Expand
);
622 setOperationAction(ISD::FSIN
, MVT::v1f64
, Expand
);
623 setOperationAction(ISD::FSINCOS
, MVT::v1f64
, Expand
);
624 setOperationAction(ISD::FSQRT
, MVT::v1f64
, Expand
);
625 setOperationAction(ISD::FSUB
, MVT::v1f64
, Expand
);
626 setOperationAction(ISD::FTRUNC
, MVT::v1f64
, Expand
);
627 setOperationAction(ISD::SETCC
, MVT::v1f64
, Expand
);
628 setOperationAction(ISD::BR_CC
, MVT::v1f64
, Expand
);
629 setOperationAction(ISD::SELECT
, MVT::v1f64
, Expand
);
630 setOperationAction(ISD::SELECT_CC
, MVT::v1f64
, Expand
);
631 setOperationAction(ISD::FP_EXTEND
, MVT::v1f64
, Expand
);
633 setOperationAction(ISD::FP_TO_SINT
, MVT::v1i64
, Expand
);
634 setOperationAction(ISD::FP_TO_UINT
, MVT::v1i64
, Expand
);
635 setOperationAction(ISD::SINT_TO_FP
, MVT::v1i64
, Expand
);
636 setOperationAction(ISD::UINT_TO_FP
, MVT::v1i64
, Expand
);
637 setOperationAction(ISD::FP_ROUND
, MVT::v1f64
, Expand
);
639 setOperationAction(ISD::MUL
, MVT::v1i64
, Expand
);
641 // AArch64 doesn't have a direct vector ->f32 conversion instructions for
642 // elements smaller than i32, so promote the input to i32 first.
643 setOperationPromotedToType(ISD::UINT_TO_FP
, MVT::v4i8
, MVT::v4i32
);
644 setOperationPromotedToType(ISD::SINT_TO_FP
, MVT::v4i8
, MVT::v4i32
);
645 setOperationPromotedToType(ISD::UINT_TO_FP
, MVT::v4i16
, MVT::v4i32
);
646 setOperationPromotedToType(ISD::SINT_TO_FP
, MVT::v4i16
, MVT::v4i32
);
647 // i8 and i16 vector elements also need promotion to i32 for v8i8 or v8i16
648 // -> v8f16 conversions.
649 setOperationPromotedToType(ISD::SINT_TO_FP
, MVT::v8i8
, MVT::v8i32
);
650 setOperationPromotedToType(ISD::UINT_TO_FP
, MVT::v8i8
, MVT::v8i32
);
651 setOperationPromotedToType(ISD::SINT_TO_FP
, MVT::v8i16
, MVT::v8i32
);
652 setOperationPromotedToType(ISD::UINT_TO_FP
, MVT::v8i16
, MVT::v8i32
);
653 // Similarly, there is no direct i32 -> f64 vector conversion instruction.
654 setOperationAction(ISD::SINT_TO_FP
, MVT::v2i32
, Custom
);
655 setOperationAction(ISD::UINT_TO_FP
, MVT::v2i32
, Custom
);
656 setOperationAction(ISD::SINT_TO_FP
, MVT::v2i64
, Custom
);
657 setOperationAction(ISD::UINT_TO_FP
, MVT::v2i64
, Custom
);
658 // Or, direct i32 -> f16 vector conversion. Set it so custom, so the
659 // conversion happens in two steps: v4i32 -> v4f32 -> v4f16
660 setOperationAction(ISD::SINT_TO_FP
, MVT::v4i32
, Custom
);
661 setOperationAction(ISD::UINT_TO_FP
, MVT::v4i32
, Custom
);
663 setOperationAction(ISD::CTLZ
, MVT::v1i64
, Expand
);
664 setOperationAction(ISD::CTLZ
, MVT::v2i64
, Expand
);
666 setOperationAction(ISD::CTTZ
, MVT::v2i8
, Expand
);
667 setOperationAction(ISD::CTTZ
, MVT::v4i16
, Expand
);
668 setOperationAction(ISD::CTTZ
, MVT::v2i32
, Expand
);
669 setOperationAction(ISD::CTTZ
, MVT::v1i64
, Expand
);
670 setOperationAction(ISD::CTTZ
, MVT::v16i8
, Expand
);
671 setOperationAction(ISD::CTTZ
, MVT::v8i16
, Expand
);
672 setOperationAction(ISD::CTTZ
, MVT::v4i32
, Expand
);
673 setOperationAction(ISD::CTTZ
, MVT::v2i64
, Expand
);
675 // AArch64 doesn't have MUL.2d:
676 setOperationAction(ISD::MUL
, MVT::v2i64
, Expand
);
677 // Custom handling for some quad-vector types to detect MULL.
678 setOperationAction(ISD::MUL
, MVT::v8i16
, Custom
);
679 setOperationAction(ISD::MUL
, MVT::v4i32
, Custom
);
680 setOperationAction(ISD::MUL
, MVT::v2i64
, Custom
);
683 for (MVT VT
: MVT::integer_valuetypes()) {
684 setOperationAction(ISD::VECREDUCE_ADD
, VT
, Custom
);
685 setOperationAction(ISD::VECREDUCE_SMAX
, VT
, Custom
);
686 setOperationAction(ISD::VECREDUCE_SMIN
, VT
, Custom
);
687 setOperationAction(ISD::VECREDUCE_UMAX
, VT
, Custom
);
688 setOperationAction(ISD::VECREDUCE_UMIN
, VT
, Custom
);
690 for (MVT VT
: MVT::fp_valuetypes()) {
691 setOperationAction(ISD::VECREDUCE_FMAX
, VT
, Custom
);
692 setOperationAction(ISD::VECREDUCE_FMIN
, VT
, Custom
);
695 setOperationAction(ISD::ANY_EXTEND
, MVT::v4i32
, Legal
);
696 setTruncStoreAction(MVT::v2i32
, MVT::v2i16
, Expand
);
697 // Likewise, narrowing and extending vector loads/stores aren't handled
699 for (MVT VT
: MVT::vector_valuetypes()) {
700 setOperationAction(ISD::SIGN_EXTEND_INREG
, VT
, Expand
);
702 setOperationAction(ISD::MULHS
, VT
, Expand
);
703 setOperationAction(ISD::SMUL_LOHI
, VT
, Expand
);
704 setOperationAction(ISD::MULHU
, VT
, Expand
);
705 setOperationAction(ISD::UMUL_LOHI
, VT
, Expand
);
707 setOperationAction(ISD::BSWAP
, VT
, Expand
);
709 for (MVT InnerVT
: MVT::vector_valuetypes()) {
710 setTruncStoreAction(VT
, InnerVT
, Expand
);
711 setLoadExtAction(ISD::SEXTLOAD
, VT
, InnerVT
, Expand
);
712 setLoadExtAction(ISD::ZEXTLOAD
, VT
, InnerVT
, Expand
);
713 setLoadExtAction(ISD::EXTLOAD
, VT
, InnerVT
, Expand
);
717 // AArch64 has implementations of a lot of rounding-like FP operations.
718 for (MVT Ty
: {MVT::v2f32
, MVT::v4f32
, MVT::v2f64
}) {
719 setOperationAction(ISD::FFLOOR
, Ty
, Legal
);
720 setOperationAction(ISD::FNEARBYINT
, Ty
, Legal
);
721 setOperationAction(ISD::FCEIL
, Ty
, Legal
);
722 setOperationAction(ISD::FRINT
, Ty
, Legal
);
723 setOperationAction(ISD::FTRUNC
, Ty
, Legal
);
724 setOperationAction(ISD::FROUND
, Ty
, Legal
);
728 PredictableSelectIsExpensive
= Subtarget
->predictableSelectIsExpensive();
731 void AArch64TargetLowering::addTypeForNEON(MVT VT
, MVT PromotedBitwiseVT
) {
732 assert(VT
.isVector() && "VT should be a vector type");
734 if (VT
.isFloatingPoint()) {
735 MVT PromoteTo
= EVT(VT
).changeVectorElementTypeToInteger().getSimpleVT();
736 setOperationPromotedToType(ISD::LOAD
, VT
, PromoteTo
);
737 setOperationPromotedToType(ISD::STORE
, VT
, PromoteTo
);
740 // Mark vector float intrinsics as expand.
741 if (VT
== MVT::v2f32
|| VT
== MVT::v4f32
|| VT
== MVT::v2f64
) {
742 setOperationAction(ISD::FSIN
, VT
, Expand
);
743 setOperationAction(ISD::FCOS
, VT
, Expand
);
744 setOperationAction(ISD::FPOW
, VT
, Expand
);
745 setOperationAction(ISD::FLOG
, VT
, Expand
);
746 setOperationAction(ISD::FLOG2
, VT
, Expand
);
747 setOperationAction(ISD::FLOG10
, VT
, Expand
);
748 setOperationAction(ISD::FEXP
, VT
, Expand
);
749 setOperationAction(ISD::FEXP2
, VT
, Expand
);
751 // But we do support custom-lowering for FCOPYSIGN.
752 setOperationAction(ISD::FCOPYSIGN
, VT
, Custom
);
755 setOperationAction(ISD::EXTRACT_VECTOR_ELT
, VT
, Custom
);
756 setOperationAction(ISD::INSERT_VECTOR_ELT
, VT
, Custom
);
757 setOperationAction(ISD::BUILD_VECTOR
, VT
, Custom
);
758 setOperationAction(ISD::VECTOR_SHUFFLE
, VT
, Custom
);
759 setOperationAction(ISD::EXTRACT_SUBVECTOR
, VT
, Custom
);
760 setOperationAction(ISD::SRA
, VT
, Custom
);
761 setOperationAction(ISD::SRL
, VT
, Custom
);
762 setOperationAction(ISD::SHL
, VT
, Custom
);
763 setOperationAction(ISD::AND
, VT
, Custom
);
764 setOperationAction(ISD::OR
, VT
, Custom
);
765 setOperationAction(ISD::SETCC
, VT
, Custom
);
766 setOperationAction(ISD::CONCAT_VECTORS
, VT
, Legal
);
768 setOperationAction(ISD::SELECT
, VT
, Expand
);
769 setOperationAction(ISD::SELECT_CC
, VT
, Expand
);
770 setOperationAction(ISD::VSELECT
, VT
, Expand
);
771 for (MVT InnerVT
: MVT::all_valuetypes())
772 setLoadExtAction(ISD::EXTLOAD
, InnerVT
, VT
, Expand
);
774 // CNT supports only B element sizes.
775 if (VT
!= MVT::v8i8
&& VT
!= MVT::v16i8
)
776 setOperationAction(ISD::CTPOP
, VT
, Expand
);
778 setOperationAction(ISD::UDIV
, VT
, Expand
);
779 setOperationAction(ISD::SDIV
, VT
, Expand
);
780 setOperationAction(ISD::UREM
, VT
, Expand
);
781 setOperationAction(ISD::SREM
, VT
, Expand
);
782 setOperationAction(ISD::FREM
, VT
, Expand
);
784 setOperationAction(ISD::FP_TO_SINT
, VT
, Custom
);
785 setOperationAction(ISD::FP_TO_UINT
, VT
, Custom
);
787 if (!VT
.isFloatingPoint())
788 setOperationAction(ISD::ABS
, VT
, Legal
);
790 // [SU][MIN|MAX] are available for all NEON types apart from i64.
791 if (!VT
.isFloatingPoint() && VT
!= MVT::v2i64
&& VT
!= MVT::v1i64
)
792 for (unsigned Opcode
: {ISD::SMIN
, ISD::SMAX
, ISD::UMIN
, ISD::UMAX
})
793 setOperationAction(Opcode
, VT
, Legal
);
795 // F[MIN|MAX][NUM|NAN] are available for all FP NEON types.
796 if (VT
.isFloatingPoint() &&
797 (VT
.getVectorElementType() != MVT::f16
|| Subtarget
->hasFullFP16()))
798 for (unsigned Opcode
: {ISD::FMINNAN
, ISD::FMAXNAN
,
799 ISD::FMINNUM
, ISD::FMAXNUM
})
800 setOperationAction(Opcode
, VT
, Legal
);
802 if (Subtarget
->isLittleEndian()) {
803 for (unsigned im
= (unsigned)ISD::PRE_INC
;
804 im
!= (unsigned)ISD::LAST_INDEXED_MODE
; ++im
) {
805 setIndexedLoadAction(im
, VT
, Legal
);
806 setIndexedStoreAction(im
, VT
, Legal
);
811 void AArch64TargetLowering::addDRTypeForNEON(MVT VT
) {
812 addRegisterClass(VT
, &AArch64::FPR64RegClass
);
813 addTypeForNEON(VT
, MVT::v2i32
);
816 void AArch64TargetLowering::addQRTypeForNEON(MVT VT
) {
817 addRegisterClass(VT
, &AArch64::FPR128RegClass
);
818 addTypeForNEON(VT
, MVT::v4i32
);
821 EVT
AArch64TargetLowering::getSetCCResultType(const DataLayout
&, LLVMContext
&,
825 return VT
.changeVectorElementTypeToInteger();
828 static bool optimizeLogicalImm(SDValue Op
, unsigned Size
, uint64_t Imm
,
829 const APInt
&Demanded
,
830 TargetLowering::TargetLoweringOpt
&TLO
,
832 uint64_t OldImm
= Imm
, NewImm
, Enc
;
833 uint64_t Mask
= ((uint64_t)(-1LL) >> (64 - Size
)), OrigMask
= Mask
;
835 // Return if the immediate is already all zeros, all ones, a bimm32 or a
837 if (Imm
== 0 || Imm
== Mask
||
838 AArch64_AM::isLogicalImmediate(Imm
& Mask
, Size
))
841 unsigned EltSize
= Size
;
842 uint64_t DemandedBits
= Demanded
.getZExtValue();
844 // Clear bits that are not demanded.
848 // The goal here is to set the non-demanded bits in a way that minimizes
849 // the number of switching between 0 and 1. In order to achieve this goal,
850 // we set the non-demanded bits to the value of the preceding demanded bits.
851 // For example, if we have an immediate 0bx10xx0x1 ('x' indicates a
852 // non-demanded bit), we copy bit0 (1) to the least significant 'x',
853 // bit2 (0) to 'xx', and bit6 (1) to the most significant 'x'.
854 // The final result is 0b11000011.
855 uint64_t NonDemandedBits
= ~DemandedBits
;
856 uint64_t InvertedImm
= ~Imm
& DemandedBits
;
857 uint64_t RotatedImm
=
858 ((InvertedImm
<< 1) | (InvertedImm
>> (EltSize
- 1) & 1)) &
860 uint64_t Sum
= RotatedImm
+ NonDemandedBits
;
861 bool Carry
= NonDemandedBits
& ~Sum
& (1ULL << (EltSize
- 1));
862 uint64_t Ones
= (Sum
+ Carry
) & NonDemandedBits
;
863 NewImm
= (Imm
| Ones
) & Mask
;
865 // If NewImm or its bitwise NOT is a shifted mask, it is a bitmask immediate
866 // or all-ones or all-zeros, in which case we can stop searching. Otherwise,
867 // we halve the element size and continue the search.
868 if (isShiftedMask_64(NewImm
) || isShiftedMask_64(~(NewImm
| ~Mask
)))
871 // We cannot shrink the element size any further if it is 2-bits.
877 uint64_t Hi
= Imm
>> EltSize
, DemandedBitsHi
= DemandedBits
>> EltSize
;
879 // Return if there is mismatch in any of the demanded bits of Imm and Hi.
880 if (((Imm
^ Hi
) & (DemandedBits
& DemandedBitsHi
) & Mask
) != 0)
883 // Merge the upper and lower halves of Imm and DemandedBits.
885 DemandedBits
|= DemandedBitsHi
;
890 // Replicate the element across the register width.
891 while (EltSize
< Size
) {
892 NewImm
|= NewImm
<< EltSize
;
897 assert(((OldImm
^ NewImm
) & Demanded
.getZExtValue()) == 0 &&
898 "demanded bits should never be altered");
899 assert(OldImm
!= NewImm
&& "the new imm shouldn't be equal to the old imm");
901 // Create the new constant immediate node.
902 EVT VT
= Op
.getValueType();
906 // If the new constant immediate is all-zeros or all-ones, let the target
907 // independent DAG combine optimize this node.
908 if (NewImm
== 0 || NewImm
== OrigMask
) {
909 New
= TLO
.DAG
.getNode(Op
.getOpcode(), DL
, VT
, Op
.getOperand(0),
910 TLO
.DAG
.getConstant(NewImm
, DL
, VT
));
911 // Otherwise, create a machine node so that target independent DAG combine
912 // doesn't undo this optimization.
914 Enc
= AArch64_AM::encodeLogicalImmediate(NewImm
, Size
);
915 SDValue EncConst
= TLO
.DAG
.getTargetConstant(Enc
, DL
, VT
);
917 TLO
.DAG
.getMachineNode(NewOpc
, DL
, VT
, Op
.getOperand(0), EncConst
), 0);
920 return TLO
.CombineTo(Op
, New
);
923 bool AArch64TargetLowering::targetShrinkDemandedConstant(
924 SDValue Op
, const APInt
&Demanded
, TargetLoweringOpt
&TLO
) const {
925 // Delay this optimization to as late as possible.
929 if (!EnableOptimizeLogicalImm
)
932 EVT VT
= Op
.getValueType();
936 unsigned Size
= VT
.getSizeInBits();
937 assert((Size
== 32 || Size
== 64) &&
938 "i32 or i64 is expected after legalization.");
940 // Exit early if we demand all bits.
941 if (Demanded
.countPopulation() == Size
)
945 switch (Op
.getOpcode()) {
949 NewOpc
= Size
== 32 ? AArch64::ANDWri
: AArch64::ANDXri
;
952 NewOpc
= Size
== 32 ? AArch64::ORRWri
: AArch64::ORRXri
;
955 NewOpc
= Size
== 32 ? AArch64::EORWri
: AArch64::EORXri
;
958 ConstantSDNode
*C
= dyn_cast
<ConstantSDNode
>(Op
.getOperand(1));
961 uint64_t Imm
= C
->getZExtValue();
962 return optimizeLogicalImm(Op
, Size
, Imm
, Demanded
, TLO
, NewOpc
);
965 /// computeKnownBitsForTargetNode - Determine which of the bits specified in
966 /// Mask are known to be either zero or one and return them Known.
967 void AArch64TargetLowering::computeKnownBitsForTargetNode(
968 const SDValue Op
, KnownBits
&Known
,
969 const APInt
&DemandedElts
, const SelectionDAG
&DAG
, unsigned Depth
) const {
970 switch (Op
.getOpcode()) {
973 case AArch64ISD::CSEL
: {
975 DAG
.computeKnownBits(Op
->getOperand(0), Known
, Depth
+ 1);
976 DAG
.computeKnownBits(Op
->getOperand(1), Known2
, Depth
+ 1);
977 Known
.Zero
&= Known2
.Zero
;
978 Known
.One
&= Known2
.One
;
981 case ISD::INTRINSIC_W_CHAIN
: {
982 ConstantSDNode
*CN
= cast
<ConstantSDNode
>(Op
->getOperand(1));
983 Intrinsic::ID IntID
= static_cast<Intrinsic::ID
>(CN
->getZExtValue());
986 case Intrinsic::aarch64_ldaxr
:
987 case Intrinsic::aarch64_ldxr
: {
988 unsigned BitWidth
= Known
.getBitWidth();
989 EVT VT
= cast
<MemIntrinsicSDNode
>(Op
)->getMemoryVT();
990 unsigned MemBits
= VT
.getScalarSizeInBits();
991 Known
.Zero
|= APInt::getHighBitsSet(BitWidth
, BitWidth
- MemBits
);
997 case ISD::INTRINSIC_WO_CHAIN
:
998 case ISD::INTRINSIC_VOID
: {
999 unsigned IntNo
= cast
<ConstantSDNode
>(Op
.getOperand(0))->getZExtValue();
1003 case Intrinsic::aarch64_neon_umaxv
:
1004 case Intrinsic::aarch64_neon_uminv
: {
1005 // Figure out the datatype of the vector operand. The UMINV instruction
1006 // will zero extend the result, so we can mark as known zero all the
1007 // bits larger than the element datatype. 32-bit or larget doesn't need
1008 // this as those are legal types and will be handled by isel directly.
1009 MVT VT
= Op
.getOperand(1).getValueType().getSimpleVT();
1010 unsigned BitWidth
= Known
.getBitWidth();
1011 if (VT
== MVT::v8i8
|| VT
== MVT::v16i8
) {
1012 assert(BitWidth
>= 8 && "Unexpected width!");
1013 APInt Mask
= APInt::getHighBitsSet(BitWidth
, BitWidth
- 8);
1015 } else if (VT
== MVT::v4i16
|| VT
== MVT::v8i16
) {
1016 assert(BitWidth
>= 16 && "Unexpected width!");
1017 APInt Mask
= APInt::getHighBitsSet(BitWidth
, BitWidth
- 16);
1027 MVT
AArch64TargetLowering::getScalarShiftAmountTy(const DataLayout
&DL
,
1032 bool AArch64TargetLowering::allowsMisalignedMemoryAccesses(EVT VT
,
1036 if (Subtarget
->requiresStrictAlign())
1040 // Some CPUs are fine with unaligned stores except for 128-bit ones.
1041 *Fast
= !Subtarget
->isMisaligned128StoreSlow() || VT
.getStoreSize() != 16 ||
1042 // See comments in performSTORECombine() for more details about
1043 // these conditions.
1045 // Code that uses clang vector extensions can mark that it
1046 // wants unaligned accesses to be treated as fast by
1047 // underspecifying alignment to be 1 or 2.
1050 // Disregard v2i64. Memcpy lowering produces those and splitting
1051 // them regresses performance on micro-benchmarks and olden/bh.
1058 AArch64TargetLowering::createFastISel(FunctionLoweringInfo
&funcInfo
,
1059 const TargetLibraryInfo
*libInfo
) const {
1060 return AArch64::createFastISel(funcInfo
, libInfo
);
1063 const char *AArch64TargetLowering::getTargetNodeName(unsigned Opcode
) const {
1064 switch ((AArch64ISD::NodeType
)Opcode
) {
1065 case AArch64ISD::FIRST_NUMBER
: break;
1066 case AArch64ISD::CALL
: return "AArch64ISD::CALL";
1067 case AArch64ISD::ADRP
: return "AArch64ISD::ADRP";
1068 case AArch64ISD::ADDlow
: return "AArch64ISD::ADDlow";
1069 case AArch64ISD::LOADgot
: return "AArch64ISD::LOADgot";
1070 case AArch64ISD::RET_FLAG
: return "AArch64ISD::RET_FLAG";
1071 case AArch64ISD::BRCOND
: return "AArch64ISD::BRCOND";
1072 case AArch64ISD::CSEL
: return "AArch64ISD::CSEL";
1073 case AArch64ISD::FCSEL
: return "AArch64ISD::FCSEL";
1074 case AArch64ISD::CSINV
: return "AArch64ISD::CSINV";
1075 case AArch64ISD::CSNEG
: return "AArch64ISD::CSNEG";
1076 case AArch64ISD::CSINC
: return "AArch64ISD::CSINC";
1077 case AArch64ISD::THREAD_POINTER
: return "AArch64ISD::THREAD_POINTER";
1078 case AArch64ISD::TLSDESC_CALLSEQ
: return "AArch64ISD::TLSDESC_CALLSEQ";
1079 case AArch64ISD::ADC
: return "AArch64ISD::ADC";
1080 case AArch64ISD::SBC
: return "AArch64ISD::SBC";
1081 case AArch64ISD::ADDS
: return "AArch64ISD::ADDS";
1082 case AArch64ISD::SUBS
: return "AArch64ISD::SUBS";
1083 case AArch64ISD::ADCS
: return "AArch64ISD::ADCS";
1084 case AArch64ISD::SBCS
: return "AArch64ISD::SBCS";
1085 case AArch64ISD::ANDS
: return "AArch64ISD::ANDS";
1086 case AArch64ISD::CCMP
: return "AArch64ISD::CCMP";
1087 case AArch64ISD::CCMN
: return "AArch64ISD::CCMN";
1088 case AArch64ISD::FCCMP
: return "AArch64ISD::FCCMP";
1089 case AArch64ISD::FCMP
: return "AArch64ISD::FCMP";
1090 case AArch64ISD::DUP
: return "AArch64ISD::DUP";
1091 case AArch64ISD::DUPLANE8
: return "AArch64ISD::DUPLANE8";
1092 case AArch64ISD::DUPLANE16
: return "AArch64ISD::DUPLANE16";
1093 case AArch64ISD::DUPLANE32
: return "AArch64ISD::DUPLANE32";
1094 case AArch64ISD::DUPLANE64
: return "AArch64ISD::DUPLANE64";
1095 case AArch64ISD::MOVI
: return "AArch64ISD::MOVI";
1096 case AArch64ISD::MOVIshift
: return "AArch64ISD::MOVIshift";
1097 case AArch64ISD::MOVIedit
: return "AArch64ISD::MOVIedit";
1098 case AArch64ISD::MOVImsl
: return "AArch64ISD::MOVImsl";
1099 case AArch64ISD::FMOV
: return "AArch64ISD::FMOV";
1100 case AArch64ISD::MVNIshift
: return "AArch64ISD::MVNIshift";
1101 case AArch64ISD::MVNImsl
: return "AArch64ISD::MVNImsl";
1102 case AArch64ISD::BICi
: return "AArch64ISD::BICi";
1103 case AArch64ISD::ORRi
: return "AArch64ISD::ORRi";
1104 case AArch64ISD::BSL
: return "AArch64ISD::BSL";
1105 case AArch64ISD::NEG
: return "AArch64ISD::NEG";
1106 case AArch64ISD::EXTR
: return "AArch64ISD::EXTR";
1107 case AArch64ISD::ZIP1
: return "AArch64ISD::ZIP1";
1108 case AArch64ISD::ZIP2
: return "AArch64ISD::ZIP2";
1109 case AArch64ISD::UZP1
: return "AArch64ISD::UZP1";
1110 case AArch64ISD::UZP2
: return "AArch64ISD::UZP2";
1111 case AArch64ISD::TRN1
: return "AArch64ISD::TRN1";
1112 case AArch64ISD::TRN2
: return "AArch64ISD::TRN2";
1113 case AArch64ISD::REV16
: return "AArch64ISD::REV16";
1114 case AArch64ISD::REV32
: return "AArch64ISD::REV32";
1115 case AArch64ISD::REV64
: return "AArch64ISD::REV64";
1116 case AArch64ISD::EXT
: return "AArch64ISD::EXT";
1117 case AArch64ISD::VSHL
: return "AArch64ISD::VSHL";
1118 case AArch64ISD::VLSHR
: return "AArch64ISD::VLSHR";
1119 case AArch64ISD::VASHR
: return "AArch64ISD::VASHR";
1120 case AArch64ISD::CMEQ
: return "AArch64ISD::CMEQ";
1121 case AArch64ISD::CMGE
: return "AArch64ISD::CMGE";
1122 case AArch64ISD::CMGT
: return "AArch64ISD::CMGT";
1123 case AArch64ISD::CMHI
: return "AArch64ISD::CMHI";
1124 case AArch64ISD::CMHS
: return "AArch64ISD::CMHS";
1125 case AArch64ISD::FCMEQ
: return "AArch64ISD::FCMEQ";
1126 case AArch64ISD::FCMGE
: return "AArch64ISD::FCMGE";
1127 case AArch64ISD::FCMGT
: return "AArch64ISD::FCMGT";
1128 case AArch64ISD::CMEQz
: return "AArch64ISD::CMEQz";
1129 case AArch64ISD::CMGEz
: return "AArch64ISD::CMGEz";
1130 case AArch64ISD::CMGTz
: return "AArch64ISD::CMGTz";
1131 case AArch64ISD::CMLEz
: return "AArch64ISD::CMLEz";
1132 case AArch64ISD::CMLTz
: return "AArch64ISD::CMLTz";
1133 case AArch64ISD::FCMEQz
: return "AArch64ISD::FCMEQz";
1134 case AArch64ISD::FCMGEz
: return "AArch64ISD::FCMGEz";
1135 case AArch64ISD::FCMGTz
: return "AArch64ISD::FCMGTz";
1136 case AArch64ISD::FCMLEz
: return "AArch64ISD::FCMLEz";
1137 case AArch64ISD::FCMLTz
: return "AArch64ISD::FCMLTz";
1138 case AArch64ISD::SADDV
: return "AArch64ISD::SADDV";
1139 case AArch64ISD::UADDV
: return "AArch64ISD::UADDV";
1140 case AArch64ISD::SMINV
: return "AArch64ISD::SMINV";
1141 case AArch64ISD::UMINV
: return "AArch64ISD::UMINV";
1142 case AArch64ISD::SMAXV
: return "AArch64ISD::SMAXV";
1143 case AArch64ISD::UMAXV
: return "AArch64ISD::UMAXV";
1144 case AArch64ISD::NOT
: return "AArch64ISD::NOT";
1145 case AArch64ISD::BIT
: return "AArch64ISD::BIT";
1146 case AArch64ISD::CBZ
: return "AArch64ISD::CBZ";
1147 case AArch64ISD::CBNZ
: return "AArch64ISD::CBNZ";
1148 case AArch64ISD::TBZ
: return "AArch64ISD::TBZ";
1149 case AArch64ISD::TBNZ
: return "AArch64ISD::TBNZ";
1150 case AArch64ISD::TC_RETURN
: return "AArch64ISD::TC_RETURN";
1151 case AArch64ISD::PREFETCH
: return "AArch64ISD::PREFETCH";
1152 case AArch64ISD::SITOF
: return "AArch64ISD::SITOF";
1153 case AArch64ISD::UITOF
: return "AArch64ISD::UITOF";
1154 case AArch64ISD::NVCAST
: return "AArch64ISD::NVCAST";
1155 case AArch64ISD::SQSHL_I
: return "AArch64ISD::SQSHL_I";
1156 case AArch64ISD::UQSHL_I
: return "AArch64ISD::UQSHL_I";
1157 case AArch64ISD::SRSHR_I
: return "AArch64ISD::SRSHR_I";
1158 case AArch64ISD::URSHR_I
: return "AArch64ISD::URSHR_I";
1159 case AArch64ISD::SQSHLU_I
: return "AArch64ISD::SQSHLU_I";
1160 case AArch64ISD::WrapperLarge
: return "AArch64ISD::WrapperLarge";
1161 case AArch64ISD::LD2post
: return "AArch64ISD::LD2post";
1162 case AArch64ISD::LD3post
: return "AArch64ISD::LD3post";
1163 case AArch64ISD::LD4post
: return "AArch64ISD::LD4post";
1164 case AArch64ISD::ST2post
: return "AArch64ISD::ST2post";
1165 case AArch64ISD::ST3post
: return "AArch64ISD::ST3post";
1166 case AArch64ISD::ST4post
: return "AArch64ISD::ST4post";
1167 case AArch64ISD::LD1x2post
: return "AArch64ISD::LD1x2post";
1168 case AArch64ISD::LD1x3post
: return "AArch64ISD::LD1x3post";
1169 case AArch64ISD::LD1x4post
: return "AArch64ISD::LD1x4post";
1170 case AArch64ISD::ST1x2post
: return "AArch64ISD::ST1x2post";
1171 case AArch64ISD::ST1x3post
: return "AArch64ISD::ST1x3post";
1172 case AArch64ISD::ST1x4post
: return "AArch64ISD::ST1x4post";
1173 case AArch64ISD::LD1DUPpost
: return "AArch64ISD::LD1DUPpost";
1174 case AArch64ISD::LD2DUPpost
: return "AArch64ISD::LD2DUPpost";
1175 case AArch64ISD::LD3DUPpost
: return "AArch64ISD::LD3DUPpost";
1176 case AArch64ISD::LD4DUPpost
: return "AArch64ISD::LD4DUPpost";
1177 case AArch64ISD::LD1LANEpost
: return "AArch64ISD::LD1LANEpost";
1178 case AArch64ISD::LD2LANEpost
: return "AArch64ISD::LD2LANEpost";
1179 case AArch64ISD::LD3LANEpost
: return "AArch64ISD::LD3LANEpost";
1180 case AArch64ISD::LD4LANEpost
: return "AArch64ISD::LD4LANEpost";
1181 case AArch64ISD::ST2LANEpost
: return "AArch64ISD::ST2LANEpost";
1182 case AArch64ISD::ST3LANEpost
: return "AArch64ISD::ST3LANEpost";
1183 case AArch64ISD::ST4LANEpost
: return "AArch64ISD::ST4LANEpost";
1184 case AArch64ISD::SMULL
: return "AArch64ISD::SMULL";
1185 case AArch64ISD::UMULL
: return "AArch64ISD::UMULL";
1186 case AArch64ISD::FRECPE
: return "AArch64ISD::FRECPE";
1187 case AArch64ISD::FRECPS
: return "AArch64ISD::FRECPS";
1188 case AArch64ISD::FRSQRTE
: return "AArch64ISD::FRSQRTE";
1189 case AArch64ISD::FRSQRTS
: return "AArch64ISD::FRSQRTS";
1195 AArch64TargetLowering::EmitF128CSEL(MachineInstr
&MI
,
1196 MachineBasicBlock
*MBB
) const {
1197 // We materialise the F128CSEL pseudo-instruction as some control flow and a
1201 // [... previous instrs leading to comparison ...]
1207 // Dest = PHI [IfTrue, TrueBB], [IfFalse, OrigBB]
1209 MachineFunction
*MF
= MBB
->getParent();
1210 const TargetInstrInfo
*TII
= Subtarget
->getInstrInfo();
1211 const BasicBlock
*LLVM_BB
= MBB
->getBasicBlock();
1212 DebugLoc DL
= MI
.getDebugLoc();
1213 MachineFunction::iterator It
= ++MBB
->getIterator();
1215 unsigned DestReg
= MI
.getOperand(0).getReg();
1216 unsigned IfTrueReg
= MI
.getOperand(1).getReg();
1217 unsigned IfFalseReg
= MI
.getOperand(2).getReg();
1218 unsigned CondCode
= MI
.getOperand(3).getImm();
1219 bool NZCVKilled
= MI
.getOperand(4).isKill();
1221 MachineBasicBlock
*TrueBB
= MF
->CreateMachineBasicBlock(LLVM_BB
);
1222 MachineBasicBlock
*EndBB
= MF
->CreateMachineBasicBlock(LLVM_BB
);
1223 MF
->insert(It
, TrueBB
);
1224 MF
->insert(It
, EndBB
);
1226 // Transfer rest of current basic-block to EndBB
1227 EndBB
->splice(EndBB
->begin(), MBB
, std::next(MachineBasicBlock::iterator(MI
)),
1229 EndBB
->transferSuccessorsAndUpdatePHIs(MBB
);
1231 BuildMI(MBB
, DL
, TII
->get(AArch64::Bcc
)).addImm(CondCode
).addMBB(TrueBB
);
1232 BuildMI(MBB
, DL
, TII
->get(AArch64::B
)).addMBB(EndBB
);
1233 MBB
->addSuccessor(TrueBB
);
1234 MBB
->addSuccessor(EndBB
);
1236 // TrueBB falls through to the end.
1237 TrueBB
->addSuccessor(EndBB
);
1240 TrueBB
->addLiveIn(AArch64::NZCV
);
1241 EndBB
->addLiveIn(AArch64::NZCV
);
1244 BuildMI(*EndBB
, EndBB
->begin(), DL
, TII
->get(AArch64::PHI
), DestReg
)
1250 MI
.eraseFromParent();
1254 MachineBasicBlock
*AArch64TargetLowering::EmitInstrWithCustomInserter(
1255 MachineInstr
&MI
, MachineBasicBlock
*BB
) const {
1256 switch (MI
.getOpcode()) {
1261 llvm_unreachable("Unexpected instruction for custom inserter!");
1263 case AArch64::F128CSEL
:
1264 return EmitF128CSEL(MI
, BB
);
1266 case TargetOpcode::STACKMAP
:
1267 case TargetOpcode::PATCHPOINT
:
1268 return emitPatchPoint(MI
, BB
);
1272 //===----------------------------------------------------------------------===//
1273 // AArch64 Lowering private implementation.
1274 //===----------------------------------------------------------------------===//
1276 //===----------------------------------------------------------------------===//
1278 //===----------------------------------------------------------------------===//
1280 /// changeIntCCToAArch64CC - Convert a DAG integer condition code to an AArch64
1282 static AArch64CC::CondCode
changeIntCCToAArch64CC(ISD::CondCode CC
) {
1285 llvm_unreachable("Unknown condition code!");
1287 return AArch64CC::NE
;
1289 return AArch64CC::EQ
;
1291 return AArch64CC::GT
;
1293 return AArch64CC::GE
;
1295 return AArch64CC::LT
;
1297 return AArch64CC::LE
;
1299 return AArch64CC::HI
;
1301 return AArch64CC::HS
;
1303 return AArch64CC::LO
;
1305 return AArch64CC::LS
;
1309 /// changeFPCCToAArch64CC - Convert a DAG fp condition code to an AArch64 CC.
1310 static void changeFPCCToAArch64CC(ISD::CondCode CC
,
1311 AArch64CC::CondCode
&CondCode
,
1312 AArch64CC::CondCode
&CondCode2
) {
1313 CondCode2
= AArch64CC::AL
;
1316 llvm_unreachable("Unknown FP condition!");
1319 CondCode
= AArch64CC::EQ
;
1323 CondCode
= AArch64CC::GT
;
1327 CondCode
= AArch64CC::GE
;
1330 CondCode
= AArch64CC::MI
;
1333 CondCode
= AArch64CC::LS
;
1336 CondCode
= AArch64CC::MI
;
1337 CondCode2
= AArch64CC::GT
;
1340 CondCode
= AArch64CC::VC
;
1343 CondCode
= AArch64CC::VS
;
1346 CondCode
= AArch64CC::EQ
;
1347 CondCode2
= AArch64CC::VS
;
1350 CondCode
= AArch64CC::HI
;
1353 CondCode
= AArch64CC::PL
;
1357 CondCode
= AArch64CC::LT
;
1361 CondCode
= AArch64CC::LE
;
1365 CondCode
= AArch64CC::NE
;
1370 /// Convert a DAG fp condition code to an AArch64 CC.
1371 /// This differs from changeFPCCToAArch64CC in that it returns cond codes that
1372 /// should be AND'ed instead of OR'ed.
1373 static void changeFPCCToANDAArch64CC(ISD::CondCode CC
,
1374 AArch64CC::CondCode
&CondCode
,
1375 AArch64CC::CondCode
&CondCode2
) {
1376 CondCode2
= AArch64CC::AL
;
1379 changeFPCCToAArch64CC(CC
, CondCode
, CondCode2
);
1380 assert(CondCode2
== AArch64CC::AL
);
1384 // == ((a olt b) || (a ogt b))
1385 // == ((a ord b) && (a une b))
1386 CondCode
= AArch64CC::VC
;
1387 CondCode2
= AArch64CC::NE
;
1391 // == ((a uno b) || (a oeq b))
1392 // == ((a ule b) && (a uge b))
1393 CondCode
= AArch64CC::PL
;
1394 CondCode2
= AArch64CC::LE
;
1399 /// changeVectorFPCCToAArch64CC - Convert a DAG fp condition code to an AArch64
1400 /// CC usable with the vector instructions. Fewer operations are available
1401 /// without a real NZCV register, so we have to use less efficient combinations
1402 /// to get the same effect.
1403 static void changeVectorFPCCToAArch64CC(ISD::CondCode CC
,
1404 AArch64CC::CondCode
&CondCode
,
1405 AArch64CC::CondCode
&CondCode2
,
1410 // Mostly the scalar mappings work fine.
1411 changeFPCCToAArch64CC(CC
, CondCode
, CondCode2
);
1417 CondCode
= AArch64CC::MI
;
1418 CondCode2
= AArch64CC::GE
;
1425 // All of the compare-mask comparisons are ordered, but we can switch
1426 // between the two by a double inversion. E.g. ULE == !OGT.
1428 changeFPCCToAArch64CC(getSetCCInverse(CC
, false), CondCode
, CondCode2
);
1433 static bool isLegalArithImmed(uint64_t C
) {
1434 // Matches AArch64DAGToDAGISel::SelectArithImmed().
1435 bool IsLegal
= (C
>> 12 == 0) || ((C
& 0xFFFULL
) == 0 && C
>> 24 == 0);
1436 DEBUG(dbgs() << "Is imm " << C
<< " legal: " << (IsLegal
? "yes\n" : "no\n"));
1440 static SDValue
emitComparison(SDValue LHS
, SDValue RHS
, ISD::CondCode CC
,
1441 const SDLoc
&dl
, SelectionDAG
&DAG
) {
1442 EVT VT
= LHS
.getValueType();
1443 const bool FullFP16
=
1444 static_cast<const AArch64Subtarget
&>(DAG
.getSubtarget()).hasFullFP16();
1446 if (VT
.isFloatingPoint()) {
1447 assert(VT
!= MVT::f128
);
1448 if (VT
== MVT::f16
&& !FullFP16
) {
1449 LHS
= DAG
.getNode(ISD::FP_EXTEND
, dl
, MVT::f32
, LHS
);
1450 RHS
= DAG
.getNode(ISD::FP_EXTEND
, dl
, MVT::f32
, RHS
);
1453 return DAG
.getNode(AArch64ISD::FCMP
, dl
, VT
, LHS
, RHS
);
1456 // The CMP instruction is just an alias for SUBS, and representing it as
1457 // SUBS means that it's possible to get CSE with subtract operations.
1458 // A later phase can perform the optimization of setting the destination
1459 // register to WZR/XZR if it ends up being unused.
1460 unsigned Opcode
= AArch64ISD::SUBS
;
1462 if (RHS
.getOpcode() == ISD::SUB
&& isNullConstant(RHS
.getOperand(0)) &&
1463 (CC
== ISD::SETEQ
|| CC
== ISD::SETNE
)) {
1464 // We'd like to combine a (CMP op1, (sub 0, op2) into a CMN instruction on
1465 // the grounds that "op1 - (-op2) == op1 + op2". However, the C and V flags
1466 // can be set differently by this operation. It comes down to whether
1467 // "SInt(~op2)+1 == SInt(~op2+1)" (and the same for UInt). If they are then
1468 // everything is fine. If not then the optimization is wrong. Thus general
1469 // comparisons are only valid if op2 != 0.
1471 // So, finally, the only LLVM-native comparisons that don't mention C and V
1472 // are SETEQ and SETNE. They're the only ones we can safely use CMN for in
1473 // the absence of information about op2.
1474 Opcode
= AArch64ISD::ADDS
;
1475 RHS
= RHS
.getOperand(1);
1476 } else if (LHS
.getOpcode() == ISD::AND
&& isNullConstant(RHS
) &&
1477 !isUnsignedIntSetCC(CC
)) {
1478 // Similarly, (CMP (and X, Y), 0) can be implemented with a TST
1479 // (a.k.a. ANDS) except that the flags are only guaranteed to work for one
1480 // of the signed comparisons.
1481 Opcode
= AArch64ISD::ANDS
;
1482 RHS
= LHS
.getOperand(1);
1483 LHS
= LHS
.getOperand(0);
1486 return DAG
.getNode(Opcode
, dl
, DAG
.getVTList(VT
, MVT_CC
), LHS
, RHS
)
1490 /// \defgroup AArch64CCMP CMP;CCMP matching
1492 /// These functions deal with the formation of CMP;CCMP;... sequences.
1493 /// The CCMP/CCMN/FCCMP/FCCMPE instructions allow the conditional execution of
1494 /// a comparison. They set the NZCV flags to a predefined value if their
1495 /// predicate is false. This allows to express arbitrary conjunctions, for
1496 /// example "cmp 0 (and (setCA (cmp A)) (setCB (cmp B))))"
1499 /// ccmp B, inv(CB), CA
1500 /// check for CB flags
1502 /// In general we can create code for arbitrary "... (and (and A B) C)"
1503 /// sequences. We can also implement some "or" expressions, because "(or A B)"
1504 /// is equivalent to "not (and (not A) (not B))" and we can implement some
1505 /// negation operations:
1506 /// We can negate the results of a single comparison by inverting the flags
1507 /// used when the predicate fails and inverting the flags tested in the next
1508 /// instruction; We can also negate the results of the whole previous
1509 /// conditional compare sequence by inverting the flags tested in the next
1510 /// instruction. However there is no way to negate the result of a partial
1513 /// Therefore on encountering an "or" expression we can negate the subtree on
1514 /// one side and have to be able to push the negate to the leafs of the subtree
1515 /// on the other side (see also the comments in code). As complete example:
1516 /// "or (or (setCA (cmp A)) (setCB (cmp B)))
1517 /// (and (setCC (cmp C)) (setCD (cmp D)))"
1518 /// is transformed to
1519 /// "not (and (not (and (setCC (cmp C)) (setCC (cmp D))))
1520 /// (and (not (setCA (cmp A)) (not (setCB (cmp B))))))"
1521 /// and implemented as:
1523 /// ccmp D, inv(CD), CC
1524 /// ccmp A, CA, inv(CD)
1525 /// ccmp B, CB, inv(CA)
1526 /// check for CB flags
1527 /// A counterexample is "or (and A B) (and C D)" which cannot be implemented
1528 /// by conditional compare sequences.
1531 /// Create a conditional comparison; Use CCMP, CCMN or FCCMP as appropriate.
1532 static SDValue
emitConditionalComparison(SDValue LHS
, SDValue RHS
,
1533 ISD::CondCode CC
, SDValue CCOp
,
1534 AArch64CC::CondCode Predicate
,
1535 AArch64CC::CondCode OutCC
,
1536 const SDLoc
&DL
, SelectionDAG
&DAG
) {
1537 unsigned Opcode
= 0;
1538 const bool FullFP16
=
1539 static_cast<const AArch64Subtarget
&>(DAG
.getSubtarget()).hasFullFP16();
1541 if (LHS
.getValueType().isFloatingPoint()) {
1542 assert(LHS
.getValueType() != MVT::f128
);
1543 if (LHS
.getValueType() == MVT::f16
&& !FullFP16
) {
1544 LHS
= DAG
.getNode(ISD::FP_EXTEND
, DL
, MVT::f32
, LHS
);
1545 RHS
= DAG
.getNode(ISD::FP_EXTEND
, DL
, MVT::f32
, RHS
);
1547 Opcode
= AArch64ISD::FCCMP
;
1548 } else if (RHS
.getOpcode() == ISD::SUB
) {
1549 SDValue SubOp0
= RHS
.getOperand(0);
1550 if (isNullConstant(SubOp0
) && (CC
== ISD::SETEQ
|| CC
== ISD::SETNE
)) {
1551 // See emitComparison() on why we can only do this for SETEQ and SETNE.
1552 Opcode
= AArch64ISD::CCMN
;
1553 RHS
= RHS
.getOperand(1);
1557 Opcode
= AArch64ISD::CCMP
;
1559 SDValue Condition
= DAG
.getConstant(Predicate
, DL
, MVT_CC
);
1560 AArch64CC::CondCode InvOutCC
= AArch64CC::getInvertedCondCode(OutCC
);
1561 unsigned NZCV
= AArch64CC::getNZCVToSatisfyCondCode(InvOutCC
);
1562 SDValue NZCVOp
= DAG
.getConstant(NZCV
, DL
, MVT::i32
);
1563 return DAG
.getNode(Opcode
, DL
, MVT_CC
, LHS
, RHS
, NZCVOp
, Condition
, CCOp
);
1566 /// Returns true if @p Val is a tree of AND/OR/SETCC operations.
1567 /// CanPushNegate is set to true if we can push a negate operation through
1568 /// the tree in a was that we are left with AND operations and negate operations
1569 /// at the leafs only. i.e. "not (or (or x y) z)" can be changed to
1570 /// "and (and (not x) (not y)) (not z)"; "not (or (and x y) z)" cannot be
1571 /// brought into such a form.
1572 static bool isConjunctionDisjunctionTree(const SDValue Val
, bool &CanNegate
,
1573 unsigned Depth
= 0) {
1574 if (!Val
.hasOneUse())
1576 unsigned Opcode
= Val
->getOpcode();
1577 if (Opcode
== ISD::SETCC
) {
1578 if (Val
->getOperand(0).getValueType() == MVT::f128
)
1583 // Protect against exponential runtime and stack overflow.
1586 if (Opcode
== ISD::AND
|| Opcode
== ISD::OR
) {
1587 SDValue O0
= Val
->getOperand(0);
1588 SDValue O1
= Val
->getOperand(1);
1590 if (!isConjunctionDisjunctionTree(O0
, CanNegateL
, Depth
+1))
1593 if (!isConjunctionDisjunctionTree(O1
, CanNegateR
, Depth
+1))
1596 if (Opcode
== ISD::OR
) {
1597 // For an OR expression we need to be able to negate at least one side or
1598 // we cannot do the transformation at all.
1599 if (!CanNegateL
&& !CanNegateR
)
1601 // We can however change a (not (or x y)) to (and (not x) (not y)) if we
1602 // can negate the x and y subtrees.
1603 CanNegate
= CanNegateL
&& CanNegateR
;
1605 // If the operands are OR expressions then we finally need to negate their
1606 // outputs, we can only do that for the operand with emitted last by
1607 // negating OutCC, not for both operands.
1608 bool NeedsNegOutL
= O0
->getOpcode() == ISD::OR
;
1609 bool NeedsNegOutR
= O1
->getOpcode() == ISD::OR
;
1610 if (NeedsNegOutL
&& NeedsNegOutR
)
1612 // We cannot negate an AND operation (it would become an OR),
1620 /// Emit conjunction or disjunction tree with the CMP/FCMP followed by a chain
1621 /// of CCMP/CFCMP ops. See @ref AArch64CCMP.
1622 /// Tries to transform the given i1 producing node @p Val to a series compare
1623 /// and conditional compare operations. @returns an NZCV flags producing node
1624 /// and sets @p OutCC to the flags that should be tested or returns SDValue() if
1625 /// transformation was not possible.
1626 /// On recursive invocations @p PushNegate may be set to true to have negation
1627 /// effects pushed to the tree leafs; @p Predicate is an NZCV flag predicate
1628 /// for the comparisons in the current subtree; @p Depth limits the search
1629 /// depth to avoid stack overflow.
1630 static SDValue
emitConjunctionDisjunctionTreeRec(SelectionDAG
&DAG
, SDValue Val
,
1631 AArch64CC::CondCode
&OutCC
, bool Negate
, SDValue CCOp
,
1632 AArch64CC::CondCode Predicate
) {
1633 // We're at a tree leaf, produce a conditional comparison operation.
1634 unsigned Opcode
= Val
->getOpcode();
1635 if (Opcode
== ISD::SETCC
) {
1636 SDValue LHS
= Val
->getOperand(0);
1637 SDValue RHS
= Val
->getOperand(1);
1638 ISD::CondCode CC
= cast
<CondCodeSDNode
>(Val
->getOperand(2))->get();
1639 bool isInteger
= LHS
.getValueType().isInteger();
1641 CC
= getSetCCInverse(CC
, isInteger
);
1643 // Determine OutCC and handle FP special case.
1645 OutCC
= changeIntCCToAArch64CC(CC
);
1647 assert(LHS
.getValueType().isFloatingPoint());
1648 AArch64CC::CondCode ExtraCC
;
1649 changeFPCCToANDAArch64CC(CC
, OutCC
, ExtraCC
);
1650 // Some floating point conditions can't be tested with a single condition
1651 // code. Construct an additional comparison in this case.
1652 if (ExtraCC
!= AArch64CC::AL
) {
1654 if (!CCOp
.getNode())
1655 ExtraCmp
= emitComparison(LHS
, RHS
, CC
, DL
, DAG
);
1657 ExtraCmp
= emitConditionalComparison(LHS
, RHS
, CC
, CCOp
, Predicate
,
1660 Predicate
= ExtraCC
;
1664 // Produce a normal comparison if we are first in the chain
1666 return emitComparison(LHS
, RHS
, CC
, DL
, DAG
);
1667 // Otherwise produce a ccmp.
1668 return emitConditionalComparison(LHS
, RHS
, CC
, CCOp
, Predicate
, OutCC
, DL
,
1671 assert((Opcode
== ISD::AND
|| (Opcode
== ISD::OR
&& Val
->hasOneUse())) &&
1672 "Valid conjunction/disjunction tree");
1674 // Check if both sides can be transformed.
1675 SDValue LHS
= Val
->getOperand(0);
1676 SDValue RHS
= Val
->getOperand(1);
1678 // In case of an OR we need to negate our operands and the result.
1679 // (A v B) <=> not(not(A) ^ not(B))
1680 bool NegateOpsAndResult
= Opcode
== ISD::OR
;
1681 // We can negate the results of all previous operations by inverting the
1682 // predicate flags giving us a free negation for one side. The other side
1683 // must be negatable by itself.
1684 if (NegateOpsAndResult
) {
1685 // See which side we can negate.
1687 bool isValidL
= isConjunctionDisjunctionTree(LHS
, CanNegateL
);
1688 assert(isValidL
&& "Valid conjunction/disjunction tree");
1693 bool isValidR
= isConjunctionDisjunctionTree(RHS
, CanNegateR
);
1694 assert(isValidR
&& "Valid conjunction/disjunction tree");
1695 assert((CanNegateL
|| CanNegateR
) && "Valid conjunction/disjunction tree");
1698 // Order the side which we cannot negate to RHS so we can emit it first.
1700 std::swap(LHS
, RHS
);
1702 bool NeedsNegOutL
= LHS
->getOpcode() == ISD::OR
;
1703 assert((!NeedsNegOutL
|| RHS
->getOpcode() != ISD::OR
) &&
1704 "Valid conjunction/disjunction tree");
1705 // Order the side where we need to negate the output flags to RHS so it
1706 // gets emitted first.
1708 std::swap(LHS
, RHS
);
1711 // Emit RHS. If we want to negate the tree we only need to push a negate
1712 // through if we are already in a PushNegate case, otherwise we can negate
1713 // the "flags to test" afterwards.
1714 AArch64CC::CondCode RHSCC
;
1715 SDValue CmpR
= emitConjunctionDisjunctionTreeRec(DAG
, RHS
, RHSCC
, Negate
,
1717 if (NegateOpsAndResult
&& !Negate
)
1718 RHSCC
= AArch64CC::getInvertedCondCode(RHSCC
);
1719 // Emit LHS. We may need to negate it.
1720 SDValue CmpL
= emitConjunctionDisjunctionTreeRec(DAG
, LHS
, OutCC
,
1721 NegateOpsAndResult
, CmpR
,
1723 // If we transformed an OR to and AND then we have to negate the result
1724 // (or absorb the Negate parameter).
1725 if (NegateOpsAndResult
&& !Negate
)
1726 OutCC
= AArch64CC::getInvertedCondCode(OutCC
);
1730 /// Emit conjunction or disjunction tree with the CMP/FCMP followed by a chain
1731 /// of CCMP/CFCMP ops. See @ref AArch64CCMP.
1732 /// \see emitConjunctionDisjunctionTreeRec().
1733 static SDValue
emitConjunctionDisjunctionTree(SelectionDAG
&DAG
, SDValue Val
,
1734 AArch64CC::CondCode
&OutCC
) {
1736 if (!isConjunctionDisjunctionTree(Val
, CanNegate
))
1739 return emitConjunctionDisjunctionTreeRec(DAG
, Val
, OutCC
, false, SDValue(),
1745 static SDValue
getAArch64Cmp(SDValue LHS
, SDValue RHS
, ISD::CondCode CC
,
1746 SDValue
&AArch64cc
, SelectionDAG
&DAG
,
1748 if (ConstantSDNode
*RHSC
= dyn_cast
<ConstantSDNode
>(RHS
.getNode())) {
1749 EVT VT
= RHS
.getValueType();
1750 uint64_t C
= RHSC
->getZExtValue();
1751 if (!isLegalArithImmed(C
)) {
1752 // Constant does not fit, try adjusting it by one?
1758 if ((VT
== MVT::i32
&& C
!= 0x80000000 &&
1759 isLegalArithImmed((uint32_t)(C
- 1))) ||
1760 (VT
== MVT::i64
&& C
!= 0x80000000ULL
&&
1761 isLegalArithImmed(C
- 1ULL))) {
1762 CC
= (CC
== ISD::SETLT
) ? ISD::SETLE
: ISD::SETGT
;
1763 C
= (VT
== MVT::i32
) ? (uint32_t)(C
- 1) : C
- 1;
1764 RHS
= DAG
.getConstant(C
, dl
, VT
);
1769 if ((VT
== MVT::i32
&& C
!= 0 &&
1770 isLegalArithImmed((uint32_t)(C
- 1))) ||
1771 (VT
== MVT::i64
&& C
!= 0ULL && isLegalArithImmed(C
- 1ULL))) {
1772 CC
= (CC
== ISD::SETULT
) ? ISD::SETULE
: ISD::SETUGT
;
1773 C
= (VT
== MVT::i32
) ? (uint32_t)(C
- 1) : C
- 1;
1774 RHS
= DAG
.getConstant(C
, dl
, VT
);
1779 if ((VT
== MVT::i32
&& C
!= INT32_MAX
&&
1780 isLegalArithImmed((uint32_t)(C
+ 1))) ||
1781 (VT
== MVT::i64
&& C
!= INT64_MAX
&&
1782 isLegalArithImmed(C
+ 1ULL))) {
1783 CC
= (CC
== ISD::SETLE
) ? ISD::SETLT
: ISD::SETGE
;
1784 C
= (VT
== MVT::i32
) ? (uint32_t)(C
+ 1) : C
+ 1;
1785 RHS
= DAG
.getConstant(C
, dl
, VT
);
1790 if ((VT
== MVT::i32
&& C
!= UINT32_MAX
&&
1791 isLegalArithImmed((uint32_t)(C
+ 1))) ||
1792 (VT
== MVT::i64
&& C
!= UINT64_MAX
&&
1793 isLegalArithImmed(C
+ 1ULL))) {
1794 CC
= (CC
== ISD::SETULE
) ? ISD::SETULT
: ISD::SETUGE
;
1795 C
= (VT
== MVT::i32
) ? (uint32_t)(C
+ 1) : C
+ 1;
1796 RHS
= DAG
.getConstant(C
, dl
, VT
);
1803 AArch64CC::CondCode AArch64CC
;
1804 if ((CC
== ISD::SETEQ
|| CC
== ISD::SETNE
) && isa
<ConstantSDNode
>(RHS
)) {
1805 const ConstantSDNode
*RHSC
= cast
<ConstantSDNode
>(RHS
);
1807 // The imm operand of ADDS is an unsigned immediate, in the range 0 to 4095.
1808 // For the i8 operand, the largest immediate is 255, so this can be easily
1809 // encoded in the compare instruction. For the i16 operand, however, the
1810 // largest immediate cannot be encoded in the compare.
1811 // Therefore, use a sign extending load and cmn to avoid materializing the
1812 // -1 constant. For example,
1814 // ldrh w0, [x0, #0]
1817 // ldrsh w0, [x0, #0]
1819 // Fundamental, we're relying on the property that (zext LHS) == (zext RHS)
1820 // if and only if (sext LHS) == (sext RHS). The checks are in place to
1821 // ensure both the LHS and RHS are truly zero extended and to make sure the
1822 // transformation is profitable.
1823 if ((RHSC
->getZExtValue() >> 16 == 0) && isa
<LoadSDNode
>(LHS
) &&
1824 cast
<LoadSDNode
>(LHS
)->getExtensionType() == ISD::ZEXTLOAD
&&
1825 cast
<LoadSDNode
>(LHS
)->getMemoryVT() == MVT::i16
&&
1826 LHS
.getNode()->hasNUsesOfValue(1, 0)) {
1827 int16_t ValueofRHS
= cast
<ConstantSDNode
>(RHS
)->getZExtValue();
1828 if (ValueofRHS
< 0 && isLegalArithImmed(-ValueofRHS
)) {
1830 DAG
.getNode(ISD::SIGN_EXTEND_INREG
, dl
, LHS
.getValueType(), LHS
,
1831 DAG
.getValueType(MVT::i16
));
1832 Cmp
= emitComparison(SExt
, DAG
.getConstant(ValueofRHS
, dl
,
1833 RHS
.getValueType()),
1835 AArch64CC
= changeIntCCToAArch64CC(CC
);
1839 if (!Cmp
&& (RHSC
->isNullValue() || RHSC
->isOne())) {
1840 if ((Cmp
= emitConjunctionDisjunctionTree(DAG
, LHS
, AArch64CC
))) {
1841 if ((CC
== ISD::SETNE
) ^ RHSC
->isNullValue())
1842 AArch64CC
= AArch64CC::getInvertedCondCode(AArch64CC
);
1848 Cmp
= emitComparison(LHS
, RHS
, CC
, dl
, DAG
);
1849 AArch64CC
= changeIntCCToAArch64CC(CC
);
1851 AArch64cc
= DAG
.getConstant(AArch64CC
, dl
, MVT_CC
);
1855 static std::pair
<SDValue
, SDValue
>
1856 getAArch64XALUOOp(AArch64CC::CondCode
&CC
, SDValue Op
, SelectionDAG
&DAG
) {
1857 assert((Op
.getValueType() == MVT::i32
|| Op
.getValueType() == MVT::i64
) &&
1858 "Unsupported value type");
1859 SDValue Value
, Overflow
;
1861 SDValue LHS
= Op
.getOperand(0);
1862 SDValue RHS
= Op
.getOperand(1);
1864 switch (Op
.getOpcode()) {
1866 llvm_unreachable("Unknown overflow instruction!");
1868 Opc
= AArch64ISD::ADDS
;
1872 Opc
= AArch64ISD::ADDS
;
1876 Opc
= AArch64ISD::SUBS
;
1880 Opc
= AArch64ISD::SUBS
;
1883 // Multiply needs a little bit extra work.
1887 bool IsSigned
= Op
.getOpcode() == ISD::SMULO
;
1888 if (Op
.getValueType() == MVT::i32
) {
1889 unsigned ExtendOpc
= IsSigned
? ISD::SIGN_EXTEND
: ISD::ZERO_EXTEND
;
1890 // For a 32 bit multiply with overflow check we want the instruction
1891 // selector to generate a widening multiply (SMADDL/UMADDL). For that we
1892 // need to generate the following pattern:
1893 // (i64 add 0, (i64 mul (i64 sext|zext i32 %a), (i64 sext|zext i32 %b))
1894 LHS
= DAG
.getNode(ExtendOpc
, DL
, MVT::i64
, LHS
);
1895 RHS
= DAG
.getNode(ExtendOpc
, DL
, MVT::i64
, RHS
);
1896 SDValue Mul
= DAG
.getNode(ISD::MUL
, DL
, MVT::i64
, LHS
, RHS
);
1897 SDValue Add
= DAG
.getNode(ISD::ADD
, DL
, MVT::i64
, Mul
,
1898 DAG
.getConstant(0, DL
, MVT::i64
));
1899 // On AArch64 the upper 32 bits are always zero extended for a 32 bit
1900 // operation. We need to clear out the upper 32 bits, because we used a
1901 // widening multiply that wrote all 64 bits. In the end this should be a
1903 Value
= DAG
.getNode(ISD::TRUNCATE
, DL
, MVT::i32
, Add
);
1905 // The signed overflow check requires more than just a simple check for
1906 // any bit set in the upper 32 bits of the result. These bits could be
1907 // just the sign bits of a negative number. To perform the overflow
1908 // check we have to arithmetic shift right the 32nd bit of the result by
1909 // 31 bits. Then we compare the result to the upper 32 bits.
1910 SDValue UpperBits
= DAG
.getNode(ISD::SRL
, DL
, MVT::i64
, Add
,
1911 DAG
.getConstant(32, DL
, MVT::i64
));
1912 UpperBits
= DAG
.getNode(ISD::TRUNCATE
, DL
, MVT::i32
, UpperBits
);
1913 SDValue LowerBits
= DAG
.getNode(ISD::SRA
, DL
, MVT::i32
, Value
,
1914 DAG
.getConstant(31, DL
, MVT::i64
));
1915 // It is important that LowerBits is last, otherwise the arithmetic
1916 // shift will not be folded into the compare (SUBS).
1917 SDVTList VTs
= DAG
.getVTList(MVT::i32
, MVT::i32
);
1918 Overflow
= DAG
.getNode(AArch64ISD::SUBS
, DL
, VTs
, UpperBits
, LowerBits
)
1921 // The overflow check for unsigned multiply is easy. We only need to
1922 // check if any of the upper 32 bits are set. This can be done with a
1923 // CMP (shifted register). For that we need to generate the following
1925 // (i64 AArch64ISD::SUBS i64 0, (i64 srl i64 %Mul, i64 32)
1926 SDValue UpperBits
= DAG
.getNode(ISD::SRL
, DL
, MVT::i64
, Mul
,
1927 DAG
.getConstant(32, DL
, MVT::i64
));
1928 SDVTList VTs
= DAG
.getVTList(MVT::i64
, MVT::i32
);
1930 DAG
.getNode(AArch64ISD::SUBS
, DL
, VTs
,
1931 DAG
.getConstant(0, DL
, MVT::i64
),
1932 UpperBits
).getValue(1);
1936 assert(Op
.getValueType() == MVT::i64
&& "Expected an i64 value type");
1937 // For the 64 bit multiply
1938 Value
= DAG
.getNode(ISD::MUL
, DL
, MVT::i64
, LHS
, RHS
);
1940 SDValue UpperBits
= DAG
.getNode(ISD::MULHS
, DL
, MVT::i64
, LHS
, RHS
);
1941 SDValue LowerBits
= DAG
.getNode(ISD::SRA
, DL
, MVT::i64
, Value
,
1942 DAG
.getConstant(63, DL
, MVT::i64
));
1943 // It is important that LowerBits is last, otherwise the arithmetic
1944 // shift will not be folded into the compare (SUBS).
1945 SDVTList VTs
= DAG
.getVTList(MVT::i64
, MVT::i32
);
1946 Overflow
= DAG
.getNode(AArch64ISD::SUBS
, DL
, VTs
, UpperBits
, LowerBits
)
1949 SDValue UpperBits
= DAG
.getNode(ISD::MULHU
, DL
, MVT::i64
, LHS
, RHS
);
1950 SDVTList VTs
= DAG
.getVTList(MVT::i64
, MVT::i32
);
1952 DAG
.getNode(AArch64ISD::SUBS
, DL
, VTs
,
1953 DAG
.getConstant(0, DL
, MVT::i64
),
1954 UpperBits
).getValue(1);
1961 SDVTList VTs
= DAG
.getVTList(Op
->getValueType(0), MVT::i32
);
1963 // Emit the AArch64 operation with overflow check.
1964 Value
= DAG
.getNode(Opc
, DL
, VTs
, LHS
, RHS
);
1965 Overflow
= Value
.getValue(1);
1967 return std::make_pair(Value
, Overflow
);
1970 SDValue
AArch64TargetLowering::LowerF128Call(SDValue Op
, SelectionDAG
&DAG
,
1971 RTLIB::Libcall Call
) const {
1972 SmallVector
<SDValue
, 2> Ops(Op
->op_begin(), Op
->op_end());
1973 return makeLibCall(DAG
, Call
, MVT::f128
, Ops
, false, SDLoc(Op
)).first
;
1976 // Returns true if the given Op is the overflow flag result of an overflow
1977 // intrinsic operation.
1978 static bool isOverflowIntrOpRes(SDValue Op
) {
1979 unsigned Opc
= Op
.getOpcode();
1980 return (Op
.getResNo() == 1 &&
1981 (Opc
== ISD::SADDO
|| Opc
== ISD::UADDO
|| Opc
== ISD::SSUBO
||
1982 Opc
== ISD::USUBO
|| Opc
== ISD::SMULO
|| Opc
== ISD::UMULO
));
1985 static SDValue
LowerXOR(SDValue Op
, SelectionDAG
&DAG
) {
1986 SDValue Sel
= Op
.getOperand(0);
1987 SDValue Other
= Op
.getOperand(1);
1990 // If the operand is an overflow checking operation, invert the condition
1991 // code and kill the Not operation. I.e., transform:
1992 // (xor (overflow_op_bool, 1))
1994 // (csel 1, 0, invert(cc), overflow_op_bool)
1995 // ... which later gets transformed to just a cset instruction with an
1996 // inverted condition code, rather than a cset + eor sequence.
1997 if (isOneConstant(Other
) && isOverflowIntrOpRes(Sel
)) {
1998 // Only lower legal XALUO ops.
1999 if (!DAG
.getTargetLoweringInfo().isTypeLegal(Sel
->getValueType(0)))
2002 SDValue TVal
= DAG
.getConstant(1, dl
, MVT::i32
);
2003 SDValue FVal
= DAG
.getConstant(0, dl
, MVT::i32
);
2004 AArch64CC::CondCode CC
;
2005 SDValue Value
, Overflow
;
2006 std::tie(Value
, Overflow
) = getAArch64XALUOOp(CC
, Sel
.getValue(0), DAG
);
2007 SDValue CCVal
= DAG
.getConstant(getInvertedCondCode(CC
), dl
, MVT::i32
);
2008 return DAG
.getNode(AArch64ISD::CSEL
, dl
, Op
.getValueType(), TVal
, FVal
,
2011 // If neither operand is a SELECT_CC, give up.
2012 if (Sel
.getOpcode() != ISD::SELECT_CC
)
2013 std::swap(Sel
, Other
);
2014 if (Sel
.getOpcode() != ISD::SELECT_CC
)
2017 // The folding we want to perform is:
2018 // (xor x, (select_cc a, b, cc, 0, -1) )
2020 // (csel x, (xor x, -1), cc ...)
2022 // The latter will get matched to a CSINV instruction.
2024 ISD::CondCode CC
= cast
<CondCodeSDNode
>(Sel
.getOperand(4))->get();
2025 SDValue LHS
= Sel
.getOperand(0);
2026 SDValue RHS
= Sel
.getOperand(1);
2027 SDValue TVal
= Sel
.getOperand(2);
2028 SDValue FVal
= Sel
.getOperand(3);
2030 // FIXME: This could be generalized to non-integer comparisons.
2031 if (LHS
.getValueType() != MVT::i32
&& LHS
.getValueType() != MVT::i64
)
2034 ConstantSDNode
*CFVal
= dyn_cast
<ConstantSDNode
>(FVal
);
2035 ConstantSDNode
*CTVal
= dyn_cast
<ConstantSDNode
>(TVal
);
2037 // The values aren't constants, this isn't the pattern we're looking for.
2038 if (!CFVal
|| !CTVal
)
2041 // We can commute the SELECT_CC by inverting the condition. This
2042 // might be needed to make this fit into a CSINV pattern.
2043 if (CTVal
->isAllOnesValue() && CFVal
->isNullValue()) {
2044 std::swap(TVal
, FVal
);
2045 std::swap(CTVal
, CFVal
);
2046 CC
= ISD::getSetCCInverse(CC
, true);
2049 // If the constants line up, perform the transform!
2050 if (CTVal
->isNullValue() && CFVal
->isAllOnesValue()) {
2052 SDValue Cmp
= getAArch64Cmp(LHS
, RHS
, CC
, CCVal
, DAG
, dl
);
2055 TVal
= DAG
.getNode(ISD::XOR
, dl
, Other
.getValueType(), Other
,
2056 DAG
.getConstant(-1ULL, dl
, Other
.getValueType()));
2058 return DAG
.getNode(AArch64ISD::CSEL
, dl
, Sel
.getValueType(), FVal
, TVal
,
2065 static SDValue
LowerADDC_ADDE_SUBC_SUBE(SDValue Op
, SelectionDAG
&DAG
) {
2066 EVT VT
= Op
.getValueType();
2068 // Let legalize expand this if it isn't a legal type yet.
2069 if (!DAG
.getTargetLoweringInfo().isTypeLegal(VT
))
2072 SDVTList VTs
= DAG
.getVTList(VT
, MVT::i32
);
2075 bool ExtraOp
= false;
2076 switch (Op
.getOpcode()) {
2078 llvm_unreachable("Invalid code");
2080 Opc
= AArch64ISD::ADDS
;
2083 Opc
= AArch64ISD::SUBS
;
2086 Opc
= AArch64ISD::ADCS
;
2090 Opc
= AArch64ISD::SBCS
;
2096 return DAG
.getNode(Opc
, SDLoc(Op
), VTs
, Op
.getOperand(0), Op
.getOperand(1));
2097 return DAG
.getNode(Opc
, SDLoc(Op
), VTs
, Op
.getOperand(0), Op
.getOperand(1),
2101 static SDValue
LowerXALUO(SDValue Op
, SelectionDAG
&DAG
) {
2102 // Let legalize expand this if it isn't a legal type yet.
2103 if (!DAG
.getTargetLoweringInfo().isTypeLegal(Op
.getValueType()))
2107 AArch64CC::CondCode CC
;
2108 // The actual operation that sets the overflow or carry flag.
2109 SDValue Value
, Overflow
;
2110 std::tie(Value
, Overflow
) = getAArch64XALUOOp(CC
, Op
, DAG
);
2112 // We use 0 and 1 as false and true values.
2113 SDValue TVal
= DAG
.getConstant(1, dl
, MVT::i32
);
2114 SDValue FVal
= DAG
.getConstant(0, dl
, MVT::i32
);
2116 // We use an inverted condition, because the conditional select is inverted
2117 // too. This will allow it to be selected to a single instruction:
2118 // CSINC Wd, WZR, WZR, invert(cond).
2119 SDValue CCVal
= DAG
.getConstant(getInvertedCondCode(CC
), dl
, MVT::i32
);
2120 Overflow
= DAG
.getNode(AArch64ISD::CSEL
, dl
, MVT::i32
, FVal
, TVal
,
2123 SDVTList VTs
= DAG
.getVTList(Op
.getValueType(), MVT::i32
);
2124 return DAG
.getNode(ISD::MERGE_VALUES
, dl
, VTs
, Value
, Overflow
);
2127 // Prefetch operands are:
2128 // 1: Address to prefetch
2130 // 3: int locality (0 = no locality ... 3 = extreme locality)
2131 // 4: bool isDataCache
2132 static SDValue
LowerPREFETCH(SDValue Op
, SelectionDAG
&DAG
) {
2134 unsigned IsWrite
= cast
<ConstantSDNode
>(Op
.getOperand(2))->getZExtValue();
2135 unsigned Locality
= cast
<ConstantSDNode
>(Op
.getOperand(3))->getZExtValue();
2136 unsigned IsData
= cast
<ConstantSDNode
>(Op
.getOperand(4))->getZExtValue();
2138 bool IsStream
= !Locality
;
2139 // When the locality number is set
2141 // The front-end should have filtered out the out-of-range values
2142 assert(Locality
<= 3 && "Prefetch locality out-of-range");
2143 // The locality degree is the opposite of the cache speed.
2144 // Put the number the other way around.
2145 // The encoding starts at 0 for level 1
2146 Locality
= 3 - Locality
;
2149 // built the mask value encoding the expected behavior.
2150 unsigned PrfOp
= (IsWrite
<< 4) | // Load/Store bit
2151 (!IsData
<< 3) | // IsDataCache bit
2152 (Locality
<< 1) | // Cache level bits
2153 (unsigned)IsStream
; // Stream bit
2154 return DAG
.getNode(AArch64ISD::PREFETCH
, DL
, MVT::Other
, Op
.getOperand(0),
2155 DAG
.getConstant(PrfOp
, DL
, MVT::i32
), Op
.getOperand(1));
2158 SDValue
AArch64TargetLowering::LowerFP_EXTEND(SDValue Op
,
2159 SelectionDAG
&DAG
) const {
2160 assert(Op
.getValueType() == MVT::f128
&& "Unexpected lowering");
2163 LC
= RTLIB::getFPEXT(Op
.getOperand(0).getValueType(), Op
.getValueType());
2165 return LowerF128Call(Op
, DAG
, LC
);
2168 SDValue
AArch64TargetLowering::LowerFP_ROUND(SDValue Op
,
2169 SelectionDAG
&DAG
) const {
2170 if (Op
.getOperand(0).getValueType() != MVT::f128
) {
2171 // It's legal except when f128 is involved
2176 LC
= RTLIB::getFPROUND(Op
.getOperand(0).getValueType(), Op
.getValueType());
2178 // FP_ROUND node has a second operand indicating whether it is known to be
2179 // precise. That doesn't take part in the LibCall so we can't directly use
2181 SDValue SrcVal
= Op
.getOperand(0);
2182 return makeLibCall(DAG
, LC
, Op
.getValueType(), SrcVal
, /*isSigned*/ false,
2186 static SDValue
LowerVectorFP_TO_INT(SDValue Op
, SelectionDAG
&DAG
) {
2187 // Warning: We maintain cost tables in AArch64TargetTransformInfo.cpp.
2188 // Any additional optimization in this function should be recorded
2189 // in the cost tables.
2190 EVT InVT
= Op
.getOperand(0).getValueType();
2191 EVT VT
= Op
.getValueType();
2192 unsigned NumElts
= InVT
.getVectorNumElements();
2194 // f16 vectors are promoted to f32 before a conversion.
2195 if (InVT
.getVectorElementType() == MVT::f16
) {
2196 MVT NewVT
= MVT::getVectorVT(MVT::f32
, NumElts
);
2199 Op
.getOpcode(), dl
, Op
.getValueType(),
2200 DAG
.getNode(ISD::FP_EXTEND
, dl
, NewVT
, Op
.getOperand(0)));
2203 if (VT
.getSizeInBits() < InVT
.getSizeInBits()) {
2206 DAG
.getNode(Op
.getOpcode(), dl
, InVT
.changeVectorElementTypeToInteger(),
2208 return DAG
.getNode(ISD::TRUNCATE
, dl
, VT
, Cv
);
2211 if (VT
.getSizeInBits() > InVT
.getSizeInBits()) {
2214 MVT::getVectorVT(MVT::getFloatingPointVT(VT
.getScalarSizeInBits()),
2215 VT
.getVectorNumElements());
2216 SDValue Ext
= DAG
.getNode(ISD::FP_EXTEND
, dl
, ExtVT
, Op
.getOperand(0));
2217 return DAG
.getNode(Op
.getOpcode(), dl
, VT
, Ext
);
2220 // Type changing conversions are illegal.
2224 SDValue
AArch64TargetLowering::LowerFP_TO_INT(SDValue Op
,
2225 SelectionDAG
&DAG
) const {
2226 if (Op
.getOperand(0).getValueType().isVector())
2227 return LowerVectorFP_TO_INT(Op
, DAG
);
2229 // f16 conversions are promoted to f32 when full fp16 is not supported.
2230 if (Op
.getOperand(0).getValueType() == MVT::f16
&&
2231 !Subtarget
->hasFullFP16()) {
2234 Op
.getOpcode(), dl
, Op
.getValueType(),
2235 DAG
.getNode(ISD::FP_EXTEND
, dl
, MVT::f32
, Op
.getOperand(0)));
2238 if (Op
.getOperand(0).getValueType() != MVT::f128
) {
2239 // It's legal except when f128 is involved
2244 if (Op
.getOpcode() == ISD::FP_TO_SINT
)
2245 LC
= RTLIB::getFPTOSINT(Op
.getOperand(0).getValueType(), Op
.getValueType());
2247 LC
= RTLIB::getFPTOUINT(Op
.getOperand(0).getValueType(), Op
.getValueType());
2249 SmallVector
<SDValue
, 2> Ops(Op
->op_begin(), Op
->op_end());
2250 return makeLibCall(DAG
, LC
, Op
.getValueType(), Ops
, false, SDLoc(Op
)).first
;
2253 static SDValue
LowerVectorINT_TO_FP(SDValue Op
, SelectionDAG
&DAG
) {
2254 // Warning: We maintain cost tables in AArch64TargetTransformInfo.cpp.
2255 // Any additional optimization in this function should be recorded
2256 // in the cost tables.
2257 EVT VT
= Op
.getValueType();
2259 SDValue In
= Op
.getOperand(0);
2260 EVT InVT
= In
.getValueType();
2262 if (VT
.getSizeInBits() < InVT
.getSizeInBits()) {
2264 MVT::getVectorVT(MVT::getFloatingPointVT(InVT
.getScalarSizeInBits()),
2265 InVT
.getVectorNumElements());
2266 In
= DAG
.getNode(Op
.getOpcode(), dl
, CastVT
, In
);
2267 return DAG
.getNode(ISD::FP_ROUND
, dl
, VT
, In
, DAG
.getIntPtrConstant(0, dl
));
2270 if (VT
.getSizeInBits() > InVT
.getSizeInBits()) {
2272 Op
.getOpcode() == ISD::SINT_TO_FP
? ISD::SIGN_EXTEND
: ISD::ZERO_EXTEND
;
2273 EVT CastVT
= VT
.changeVectorElementTypeToInteger();
2274 In
= DAG
.getNode(CastOpc
, dl
, CastVT
, In
);
2275 return DAG
.getNode(Op
.getOpcode(), dl
, VT
, In
);
2281 SDValue
AArch64TargetLowering::LowerINT_TO_FP(SDValue Op
,
2282 SelectionDAG
&DAG
) const {
2283 if (Op
.getValueType().isVector())
2284 return LowerVectorINT_TO_FP(Op
, DAG
);
2286 // f16 conversions are promoted to f32 when full fp16 is not supported.
2287 if (Op
.getValueType() == MVT::f16
&&
2288 !Subtarget
->hasFullFP16()) {
2291 ISD::FP_ROUND
, dl
, MVT::f16
,
2292 DAG
.getNode(Op
.getOpcode(), dl
, MVT::f32
, Op
.getOperand(0)),
2293 DAG
.getIntPtrConstant(0, dl
));
2296 // i128 conversions are libcalls.
2297 if (Op
.getOperand(0).getValueType() == MVT::i128
)
2300 // Other conversions are legal, unless it's to the completely software-based
2302 if (Op
.getValueType() != MVT::f128
)
2306 if (Op
.getOpcode() == ISD::SINT_TO_FP
)
2307 LC
= RTLIB::getSINTTOFP(Op
.getOperand(0).getValueType(), Op
.getValueType());
2309 LC
= RTLIB::getUINTTOFP(Op
.getOperand(0).getValueType(), Op
.getValueType());
2311 return LowerF128Call(Op
, DAG
, LC
);
2314 SDValue
AArch64TargetLowering::LowerFSINCOS(SDValue Op
,
2315 SelectionDAG
&DAG
) const {
2316 // For iOS, we want to call an alternative entry point: __sincos_stret,
2317 // which returns the values in two S / D registers.
2319 SDValue Arg
= Op
.getOperand(0);
2320 EVT ArgVT
= Arg
.getValueType();
2321 Type
*ArgTy
= ArgVT
.getTypeForEVT(*DAG
.getContext());
2328 Entry
.IsSExt
= false;
2329 Entry
.IsZExt
= false;
2330 Args
.push_back(Entry
);
2332 RTLIB::Libcall LC
= ArgVT
== MVT::f64
? RTLIB::SINCOS_STRET_F64
2333 : RTLIB::SINCOS_STRET_F32
;
2334 const char *LibcallName
= getLibcallName(LC
);
2336 DAG
.getExternalSymbol(LibcallName
, getPointerTy(DAG
.getDataLayout()));
2338 StructType
*RetTy
= StructType::get(ArgTy
, ArgTy
);
2339 TargetLowering::CallLoweringInfo
CLI(DAG
);
2341 .setChain(DAG
.getEntryNode())
2342 .setLibCallee(CallingConv::Fast
, RetTy
, Callee
, std::move(Args
));
2344 std::pair
<SDValue
, SDValue
> CallResult
= LowerCallTo(CLI
);
2345 return CallResult
.first
;
2348 static SDValue
LowerBITCAST(SDValue Op
, SelectionDAG
&DAG
) {
2349 if (Op
.getValueType() != MVT::f16
)
2352 assert(Op
.getOperand(0).getValueType() == MVT::i16
);
2355 Op
= DAG
.getNode(ISD::ANY_EXTEND
, DL
, MVT::i32
, Op
.getOperand(0));
2356 Op
= DAG
.getNode(ISD::BITCAST
, DL
, MVT::f32
, Op
);
2358 DAG
.getMachineNode(TargetOpcode::EXTRACT_SUBREG
, DL
, MVT::f16
, Op
,
2359 DAG
.getTargetConstant(AArch64::hsub
, DL
, MVT::i32
)),
2363 static EVT
getExtensionTo64Bits(const EVT
&OrigVT
) {
2364 if (OrigVT
.getSizeInBits() >= 64)
2367 assert(OrigVT
.isSimple() && "Expecting a simple value type");
2369 MVT::SimpleValueType OrigSimpleTy
= OrigVT
.getSimpleVT().SimpleTy
;
2370 switch (OrigSimpleTy
) {
2371 default: llvm_unreachable("Unexpected Vector Type");
2380 static SDValue
addRequiredExtensionForVectorMULL(SDValue N
, SelectionDAG
&DAG
,
2383 unsigned ExtOpcode
) {
2384 // The vector originally had a size of OrigTy. It was then extended to ExtTy.
2385 // We expect the ExtTy to be 128-bits total. If the OrigTy is less than
2386 // 64-bits we need to insert a new extension so that it will be 64-bits.
2387 assert(ExtTy
.is128BitVector() && "Unexpected extension size");
2388 if (OrigTy
.getSizeInBits() >= 64)
2391 // Must extend size to at least 64 bits to be used as an operand for VMULL.
2392 EVT NewVT
= getExtensionTo64Bits(OrigTy
);
2394 return DAG
.getNode(ExtOpcode
, SDLoc(N
), NewVT
, N
);
2397 static bool isExtendedBUILD_VECTOR(SDNode
*N
, SelectionDAG
&DAG
,
2399 EVT VT
= N
->getValueType(0);
2401 if (N
->getOpcode() != ISD::BUILD_VECTOR
)
2404 for (const SDValue
&Elt
: N
->op_values()) {
2405 if (ConstantSDNode
*C
= dyn_cast
<ConstantSDNode
>(Elt
)) {
2406 unsigned EltSize
= VT
.getScalarSizeInBits();
2407 unsigned HalfSize
= EltSize
/ 2;
2409 if (!isIntN(HalfSize
, C
->getSExtValue()))
2412 if (!isUIntN(HalfSize
, C
->getZExtValue()))
2423 static SDValue
skipExtensionForVectorMULL(SDNode
*N
, SelectionDAG
&DAG
) {
2424 if (N
->getOpcode() == ISD::SIGN_EXTEND
|| N
->getOpcode() == ISD::ZERO_EXTEND
)
2425 return addRequiredExtensionForVectorMULL(N
->getOperand(0), DAG
,
2426 N
->getOperand(0)->getValueType(0),
2430 assert(N
->getOpcode() == ISD::BUILD_VECTOR
&& "expected BUILD_VECTOR");
2431 EVT VT
= N
->getValueType(0);
2433 unsigned EltSize
= VT
.getScalarSizeInBits() / 2;
2434 unsigned NumElts
= VT
.getVectorNumElements();
2435 MVT TruncVT
= MVT::getIntegerVT(EltSize
);
2436 SmallVector
<SDValue
, 8> Ops
;
2437 for (unsigned i
= 0; i
!= NumElts
; ++i
) {
2438 ConstantSDNode
*C
= cast
<ConstantSDNode
>(N
->getOperand(i
));
2439 const APInt
&CInt
= C
->getAPIntValue();
2440 // Element types smaller than 32 bits are not legal, so use i32 elements.
2441 // The values are implicitly truncated so sext vs. zext doesn't matter.
2442 Ops
.push_back(DAG
.getConstant(CInt
.zextOrTrunc(32), dl
, MVT::i32
));
2444 return DAG
.getBuildVector(MVT::getVectorVT(TruncVT
, NumElts
), dl
, Ops
);
2447 static bool isSignExtended(SDNode
*N
, SelectionDAG
&DAG
) {
2448 return N
->getOpcode() == ISD::SIGN_EXTEND
||
2449 isExtendedBUILD_VECTOR(N
, DAG
, true);
2452 static bool isZeroExtended(SDNode
*N
, SelectionDAG
&DAG
) {
2453 return N
->getOpcode() == ISD::ZERO_EXTEND
||
2454 isExtendedBUILD_VECTOR(N
, DAG
, false);
2457 static bool isAddSubSExt(SDNode
*N
, SelectionDAG
&DAG
) {
2458 unsigned Opcode
= N
->getOpcode();
2459 if (Opcode
== ISD::ADD
|| Opcode
== ISD::SUB
) {
2460 SDNode
*N0
= N
->getOperand(0).getNode();
2461 SDNode
*N1
= N
->getOperand(1).getNode();
2462 return N0
->hasOneUse() && N1
->hasOneUse() &&
2463 isSignExtended(N0
, DAG
) && isSignExtended(N1
, DAG
);
2468 static bool isAddSubZExt(SDNode
*N
, SelectionDAG
&DAG
) {
2469 unsigned Opcode
= N
->getOpcode();
2470 if (Opcode
== ISD::ADD
|| Opcode
== ISD::SUB
) {
2471 SDNode
*N0
= N
->getOperand(0).getNode();
2472 SDNode
*N1
= N
->getOperand(1).getNode();
2473 return N0
->hasOneUse() && N1
->hasOneUse() &&
2474 isZeroExtended(N0
, DAG
) && isZeroExtended(N1
, DAG
);
2479 static SDValue
LowerMUL(SDValue Op
, SelectionDAG
&DAG
) {
2480 // Multiplications are only custom-lowered for 128-bit vectors so that
2481 // VMULL can be detected. Otherwise v2i64 multiplications are not legal.
2482 EVT VT
= Op
.getValueType();
2483 assert(VT
.is128BitVector() && VT
.isInteger() &&
2484 "unexpected type for custom-lowering ISD::MUL");
2485 SDNode
*N0
= Op
.getOperand(0).getNode();
2486 SDNode
*N1
= Op
.getOperand(1).getNode();
2487 unsigned NewOpc
= 0;
2489 bool isN0SExt
= isSignExtended(N0
, DAG
);
2490 bool isN1SExt
= isSignExtended(N1
, DAG
);
2491 if (isN0SExt
&& isN1SExt
)
2492 NewOpc
= AArch64ISD::SMULL
;
2494 bool isN0ZExt
= isZeroExtended(N0
, DAG
);
2495 bool isN1ZExt
= isZeroExtended(N1
, DAG
);
2496 if (isN0ZExt
&& isN1ZExt
)
2497 NewOpc
= AArch64ISD::UMULL
;
2498 else if (isN1SExt
|| isN1ZExt
) {
2499 // Look for (s/zext A + s/zext B) * (s/zext C). We want to turn these
2500 // into (s/zext A * s/zext C) + (s/zext B * s/zext C)
2501 if (isN1SExt
&& isAddSubSExt(N0
, DAG
)) {
2502 NewOpc
= AArch64ISD::SMULL
;
2504 } else if (isN1ZExt
&& isAddSubZExt(N0
, DAG
)) {
2505 NewOpc
= AArch64ISD::UMULL
;
2507 } else if (isN0ZExt
&& isAddSubZExt(N1
, DAG
)) {
2509 NewOpc
= AArch64ISD::UMULL
;
2515 if (VT
== MVT::v2i64
)
2516 // Fall through to expand this. It is not legal.
2519 // Other vector multiplications are legal.
2524 // Legalize to a S/UMULL instruction
2527 SDValue Op1
= skipExtensionForVectorMULL(N1
, DAG
);
2529 Op0
= skipExtensionForVectorMULL(N0
, DAG
);
2530 assert(Op0
.getValueType().is64BitVector() &&
2531 Op1
.getValueType().is64BitVector() &&
2532 "unexpected types for extended operands to VMULL");
2533 return DAG
.getNode(NewOpc
, DL
, VT
, Op0
, Op1
);
2535 // Optimizing (zext A + zext B) * C, to (S/UMULL A, C) + (S/UMULL B, C) during
2536 // isel lowering to take advantage of no-stall back to back s/umul + s/umla.
2537 // This is true for CPUs with accumulate forwarding such as Cortex-A53/A57
2538 SDValue N00
= skipExtensionForVectorMULL(N0
->getOperand(0).getNode(), DAG
);
2539 SDValue N01
= skipExtensionForVectorMULL(N0
->getOperand(1).getNode(), DAG
);
2540 EVT Op1VT
= Op1
.getValueType();
2541 return DAG
.getNode(N0
->getOpcode(), DL
, VT
,
2542 DAG
.getNode(NewOpc
, DL
, VT
,
2543 DAG
.getNode(ISD::BITCAST
, DL
, Op1VT
, N00
), Op1
),
2544 DAG
.getNode(NewOpc
, DL
, VT
,
2545 DAG
.getNode(ISD::BITCAST
, DL
, Op1VT
, N01
), Op1
));
2548 SDValue
AArch64TargetLowering::LowerINTRINSIC_WO_CHAIN(SDValue Op
,
2549 SelectionDAG
&DAG
) const {
2550 unsigned IntNo
= cast
<ConstantSDNode
>(Op
.getOperand(0))->getZExtValue();
2553 default: return SDValue(); // Don't custom lower most intrinsics.
2554 case Intrinsic::thread_pointer
: {
2555 EVT PtrVT
= getPointerTy(DAG
.getDataLayout());
2556 return DAG
.getNode(AArch64ISD::THREAD_POINTER
, dl
, PtrVT
);
2558 case Intrinsic::aarch64_neon_abs
:
2559 return DAG
.getNode(ISD::ABS
, dl
, Op
.getValueType(),
2561 case Intrinsic::aarch64_neon_smax
:
2562 return DAG
.getNode(ISD::SMAX
, dl
, Op
.getValueType(),
2563 Op
.getOperand(1), Op
.getOperand(2));
2564 case Intrinsic::aarch64_neon_umax
:
2565 return DAG
.getNode(ISD::UMAX
, dl
, Op
.getValueType(),
2566 Op
.getOperand(1), Op
.getOperand(2));
2567 case Intrinsic::aarch64_neon_smin
:
2568 return DAG
.getNode(ISD::SMIN
, dl
, Op
.getValueType(),
2569 Op
.getOperand(1), Op
.getOperand(2));
2570 case Intrinsic::aarch64_neon_umin
:
2571 return DAG
.getNode(ISD::UMIN
, dl
, Op
.getValueType(),
2572 Op
.getOperand(1), Op
.getOperand(2));
2576 SDValue
AArch64TargetLowering::LowerOperation(SDValue Op
,
2577 SelectionDAG
&DAG
) const {
2578 DEBUG(dbgs() << "Custom lowering: ");
2581 switch (Op
.getOpcode()) {
2583 llvm_unreachable("unimplemented operand");
2586 return LowerBITCAST(Op
, DAG
);
2587 case ISD::GlobalAddress
:
2588 return LowerGlobalAddress(Op
, DAG
);
2589 case ISD::GlobalTLSAddress
:
2590 return LowerGlobalTLSAddress(Op
, DAG
);
2592 return LowerSETCC(Op
, DAG
);
2594 return LowerBR_CC(Op
, DAG
);
2596 return LowerSELECT(Op
, DAG
);
2597 case ISD::SELECT_CC
:
2598 return LowerSELECT_CC(Op
, DAG
);
2599 case ISD::JumpTable
:
2600 return LowerJumpTable(Op
, DAG
);
2601 case ISD::ConstantPool
:
2602 return LowerConstantPool(Op
, DAG
);
2603 case ISD::BlockAddress
:
2604 return LowerBlockAddress(Op
, DAG
);
2606 return LowerVASTART(Op
, DAG
);
2608 return LowerVACOPY(Op
, DAG
);
2610 return LowerVAARG(Op
, DAG
);
2615 return LowerADDC_ADDE_SUBC_SUBE(Op
, DAG
);
2622 return LowerXALUO(Op
, DAG
);
2624 return LowerF128Call(Op
, DAG
, RTLIB::ADD_F128
);
2626 return LowerF128Call(Op
, DAG
, RTLIB::SUB_F128
);
2628 return LowerF128Call(Op
, DAG
, RTLIB::MUL_F128
);
2630 return LowerF128Call(Op
, DAG
, RTLIB::DIV_F128
);
2632 return LowerFP_ROUND(Op
, DAG
);
2633 case ISD::FP_EXTEND
:
2634 return LowerFP_EXTEND(Op
, DAG
);
2635 case ISD::FRAMEADDR
:
2636 return LowerFRAMEADDR(Op
, DAG
);
2637 case ISD::RETURNADDR
:
2638 return LowerRETURNADDR(Op
, DAG
);
2639 case ISD::INSERT_VECTOR_ELT
:
2640 return LowerINSERT_VECTOR_ELT(Op
, DAG
);
2641 case ISD::EXTRACT_VECTOR_ELT
:
2642 return LowerEXTRACT_VECTOR_ELT(Op
, DAG
);
2643 case ISD::BUILD_VECTOR
:
2644 return LowerBUILD_VECTOR(Op
, DAG
);
2645 case ISD::VECTOR_SHUFFLE
:
2646 return LowerVECTOR_SHUFFLE(Op
, DAG
);
2647 case ISD::EXTRACT_SUBVECTOR
:
2648 return LowerEXTRACT_SUBVECTOR(Op
, DAG
);
2652 return LowerVectorSRA_SRL_SHL(Op
, DAG
);
2653 case ISD::SHL_PARTS
:
2654 return LowerShiftLeftParts(Op
, DAG
);
2655 case ISD::SRL_PARTS
:
2656 case ISD::SRA_PARTS
:
2657 return LowerShiftRightParts(Op
, DAG
);
2659 return LowerCTPOP(Op
, DAG
);
2660 case ISD::FCOPYSIGN
:
2661 return LowerFCOPYSIGN(Op
, DAG
);
2663 return LowerVectorAND(Op
, DAG
);
2665 return LowerVectorOR(Op
, DAG
);
2667 return LowerXOR(Op
, DAG
);
2669 return LowerPREFETCH(Op
, DAG
);
2670 case ISD::SINT_TO_FP
:
2671 case ISD::UINT_TO_FP
:
2672 return LowerINT_TO_FP(Op
, DAG
);
2673 case ISD::FP_TO_SINT
:
2674 case ISD::FP_TO_UINT
:
2675 return LowerFP_TO_INT(Op
, DAG
);
2677 return LowerFSINCOS(Op
, DAG
);
2679 return LowerMUL(Op
, DAG
);
2680 case ISD::INTRINSIC_WO_CHAIN
:
2681 return LowerINTRINSIC_WO_CHAIN(Op
, DAG
);
2682 case ISD::VECREDUCE_ADD
:
2683 case ISD::VECREDUCE_SMAX
:
2684 case ISD::VECREDUCE_SMIN
:
2685 case ISD::VECREDUCE_UMAX
:
2686 case ISD::VECREDUCE_UMIN
:
2687 case ISD::VECREDUCE_FMAX
:
2688 case ISD::VECREDUCE_FMIN
:
2689 return LowerVECREDUCE(Op
, DAG
);
2690 case ISD::ATOMIC_LOAD_SUB
:
2691 return LowerATOMIC_LOAD_SUB(Op
, DAG
);
2692 case ISD::ATOMIC_LOAD_AND
:
2693 return LowerATOMIC_LOAD_AND(Op
, DAG
);
2694 case ISD::DYNAMIC_STACKALLOC
:
2695 return LowerDYNAMIC_STACKALLOC(Op
, DAG
);
2699 //===----------------------------------------------------------------------===//
2700 // Calling Convention Implementation
2701 //===----------------------------------------------------------------------===//
2703 #include "AArch64GenCallingConv.inc"
2705 /// Selects the correct CCAssignFn for a given CallingConvention value.
2706 CCAssignFn
*AArch64TargetLowering::CCAssignFnForCall(CallingConv::ID CC
,
2707 bool IsVarArg
) const {
2710 report_fatal_error("Unsupported calling convention.");
2711 case CallingConv::WebKit_JS
:
2712 return CC_AArch64_WebKit_JS
;
2713 case CallingConv::GHC
:
2714 return CC_AArch64_GHC
;
2715 case CallingConv::C
:
2716 case CallingConv::Fast
:
2717 case CallingConv::PreserveMost
:
2718 case CallingConv::CXX_FAST_TLS
:
2719 case CallingConv::Swift
:
2720 if (Subtarget
->isTargetWindows() && IsVarArg
)
2721 return CC_AArch64_Win64_VarArg
;
2722 if (!Subtarget
->isTargetDarwin())
2723 return CC_AArch64_AAPCS
;
2724 return IsVarArg
? CC_AArch64_DarwinPCS_VarArg
: CC_AArch64_DarwinPCS
;
2725 case CallingConv::Win64
:
2726 return IsVarArg
? CC_AArch64_Win64_VarArg
: CC_AArch64_AAPCS
;
2731 AArch64TargetLowering::CCAssignFnForReturn(CallingConv::ID CC
) const {
2732 return CC
== CallingConv::WebKit_JS
? RetCC_AArch64_WebKit_JS
2733 : RetCC_AArch64_AAPCS
;
2736 SDValue
AArch64TargetLowering::LowerFormalArguments(
2737 SDValue Chain
, CallingConv::ID CallConv
, bool isVarArg
,
2738 const SmallVectorImpl
<ISD::InputArg
> &Ins
, const SDLoc
&DL
,
2739 SelectionDAG
&DAG
, SmallVectorImpl
<SDValue
> &InVals
) const {
2740 MachineFunction
&MF
= DAG
.getMachineFunction();
2741 MachineFrameInfo
&MFI
= MF
.getFrameInfo();
2742 bool IsWin64
= Subtarget
->isCallingConvWin64(MF
.getFunction().getCallingConv());
2744 // Assign locations to all of the incoming arguments.
2745 SmallVector
<CCValAssign
, 16> ArgLocs
;
2746 CCState
CCInfo(CallConv
, isVarArg
, DAG
.getMachineFunction(), ArgLocs
,
2749 // At this point, Ins[].VT may already be promoted to i32. To correctly
2750 // handle passing i8 as i8 instead of i32 on stack, we pass in both i32 and
2751 // i8 to CC_AArch64_AAPCS with i32 being ValVT and i8 being LocVT.
2752 // Since AnalyzeFormalArguments uses Ins[].VT for both ValVT and LocVT, here
2753 // we use a special version of AnalyzeFormalArguments to pass in ValVT and
2755 unsigned NumArgs
= Ins
.size();
2756 Function::const_arg_iterator CurOrigArg
= MF
.getFunction().arg_begin();
2757 unsigned CurArgIdx
= 0;
2758 for (unsigned i
= 0; i
!= NumArgs
; ++i
) {
2759 MVT ValVT
= Ins
[i
].VT
;
2760 if (Ins
[i
].isOrigArg()) {
2761 std::advance(CurOrigArg
, Ins
[i
].getOrigArgIndex() - CurArgIdx
);
2762 CurArgIdx
= Ins
[i
].getOrigArgIndex();
2764 // Get type of the original argument.
2765 EVT ActualVT
= getValueType(DAG
.getDataLayout(), CurOrigArg
->getType(),
2766 /*AllowUnknown*/ true);
2767 MVT ActualMVT
= ActualVT
.isSimple() ? ActualVT
.getSimpleVT() : MVT::Other
;
2768 // If ActualMVT is i1/i8/i16, we should set LocVT to i8/i8/i16.
2769 if (ActualMVT
== MVT::i1
|| ActualMVT
== MVT::i8
)
2771 else if (ActualMVT
== MVT::i16
)
2774 CCAssignFn
*AssignFn
= CCAssignFnForCall(CallConv
, /*IsVarArg=*/false);
2776 AssignFn(i
, ValVT
, ValVT
, CCValAssign::Full
, Ins
[i
].Flags
, CCInfo
);
2777 assert(!Res
&& "Call operand has unhandled type");
2780 assert(ArgLocs
.size() == Ins
.size());
2781 SmallVector
<SDValue
, 16> ArgValues
;
2782 for (unsigned i
= 0, e
= ArgLocs
.size(); i
!= e
; ++i
) {
2783 CCValAssign
&VA
= ArgLocs
[i
];
2785 if (Ins
[i
].Flags
.isByVal()) {
2786 // Byval is used for HFAs in the PCS, but the system should work in a
2787 // non-compliant manner for larger structs.
2788 EVT PtrVT
= getPointerTy(DAG
.getDataLayout());
2789 int Size
= Ins
[i
].Flags
.getByValSize();
2790 unsigned NumRegs
= (Size
+ 7) / 8;
2792 // FIXME: This works on big-endian for composite byvals, which are the common
2793 // case. It should also work for fundamental types too.
2795 MFI
.CreateFixedObject(8 * NumRegs
, VA
.getLocMemOffset(), false);
2796 SDValue FrameIdxN
= DAG
.getFrameIndex(FrameIdx
, PtrVT
);
2797 InVals
.push_back(FrameIdxN
);
2802 if (VA
.isRegLoc()) {
2803 // Arguments stored in registers.
2804 EVT RegVT
= VA
.getLocVT();
2807 const TargetRegisterClass
*RC
;
2809 if (RegVT
== MVT::i32
)
2810 RC
= &AArch64::GPR32RegClass
;
2811 else if (RegVT
== MVT::i64
)
2812 RC
= &AArch64::GPR64RegClass
;
2813 else if (RegVT
== MVT::f16
)
2814 RC
= &AArch64::FPR16RegClass
;
2815 else if (RegVT
== MVT::f32
)
2816 RC
= &AArch64::FPR32RegClass
;
2817 else if (RegVT
== MVT::f64
|| RegVT
.is64BitVector())
2818 RC
= &AArch64::FPR64RegClass
;
2819 else if (RegVT
== MVT::f128
|| RegVT
.is128BitVector())
2820 RC
= &AArch64::FPR128RegClass
;
2822 llvm_unreachable("RegVT not supported by FORMAL_ARGUMENTS Lowering");
2824 // Transform the arguments in physical registers into virtual ones.
2825 unsigned Reg
= MF
.addLiveIn(VA
.getLocReg(), RC
);
2826 ArgValue
= DAG
.getCopyFromReg(Chain
, DL
, Reg
, RegVT
);
2828 // If this is an 8, 16 or 32-bit value, it is really passed promoted
2829 // to 64 bits. Insert an assert[sz]ext to capture this, then
2830 // truncate to the right size.
2831 switch (VA
.getLocInfo()) {
2833 llvm_unreachable("Unknown loc info!");
2834 case CCValAssign::Full
:
2836 case CCValAssign::BCvt
:
2837 ArgValue
= DAG
.getNode(ISD::BITCAST
, DL
, VA
.getValVT(), ArgValue
);
2839 case CCValAssign::AExt
:
2840 case CCValAssign::SExt
:
2841 case CCValAssign::ZExt
:
2842 // SelectionDAGBuilder will insert appropriate AssertZExt & AssertSExt
2843 // nodes after our lowering.
2844 assert(RegVT
== Ins
[i
].VT
&& "incorrect register location selected");
2848 InVals
.push_back(ArgValue
);
2850 } else { // VA.isRegLoc()
2851 assert(VA
.isMemLoc() && "CCValAssign is neither reg nor mem");
2852 unsigned ArgOffset
= VA
.getLocMemOffset();
2853 unsigned ArgSize
= VA
.getValVT().getSizeInBits() / 8;
2855 uint32_t BEAlign
= 0;
2856 if (!Subtarget
->isLittleEndian() && ArgSize
< 8 &&
2857 !Ins
[i
].Flags
.isInConsecutiveRegs())
2858 BEAlign
= 8 - ArgSize
;
2860 int FI
= MFI
.CreateFixedObject(ArgSize
, ArgOffset
+ BEAlign
, true);
2862 // Create load nodes to retrieve arguments from the stack.
2863 SDValue FIN
= DAG
.getFrameIndex(FI
, getPointerTy(DAG
.getDataLayout()));
2866 // For NON_EXTLOAD, generic code in getLoad assert(ValVT == MemVT)
2867 ISD::LoadExtType ExtType
= ISD::NON_EXTLOAD
;
2868 MVT MemVT
= VA
.getValVT();
2870 switch (VA
.getLocInfo()) {
2873 case CCValAssign::BCvt
:
2874 MemVT
= VA
.getLocVT();
2876 case CCValAssign::SExt
:
2877 ExtType
= ISD::SEXTLOAD
;
2879 case CCValAssign::ZExt
:
2880 ExtType
= ISD::ZEXTLOAD
;
2882 case CCValAssign::AExt
:
2883 ExtType
= ISD::EXTLOAD
;
2887 ArgValue
= DAG
.getExtLoad(
2888 ExtType
, DL
, VA
.getLocVT(), Chain
, FIN
,
2889 MachinePointerInfo::getFixedStack(DAG
.getMachineFunction(), FI
),
2892 InVals
.push_back(ArgValue
);
2897 AArch64FunctionInfo
*FuncInfo
= MF
.getInfo
<AArch64FunctionInfo
>();
2899 if (!Subtarget
->isTargetDarwin() || IsWin64
) {
2900 // The AAPCS variadic function ABI is identical to the non-variadic
2901 // one. As a result there may be more arguments in registers and we should
2902 // save them for future reference.
2903 // Win64 variadic functions also pass arguments in registers, but all float
2904 // arguments are passed in integer registers.
2905 saveVarArgRegisters(CCInfo
, DAG
, DL
, Chain
);
2908 // This will point to the next argument passed via stack.
2909 unsigned StackOffset
= CCInfo
.getNextStackOffset();
2910 // We currently pass all varargs at 8-byte alignment.
2911 StackOffset
= ((StackOffset
+ 7) & ~7);
2912 FuncInfo
->setVarArgsStackIndex(MFI
.CreateFixedObject(4, StackOffset
, true));
2915 unsigned StackArgSize
= CCInfo
.getNextStackOffset();
2916 bool TailCallOpt
= MF
.getTarget().Options
.GuaranteedTailCallOpt
;
2917 if (DoesCalleeRestoreStack(CallConv
, TailCallOpt
)) {
2918 // This is a non-standard ABI so by fiat I say we're allowed to make full
2919 // use of the stack area to be popped, which must be aligned to 16 bytes in
2921 StackArgSize
= alignTo(StackArgSize
, 16);
2923 // If we're expected to restore the stack (e.g. fastcc) then we'll be adding
2924 // a multiple of 16.
2925 FuncInfo
->setArgumentStackToRestore(StackArgSize
);
2927 // This realignment carries over to the available bytes below. Our own
2928 // callers will guarantee the space is free by giving an aligned value to
2931 // Even if we're not expected to free up the space, it's useful to know how
2932 // much is there while considering tail calls (because we can reuse it).
2933 FuncInfo
->setBytesInStackArgArea(StackArgSize
);
2938 void AArch64TargetLowering::saveVarArgRegisters(CCState
&CCInfo
,
2941 SDValue
&Chain
) const {
2942 MachineFunction
&MF
= DAG
.getMachineFunction();
2943 MachineFrameInfo
&MFI
= MF
.getFrameInfo();
2944 AArch64FunctionInfo
*FuncInfo
= MF
.getInfo
<AArch64FunctionInfo
>();
2945 auto PtrVT
= getPointerTy(DAG
.getDataLayout());
2946 bool IsWin64
= Subtarget
->isCallingConvWin64(MF
.getFunction().getCallingConv());
2948 SmallVector
<SDValue
, 8> MemOps
;
2950 static const MCPhysReg GPRArgRegs
[] = { AArch64::X0
, AArch64::X1
, AArch64::X2
,
2951 AArch64::X3
, AArch64::X4
, AArch64::X5
,
2952 AArch64::X6
, AArch64::X7
};
2953 static const unsigned NumGPRArgRegs
= array_lengthof(GPRArgRegs
);
2954 unsigned FirstVariadicGPR
= CCInfo
.getFirstUnallocated(GPRArgRegs
);
2956 unsigned GPRSaveSize
= 8 * (NumGPRArgRegs
- FirstVariadicGPR
);
2958 if (GPRSaveSize
!= 0) {
2960 GPRIdx
= MFI
.CreateFixedObject(GPRSaveSize
, -(int)GPRSaveSize
, false);
2961 if (GPRSaveSize
& 15)
2962 // The extra size here, if triggered, will always be 8.
2963 MFI
.CreateFixedObject(16 - (GPRSaveSize
& 15), -(int)alignTo(GPRSaveSize
, 16), false);
2965 GPRIdx
= MFI
.CreateStackObject(GPRSaveSize
, 8, false);
2967 SDValue FIN
= DAG
.getFrameIndex(GPRIdx
, PtrVT
);
2969 for (unsigned i
= FirstVariadicGPR
; i
< NumGPRArgRegs
; ++i
) {
2970 unsigned VReg
= MF
.addLiveIn(GPRArgRegs
[i
], &AArch64::GPR64RegClass
);
2971 SDValue Val
= DAG
.getCopyFromReg(Chain
, DL
, VReg
, MVT::i64
);
2972 SDValue Store
= DAG
.getStore(
2973 Val
.getValue(1), DL
, Val
, FIN
,
2975 ? MachinePointerInfo::getFixedStack(DAG
.getMachineFunction(),
2977 (i
- FirstVariadicGPR
) * 8)
2978 : MachinePointerInfo::getStack(DAG
.getMachineFunction(), i
* 8));
2979 MemOps
.push_back(Store
);
2981 DAG
.getNode(ISD::ADD
, DL
, PtrVT
, FIN
, DAG
.getConstant(8, DL
, PtrVT
));
2984 FuncInfo
->setVarArgsGPRIndex(GPRIdx
);
2985 FuncInfo
->setVarArgsGPRSize(GPRSaveSize
);
2987 if (Subtarget
->hasFPARMv8() && !IsWin64
) {
2988 static const MCPhysReg FPRArgRegs
[] = {
2989 AArch64::Q0
, AArch64::Q1
, AArch64::Q2
, AArch64::Q3
,
2990 AArch64::Q4
, AArch64::Q5
, AArch64::Q6
, AArch64::Q7
};
2991 static const unsigned NumFPRArgRegs
= array_lengthof(FPRArgRegs
);
2992 unsigned FirstVariadicFPR
= CCInfo
.getFirstUnallocated(FPRArgRegs
);
2994 unsigned FPRSaveSize
= 16 * (NumFPRArgRegs
- FirstVariadicFPR
);
2996 if (FPRSaveSize
!= 0) {
2997 FPRIdx
= MFI
.CreateStackObject(FPRSaveSize
, 16, false);
2999 SDValue FIN
= DAG
.getFrameIndex(FPRIdx
, PtrVT
);
3001 for (unsigned i
= FirstVariadicFPR
; i
< NumFPRArgRegs
; ++i
) {
3002 unsigned VReg
= MF
.addLiveIn(FPRArgRegs
[i
], &AArch64::FPR128RegClass
);
3003 SDValue Val
= DAG
.getCopyFromReg(Chain
, DL
, VReg
, MVT::f128
);
3005 SDValue Store
= DAG
.getStore(
3006 Val
.getValue(1), DL
, Val
, FIN
,
3007 MachinePointerInfo::getStack(DAG
.getMachineFunction(), i
* 16));
3008 MemOps
.push_back(Store
);
3009 FIN
= DAG
.getNode(ISD::ADD
, DL
, PtrVT
, FIN
,
3010 DAG
.getConstant(16, DL
, PtrVT
));
3013 FuncInfo
->setVarArgsFPRIndex(FPRIdx
);
3014 FuncInfo
->setVarArgsFPRSize(FPRSaveSize
);
3017 if (!MemOps
.empty()) {
3018 Chain
= DAG
.getNode(ISD::TokenFactor
, DL
, MVT::Other
, MemOps
);
3022 /// LowerCallResult - Lower the result values of a call into the
3023 /// appropriate copies out of appropriate physical registers.
3024 SDValue
AArch64TargetLowering::LowerCallResult(
3025 SDValue Chain
, SDValue InFlag
, CallingConv::ID CallConv
, bool isVarArg
,
3026 const SmallVectorImpl
<ISD::InputArg
> &Ins
, const SDLoc
&DL
,
3027 SelectionDAG
&DAG
, SmallVectorImpl
<SDValue
> &InVals
, bool isThisReturn
,
3028 SDValue ThisVal
) const {
3029 CCAssignFn
*RetCC
= CallConv
== CallingConv::WebKit_JS
3030 ? RetCC_AArch64_WebKit_JS
3031 : RetCC_AArch64_AAPCS
;
3032 // Assign locations to each value returned by this call.
3033 SmallVector
<CCValAssign
, 16> RVLocs
;
3034 CCState
CCInfo(CallConv
, isVarArg
, DAG
.getMachineFunction(), RVLocs
,
3036 CCInfo
.AnalyzeCallResult(Ins
, RetCC
);
3038 // Copy all of the result registers out of their specified physreg.
3039 for (unsigned i
= 0; i
!= RVLocs
.size(); ++i
) {
3040 CCValAssign VA
= RVLocs
[i
];
3042 // Pass 'this' value directly from the argument to return value, to avoid
3043 // reg unit interference
3044 if (i
== 0 && isThisReturn
) {
3045 assert(!VA
.needsCustom() && VA
.getLocVT() == MVT::i64
&&
3046 "unexpected return calling convention register assignment");
3047 InVals
.push_back(ThisVal
);
3052 DAG
.getCopyFromReg(Chain
, DL
, VA
.getLocReg(), VA
.getLocVT(), InFlag
);
3053 Chain
= Val
.getValue(1);
3054 InFlag
= Val
.getValue(2);
3056 switch (VA
.getLocInfo()) {
3058 llvm_unreachable("Unknown loc info!");
3059 case CCValAssign::Full
:
3061 case CCValAssign::BCvt
:
3062 Val
= DAG
.getNode(ISD::BITCAST
, DL
, VA
.getValVT(), Val
);
3066 InVals
.push_back(Val
);
3072 /// Return true if the calling convention is one that we can guarantee TCO for.
3073 static bool canGuaranteeTCO(CallingConv::ID CC
) {
3074 return CC
== CallingConv::Fast
;
3077 /// Return true if we might ever do TCO for calls with this calling convention.
3078 static bool mayTailCallThisCC(CallingConv::ID CC
) {
3080 case CallingConv::C
:
3081 case CallingConv::PreserveMost
:
3082 case CallingConv::Swift
:
3085 return canGuaranteeTCO(CC
);
3089 bool AArch64TargetLowering::isEligibleForTailCallOptimization(
3090 SDValue Callee
, CallingConv::ID CalleeCC
, bool isVarArg
,
3091 const SmallVectorImpl
<ISD::OutputArg
> &Outs
,
3092 const SmallVectorImpl
<SDValue
> &OutVals
,
3093 const SmallVectorImpl
<ISD::InputArg
> &Ins
, SelectionDAG
&DAG
) const {
3094 if (!mayTailCallThisCC(CalleeCC
))
3097 MachineFunction
&MF
= DAG
.getMachineFunction();
3098 const Function
&CallerF
= MF
.getFunction();
3099 CallingConv::ID CallerCC
= CallerF
.getCallingConv();
3100 bool CCMatch
= CallerCC
== CalleeCC
;
3102 // Byval parameters hand the function a pointer directly into the stack area
3103 // we want to reuse during a tail call. Working around this *is* possible (see
3104 // X86) but less efficient and uglier in LowerCall.
3105 for (Function::const_arg_iterator i
= CallerF
.arg_begin(),
3106 e
= CallerF
.arg_end();
3108 if (i
->hasByValAttr())
3111 if (getTargetMachine().Options
.GuaranteedTailCallOpt
)
3112 return canGuaranteeTCO(CalleeCC
) && CCMatch
;
3114 // Externally-defined functions with weak linkage should not be
3115 // tail-called on AArch64 when the OS does not support dynamic
3116 // pre-emption of symbols, as the AAELF spec requires normal calls
3117 // to undefined weak functions to be replaced with a NOP or jump to the
3118 // next instruction. The behaviour of branch instructions in this
3119 // situation (as used for tail calls) is implementation-defined, so we
3120 // cannot rely on the linker replacing the tail call with a return.
3121 if (GlobalAddressSDNode
*G
= dyn_cast
<GlobalAddressSDNode
>(Callee
)) {
3122 const GlobalValue
*GV
= G
->getGlobal();
3123 const Triple
&TT
= getTargetMachine().getTargetTriple();
3124 if (GV
->hasExternalWeakLinkage() &&
3125 (!TT
.isOSWindows() || TT
.isOSBinFormatELF() || TT
.isOSBinFormatMachO()))
3129 // Now we search for cases where we can use a tail call without changing the
3130 // ABI. Sibcall is used in some places (particularly gcc) to refer to this
3133 // I want anyone implementing a new calling convention to think long and hard
3134 // about this assert.
3135 assert((!isVarArg
|| CalleeCC
== CallingConv::C
) &&
3136 "Unexpected variadic calling convention");
3138 LLVMContext
&C
= *DAG
.getContext();
3139 if (isVarArg
&& !Outs
.empty()) {
3140 // At least two cases here: if caller is fastcc then we can't have any
3141 // memory arguments (we'd be expected to clean up the stack afterwards). If
3142 // caller is C then we could potentially use its argument area.
3144 // FIXME: for now we take the most conservative of these in both cases:
3145 // disallow all variadic memory operands.
3146 SmallVector
<CCValAssign
, 16> ArgLocs
;
3147 CCState
CCInfo(CalleeCC
, isVarArg
, MF
, ArgLocs
, C
);
3149 CCInfo
.AnalyzeCallOperands(Outs
, CCAssignFnForCall(CalleeCC
, true));
3150 for (const CCValAssign
&ArgLoc
: ArgLocs
)
3151 if (!ArgLoc
.isRegLoc())
3155 // Check that the call results are passed in the same way.
3156 if (!CCState::resultsCompatible(CalleeCC
, CallerCC
, MF
, C
, Ins
,
3157 CCAssignFnForCall(CalleeCC
, isVarArg
),
3158 CCAssignFnForCall(CallerCC
, isVarArg
)))
3160 // The callee has to preserve all registers the caller needs to preserve.
3161 const AArch64RegisterInfo
*TRI
= Subtarget
->getRegisterInfo();
3162 const uint32_t *CallerPreserved
= TRI
->getCallPreservedMask(MF
, CallerCC
);
3164 const uint32_t *CalleePreserved
= TRI
->getCallPreservedMask(MF
, CalleeCC
);
3165 if (!TRI
->regmaskSubsetEqual(CallerPreserved
, CalleePreserved
))
3169 // Nothing more to check if the callee is taking no arguments
3173 SmallVector
<CCValAssign
, 16> ArgLocs
;
3174 CCState
CCInfo(CalleeCC
, isVarArg
, MF
, ArgLocs
, C
);
3176 CCInfo
.AnalyzeCallOperands(Outs
, CCAssignFnForCall(CalleeCC
, isVarArg
));
3178 const AArch64FunctionInfo
*FuncInfo
= MF
.getInfo
<AArch64FunctionInfo
>();
3180 // If the stack arguments for this call do not fit into our own save area then
3181 // the call cannot be made tail.
3182 if (CCInfo
.getNextStackOffset() > FuncInfo
->getBytesInStackArgArea())
3185 const MachineRegisterInfo
&MRI
= MF
.getRegInfo();
3186 if (!parametersInCSRMatch(MRI
, CallerPreserved
, ArgLocs
, OutVals
))
3192 SDValue
AArch64TargetLowering::addTokenForArgument(SDValue Chain
,
3194 MachineFrameInfo
&MFI
,
3195 int ClobberedFI
) const {
3196 SmallVector
<SDValue
, 8> ArgChains
;
3197 int64_t FirstByte
= MFI
.getObjectOffset(ClobberedFI
);
3198 int64_t LastByte
= FirstByte
+ MFI
.getObjectSize(ClobberedFI
) - 1;
3200 // Include the original chain at the beginning of the list. When this is
3201 // used by target LowerCall hooks, this helps legalize find the
3202 // CALLSEQ_BEGIN node.
3203 ArgChains
.push_back(Chain
);
3205 // Add a chain value for each stack argument corresponding
3206 for (SDNode::use_iterator U
= DAG
.getEntryNode().getNode()->use_begin(),
3207 UE
= DAG
.getEntryNode().getNode()->use_end();
3209 if (LoadSDNode
*L
= dyn_cast
<LoadSDNode
>(*U
))
3210 if (FrameIndexSDNode
*FI
= dyn_cast
<FrameIndexSDNode
>(L
->getBasePtr()))
3211 if (FI
->getIndex() < 0) {
3212 int64_t InFirstByte
= MFI
.getObjectOffset(FI
->getIndex());
3213 int64_t InLastByte
= InFirstByte
;
3214 InLastByte
+= MFI
.getObjectSize(FI
->getIndex()) - 1;
3216 if ((InFirstByte
<= FirstByte
&& FirstByte
<= InLastByte
) ||
3217 (FirstByte
<= InFirstByte
&& InFirstByte
<= LastByte
))
3218 ArgChains
.push_back(SDValue(L
, 1));
3221 // Build a tokenfactor for all the chains.
3222 return DAG
.getNode(ISD::TokenFactor
, SDLoc(Chain
), MVT::Other
, ArgChains
);
3225 bool AArch64TargetLowering::DoesCalleeRestoreStack(CallingConv::ID CallCC
,
3226 bool TailCallOpt
) const {
3227 return CallCC
== CallingConv::Fast
&& TailCallOpt
;
3230 /// LowerCall - Lower a call to a callseq_start + CALL + callseq_end chain,
3231 /// and add input and output parameter nodes.
3233 AArch64TargetLowering::LowerCall(CallLoweringInfo
&CLI
,
3234 SmallVectorImpl
<SDValue
> &InVals
) const {
3235 SelectionDAG
&DAG
= CLI
.DAG
;
3237 SmallVector
<ISD::OutputArg
, 32> &Outs
= CLI
.Outs
;
3238 SmallVector
<SDValue
, 32> &OutVals
= CLI
.OutVals
;
3239 SmallVector
<ISD::InputArg
, 32> &Ins
= CLI
.Ins
;
3240 SDValue Chain
= CLI
.Chain
;
3241 SDValue Callee
= CLI
.Callee
;
3242 bool &IsTailCall
= CLI
.IsTailCall
;
3243 CallingConv::ID CallConv
= CLI
.CallConv
;
3244 bool IsVarArg
= CLI
.IsVarArg
;
3246 MachineFunction
&MF
= DAG
.getMachineFunction();
3247 bool IsThisReturn
= false;
3249 AArch64FunctionInfo
*FuncInfo
= MF
.getInfo
<AArch64FunctionInfo
>();
3250 bool TailCallOpt
= MF
.getTarget().Options
.GuaranteedTailCallOpt
;
3251 bool IsSibCall
= false;
3254 // Check if it's really possible to do a tail call.
3255 IsTailCall
= isEligibleForTailCallOptimization(
3256 Callee
, CallConv
, IsVarArg
, Outs
, OutVals
, Ins
, DAG
);
3257 if (!IsTailCall
&& CLI
.CS
&& CLI
.CS
.isMustTailCall())
3258 report_fatal_error("failed to perform tail call elimination on a call "
3259 "site marked musttail");
3261 // A sibling call is one where we're under the usual C ABI and not planning
3262 // to change that but can still do a tail call:
3263 if (!TailCallOpt
&& IsTailCall
)
3270 // Analyze operands of the call, assigning locations to each operand.
3271 SmallVector
<CCValAssign
, 16> ArgLocs
;
3272 CCState
CCInfo(CallConv
, IsVarArg
, DAG
.getMachineFunction(), ArgLocs
,
3276 // Handle fixed and variable vector arguments differently.
3277 // Variable vector arguments always go into memory.
3278 unsigned NumArgs
= Outs
.size();
3280 for (unsigned i
= 0; i
!= NumArgs
; ++i
) {
3281 MVT ArgVT
= Outs
[i
].VT
;
3282 ISD::ArgFlagsTy ArgFlags
= Outs
[i
].Flags
;
3283 CCAssignFn
*AssignFn
= CCAssignFnForCall(CallConv
,
3284 /*IsVarArg=*/ !Outs
[i
].IsFixed
);
3285 bool Res
= AssignFn(i
, ArgVT
, ArgVT
, CCValAssign::Full
, ArgFlags
, CCInfo
);
3286 assert(!Res
&& "Call operand has unhandled type");
3290 // At this point, Outs[].VT may already be promoted to i32. To correctly
3291 // handle passing i8 as i8 instead of i32 on stack, we pass in both i32 and
3292 // i8 to CC_AArch64_AAPCS with i32 being ValVT and i8 being LocVT.
3293 // Since AnalyzeCallOperands uses Ins[].VT for both ValVT and LocVT, here
3294 // we use a special version of AnalyzeCallOperands to pass in ValVT and
3296 unsigned NumArgs
= Outs
.size();
3297 for (unsigned i
= 0; i
!= NumArgs
; ++i
) {
3298 MVT ValVT
= Outs
[i
].VT
;
3299 // Get type of the original argument.
3300 EVT ActualVT
= getValueType(DAG
.getDataLayout(),
3301 CLI
.getArgs()[Outs
[i
].OrigArgIndex
].Ty
,
3302 /*AllowUnknown*/ true);
3303 MVT ActualMVT
= ActualVT
.isSimple() ? ActualVT
.getSimpleVT() : ValVT
;
3304 ISD::ArgFlagsTy ArgFlags
= Outs
[i
].Flags
;
3305 // If ActualMVT is i1/i8/i16, we should set LocVT to i8/i8/i16.
3306 if (ActualMVT
== MVT::i1
|| ActualMVT
== MVT::i8
)
3308 else if (ActualMVT
== MVT::i16
)
3311 CCAssignFn
*AssignFn
= CCAssignFnForCall(CallConv
, /*IsVarArg=*/false);
3312 bool Res
= AssignFn(i
, ValVT
, ValVT
, CCValAssign::Full
, ArgFlags
, CCInfo
);
3313 assert(!Res
&& "Call operand has unhandled type");
3318 // Get a count of how many bytes are to be pushed on the stack.
3319 unsigned NumBytes
= CCInfo
.getNextStackOffset();
3322 // Since we're not changing the ABI to make this a tail call, the memory
3323 // operands are already available in the caller's incoming argument space.
3327 // FPDiff is the byte offset of the call's argument area from the callee's.
3328 // Stores to callee stack arguments will be placed in FixedStackSlots offset
3329 // by this amount for a tail call. In a sibling call it must be 0 because the
3330 // caller will deallocate the entire stack and the callee still expects its
3331 // arguments to begin at SP+0. Completely unused for non-tail calls.
3334 if (IsTailCall
&& !IsSibCall
) {
3335 unsigned NumReusableBytes
= FuncInfo
->getBytesInStackArgArea();
3337 // Since callee will pop argument stack as a tail call, we must keep the
3338 // popped size 16-byte aligned.
3339 NumBytes
= alignTo(NumBytes
, 16);
3341 // FPDiff will be negative if this tail call requires more space than we
3342 // would automatically have in our incoming argument space. Positive if we
3343 // can actually shrink the stack.
3344 FPDiff
= NumReusableBytes
- NumBytes
;
3346 // The stack pointer must be 16-byte aligned at all times it's used for a
3347 // memory operation, which in practice means at *all* times and in
3348 // particular across call boundaries. Therefore our own arguments started at
3349 // a 16-byte aligned SP and the delta applied for the tail call should
3350 // satisfy the same constraint.
3351 assert(FPDiff
% 16 == 0 && "unaligned stack on tail call");
3354 // Adjust the stack pointer for the new arguments...
3355 // These operations are automatically eliminated by the prolog/epilog pass
3357 Chain
= DAG
.getCALLSEQ_START(Chain
, NumBytes
, 0, DL
);
3359 SDValue StackPtr
= DAG
.getCopyFromReg(Chain
, DL
, AArch64::SP
,
3360 getPointerTy(DAG
.getDataLayout()));
3362 SmallVector
<std::pair
<unsigned, SDValue
>, 8> RegsToPass
;
3363 SmallVector
<SDValue
, 8> MemOpChains
;
3364 auto PtrVT
= getPointerTy(DAG
.getDataLayout());
3366 // Walk the register/memloc assignments, inserting copies/loads.
3367 for (unsigned i
= 0, realArgIdx
= 0, e
= ArgLocs
.size(); i
!= e
;
3368 ++i
, ++realArgIdx
) {
3369 CCValAssign
&VA
= ArgLocs
[i
];
3370 SDValue Arg
= OutVals
[realArgIdx
];
3371 ISD::ArgFlagsTy Flags
= Outs
[realArgIdx
].Flags
;
3373 // Promote the value if needed.
3374 switch (VA
.getLocInfo()) {
3376 llvm_unreachable("Unknown loc info!");
3377 case CCValAssign::Full
:
3379 case CCValAssign::SExt
:
3380 Arg
= DAG
.getNode(ISD::SIGN_EXTEND
, DL
, VA
.getLocVT(), Arg
);
3382 case CCValAssign::ZExt
:
3383 Arg
= DAG
.getNode(ISD::ZERO_EXTEND
, DL
, VA
.getLocVT(), Arg
);
3385 case CCValAssign::AExt
:
3386 if (Outs
[realArgIdx
].ArgVT
== MVT::i1
) {
3387 // AAPCS requires i1 to be zero-extended to 8-bits by the caller.
3388 Arg
= DAG
.getNode(ISD::TRUNCATE
, DL
, MVT::i1
, Arg
);
3389 Arg
= DAG
.getNode(ISD::ZERO_EXTEND
, DL
, MVT::i8
, Arg
);
3391 Arg
= DAG
.getNode(ISD::ANY_EXTEND
, DL
, VA
.getLocVT(), Arg
);
3393 case CCValAssign::BCvt
:
3394 Arg
= DAG
.getNode(ISD::BITCAST
, DL
, VA
.getLocVT(), Arg
);
3396 case CCValAssign::FPExt
:
3397 Arg
= DAG
.getNode(ISD::FP_EXTEND
, DL
, VA
.getLocVT(), Arg
);
3401 if (VA
.isRegLoc()) {
3402 if (realArgIdx
== 0 && Flags
.isReturned() && !Flags
.isSwiftSelf() &&
3403 Outs
[0].VT
== MVT::i64
) {
3404 assert(VA
.getLocVT() == MVT::i64
&&
3405 "unexpected calling convention register assignment");
3406 assert(!Ins
.empty() && Ins
[0].VT
== MVT::i64
&&
3407 "unexpected use of 'returned'");
3408 IsThisReturn
= true;
3410 RegsToPass
.push_back(std::make_pair(VA
.getLocReg(), Arg
));
3412 assert(VA
.isMemLoc());
3415 MachinePointerInfo DstInfo
;
3417 // FIXME: This works on big-endian for composite byvals, which are the
3418 // common case. It should also work for fundamental types too.
3419 uint32_t BEAlign
= 0;
3420 unsigned OpSize
= Flags
.isByVal() ? Flags
.getByValSize() * 8
3421 : VA
.getValVT().getSizeInBits();
3422 OpSize
= (OpSize
+ 7) / 8;
3423 if (!Subtarget
->isLittleEndian() && !Flags
.isByVal() &&
3424 !Flags
.isInConsecutiveRegs()) {
3426 BEAlign
= 8 - OpSize
;
3428 unsigned LocMemOffset
= VA
.getLocMemOffset();
3429 int32_t Offset
= LocMemOffset
+ BEAlign
;
3430 SDValue PtrOff
= DAG
.getIntPtrConstant(Offset
, DL
);
3431 PtrOff
= DAG
.getNode(ISD::ADD
, DL
, PtrVT
, StackPtr
, PtrOff
);
3434 Offset
= Offset
+ FPDiff
;
3435 int FI
= MF
.getFrameInfo().CreateFixedObject(OpSize
, Offset
, true);
3437 DstAddr
= DAG
.getFrameIndex(FI
, PtrVT
);
3439 MachinePointerInfo::getFixedStack(DAG
.getMachineFunction(), FI
);
3441 // Make sure any stack arguments overlapping with where we're storing
3442 // are loaded before this eventual operation. Otherwise they'll be
3444 Chain
= addTokenForArgument(Chain
, DAG
, MF
.getFrameInfo(), FI
);
3446 SDValue PtrOff
= DAG
.getIntPtrConstant(Offset
, DL
);
3448 DstAddr
= DAG
.getNode(ISD::ADD
, DL
, PtrVT
, StackPtr
, PtrOff
);
3449 DstInfo
= MachinePointerInfo::getStack(DAG
.getMachineFunction(),
3453 if (Outs
[i
].Flags
.isByVal()) {
3455 DAG
.getConstant(Outs
[i
].Flags
.getByValSize(), DL
, MVT::i64
);
3456 SDValue Cpy
= DAG
.getMemcpy(
3457 Chain
, DL
, DstAddr
, Arg
, SizeNode
, Outs
[i
].Flags
.getByValAlign(),
3458 /*isVol = */ false, /*AlwaysInline = */ false,
3459 /*isTailCall = */ false,
3460 DstInfo
, MachinePointerInfo());
3462 MemOpChains
.push_back(Cpy
);
3464 // Since we pass i1/i8/i16 as i1/i8/i16 on stack and Arg is already
3465 // promoted to a legal register type i32, we should truncate Arg back to
3467 if (VA
.getValVT() == MVT::i1
|| VA
.getValVT() == MVT::i8
||
3468 VA
.getValVT() == MVT::i16
)
3469 Arg
= DAG
.getNode(ISD::TRUNCATE
, DL
, VA
.getValVT(), Arg
);
3471 SDValue Store
= DAG
.getStore(Chain
, DL
, Arg
, DstAddr
, DstInfo
);
3472 MemOpChains
.push_back(Store
);
3477 if (!MemOpChains
.empty())
3478 Chain
= DAG
.getNode(ISD::TokenFactor
, DL
, MVT::Other
, MemOpChains
);
3480 // Build a sequence of copy-to-reg nodes chained together with token chain
3481 // and flag operands which copy the outgoing args into the appropriate regs.
3483 for (auto &RegToPass
: RegsToPass
) {
3484 Chain
= DAG
.getCopyToReg(Chain
, DL
, RegToPass
.first
,
3485 RegToPass
.second
, InFlag
);
3486 InFlag
= Chain
.getValue(1);
3489 // If the callee is a GlobalAddress/ExternalSymbol node (quite common, every
3490 // direct call is) turn it into a TargetGlobalAddress/TargetExternalSymbol
3491 // node so that legalize doesn't hack it.
3492 if (auto *G
= dyn_cast
<GlobalAddressSDNode
>(Callee
)) {
3493 auto GV
= G
->getGlobal();
3494 if (Subtarget
->classifyGlobalFunctionReference(GV
, getTargetMachine()) ==
3495 AArch64II::MO_GOT
) {
3496 Callee
= DAG
.getTargetGlobalAddress(GV
, DL
, PtrVT
, 0, AArch64II::MO_GOT
);
3497 Callee
= DAG
.getNode(AArch64ISD::LOADgot
, DL
, PtrVT
, Callee
);
3498 } else if (Subtarget
->isTargetCOFF() && GV
->hasDLLImportStorageClass()) {
3499 assert(Subtarget
->isTargetWindows() &&
3500 "Windows is the only supported COFF target");
3501 Callee
= getGOT(G
, DAG
, AArch64II::MO_DLLIMPORT
);
3503 const GlobalValue
*GV
= G
->getGlobal();
3504 Callee
= DAG
.getTargetGlobalAddress(GV
, DL
, PtrVT
, 0, 0);
3506 } else if (auto *S
= dyn_cast
<ExternalSymbolSDNode
>(Callee
)) {
3507 if (getTargetMachine().getCodeModel() == CodeModel::Large
&&
3508 Subtarget
->isTargetMachO()) {
3509 const char *Sym
= S
->getSymbol();
3510 Callee
= DAG
.getTargetExternalSymbol(Sym
, PtrVT
, AArch64II::MO_GOT
);
3511 Callee
= DAG
.getNode(AArch64ISD::LOADgot
, DL
, PtrVT
, Callee
);
3513 const char *Sym
= S
->getSymbol();
3514 Callee
= DAG
.getTargetExternalSymbol(Sym
, PtrVT
, 0);
3518 // We don't usually want to end the call-sequence here because we would tidy
3519 // the frame up *after* the call, however in the ABI-changing tail-call case
3520 // we've carefully laid out the parameters so that when sp is reset they'll be
3521 // in the correct location.
3522 if (IsTailCall
&& !IsSibCall
) {
3523 Chain
= DAG
.getCALLSEQ_END(Chain
, DAG
.getIntPtrConstant(NumBytes
, DL
, true),
3524 DAG
.getIntPtrConstant(0, DL
, true), InFlag
, DL
);
3525 InFlag
= Chain
.getValue(1);
3528 std::vector
<SDValue
> Ops
;
3529 Ops
.push_back(Chain
);
3530 Ops
.push_back(Callee
);
3533 // Each tail call may have to adjust the stack by a different amount, so
3534 // this information must travel along with the operation for eventual
3535 // consumption by emitEpilogue.
3536 Ops
.push_back(DAG
.getTargetConstant(FPDiff
, DL
, MVT::i32
));
3539 // Add argument registers to the end of the list so that they are known live
3541 for (auto &RegToPass
: RegsToPass
)
3542 Ops
.push_back(DAG
.getRegister(RegToPass
.first
,
3543 RegToPass
.second
.getValueType()));
3545 // Add a register mask operand representing the call-preserved registers.
3546 const uint32_t *Mask
;
3547 const AArch64RegisterInfo
*TRI
= Subtarget
->getRegisterInfo();
3549 // For 'this' returns, use the X0-preserving mask if applicable
3550 Mask
= TRI
->getThisReturnPreservedMask(MF
, CallConv
);
3552 IsThisReturn
= false;
3553 Mask
= TRI
->getCallPreservedMask(MF
, CallConv
);
3556 Mask
= TRI
->getCallPreservedMask(MF
, CallConv
);
3558 assert(Mask
&& "Missing call preserved mask for calling convention");
3559 Ops
.push_back(DAG
.getRegisterMask(Mask
));
3561 if (InFlag
.getNode())
3562 Ops
.push_back(InFlag
);
3564 SDVTList NodeTys
= DAG
.getVTList(MVT::Other
, MVT::Glue
);
3566 // If we're doing a tall call, use a TC_RETURN here rather than an
3567 // actual call instruction.
3569 MF
.getFrameInfo().setHasTailCall();
3570 return DAG
.getNode(AArch64ISD::TC_RETURN
, DL
, NodeTys
, Ops
);
3573 // Returns a chain and a flag for retval copy to use.
3574 Chain
= DAG
.getNode(AArch64ISD::CALL
, DL
, NodeTys
, Ops
);
3575 InFlag
= Chain
.getValue(1);
3577 uint64_t CalleePopBytes
=
3578 DoesCalleeRestoreStack(CallConv
, TailCallOpt
) ? alignTo(NumBytes
, 16) : 0;
3580 Chain
= DAG
.getCALLSEQ_END(Chain
, DAG
.getIntPtrConstant(NumBytes
, DL
, true),
3581 DAG
.getIntPtrConstant(CalleePopBytes
, DL
, true),
3584 InFlag
= Chain
.getValue(1);
3586 // Handle result values, copying them out of physregs into vregs that we
3588 return LowerCallResult(Chain
, InFlag
, CallConv
, IsVarArg
, Ins
, DL
, DAG
,
3589 InVals
, IsThisReturn
,
3590 IsThisReturn
? OutVals
[0] : SDValue());
3593 bool AArch64TargetLowering::CanLowerReturn(
3594 CallingConv::ID CallConv
, MachineFunction
&MF
, bool isVarArg
,
3595 const SmallVectorImpl
<ISD::OutputArg
> &Outs
, LLVMContext
&Context
) const {
3596 CCAssignFn
*RetCC
= CallConv
== CallingConv::WebKit_JS
3597 ? RetCC_AArch64_WebKit_JS
3598 : RetCC_AArch64_AAPCS
;
3599 SmallVector
<CCValAssign
, 16> RVLocs
;
3600 CCState
CCInfo(CallConv
, isVarArg
, MF
, RVLocs
, Context
);
3601 return CCInfo
.CheckReturn(Outs
, RetCC
);
3605 AArch64TargetLowering::LowerReturn(SDValue Chain
, CallingConv::ID CallConv
,
3607 const SmallVectorImpl
<ISD::OutputArg
> &Outs
,
3608 const SmallVectorImpl
<SDValue
> &OutVals
,
3609 const SDLoc
&DL
, SelectionDAG
&DAG
) const {
3610 CCAssignFn
*RetCC
= CallConv
== CallingConv::WebKit_JS
3611 ? RetCC_AArch64_WebKit_JS
3612 : RetCC_AArch64_AAPCS
;
3613 SmallVector
<CCValAssign
, 16> RVLocs
;
3614 CCState
CCInfo(CallConv
, isVarArg
, DAG
.getMachineFunction(), RVLocs
,
3616 CCInfo
.AnalyzeReturn(Outs
, RetCC
);
3618 // Copy the result values into the output registers.
3620 SmallVector
<SDValue
, 4> RetOps(1, Chain
);
3621 for (unsigned i
= 0, realRVLocIdx
= 0; i
!= RVLocs
.size();
3622 ++i
, ++realRVLocIdx
) {
3623 CCValAssign
&VA
= RVLocs
[i
];
3624 assert(VA
.isRegLoc() && "Can only return in registers!");
3625 SDValue Arg
= OutVals
[realRVLocIdx
];
3627 switch (VA
.getLocInfo()) {
3629 llvm_unreachable("Unknown loc info!");
3630 case CCValAssign::Full
:
3631 if (Outs
[i
].ArgVT
== MVT::i1
) {
3632 // AAPCS requires i1 to be zero-extended to i8 by the producer of the
3633 // value. This is strictly redundant on Darwin (which uses "zeroext
3634 // i1"), but will be optimised out before ISel.
3635 Arg
= DAG
.getNode(ISD::TRUNCATE
, DL
, MVT::i1
, Arg
);
3636 Arg
= DAG
.getNode(ISD::ZERO_EXTEND
, DL
, VA
.getLocVT(), Arg
);
3639 case CCValAssign::BCvt
:
3640 Arg
= DAG
.getNode(ISD::BITCAST
, DL
, VA
.getLocVT(), Arg
);
3644 Chain
= DAG
.getCopyToReg(Chain
, DL
, VA
.getLocReg(), Arg
, Flag
);
3645 Flag
= Chain
.getValue(1);
3646 RetOps
.push_back(DAG
.getRegister(VA
.getLocReg(), VA
.getLocVT()));
3648 const AArch64RegisterInfo
*TRI
= Subtarget
->getRegisterInfo();
3649 const MCPhysReg
*I
=
3650 TRI
->getCalleeSavedRegsViaCopy(&DAG
.getMachineFunction());
3653 if (AArch64::GPR64RegClass
.contains(*I
))
3654 RetOps
.push_back(DAG
.getRegister(*I
, MVT::i64
));
3655 else if (AArch64::FPR64RegClass
.contains(*I
))
3656 RetOps
.push_back(DAG
.getRegister(*I
, MVT::getFloatingPointVT(64)));
3658 llvm_unreachable("Unexpected register class in CSRsViaCopy!");
3662 RetOps
[0] = Chain
; // Update chain.
3664 // Add the flag if we have it.
3666 RetOps
.push_back(Flag
);
3668 return DAG
.getNode(AArch64ISD::RET_FLAG
, DL
, MVT::Other
, RetOps
);
3671 //===----------------------------------------------------------------------===//
3672 // Other Lowering Code
3673 //===----------------------------------------------------------------------===//
3675 SDValue
AArch64TargetLowering::getTargetNode(GlobalAddressSDNode
*N
, EVT Ty
,
3677 unsigned Flag
) const {
3678 return DAG
.getTargetGlobalAddress(N
->getGlobal(), SDLoc(N
), Ty
, 0, Flag
);
3681 SDValue
AArch64TargetLowering::getTargetNode(JumpTableSDNode
*N
, EVT Ty
,
3683 unsigned Flag
) const {
3684 return DAG
.getTargetJumpTable(N
->getIndex(), Ty
, Flag
);
3687 SDValue
AArch64TargetLowering::getTargetNode(ConstantPoolSDNode
*N
, EVT Ty
,
3689 unsigned Flag
) const {
3690 return DAG
.getTargetConstantPool(N
->getConstVal(), Ty
, N
->getAlignment(),
3691 N
->getOffset(), Flag
);
3694 SDValue
AArch64TargetLowering::getTargetNode(BlockAddressSDNode
* N
, EVT Ty
,
3696 unsigned Flag
) const {
3697 return DAG
.getTargetBlockAddress(N
->getBlockAddress(), Ty
, 0, Flag
);
3701 template <class NodeTy
>
3702 SDValue
AArch64TargetLowering::getGOT(NodeTy
*N
, SelectionDAG
&DAG
,
3703 unsigned Flags
) const {
3704 DEBUG(dbgs() << "AArch64TargetLowering::getGOT\n");
3706 EVT Ty
= getPointerTy(DAG
.getDataLayout());
3707 SDValue GotAddr
= getTargetNode(N
, Ty
, DAG
, AArch64II::MO_GOT
| Flags
);
3708 // FIXME: Once remat is capable of dealing with instructions with register
3709 // operands, expand this into two nodes instead of using a wrapper node.
3710 return DAG
.getNode(AArch64ISD::LOADgot
, DL
, Ty
, GotAddr
);
3713 // (wrapper %highest(sym), %higher(sym), %hi(sym), %lo(sym))
3714 template <class NodeTy
>
3715 SDValue
AArch64TargetLowering::getAddrLarge(NodeTy
*N
, SelectionDAG
&DAG
,
3716 unsigned Flags
) const {
3717 DEBUG(dbgs() << "AArch64TargetLowering::getAddrLarge\n");
3719 EVT Ty
= getPointerTy(DAG
.getDataLayout());
3720 const unsigned char MO_NC
= AArch64II::MO_NC
;
3722 AArch64ISD::WrapperLarge
, DL
, Ty
,
3723 getTargetNode(N
, Ty
, DAG
, AArch64II::MO_G3
| Flags
),
3724 getTargetNode(N
, Ty
, DAG
, AArch64II::MO_G2
| MO_NC
| Flags
),
3725 getTargetNode(N
, Ty
, DAG
, AArch64II::MO_G1
| MO_NC
| Flags
),
3726 getTargetNode(N
, Ty
, DAG
, AArch64II::MO_G0
| MO_NC
| Flags
));
3729 // (addlow (adrp %hi(sym)) %lo(sym))
3730 template <class NodeTy
>
3731 SDValue
AArch64TargetLowering::getAddr(NodeTy
*N
, SelectionDAG
&DAG
,
3732 unsigned Flags
) const {
3733 DEBUG(dbgs() << "AArch64TargetLowering::getAddr\n");
3735 EVT Ty
= getPointerTy(DAG
.getDataLayout());
3736 SDValue Hi
= getTargetNode(N
, Ty
, DAG
, AArch64II::MO_PAGE
| Flags
);
3737 SDValue Lo
= getTargetNode(N
, Ty
, DAG
,
3738 AArch64II::MO_PAGEOFF
| AArch64II::MO_NC
| Flags
);
3739 SDValue ADRP
= DAG
.getNode(AArch64ISD::ADRP
, DL
, Ty
, Hi
);
3740 return DAG
.getNode(AArch64ISD::ADDlow
, DL
, Ty
, ADRP
, Lo
);
3743 SDValue
AArch64TargetLowering::LowerGlobalAddress(SDValue Op
,
3744 SelectionDAG
&DAG
) const {
3745 GlobalAddressSDNode
*GN
= cast
<GlobalAddressSDNode
>(Op
);
3746 const GlobalValue
*GV
= GN
->getGlobal();
3747 const AArch64II::TOF TargetFlags
=
3748 (GV
->hasDLLImportStorageClass() ? AArch64II::MO_DLLIMPORT
3749 : AArch64II::MO_NO_FLAG
);
3750 unsigned char OpFlags
=
3751 Subtarget
->ClassifyGlobalReference(GV
, getTargetMachine());
3753 assert(cast
<GlobalAddressSDNode
>(Op
)->getOffset() == 0 &&
3754 "unexpected offset in global node");
3756 // This also catches the large code model case for Darwin.
3757 if ((OpFlags
& AArch64II::MO_GOT
) != 0) {
3758 return getGOT(GN
, DAG
, TargetFlags
);
3762 if (getTargetMachine().getCodeModel() == CodeModel::Large
) {
3763 Result
= getAddrLarge(GN
, DAG
, TargetFlags
);
3765 Result
= getAddr(GN
, DAG
, TargetFlags
);
3767 EVT PtrVT
= getPointerTy(DAG
.getDataLayout());
3769 if (GV
->hasDLLImportStorageClass())
3770 Result
= DAG
.getLoad(PtrVT
, DL
, DAG
.getEntryNode(), Result
,
3771 MachinePointerInfo::getGOT(DAG
.getMachineFunction()));
3775 /// \brief Convert a TLS address reference into the correct sequence of loads
3776 /// and calls to compute the variable's address (for Darwin, currently) and
3777 /// return an SDValue containing the final node.
3779 /// Darwin only has one TLS scheme which must be capable of dealing with the
3780 /// fully general situation, in the worst case. This means:
3781 /// + "extern __thread" declaration.
3782 /// + Defined in a possibly unknown dynamic library.
3784 /// The general system is that each __thread variable has a [3 x i64] descriptor
3785 /// which contains information used by the runtime to calculate the address. The
3786 /// only part of this the compiler needs to know about is the first xword, which
3787 /// contains a function pointer that must be called with the address of the
3788 /// entire descriptor in "x0".
3790 /// Since this descriptor may be in a different unit, in general even the
3791 /// descriptor must be accessed via an indirect load. The "ideal" code sequence
3793 /// adrp x0, _var@TLVPPAGE
3794 /// ldr x0, [x0, _var@TLVPPAGEOFF] ; x0 now contains address of descriptor
3795 /// ldr x1, [x0] ; x1 contains 1st entry of descriptor,
3796 /// ; the function pointer
3797 /// blr x1 ; Uses descriptor address in x0
3798 /// ; Address of _var is now in x0.
3800 /// If the address of _var's descriptor *is* known to the linker, then it can
3801 /// change the first "ldr" instruction to an appropriate "add x0, x0, #imm" for
3802 /// a slight efficiency gain.
3804 AArch64TargetLowering::LowerDarwinGlobalTLSAddress(SDValue Op
,
3805 SelectionDAG
&DAG
) const {
3806 assert(Subtarget
->isTargetDarwin() &&
3807 "This function expects a Darwin target");
3810 MVT PtrVT
= getPointerTy(DAG
.getDataLayout());
3811 const GlobalValue
*GV
= cast
<GlobalAddressSDNode
>(Op
)->getGlobal();
3814 DAG
.getTargetGlobalAddress(GV
, DL
, PtrVT
, 0, AArch64II::MO_TLS
);
3815 SDValue DescAddr
= DAG
.getNode(AArch64ISD::LOADgot
, DL
, PtrVT
, TLVPAddr
);
3817 // The first entry in the descriptor is a function pointer that we must call
3818 // to obtain the address of the variable.
3819 SDValue Chain
= DAG
.getEntryNode();
3820 SDValue FuncTLVGet
= DAG
.getLoad(
3821 MVT::i64
, DL
, Chain
, DescAddr
,
3822 MachinePointerInfo::getGOT(DAG
.getMachineFunction()),
3823 /* Alignment = */ 8,
3824 MachineMemOperand::MONonTemporal
| MachineMemOperand::MOInvariant
|
3825 MachineMemOperand::MODereferenceable
);
3826 Chain
= FuncTLVGet
.getValue(1);
3828 MachineFrameInfo
&MFI
= DAG
.getMachineFunction().getFrameInfo();
3829 MFI
.setAdjustsStack(true);
3831 // TLS calls preserve all registers except those that absolutely must be
3832 // trashed: X0 (it takes an argument), LR (it's a call) and NZCV (let's not be
3834 const uint32_t *Mask
=
3835 Subtarget
->getRegisterInfo()->getTLSCallPreservedMask();
3837 // Finally, we can make the call. This is just a degenerate version of a
3838 // normal AArch64 call node: x0 takes the address of the descriptor, and
3839 // returns the address of the variable in this thread.
3840 Chain
= DAG
.getCopyToReg(Chain
, DL
, AArch64::X0
, DescAddr
, SDValue());
3842 DAG
.getNode(AArch64ISD::CALL
, DL
, DAG
.getVTList(MVT::Other
, MVT::Glue
),
3843 Chain
, FuncTLVGet
, DAG
.getRegister(AArch64::X0
, MVT::i64
),
3844 DAG
.getRegisterMask(Mask
), Chain
.getValue(1));
3845 return DAG
.getCopyFromReg(Chain
, DL
, AArch64::X0
, PtrVT
, Chain
.getValue(1));
3848 /// When accessing thread-local variables under either the general-dynamic or
3849 /// local-dynamic system, we make a "TLS-descriptor" call. The variable will
3850 /// have a descriptor, accessible via a PC-relative ADRP, and whose first entry
3851 /// is a function pointer to carry out the resolution.
3853 /// The sequence is:
3854 /// adrp x0, :tlsdesc:var
3855 /// ldr x1, [x0, #:tlsdesc_lo12:var]
3856 /// add x0, x0, #:tlsdesc_lo12:var
3857 /// .tlsdesccall var
3859 /// (TPIDR_EL0 offset now in x0)
3861 /// The above sequence must be produced unscheduled, to enable the linker to
3862 /// optimize/relax this sequence.
3863 /// Therefore, a pseudo-instruction (TLSDESC_CALLSEQ) is used to represent the
3864 /// above sequence, and expanded really late in the compilation flow, to ensure
3865 /// the sequence is produced as per above.
3866 SDValue
AArch64TargetLowering::LowerELFTLSDescCallSeq(SDValue SymAddr
,
3868 SelectionDAG
&DAG
) const {
3869 EVT PtrVT
= getPointerTy(DAG
.getDataLayout());
3871 SDValue Chain
= DAG
.getEntryNode();
3872 SDVTList NodeTys
= DAG
.getVTList(MVT::Other
, MVT::Glue
);
3875 DAG
.getNode(AArch64ISD::TLSDESC_CALLSEQ
, DL
, NodeTys
, {Chain
, SymAddr
});
3876 SDValue Glue
= Chain
.getValue(1);
3878 return DAG
.getCopyFromReg(Chain
, DL
, AArch64::X0
, PtrVT
, Glue
);
3882 AArch64TargetLowering::LowerELFGlobalTLSAddress(SDValue Op
,
3883 SelectionDAG
&DAG
) const {
3884 assert(Subtarget
->isTargetELF() && "This function expects an ELF target");
3885 assert(Subtarget
->useSmallAddressing() &&
3886 "ELF TLS only supported in small memory model");
3887 // Different choices can be made for the maximum size of the TLS area for a
3888 // module. For the small address model, the default TLS size is 16MiB and the
3889 // maximum TLS size is 4GiB.
3890 // FIXME: add -mtls-size command line option and make it control the 16MiB
3891 // vs. 4GiB code sequence generation.
3892 const GlobalAddressSDNode
*GA
= cast
<GlobalAddressSDNode
>(Op
);
3894 TLSModel::Model Model
= getTargetMachine().getTLSModel(GA
->getGlobal());
3896 if (!EnableAArch64ELFLocalDynamicTLSGeneration
) {
3897 if (Model
== TLSModel::LocalDynamic
)
3898 Model
= TLSModel::GeneralDynamic
;
3902 EVT PtrVT
= getPointerTy(DAG
.getDataLayout());
3904 const GlobalValue
*GV
= GA
->getGlobal();
3906 SDValue ThreadBase
= DAG
.getNode(AArch64ISD::THREAD_POINTER
, DL
, PtrVT
);
3908 if (Model
== TLSModel::LocalExec
) {
3909 SDValue HiVar
= DAG
.getTargetGlobalAddress(
3910 GV
, DL
, PtrVT
, 0, AArch64II::MO_TLS
| AArch64II::MO_HI12
);
3911 SDValue LoVar
= DAG
.getTargetGlobalAddress(
3913 AArch64II::MO_TLS
| AArch64II::MO_PAGEOFF
| AArch64II::MO_NC
);
3915 SDValue TPWithOff_lo
=
3916 SDValue(DAG
.getMachineNode(AArch64::ADDXri
, DL
, PtrVT
, ThreadBase
,
3918 DAG
.getTargetConstant(0, DL
, MVT::i32
)),
3921 SDValue(DAG
.getMachineNode(AArch64::ADDXri
, DL
, PtrVT
, TPWithOff_lo
,
3923 DAG
.getTargetConstant(0, DL
, MVT::i32
)),
3926 } else if (Model
== TLSModel::InitialExec
) {
3927 TPOff
= DAG
.getTargetGlobalAddress(GV
, DL
, PtrVT
, 0, AArch64II::MO_TLS
);
3928 TPOff
= DAG
.getNode(AArch64ISD::LOADgot
, DL
, PtrVT
, TPOff
);
3929 } else if (Model
== TLSModel::LocalDynamic
) {
3930 // Local-dynamic accesses proceed in two phases. A general-dynamic TLS
3931 // descriptor call against the special symbol _TLS_MODULE_BASE_ to calculate
3932 // the beginning of the module's TLS region, followed by a DTPREL offset
3935 // These accesses will need deduplicating if there's more than one.
3936 AArch64FunctionInfo
*MFI
=
3937 DAG
.getMachineFunction().getInfo
<AArch64FunctionInfo
>();
3938 MFI
->incNumLocalDynamicTLSAccesses();
3940 // The call needs a relocation too for linker relaxation. It doesn't make
3941 // sense to call it MO_PAGE or MO_PAGEOFF though so we need another copy of
3943 SDValue SymAddr
= DAG
.getTargetExternalSymbol("_TLS_MODULE_BASE_", PtrVT
,
3946 // Now we can calculate the offset from TPIDR_EL0 to this module's
3947 // thread-local area.
3948 TPOff
= LowerELFTLSDescCallSeq(SymAddr
, DL
, DAG
);
3950 // Now use :dtprel_whatever: operations to calculate this variable's offset
3951 // in its thread-storage area.
3952 SDValue HiVar
= DAG
.getTargetGlobalAddress(
3953 GV
, DL
, MVT::i64
, 0, AArch64II::MO_TLS
| AArch64II::MO_HI12
);
3954 SDValue LoVar
= DAG
.getTargetGlobalAddress(
3955 GV
, DL
, MVT::i64
, 0,
3956 AArch64II::MO_TLS
| AArch64II::MO_PAGEOFF
| AArch64II::MO_NC
);
3958 TPOff
= SDValue(DAG
.getMachineNode(AArch64::ADDXri
, DL
, PtrVT
, TPOff
, HiVar
,
3959 DAG
.getTargetConstant(0, DL
, MVT::i32
)),
3961 TPOff
= SDValue(DAG
.getMachineNode(AArch64::ADDXri
, DL
, PtrVT
, TPOff
, LoVar
,
3962 DAG
.getTargetConstant(0, DL
, MVT::i32
)),
3964 } else if (Model
== TLSModel::GeneralDynamic
) {
3965 // The call needs a relocation too for linker relaxation. It doesn't make
3966 // sense to call it MO_PAGE or MO_PAGEOFF though so we need another copy of
3969 DAG
.getTargetGlobalAddress(GV
, DL
, PtrVT
, 0, AArch64II::MO_TLS
);
3971 // Finally we can make a call to calculate the offset from tpidr_el0.
3972 TPOff
= LowerELFTLSDescCallSeq(SymAddr
, DL
, DAG
);
3974 llvm_unreachable("Unsupported ELF TLS access model");
3976 return DAG
.getNode(ISD::ADD
, DL
, PtrVT
, ThreadBase
, TPOff
);
3979 SDValue
AArch64TargetLowering::LowerGlobalTLSAddress(SDValue Op
,
3980 SelectionDAG
&DAG
) const {
3981 const GlobalAddressSDNode
*GA
= cast
<GlobalAddressSDNode
>(Op
);
3982 if (DAG
.getTarget().Options
.EmulatedTLS
)
3983 return LowerToTLSEmulatedModel(GA
, DAG
);
3985 if (Subtarget
->isTargetDarwin())
3986 return LowerDarwinGlobalTLSAddress(Op
, DAG
);
3987 if (Subtarget
->isTargetELF())
3988 return LowerELFGlobalTLSAddress(Op
, DAG
);
3990 llvm_unreachable("Unexpected platform trying to use TLS");
3993 SDValue
AArch64TargetLowering::LowerBR_CC(SDValue Op
, SelectionDAG
&DAG
) const {
3994 SDValue Chain
= Op
.getOperand(0);
3995 ISD::CondCode CC
= cast
<CondCodeSDNode
>(Op
.getOperand(1))->get();
3996 SDValue LHS
= Op
.getOperand(2);
3997 SDValue RHS
= Op
.getOperand(3);
3998 SDValue Dest
= Op
.getOperand(4);
4001 // Handle f128 first, since lowering it will result in comparing the return
4002 // value of a libcall against zero, which is just what the rest of LowerBR_CC
4003 // is expecting to deal with.
4004 if (LHS
.getValueType() == MVT::f128
) {
4005 softenSetCCOperands(DAG
, MVT::f128
, LHS
, RHS
, CC
, dl
);
4007 // If softenSetCCOperands returned a scalar, we need to compare the result
4008 // against zero to select between true and false values.
4009 if (!RHS
.getNode()) {
4010 RHS
= DAG
.getConstant(0, dl
, LHS
.getValueType());
4015 // Optimize {s|u}{add|sub|mul}.with.overflow feeding into a branch
4017 if (isOverflowIntrOpRes(LHS
) && isOneConstant(RHS
) &&
4018 (CC
== ISD::SETEQ
|| CC
== ISD::SETNE
)) {
4019 // Only lower legal XALUO ops.
4020 if (!DAG
.getTargetLoweringInfo().isTypeLegal(LHS
->getValueType(0)))
4023 // The actual operation with overflow check.
4024 AArch64CC::CondCode OFCC
;
4025 SDValue Value
, Overflow
;
4026 std::tie(Value
, Overflow
) = getAArch64XALUOOp(OFCC
, LHS
.getValue(0), DAG
);
4028 if (CC
== ISD::SETNE
)
4029 OFCC
= getInvertedCondCode(OFCC
);
4030 SDValue CCVal
= DAG
.getConstant(OFCC
, dl
, MVT::i32
);
4032 return DAG
.getNode(AArch64ISD::BRCOND
, dl
, MVT::Other
, Chain
, Dest
, CCVal
,
4036 if (LHS
.getValueType().isInteger()) {
4037 assert((LHS
.getValueType() == RHS
.getValueType()) &&
4038 (LHS
.getValueType() == MVT::i32
|| LHS
.getValueType() == MVT::i64
));
4040 // If the RHS of the comparison is zero, we can potentially fold this
4041 // to a specialized branch.
4042 const ConstantSDNode
*RHSC
= dyn_cast
<ConstantSDNode
>(RHS
);
4043 if (RHSC
&& RHSC
->getZExtValue() == 0) {
4044 if (CC
== ISD::SETEQ
) {
4045 // See if we can use a TBZ to fold in an AND as well.
4046 // TBZ has a smaller branch displacement than CBZ. If the offset is
4047 // out of bounds, a late MI-layer pass rewrites branches.
4048 // 403.gcc is an example that hits this case.
4049 if (LHS
.getOpcode() == ISD::AND
&&
4050 isa
<ConstantSDNode
>(LHS
.getOperand(1)) &&
4051 isPowerOf2_64(LHS
.getConstantOperandVal(1))) {
4052 SDValue Test
= LHS
.getOperand(0);
4053 uint64_t Mask
= LHS
.getConstantOperandVal(1);
4054 return DAG
.getNode(AArch64ISD::TBZ
, dl
, MVT::Other
, Chain
, Test
,
4055 DAG
.getConstant(Log2_64(Mask
), dl
, MVT::i64
),
4059 return DAG
.getNode(AArch64ISD::CBZ
, dl
, MVT::Other
, Chain
, LHS
, Dest
);
4060 } else if (CC
== ISD::SETNE
) {
4061 // See if we can use a TBZ to fold in an AND as well.
4062 // TBZ has a smaller branch displacement than CBZ. If the offset is
4063 // out of bounds, a late MI-layer pass rewrites branches.
4064 // 403.gcc is an example that hits this case.
4065 if (LHS
.getOpcode() == ISD::AND
&&
4066 isa
<ConstantSDNode
>(LHS
.getOperand(1)) &&
4067 isPowerOf2_64(LHS
.getConstantOperandVal(1))) {
4068 SDValue Test
= LHS
.getOperand(0);
4069 uint64_t Mask
= LHS
.getConstantOperandVal(1);
4070 return DAG
.getNode(AArch64ISD::TBNZ
, dl
, MVT::Other
, Chain
, Test
,
4071 DAG
.getConstant(Log2_64(Mask
), dl
, MVT::i64
),
4075 return DAG
.getNode(AArch64ISD::CBNZ
, dl
, MVT::Other
, Chain
, LHS
, Dest
);
4076 } else if (CC
== ISD::SETLT
&& LHS
.getOpcode() != ISD::AND
) {
4077 // Don't combine AND since emitComparison converts the AND to an ANDS
4078 // (a.k.a. TST) and the test in the test bit and branch instruction
4079 // becomes redundant. This would also increase register pressure.
4080 uint64_t Mask
= LHS
.getValueSizeInBits() - 1;
4081 return DAG
.getNode(AArch64ISD::TBNZ
, dl
, MVT::Other
, Chain
, LHS
,
4082 DAG
.getConstant(Mask
, dl
, MVT::i64
), Dest
);
4085 if (RHSC
&& RHSC
->getSExtValue() == -1 && CC
== ISD::SETGT
&&
4086 LHS
.getOpcode() != ISD::AND
) {
4087 // Don't combine AND since emitComparison converts the AND to an ANDS
4088 // (a.k.a. TST) and the test in the test bit and branch instruction
4089 // becomes redundant. This would also increase register pressure.
4090 uint64_t Mask
= LHS
.getValueSizeInBits() - 1;
4091 return DAG
.getNode(AArch64ISD::TBZ
, dl
, MVT::Other
, Chain
, LHS
,
4092 DAG
.getConstant(Mask
, dl
, MVT::i64
), Dest
);
4096 SDValue Cmp
= getAArch64Cmp(LHS
, RHS
, CC
, CCVal
, DAG
, dl
);
4097 return DAG
.getNode(AArch64ISD::BRCOND
, dl
, MVT::Other
, Chain
, Dest
, CCVal
,
4101 assert(LHS
.getValueType() == MVT::f16
|| LHS
.getValueType() == MVT::f32
||
4102 LHS
.getValueType() == MVT::f64
);
4104 // Unfortunately, the mapping of LLVM FP CC's onto AArch64 CC's isn't totally
4105 // clean. Some of them require two branches to implement.
4106 SDValue Cmp
= emitComparison(LHS
, RHS
, CC
, dl
, DAG
);
4107 AArch64CC::CondCode CC1
, CC2
;
4108 changeFPCCToAArch64CC(CC
, CC1
, CC2
);
4109 SDValue CC1Val
= DAG
.getConstant(CC1
, dl
, MVT::i32
);
4111 DAG
.getNode(AArch64ISD::BRCOND
, dl
, MVT::Other
, Chain
, Dest
, CC1Val
, Cmp
);
4112 if (CC2
!= AArch64CC::AL
) {
4113 SDValue CC2Val
= DAG
.getConstant(CC2
, dl
, MVT::i32
);
4114 return DAG
.getNode(AArch64ISD::BRCOND
, dl
, MVT::Other
, BR1
, Dest
, CC2Val
,
4121 SDValue
AArch64TargetLowering::LowerFCOPYSIGN(SDValue Op
,
4122 SelectionDAG
&DAG
) const {
4123 EVT VT
= Op
.getValueType();
4126 SDValue In1
= Op
.getOperand(0);
4127 SDValue In2
= Op
.getOperand(1);
4128 EVT SrcVT
= In2
.getValueType();
4130 if (SrcVT
.bitsLT(VT
))
4131 In2
= DAG
.getNode(ISD::FP_EXTEND
, DL
, VT
, In2
);
4132 else if (SrcVT
.bitsGT(VT
))
4133 In2
= DAG
.getNode(ISD::FP_ROUND
, DL
, VT
, In2
, DAG
.getIntPtrConstant(0, DL
));
4137 SDValue VecVal1
, VecVal2
;
4139 auto setVecVal
= [&] (int Idx
) {
4140 if (!VT
.isVector()) {
4141 VecVal1
= DAG
.getTargetInsertSubreg(Idx
, DL
, VecVT
,
4142 DAG
.getUNDEF(VecVT
), In1
);
4143 VecVal2
= DAG
.getTargetInsertSubreg(Idx
, DL
, VecVT
,
4144 DAG
.getUNDEF(VecVT
), In2
);
4146 VecVal1
= DAG
.getNode(ISD::BITCAST
, DL
, VecVT
, In1
);
4147 VecVal2
= DAG
.getNode(ISD::BITCAST
, DL
, VecVT
, In2
);
4151 if (VT
== MVT::f32
|| VT
== MVT::v2f32
|| VT
== MVT::v4f32
) {
4152 VecVT
= (VT
== MVT::v2f32
? MVT::v2i32
: MVT::v4i32
);
4153 EltMask
= 0x80000000ULL
;
4154 setVecVal(AArch64::ssub
);
4155 } else if (VT
== MVT::f64
|| VT
== MVT::v2f64
) {
4158 // We want to materialize a mask with the high bit set, but the AdvSIMD
4159 // immediate moves cannot materialize that in a single instruction for
4160 // 64-bit elements. Instead, materialize zero and then negate it.
4163 setVecVal(AArch64::dsub
);
4164 } else if (VT
== MVT::f16
|| VT
== MVT::v4f16
|| VT
== MVT::v8f16
) {
4165 VecVT
= (VT
== MVT::v4f16
? MVT::v4i16
: MVT::v8i16
);
4166 EltMask
= 0x8000ULL
;
4167 setVecVal(AArch64::hsub
);
4169 llvm_unreachable("Invalid type for copysign!");
4172 SDValue BuildVec
= DAG
.getConstant(EltMask
, DL
, VecVT
);
4174 // If we couldn't materialize the mask above, then the mask vector will be
4175 // the zero vector, and we need to negate it here.
4176 if (VT
== MVT::f64
|| VT
== MVT::v2f64
) {
4177 BuildVec
= DAG
.getNode(ISD::BITCAST
, DL
, MVT::v2f64
, BuildVec
);
4178 BuildVec
= DAG
.getNode(ISD::FNEG
, DL
, MVT::v2f64
, BuildVec
);
4179 BuildVec
= DAG
.getNode(ISD::BITCAST
, DL
, MVT::v2i64
, BuildVec
);
4183 DAG
.getNode(AArch64ISD::BIT
, DL
, VecVT
, VecVal1
, VecVal2
, BuildVec
);
4186 return DAG
.getTargetExtractSubreg(AArch64::hsub
, DL
, VT
, Sel
);
4188 return DAG
.getTargetExtractSubreg(AArch64::ssub
, DL
, VT
, Sel
);
4189 else if (VT
== MVT::f64
)
4190 return DAG
.getTargetExtractSubreg(AArch64::dsub
, DL
, VT
, Sel
);
4192 return DAG
.getNode(ISD::BITCAST
, DL
, VT
, Sel
);
4195 SDValue
AArch64TargetLowering::LowerCTPOP(SDValue Op
, SelectionDAG
&DAG
) const {
4196 if (DAG
.getMachineFunction().getFunction().hasFnAttribute(
4197 Attribute::NoImplicitFloat
))
4200 if (!Subtarget
->hasNEON())
4203 // While there is no integer popcount instruction, it can
4204 // be more efficiently lowered to the following sequence that uses
4205 // AdvSIMD registers/instructions as long as the copies to/from
4206 // the AdvSIMD registers are cheap.
4207 // FMOV D0, X0 // copy 64-bit int to vector, high bits zero'd
4208 // CNT V0.8B, V0.8B // 8xbyte pop-counts
4209 // ADDV B0, V0.8B // sum 8xbyte pop-counts
4210 // UMOV X0, V0.B[0] // copy byte result back to integer reg
4211 SDValue Val
= Op
.getOperand(0);
4213 EVT VT
= Op
.getValueType();
4216 Val
= DAG
.getNode(ISD::ZERO_EXTEND
, DL
, MVT::i64
, Val
);
4217 Val
= DAG
.getNode(ISD::BITCAST
, DL
, MVT::v8i8
, Val
);
4219 SDValue CtPop
= DAG
.getNode(ISD::CTPOP
, DL
, MVT::v8i8
, Val
);
4220 SDValue UaddLV
= DAG
.getNode(
4221 ISD::INTRINSIC_WO_CHAIN
, DL
, MVT::i32
,
4222 DAG
.getConstant(Intrinsic::aarch64_neon_uaddlv
, DL
, MVT::i32
), CtPop
);
4225 UaddLV
= DAG
.getNode(ISD::ZERO_EXTEND
, DL
, MVT::i64
, UaddLV
);
4229 SDValue
AArch64TargetLowering::LowerSETCC(SDValue Op
, SelectionDAG
&DAG
) const {
4231 if (Op
.getValueType().isVector())
4232 return LowerVSETCC(Op
, DAG
);
4234 SDValue LHS
= Op
.getOperand(0);
4235 SDValue RHS
= Op
.getOperand(1);
4236 ISD::CondCode CC
= cast
<CondCodeSDNode
>(Op
.getOperand(2))->get();
4239 // We chose ZeroOrOneBooleanContents, so use zero and one.
4240 EVT VT
= Op
.getValueType();
4241 SDValue TVal
= DAG
.getConstant(1, dl
, VT
);
4242 SDValue FVal
= DAG
.getConstant(0, dl
, VT
);
4244 // Handle f128 first, since one possible outcome is a normal integer
4245 // comparison which gets picked up by the next if statement.
4246 if (LHS
.getValueType() == MVT::f128
) {
4247 softenSetCCOperands(DAG
, MVT::f128
, LHS
, RHS
, CC
, dl
);
4249 // If softenSetCCOperands returned a scalar, use it.
4250 if (!RHS
.getNode()) {
4251 assert(LHS
.getValueType() == Op
.getValueType() &&
4252 "Unexpected setcc expansion!");
4257 if (LHS
.getValueType().isInteger()) {
4260 getAArch64Cmp(LHS
, RHS
, ISD::getSetCCInverse(CC
, true), CCVal
, DAG
, dl
);
4262 // Note that we inverted the condition above, so we reverse the order of
4263 // the true and false operands here. This will allow the setcc to be
4264 // matched to a single CSINC instruction.
4265 return DAG
.getNode(AArch64ISD::CSEL
, dl
, VT
, FVal
, TVal
, CCVal
, Cmp
);
4268 // Now we know we're dealing with FP values.
4269 assert(LHS
.getValueType() == MVT::f16
|| LHS
.getValueType() == MVT::f32
||
4270 LHS
.getValueType() == MVT::f64
);
4272 // If that fails, we'll need to perform an FCMP + CSEL sequence. Go ahead
4273 // and do the comparison.
4274 SDValue Cmp
= emitComparison(LHS
, RHS
, CC
, dl
, DAG
);
4276 AArch64CC::CondCode CC1
, CC2
;
4277 changeFPCCToAArch64CC(CC
, CC1
, CC2
);
4278 if (CC2
== AArch64CC::AL
) {
4279 changeFPCCToAArch64CC(ISD::getSetCCInverse(CC
, false), CC1
, CC2
);
4280 SDValue CC1Val
= DAG
.getConstant(CC1
, dl
, MVT::i32
);
4282 // Note that we inverted the condition above, so we reverse the order of
4283 // the true and false operands here. This will allow the setcc to be
4284 // matched to a single CSINC instruction.
4285 return DAG
.getNode(AArch64ISD::CSEL
, dl
, VT
, FVal
, TVal
, CC1Val
, Cmp
);
4287 // Unfortunately, the mapping of LLVM FP CC's onto AArch64 CC's isn't
4288 // totally clean. Some of them require two CSELs to implement. As is in
4289 // this case, we emit the first CSEL and then emit a second using the output
4290 // of the first as the RHS. We're effectively OR'ing the two CC's together.
4292 // FIXME: It would be nice if we could match the two CSELs to two CSINCs.
4293 SDValue CC1Val
= DAG
.getConstant(CC1
, dl
, MVT::i32
);
4295 DAG
.getNode(AArch64ISD::CSEL
, dl
, VT
, TVal
, FVal
, CC1Val
, Cmp
);
4297 SDValue CC2Val
= DAG
.getConstant(CC2
, dl
, MVT::i32
);
4298 return DAG
.getNode(AArch64ISD::CSEL
, dl
, VT
, TVal
, CS1
, CC2Val
, Cmp
);
4302 SDValue
AArch64TargetLowering::LowerSELECT_CC(ISD::CondCode CC
, SDValue LHS
,
4303 SDValue RHS
, SDValue TVal
,
4304 SDValue FVal
, const SDLoc
&dl
,
4305 SelectionDAG
&DAG
) const {
4306 // Handle f128 first, because it will result in a comparison of some RTLIB
4307 // call result against zero.
4308 if (LHS
.getValueType() == MVT::f128
) {
4309 softenSetCCOperands(DAG
, MVT::f128
, LHS
, RHS
, CC
, dl
);
4311 // If softenSetCCOperands returned a scalar, we need to compare the result
4312 // against zero to select between true and false values.
4313 if (!RHS
.getNode()) {
4314 RHS
= DAG
.getConstant(0, dl
, LHS
.getValueType());
4319 // Also handle f16, for which we need to do a f32 comparison.
4320 if (LHS
.getValueType() == MVT::f16
&& !Subtarget
->hasFullFP16()) {
4321 LHS
= DAG
.getNode(ISD::FP_EXTEND
, dl
, MVT::f32
, LHS
);
4322 RHS
= DAG
.getNode(ISD::FP_EXTEND
, dl
, MVT::f32
, RHS
);
4325 // Next, handle integers.
4326 if (LHS
.getValueType().isInteger()) {
4327 assert((LHS
.getValueType() == RHS
.getValueType()) &&
4328 (LHS
.getValueType() == MVT::i32
|| LHS
.getValueType() == MVT::i64
));
4330 unsigned Opcode
= AArch64ISD::CSEL
;
4332 // If both the TVal and the FVal are constants, see if we can swap them in
4333 // order to for a CSINV or CSINC out of them.
4334 ConstantSDNode
*CFVal
= dyn_cast
<ConstantSDNode
>(FVal
);
4335 ConstantSDNode
*CTVal
= dyn_cast
<ConstantSDNode
>(TVal
);
4337 if (CTVal
&& CFVal
&& CTVal
->isAllOnesValue() && CFVal
->isNullValue()) {
4338 std::swap(TVal
, FVal
);
4339 std::swap(CTVal
, CFVal
);
4340 CC
= ISD::getSetCCInverse(CC
, true);
4341 } else if (CTVal
&& CFVal
&& CTVal
->isOne() && CFVal
->isNullValue()) {
4342 std::swap(TVal
, FVal
);
4343 std::swap(CTVal
, CFVal
);
4344 CC
= ISD::getSetCCInverse(CC
, true);
4345 } else if (TVal
.getOpcode() == ISD::XOR
) {
4346 // If TVal is a NOT we want to swap TVal and FVal so that we can match
4347 // with a CSINV rather than a CSEL.
4348 if (isAllOnesConstant(TVal
.getOperand(1))) {
4349 std::swap(TVal
, FVal
);
4350 std::swap(CTVal
, CFVal
);
4351 CC
= ISD::getSetCCInverse(CC
, true);
4353 } else if (TVal
.getOpcode() == ISD::SUB
) {
4354 // If TVal is a negation (SUB from 0) we want to swap TVal and FVal so
4355 // that we can match with a CSNEG rather than a CSEL.
4356 if (isNullConstant(TVal
.getOperand(0))) {
4357 std::swap(TVal
, FVal
);
4358 std::swap(CTVal
, CFVal
);
4359 CC
= ISD::getSetCCInverse(CC
, true);
4361 } else if (CTVal
&& CFVal
) {
4362 const int64_t TrueVal
= CTVal
->getSExtValue();
4363 const int64_t FalseVal
= CFVal
->getSExtValue();
4366 // If both TVal and FVal are constants, see if FVal is the
4367 // inverse/negation/increment of TVal and generate a CSINV/CSNEG/CSINC
4368 // instead of a CSEL in that case.
4369 if (TrueVal
== ~FalseVal
) {
4370 Opcode
= AArch64ISD::CSINV
;
4371 } else if (TrueVal
== -FalseVal
) {
4372 Opcode
= AArch64ISD::CSNEG
;
4373 } else if (TVal
.getValueType() == MVT::i32
) {
4374 // If our operands are only 32-bit wide, make sure we use 32-bit
4375 // arithmetic for the check whether we can use CSINC. This ensures that
4376 // the addition in the check will wrap around properly in case there is
4377 // an overflow (which would not be the case if we do the check with
4378 // 64-bit arithmetic).
4379 const uint32_t TrueVal32
= CTVal
->getZExtValue();
4380 const uint32_t FalseVal32
= CFVal
->getZExtValue();
4382 if ((TrueVal32
== FalseVal32
+ 1) || (TrueVal32
+ 1 == FalseVal32
)) {
4383 Opcode
= AArch64ISD::CSINC
;
4385 if (TrueVal32
> FalseVal32
) {
4389 // 64-bit check whether we can use CSINC.
4390 } else if ((TrueVal
== FalseVal
+ 1) || (TrueVal
+ 1 == FalseVal
)) {
4391 Opcode
= AArch64ISD::CSINC
;
4393 if (TrueVal
> FalseVal
) {
4398 // Swap TVal and FVal if necessary.
4400 std::swap(TVal
, FVal
);
4401 std::swap(CTVal
, CFVal
);
4402 CC
= ISD::getSetCCInverse(CC
, true);
4405 if (Opcode
!= AArch64ISD::CSEL
) {
4406 // Drop FVal since we can get its value by simply inverting/negating
4412 // Avoid materializing a constant when possible by reusing a known value in
4413 // a register. However, don't perform this optimization if the known value
4414 // is one, zero or negative one in the case of a CSEL. We can always
4415 // materialize these values using CSINC, CSEL and CSINV with wzr/xzr as the
4416 // FVal, respectively.
4417 ConstantSDNode
*RHSVal
= dyn_cast
<ConstantSDNode
>(RHS
);
4418 if (Opcode
== AArch64ISD::CSEL
&& RHSVal
&& !RHSVal
->isOne() &&
4419 !RHSVal
->isNullValue() && !RHSVal
->isAllOnesValue()) {
4420 AArch64CC::CondCode AArch64CC
= changeIntCCToAArch64CC(CC
);
4421 // Transform "a == C ? C : x" to "a == C ? a : x" and "a != C ? x : C" to
4422 // "a != C ? x : a" to avoid materializing C.
4423 if (CTVal
&& CTVal
== RHSVal
&& AArch64CC
== AArch64CC::EQ
)
4425 else if (CFVal
&& CFVal
== RHSVal
&& AArch64CC
== AArch64CC::NE
)
4427 } else if (Opcode
== AArch64ISD::CSNEG
&& RHSVal
&& RHSVal
->isOne()) {
4428 assert (CTVal
&& CFVal
&& "Expected constant operands for CSNEG.");
4429 // Use a CSINV to transform "a == C ? 1 : -1" to "a == C ? a : -1" to
4430 // avoid materializing C.
4431 AArch64CC::CondCode AArch64CC
= changeIntCCToAArch64CC(CC
);
4432 if (CTVal
== RHSVal
&& AArch64CC
== AArch64CC::EQ
) {
4433 Opcode
= AArch64ISD::CSINV
;
4435 FVal
= DAG
.getConstant(0, dl
, FVal
.getValueType());
4440 SDValue Cmp
= getAArch64Cmp(LHS
, RHS
, CC
, CCVal
, DAG
, dl
);
4441 EVT VT
= TVal
.getValueType();
4442 return DAG
.getNode(Opcode
, dl
, VT
, TVal
, FVal
, CCVal
, Cmp
);
4445 // Now we know we're dealing with FP values.
4446 assert(LHS
.getValueType() == MVT::f16
|| LHS
.getValueType() == MVT::f32
||
4447 LHS
.getValueType() == MVT::f64
);
4448 assert(LHS
.getValueType() == RHS
.getValueType());
4449 EVT VT
= TVal
.getValueType();
4450 SDValue Cmp
= emitComparison(LHS
, RHS
, CC
, dl
, DAG
);
4452 // Unfortunately, the mapping of LLVM FP CC's onto AArch64 CC's isn't totally
4453 // clean. Some of them require two CSELs to implement.
4454 AArch64CC::CondCode CC1
, CC2
;
4455 changeFPCCToAArch64CC(CC
, CC1
, CC2
);
4457 if (DAG
.getTarget().Options
.UnsafeFPMath
) {
4458 // Transform "a == 0.0 ? 0.0 : x" to "a == 0.0 ? a : x" and
4459 // "a != 0.0 ? x : 0.0" to "a != 0.0 ? x : a" to avoid materializing 0.0.
4460 ConstantFPSDNode
*RHSVal
= dyn_cast
<ConstantFPSDNode
>(RHS
);
4461 if (RHSVal
&& RHSVal
->isZero()) {
4462 ConstantFPSDNode
*CFVal
= dyn_cast
<ConstantFPSDNode
>(FVal
);
4463 ConstantFPSDNode
*CTVal
= dyn_cast
<ConstantFPSDNode
>(TVal
);
4465 if ((CC
== ISD::SETEQ
|| CC
== ISD::SETOEQ
|| CC
== ISD::SETUEQ
) &&
4466 CTVal
&& CTVal
->isZero() && TVal
.getValueType() == LHS
.getValueType())
4468 else if ((CC
== ISD::SETNE
|| CC
== ISD::SETONE
|| CC
== ISD::SETUNE
) &&
4469 CFVal
&& CFVal
->isZero() &&
4470 FVal
.getValueType() == LHS
.getValueType())
4475 // Emit first, and possibly only, CSEL.
4476 SDValue CC1Val
= DAG
.getConstant(CC1
, dl
, MVT::i32
);
4477 SDValue CS1
= DAG
.getNode(AArch64ISD::CSEL
, dl
, VT
, TVal
, FVal
, CC1Val
, Cmp
);
4479 // If we need a second CSEL, emit it, using the output of the first as the
4480 // RHS. We're effectively OR'ing the two CC's together.
4481 if (CC2
!= AArch64CC::AL
) {
4482 SDValue CC2Val
= DAG
.getConstant(CC2
, dl
, MVT::i32
);
4483 return DAG
.getNode(AArch64ISD::CSEL
, dl
, VT
, TVal
, CS1
, CC2Val
, Cmp
);
4486 // Otherwise, return the output of the first CSEL.
4490 SDValue
AArch64TargetLowering::LowerSELECT_CC(SDValue Op
,
4491 SelectionDAG
&DAG
) const {
4492 ISD::CondCode CC
= cast
<CondCodeSDNode
>(Op
.getOperand(4))->get();
4493 SDValue LHS
= Op
.getOperand(0);
4494 SDValue RHS
= Op
.getOperand(1);
4495 SDValue TVal
= Op
.getOperand(2);
4496 SDValue FVal
= Op
.getOperand(3);
4498 return LowerSELECT_CC(CC
, LHS
, RHS
, TVal
, FVal
, DL
, DAG
);
4501 SDValue
AArch64TargetLowering::LowerSELECT(SDValue Op
,
4502 SelectionDAG
&DAG
) const {
4503 SDValue CCVal
= Op
->getOperand(0);
4504 SDValue TVal
= Op
->getOperand(1);
4505 SDValue FVal
= Op
->getOperand(2);
4508 // Optimize {s|u}{add|sub|mul}.with.overflow feeding into a select
4510 if (isOverflowIntrOpRes(CCVal
)) {
4511 // Only lower legal XALUO ops.
4512 if (!DAG
.getTargetLoweringInfo().isTypeLegal(CCVal
->getValueType(0)))
4515 AArch64CC::CondCode OFCC
;
4516 SDValue Value
, Overflow
;
4517 std::tie(Value
, Overflow
) = getAArch64XALUOOp(OFCC
, CCVal
.getValue(0), DAG
);
4518 SDValue CCVal
= DAG
.getConstant(OFCC
, DL
, MVT::i32
);
4520 return DAG
.getNode(AArch64ISD::CSEL
, DL
, Op
.getValueType(), TVal
, FVal
,
4524 // Lower it the same way as we would lower a SELECT_CC node.
4527 if (CCVal
.getOpcode() == ISD::SETCC
) {
4528 LHS
= CCVal
.getOperand(0);
4529 RHS
= CCVal
.getOperand(1);
4530 CC
= cast
<CondCodeSDNode
>(CCVal
->getOperand(2))->get();
4533 RHS
= DAG
.getConstant(0, DL
, CCVal
.getValueType());
4536 return LowerSELECT_CC(CC
, LHS
, RHS
, TVal
, FVal
, DL
, DAG
);
4539 SDValue
AArch64TargetLowering::LowerJumpTable(SDValue Op
,
4540 SelectionDAG
&DAG
) const {
4541 // Jump table entries as PC relative offsets. No additional tweaking
4542 // is necessary here. Just get the address of the jump table.
4543 JumpTableSDNode
*JT
= cast
<JumpTableSDNode
>(Op
);
4545 if (getTargetMachine().getCodeModel() == CodeModel::Large
&&
4546 !Subtarget
->isTargetMachO()) {
4547 return getAddrLarge(JT
, DAG
);
4549 return getAddr(JT
, DAG
);
4552 SDValue
AArch64TargetLowering::LowerConstantPool(SDValue Op
,
4553 SelectionDAG
&DAG
) const {
4554 ConstantPoolSDNode
*CP
= cast
<ConstantPoolSDNode
>(Op
);
4556 if (getTargetMachine().getCodeModel() == CodeModel::Large
) {
4557 // Use the GOT for the large code model on iOS.
4558 if (Subtarget
->isTargetMachO()) {
4559 return getGOT(CP
, DAG
);
4561 return getAddrLarge(CP
, DAG
);
4563 return getAddr(CP
, DAG
);
4567 SDValue
AArch64TargetLowering::LowerBlockAddress(SDValue Op
,
4568 SelectionDAG
&DAG
) const {
4569 BlockAddressSDNode
*BA
= cast
<BlockAddressSDNode
>(Op
);
4570 if (getTargetMachine().getCodeModel() == CodeModel::Large
&&
4571 !Subtarget
->isTargetMachO()) {
4572 return getAddrLarge(BA
, DAG
);
4574 return getAddr(BA
, DAG
);
4578 SDValue
AArch64TargetLowering::LowerDarwin_VASTART(SDValue Op
,
4579 SelectionDAG
&DAG
) const {
4580 AArch64FunctionInfo
*FuncInfo
=
4581 DAG
.getMachineFunction().getInfo
<AArch64FunctionInfo
>();
4584 SDValue FR
= DAG
.getFrameIndex(FuncInfo
->getVarArgsStackIndex(),
4585 getPointerTy(DAG
.getDataLayout()));
4586 const Value
*SV
= cast
<SrcValueSDNode
>(Op
.getOperand(2))->getValue();
4587 return DAG
.getStore(Op
.getOperand(0), DL
, FR
, Op
.getOperand(1),
4588 MachinePointerInfo(SV
));
4591 SDValue
AArch64TargetLowering::LowerWin64_VASTART(SDValue Op
,
4592 SelectionDAG
&DAG
) const {
4593 AArch64FunctionInfo
*FuncInfo
=
4594 DAG
.getMachineFunction().getInfo
<AArch64FunctionInfo
>();
4597 SDValue FR
= DAG
.getFrameIndex(FuncInfo
->getVarArgsGPRSize() > 0
4598 ? FuncInfo
->getVarArgsGPRIndex()
4599 : FuncInfo
->getVarArgsStackIndex(),
4600 getPointerTy(DAG
.getDataLayout()));
4601 const Value
*SV
= cast
<SrcValueSDNode
>(Op
.getOperand(2))->getValue();
4602 return DAG
.getStore(Op
.getOperand(0), DL
, FR
, Op
.getOperand(1),
4603 MachinePointerInfo(SV
));
4606 SDValue
AArch64TargetLowering::LowerAAPCS_VASTART(SDValue Op
,
4607 SelectionDAG
&DAG
) const {
4608 // The layout of the va_list struct is specified in the AArch64 Procedure Call
4609 // Standard, section B.3.
4610 MachineFunction
&MF
= DAG
.getMachineFunction();
4611 AArch64FunctionInfo
*FuncInfo
= MF
.getInfo
<AArch64FunctionInfo
>();
4612 auto PtrVT
= getPointerTy(DAG
.getDataLayout());
4615 SDValue Chain
= Op
.getOperand(0);
4616 SDValue VAList
= Op
.getOperand(1);
4617 const Value
*SV
= cast
<SrcValueSDNode
>(Op
.getOperand(2))->getValue();
4618 SmallVector
<SDValue
, 4> MemOps
;
4620 // void *__stack at offset 0
4621 SDValue Stack
= DAG
.getFrameIndex(FuncInfo
->getVarArgsStackIndex(), PtrVT
);
4622 MemOps
.push_back(DAG
.getStore(Chain
, DL
, Stack
, VAList
,
4623 MachinePointerInfo(SV
), /* Alignment = */ 8));
4625 // void *__gr_top at offset 8
4626 int GPRSize
= FuncInfo
->getVarArgsGPRSize();
4628 SDValue GRTop
, GRTopAddr
;
4631 DAG
.getNode(ISD::ADD
, DL
, PtrVT
, VAList
, DAG
.getConstant(8, DL
, PtrVT
));
4633 GRTop
= DAG
.getFrameIndex(FuncInfo
->getVarArgsGPRIndex(), PtrVT
);
4634 GRTop
= DAG
.getNode(ISD::ADD
, DL
, PtrVT
, GRTop
,
4635 DAG
.getConstant(GPRSize
, DL
, PtrVT
));
4637 MemOps
.push_back(DAG
.getStore(Chain
, DL
, GRTop
, GRTopAddr
,
4638 MachinePointerInfo(SV
, 8),
4639 /* Alignment = */ 8));
4642 // void *__vr_top at offset 16
4643 int FPRSize
= FuncInfo
->getVarArgsFPRSize();
4645 SDValue VRTop
, VRTopAddr
;
4646 VRTopAddr
= DAG
.getNode(ISD::ADD
, DL
, PtrVT
, VAList
,
4647 DAG
.getConstant(16, DL
, PtrVT
));
4649 VRTop
= DAG
.getFrameIndex(FuncInfo
->getVarArgsFPRIndex(), PtrVT
);
4650 VRTop
= DAG
.getNode(ISD::ADD
, DL
, PtrVT
, VRTop
,
4651 DAG
.getConstant(FPRSize
, DL
, PtrVT
));
4653 MemOps
.push_back(DAG
.getStore(Chain
, DL
, VRTop
, VRTopAddr
,
4654 MachinePointerInfo(SV
, 16),
4655 /* Alignment = */ 8));
4658 // int __gr_offs at offset 24
4659 SDValue GROffsAddr
=
4660 DAG
.getNode(ISD::ADD
, DL
, PtrVT
, VAList
, DAG
.getConstant(24, DL
, PtrVT
));
4661 MemOps
.push_back(DAG
.getStore(
4662 Chain
, DL
, DAG
.getConstant(-GPRSize
, DL
, MVT::i32
), GROffsAddr
,
4663 MachinePointerInfo(SV
, 24), /* Alignment = */ 4));
4665 // int __vr_offs at offset 28
4666 SDValue VROffsAddr
=
4667 DAG
.getNode(ISD::ADD
, DL
, PtrVT
, VAList
, DAG
.getConstant(28, DL
, PtrVT
));
4668 MemOps
.push_back(DAG
.getStore(
4669 Chain
, DL
, DAG
.getConstant(-FPRSize
, DL
, MVT::i32
), VROffsAddr
,
4670 MachinePointerInfo(SV
, 28), /* Alignment = */ 4));
4672 return DAG
.getNode(ISD::TokenFactor
, DL
, MVT::Other
, MemOps
);
4675 SDValue
AArch64TargetLowering::LowerVASTART(SDValue Op
,
4676 SelectionDAG
&DAG
) const {
4677 MachineFunction
&MF
= DAG
.getMachineFunction();
4679 if (Subtarget
->isCallingConvWin64(MF
.getFunction().getCallingConv()))
4680 return LowerWin64_VASTART(Op
, DAG
);
4681 else if (Subtarget
->isTargetDarwin())
4682 return LowerDarwin_VASTART(Op
, DAG
);
4684 return LowerAAPCS_VASTART(Op
, DAG
);
4687 SDValue
AArch64TargetLowering::LowerVACOPY(SDValue Op
,
4688 SelectionDAG
&DAG
) const {
4689 // AAPCS has three pointers and two ints (= 32 bytes), Darwin has single
4692 unsigned VaListSize
=
4693 Subtarget
->isTargetDarwin() || Subtarget
->isTargetWindows() ? 8 : 32;
4694 const Value
*DestSV
= cast
<SrcValueSDNode
>(Op
.getOperand(3))->getValue();
4695 const Value
*SrcSV
= cast
<SrcValueSDNode
>(Op
.getOperand(4))->getValue();
4697 return DAG
.getMemcpy(Op
.getOperand(0), DL
, Op
.getOperand(1),
4699 DAG
.getConstant(VaListSize
, DL
, MVT::i32
),
4700 8, false, false, false, MachinePointerInfo(DestSV
),
4701 MachinePointerInfo(SrcSV
));
4704 SDValue
AArch64TargetLowering::LowerVAARG(SDValue Op
, SelectionDAG
&DAG
) const {
4705 assert(Subtarget
->isTargetDarwin() &&
4706 "automatic va_arg instruction only works on Darwin");
4708 const Value
*V
= cast
<SrcValueSDNode
>(Op
.getOperand(2))->getValue();
4709 EVT VT
= Op
.getValueType();
4711 SDValue Chain
= Op
.getOperand(0);
4712 SDValue Addr
= Op
.getOperand(1);
4713 unsigned Align
= Op
.getConstantOperandVal(3);
4714 auto PtrVT
= getPointerTy(DAG
.getDataLayout());
4716 SDValue VAList
= DAG
.getLoad(PtrVT
, DL
, Chain
, Addr
, MachinePointerInfo(V
));
4717 Chain
= VAList
.getValue(1);
4720 assert(((Align
& (Align
- 1)) == 0) && "Expected Align to be a power of 2");
4721 VAList
= DAG
.getNode(ISD::ADD
, DL
, PtrVT
, VAList
,
4722 DAG
.getConstant(Align
- 1, DL
, PtrVT
));
4723 VAList
= DAG
.getNode(ISD::AND
, DL
, PtrVT
, VAList
,
4724 DAG
.getConstant(-(int64_t)Align
, DL
, PtrVT
));
4727 Type
*ArgTy
= VT
.getTypeForEVT(*DAG
.getContext());
4728 uint64_t ArgSize
= DAG
.getDataLayout().getTypeAllocSize(ArgTy
);
4730 // Scalar integer and FP values smaller than 64 bits are implicitly extended
4731 // up to 64 bits. At the very least, we have to increase the striding of the
4732 // vaargs list to match this, and for FP values we need to introduce
4733 // FP_ROUND nodes as well.
4734 if (VT
.isInteger() && !VT
.isVector())
4736 bool NeedFPTrunc
= false;
4737 if (VT
.isFloatingPoint() && !VT
.isVector() && VT
!= MVT::f64
) {
4742 // Increment the pointer, VAList, to the next vaarg
4743 SDValue VANext
= DAG
.getNode(ISD::ADD
, DL
, PtrVT
, VAList
,
4744 DAG
.getConstant(ArgSize
, DL
, PtrVT
));
4745 // Store the incremented VAList to the legalized pointer
4747 DAG
.getStore(Chain
, DL
, VANext
, Addr
, MachinePointerInfo(V
));
4749 // Load the actual argument out of the pointer VAList
4751 // Load the value as an f64.
4753 DAG
.getLoad(MVT::f64
, DL
, APStore
, VAList
, MachinePointerInfo());
4754 // Round the value down to an f32.
4755 SDValue NarrowFP
= DAG
.getNode(ISD::FP_ROUND
, DL
, VT
, WideFP
.getValue(0),
4756 DAG
.getIntPtrConstant(1, DL
));
4757 SDValue Ops
[] = { NarrowFP
, WideFP
.getValue(1) };
4758 // Merge the rounded value with the chain output of the load.
4759 return DAG
.getMergeValues(Ops
, DL
);
4762 return DAG
.getLoad(VT
, DL
, APStore
, VAList
, MachinePointerInfo());
4765 SDValue
AArch64TargetLowering::LowerFRAMEADDR(SDValue Op
,
4766 SelectionDAG
&DAG
) const {
4767 MachineFrameInfo
&MFI
= DAG
.getMachineFunction().getFrameInfo();
4768 MFI
.setFrameAddressIsTaken(true);
4770 EVT VT
= Op
.getValueType();
4772 unsigned Depth
= cast
<ConstantSDNode
>(Op
.getOperand(0))->getZExtValue();
4774 DAG
.getCopyFromReg(DAG
.getEntryNode(), DL
, AArch64::FP
, VT
);
4776 FrameAddr
= DAG
.getLoad(VT
, DL
, DAG
.getEntryNode(), FrameAddr
,
4777 MachinePointerInfo());
4781 // FIXME? Maybe this could be a TableGen attribute on some registers and
4782 // this table could be generated automatically from RegInfo.
4783 unsigned AArch64TargetLowering::getRegisterByName(const char* RegName
, EVT VT
,
4784 SelectionDAG
&DAG
) const {
4785 unsigned Reg
= StringSwitch
<unsigned>(RegName
)
4786 .Case("sp", AArch64::SP
)
4787 .Case("x18", AArch64::X18
)
4788 .Case("w18", AArch64::W18
)
4790 if ((Reg
== AArch64::X18
|| Reg
== AArch64::W18
) &&
4791 !Subtarget
->isX18Reserved())
4795 report_fatal_error(Twine("Invalid register name \""
4796 + StringRef(RegName
) + "\"."));
4799 SDValue
AArch64TargetLowering::LowerRETURNADDR(SDValue Op
,
4800 SelectionDAG
&DAG
) const {
4801 MachineFunction
&MF
= DAG
.getMachineFunction();
4802 MachineFrameInfo
&MFI
= MF
.getFrameInfo();
4803 MFI
.setReturnAddressIsTaken(true);
4805 EVT VT
= Op
.getValueType();
4807 unsigned Depth
= cast
<ConstantSDNode
>(Op
.getOperand(0))->getZExtValue();
4809 SDValue FrameAddr
= LowerFRAMEADDR(Op
, DAG
);
4810 SDValue Offset
= DAG
.getConstant(8, DL
, getPointerTy(DAG
.getDataLayout()));
4811 return DAG
.getLoad(VT
, DL
, DAG
.getEntryNode(),
4812 DAG
.getNode(ISD::ADD
, DL
, VT
, FrameAddr
, Offset
),
4813 MachinePointerInfo());
4816 // Return LR, which contains the return address. Mark it an implicit live-in.
4817 unsigned Reg
= MF
.addLiveIn(AArch64::LR
, &AArch64::GPR64RegClass
);
4818 return DAG
.getCopyFromReg(DAG
.getEntryNode(), DL
, Reg
, VT
);
4821 /// LowerShiftRightParts - Lower SRA_PARTS, which returns two
4822 /// i64 values and take a 2 x i64 value to shift plus a shift amount.
4823 SDValue
AArch64TargetLowering::LowerShiftRightParts(SDValue Op
,
4824 SelectionDAG
&DAG
) const {
4825 assert(Op
.getNumOperands() == 3 && "Not a double-shift!");
4826 EVT VT
= Op
.getValueType();
4827 unsigned VTBits
= VT
.getSizeInBits();
4829 SDValue ShOpLo
= Op
.getOperand(0);
4830 SDValue ShOpHi
= Op
.getOperand(1);
4831 SDValue ShAmt
= Op
.getOperand(2);
4832 unsigned Opc
= (Op
.getOpcode() == ISD::SRA_PARTS
) ? ISD::SRA
: ISD::SRL
;
4834 assert(Op
.getOpcode() == ISD::SRA_PARTS
|| Op
.getOpcode() == ISD::SRL_PARTS
);
4836 SDValue RevShAmt
= DAG
.getNode(ISD::SUB
, dl
, MVT::i64
,
4837 DAG
.getConstant(VTBits
, dl
, MVT::i64
), ShAmt
);
4838 SDValue HiBitsForLo
= DAG
.getNode(ISD::SHL
, dl
, VT
, ShOpHi
, RevShAmt
);
4840 // Unfortunately, if ShAmt == 0, we just calculated "(SHL ShOpHi, 64)" which
4841 // is "undef". We wanted 0, so CSEL it directly.
4842 SDValue Cmp
= emitComparison(ShAmt
, DAG
.getConstant(0, dl
, MVT::i64
),
4843 ISD::SETEQ
, dl
, DAG
);
4844 SDValue CCVal
= DAG
.getConstant(AArch64CC::EQ
, dl
, MVT::i32
);
4846 DAG
.getNode(AArch64ISD::CSEL
, dl
, VT
, DAG
.getConstant(0, dl
, MVT::i64
),
4847 HiBitsForLo
, CCVal
, Cmp
);
4849 SDValue ExtraShAmt
= DAG
.getNode(ISD::SUB
, dl
, MVT::i64
, ShAmt
,
4850 DAG
.getConstant(VTBits
, dl
, MVT::i64
));
4852 SDValue LoBitsForLo
= DAG
.getNode(ISD::SRL
, dl
, VT
, ShOpLo
, ShAmt
);
4853 SDValue LoForNormalShift
=
4854 DAG
.getNode(ISD::OR
, dl
, VT
, LoBitsForLo
, HiBitsForLo
);
4856 Cmp
= emitComparison(ExtraShAmt
, DAG
.getConstant(0, dl
, MVT::i64
), ISD::SETGE
,
4858 CCVal
= DAG
.getConstant(AArch64CC::GE
, dl
, MVT::i32
);
4859 SDValue LoForBigShift
= DAG
.getNode(Opc
, dl
, VT
, ShOpHi
, ExtraShAmt
);
4860 SDValue Lo
= DAG
.getNode(AArch64ISD::CSEL
, dl
, VT
, LoForBigShift
,
4861 LoForNormalShift
, CCVal
, Cmp
);
4863 // AArch64 shifts larger than the register width are wrapped rather than
4864 // clamped, so we can't just emit "hi >> x".
4865 SDValue HiForNormalShift
= DAG
.getNode(Opc
, dl
, VT
, ShOpHi
, ShAmt
);
4866 SDValue HiForBigShift
=
4868 ? DAG
.getNode(Opc
, dl
, VT
, ShOpHi
,
4869 DAG
.getConstant(VTBits
- 1, dl
, MVT::i64
))
4870 : DAG
.getConstant(0, dl
, VT
);
4871 SDValue Hi
= DAG
.getNode(AArch64ISD::CSEL
, dl
, VT
, HiForBigShift
,
4872 HiForNormalShift
, CCVal
, Cmp
);
4874 SDValue Ops
[2] = { Lo
, Hi
};
4875 return DAG
.getMergeValues(Ops
, dl
);
4878 /// LowerShiftLeftParts - Lower SHL_PARTS, which returns two
4879 /// i64 values and take a 2 x i64 value to shift plus a shift amount.
4880 SDValue
AArch64TargetLowering::LowerShiftLeftParts(SDValue Op
,
4881 SelectionDAG
&DAG
) const {
4882 assert(Op
.getNumOperands() == 3 && "Not a double-shift!");
4883 EVT VT
= Op
.getValueType();
4884 unsigned VTBits
= VT
.getSizeInBits();
4886 SDValue ShOpLo
= Op
.getOperand(0);
4887 SDValue ShOpHi
= Op
.getOperand(1);
4888 SDValue ShAmt
= Op
.getOperand(2);
4890 assert(Op
.getOpcode() == ISD::SHL_PARTS
);
4891 SDValue RevShAmt
= DAG
.getNode(ISD::SUB
, dl
, MVT::i64
,
4892 DAG
.getConstant(VTBits
, dl
, MVT::i64
), ShAmt
);
4893 SDValue LoBitsForHi
= DAG
.getNode(ISD::SRL
, dl
, VT
, ShOpLo
, RevShAmt
);
4895 // Unfortunately, if ShAmt == 0, we just calculated "(SRL ShOpLo, 64)" which
4896 // is "undef". We wanted 0, so CSEL it directly.
4897 SDValue Cmp
= emitComparison(ShAmt
, DAG
.getConstant(0, dl
, MVT::i64
),
4898 ISD::SETEQ
, dl
, DAG
);
4899 SDValue CCVal
= DAG
.getConstant(AArch64CC::EQ
, dl
, MVT::i32
);
4901 DAG
.getNode(AArch64ISD::CSEL
, dl
, VT
, DAG
.getConstant(0, dl
, MVT::i64
),
4902 LoBitsForHi
, CCVal
, Cmp
);
4904 SDValue ExtraShAmt
= DAG
.getNode(ISD::SUB
, dl
, MVT::i64
, ShAmt
,
4905 DAG
.getConstant(VTBits
, dl
, MVT::i64
));
4906 SDValue HiBitsForHi
= DAG
.getNode(ISD::SHL
, dl
, VT
, ShOpHi
, ShAmt
);
4907 SDValue HiForNormalShift
=
4908 DAG
.getNode(ISD::OR
, dl
, VT
, LoBitsForHi
, HiBitsForHi
);
4910 SDValue HiForBigShift
= DAG
.getNode(ISD::SHL
, dl
, VT
, ShOpLo
, ExtraShAmt
);
4912 Cmp
= emitComparison(ExtraShAmt
, DAG
.getConstant(0, dl
, MVT::i64
), ISD::SETGE
,
4914 CCVal
= DAG
.getConstant(AArch64CC::GE
, dl
, MVT::i32
);
4915 SDValue Hi
= DAG
.getNode(AArch64ISD::CSEL
, dl
, VT
, HiForBigShift
,
4916 HiForNormalShift
, CCVal
, Cmp
);
4918 // AArch64 shifts of larger than register sizes are wrapped rather than
4919 // clamped, so we can't just emit "lo << a" if a is too big.
4920 SDValue LoForBigShift
= DAG
.getConstant(0, dl
, VT
);
4921 SDValue LoForNormalShift
= DAG
.getNode(ISD::SHL
, dl
, VT
, ShOpLo
, ShAmt
);
4922 SDValue Lo
= DAG
.getNode(AArch64ISD::CSEL
, dl
, VT
, LoForBigShift
,
4923 LoForNormalShift
, CCVal
, Cmp
);
4925 SDValue Ops
[2] = { Lo
, Hi
};
4926 return DAG
.getMergeValues(Ops
, dl
);
4929 bool AArch64TargetLowering::isOffsetFoldingLegal(
4930 const GlobalAddressSDNode
*GA
) const {
4931 DEBUG(dbgs() << "Skipping offset folding global address: ");
4933 DEBUG(dbgs() << "AArch64 doesn't support folding offsets into global "
4938 bool AArch64TargetLowering::isFPImmLegal(const APFloat
&Imm
, EVT VT
) const {
4939 // We can materialize #0.0 as fmov $Rd, XZR for 64-bit and 32-bit cases.
4940 // FIXME: We should be able to handle f128 as well with a clever lowering.
4941 if (Imm
.isPosZero() && (VT
== MVT::f64
|| VT
== MVT::f32
||
4942 (VT
== MVT::f16
&& Subtarget
->hasFullFP16()))) {
4943 DEBUG(dbgs() << "Legal fp imm: materialize 0 using the zero register\n");
4948 bool IsLegal
= false;
4949 SmallString
<128> ImmStrVal
;
4950 Imm
.toString(ImmStrVal
);
4952 if (VT
== MVT::f64
) {
4954 IsLegal
= AArch64_AM::getFP64Imm(Imm
) != -1;
4955 } else if (VT
== MVT::f32
) {
4957 IsLegal
= AArch64_AM::getFP32Imm(Imm
) != -1;
4958 } else if (VT
== MVT::f16
&& Subtarget
->hasFullFP16()) {
4960 IsLegal
= AArch64_AM::getFP16Imm(Imm
) != -1;
4964 DEBUG(dbgs() << "Legal " << FPType
<< " imm value: " << ImmStrVal
<< "\n");
4968 if (!FPType
.empty())
4969 DEBUG(dbgs() << "Illegal " << FPType
<< " imm value: " << ImmStrVal
<< "\n");
4971 DEBUG(dbgs() << "Illegal fp imm " << ImmStrVal
<< ": unsupported fp type\n");
4976 //===----------------------------------------------------------------------===//
4977 // AArch64 Optimization Hooks
4978 //===----------------------------------------------------------------------===//
4980 static SDValue
getEstimate(const AArch64Subtarget
*ST
, unsigned Opcode
,
4981 SDValue Operand
, SelectionDAG
&DAG
,
4983 EVT VT
= Operand
.getValueType();
4984 if (ST
->hasNEON() &&
4985 (VT
== MVT::f64
|| VT
== MVT::v1f64
|| VT
== MVT::v2f64
||
4986 VT
== MVT::f32
|| VT
== MVT::v1f32
||
4987 VT
== MVT::v2f32
|| VT
== MVT::v4f32
)) {
4988 if (ExtraSteps
== TargetLoweringBase::ReciprocalEstimate::Unspecified
)
4989 // For the reciprocal estimates, convergence is quadratic, so the number
4990 // of digits is doubled after each iteration. In ARMv8, the accuracy of
4991 // the initial estimate is 2^-8. Thus the number of extra steps to refine
4992 // the result for float (23 mantissa bits) is 2 and for double (52
4993 // mantissa bits) is 3.
4994 ExtraSteps
= VT
.getScalarType() == MVT::f64
? 3 : 2;
4996 return DAG
.getNode(Opcode
, SDLoc(Operand
), VT
, Operand
);
5002 SDValue
AArch64TargetLowering::getSqrtEstimate(SDValue Operand
,
5003 SelectionDAG
&DAG
, int Enabled
,
5006 bool Reciprocal
) const {
5007 if (Enabled
== ReciprocalEstimate::Enabled
||
5008 (Enabled
== ReciprocalEstimate::Unspecified
&& Subtarget
->useRSqrt()))
5009 if (SDValue Estimate
= getEstimate(Subtarget
, AArch64ISD::FRSQRTE
, Operand
,
5012 EVT VT
= Operand
.getValueType();
5015 Flags
.setUnsafeAlgebra(true);
5017 // Newton reciprocal square root iteration: E * 0.5 * (3 - X * E^2)
5018 // AArch64 reciprocal square root iteration instruction: 0.5 * (3 - M * N)
5019 for (int i
= ExtraSteps
; i
> 0; --i
) {
5020 SDValue Step
= DAG
.getNode(ISD::FMUL
, DL
, VT
, Estimate
, Estimate
,
5022 Step
= DAG
.getNode(AArch64ISD::FRSQRTS
, DL
, VT
, Operand
, Step
, Flags
);
5023 Estimate
= DAG
.getNode(ISD::FMUL
, DL
, VT
, Estimate
, Step
, Flags
);
5026 EVT CCVT
= getSetCCResultType(DAG
.getDataLayout(), *DAG
.getContext(),
5028 SDValue FPZero
= DAG
.getConstantFP(0.0, DL
, VT
);
5029 SDValue Eq
= DAG
.getSetCC(DL
, CCVT
, Operand
, FPZero
, ISD::SETEQ
);
5031 Estimate
= DAG
.getNode(ISD::FMUL
, DL
, VT
, Operand
, Estimate
, Flags
);
5032 // Correct the result if the operand is 0.0.
5033 Estimate
= DAG
.getNode(VT
.isVector() ? ISD::VSELECT
: ISD::SELECT
, DL
,
5034 VT
, Eq
, Operand
, Estimate
);
5044 SDValue
AArch64TargetLowering::getRecipEstimate(SDValue Operand
,
5045 SelectionDAG
&DAG
, int Enabled
,
5046 int &ExtraSteps
) const {
5047 if (Enabled
== ReciprocalEstimate::Enabled
)
5048 if (SDValue Estimate
= getEstimate(Subtarget
, AArch64ISD::FRECPE
, Operand
,
5051 EVT VT
= Operand
.getValueType();
5054 Flags
.setUnsafeAlgebra(true);
5056 // Newton reciprocal iteration: E * (2 - X * E)
5057 // AArch64 reciprocal iteration instruction: (2 - M * N)
5058 for (int i
= ExtraSteps
; i
> 0; --i
) {
5059 SDValue Step
= DAG
.getNode(AArch64ISD::FRECPS
, DL
, VT
, Operand
,
5061 Estimate
= DAG
.getNode(ISD::FMUL
, DL
, VT
, Estimate
, Step
, Flags
);
5071 //===----------------------------------------------------------------------===//
5072 // AArch64 Inline Assembly Support
5073 //===----------------------------------------------------------------------===//
5075 // Table of Constraints
5076 // TODO: This is the current set of constraints supported by ARM for the
5077 // compiler, not all of them may make sense, e.g. S may be difficult to support.
5079 // r - A general register
5080 // w - An FP/SIMD register of some size in the range v0-v31
5081 // x - An FP/SIMD register of some size in the range v0-v15
5082 // I - Constant that can be used with an ADD instruction
5083 // J - Constant that can be used with a SUB instruction
5084 // K - Constant that can be used with a 32-bit logical instruction
5085 // L - Constant that can be used with a 64-bit logical instruction
5086 // M - Constant that can be used as a 32-bit MOV immediate
5087 // N - Constant that can be used as a 64-bit MOV immediate
5088 // Q - A memory reference with base register and no offset
5089 // S - A symbolic address
5090 // Y - Floating point constant zero
5091 // Z - Integer constant zero
5093 // Note that general register operands will be output using their 64-bit x
5094 // register name, whatever the size of the variable, unless the asm operand
5095 // is prefixed by the %w modifier. Floating-point and SIMD register operands
5096 // will be output with the v prefix unless prefixed by the %b, %h, %s, %d or
5098 const char *AArch64TargetLowering::LowerXConstraint(EVT ConstraintVT
) const {
5099 // At this point, we have to lower this constraint to something else, so we
5100 // lower it to an "r" or "w". However, by doing this we will force the result
5101 // to be in register, while the X constraint is much more permissive.
5103 // Although we are correct (we are free to emit anything, without
5104 // constraints), we might break use cases that would expect us to be more
5105 // efficient and emit something else.
5106 if (!Subtarget
->hasFPARMv8())
5109 if (ConstraintVT
.isFloatingPoint())
5112 if (ConstraintVT
.isVector() &&
5113 (ConstraintVT
.getSizeInBits() == 64 ||
5114 ConstraintVT
.getSizeInBits() == 128))
5120 /// getConstraintType - Given a constraint letter, return the type of
5121 /// constraint it is for this target.
5122 AArch64TargetLowering::ConstraintType
5123 AArch64TargetLowering::getConstraintType(StringRef Constraint
) const {
5124 if (Constraint
.size() == 1) {
5125 switch (Constraint
[0]) {
5132 return C_RegisterClass
;
5133 // An address with a single base register. Due to the way we
5134 // currently handle addresses it is the same as 'r'.
5139 return TargetLowering::getConstraintType(Constraint
);
5142 /// Examine constraint type and operand type and determine a weight value.
5143 /// This object must already have been set up with the operand type
5144 /// and the current alternative constraint selected.
5145 TargetLowering::ConstraintWeight
5146 AArch64TargetLowering::getSingleConstraintMatchWeight(
5147 AsmOperandInfo
&info
, const char *constraint
) const {
5148 ConstraintWeight weight
= CW_Invalid
;
5149 Value
*CallOperandVal
= info
.CallOperandVal
;
5150 // If we don't have a value, we can't do a match,
5151 // but allow it at the lowest weight.
5152 if (!CallOperandVal
)
5154 Type
*type
= CallOperandVal
->getType();
5155 // Look at the constraint type.
5156 switch (*constraint
) {
5158 weight
= TargetLowering::getSingleConstraintMatchWeight(info
, constraint
);
5162 if (type
->isFloatingPointTy() || type
->isVectorTy())
5163 weight
= CW_Register
;
5166 weight
= CW_Constant
;
5172 std::pair
<unsigned, const TargetRegisterClass
*>
5173 AArch64TargetLowering::getRegForInlineAsmConstraint(
5174 const TargetRegisterInfo
*TRI
, StringRef Constraint
, MVT VT
) const {
5175 if (Constraint
.size() == 1) {
5176 switch (Constraint
[0]) {
5178 if (VT
.getSizeInBits() == 64)
5179 return std::make_pair(0U, &AArch64::GPR64commonRegClass
);
5180 return std::make_pair(0U, &AArch64::GPR32commonRegClass
);
5182 if (VT
.getSizeInBits() == 16)
5183 return std::make_pair(0U, &AArch64::FPR16RegClass
);
5184 if (VT
.getSizeInBits() == 32)
5185 return std::make_pair(0U, &AArch64::FPR32RegClass
);
5186 if (VT
.getSizeInBits() == 64)
5187 return std::make_pair(0U, &AArch64::FPR64RegClass
);
5188 if (VT
.getSizeInBits() == 128)
5189 return std::make_pair(0U, &AArch64::FPR128RegClass
);
5191 // The instructions that this constraint is designed for can
5192 // only take 128-bit registers so just use that regclass.
5194 if (VT
.getSizeInBits() == 128)
5195 return std::make_pair(0U, &AArch64::FPR128_loRegClass
);
5199 if (StringRef("{cc}").equals_lower(Constraint
))
5200 return std::make_pair(unsigned(AArch64::NZCV
), &AArch64::CCRRegClass
);
5202 // Use the default implementation in TargetLowering to convert the register
5203 // constraint into a member of a register class.
5204 std::pair
<unsigned, const TargetRegisterClass
*> Res
;
5205 Res
= TargetLowering::getRegForInlineAsmConstraint(TRI
, Constraint
, VT
);
5207 // Not found as a standard register?
5209 unsigned Size
= Constraint
.size();
5210 if ((Size
== 4 || Size
== 5) && Constraint
[0] == '{' &&
5211 tolower(Constraint
[1]) == 'v' && Constraint
[Size
- 1] == '}') {
5213 bool Failed
= Constraint
.slice(2, Size
- 1).getAsInteger(10, RegNo
);
5214 if (!Failed
&& RegNo
>= 0 && RegNo
<= 31) {
5215 // v0 - v31 are aliases of q0 - q31 or d0 - d31 depending on size.
5216 // By default we'll emit v0-v31 for this unless there's a modifier where
5217 // we'll emit the correct register as well.
5218 if (VT
!= MVT::Other
&& VT
.getSizeInBits() == 64) {
5219 Res
.first
= AArch64::FPR64RegClass
.getRegister(RegNo
);
5220 Res
.second
= &AArch64::FPR64RegClass
;
5222 Res
.first
= AArch64::FPR128RegClass
.getRegister(RegNo
);
5223 Res
.second
= &AArch64::FPR128RegClass
;
5232 /// LowerAsmOperandForConstraint - Lower the specified operand into the Ops
5233 /// vector. If it is invalid, don't add anything to Ops.
5234 void AArch64TargetLowering::LowerAsmOperandForConstraint(
5235 SDValue Op
, std::string
&Constraint
, std::vector
<SDValue
> &Ops
,
5236 SelectionDAG
&DAG
) const {
5239 // Currently only support length 1 constraints.
5240 if (Constraint
.length() != 1)
5243 char ConstraintLetter
= Constraint
[0];
5244 switch (ConstraintLetter
) {
5248 // This set of constraints deal with valid constants for various instructions.
5249 // Validate and return a target constant for them if we can.
5251 // 'z' maps to xzr or wzr so it needs an input of 0.
5252 if (!isNullConstant(Op
))
5255 if (Op
.getValueType() == MVT::i64
)
5256 Result
= DAG
.getRegister(AArch64::XZR
, MVT::i64
);
5258 Result
= DAG
.getRegister(AArch64::WZR
, MVT::i32
);
5268 ConstantSDNode
*C
= dyn_cast
<ConstantSDNode
>(Op
);
5272 // Grab the value and do some validation.
5273 uint64_t CVal
= C
->getZExtValue();
5274 switch (ConstraintLetter
) {
5275 // The I constraint applies only to simple ADD or SUB immediate operands:
5276 // i.e. 0 to 4095 with optional shift by 12
5277 // The J constraint applies only to ADD or SUB immediates that would be
5278 // valid when negated, i.e. if [an add pattern] were to be output as a SUB
5279 // instruction [or vice versa], in other words -1 to -4095 with optional
5280 // left shift by 12.
5282 if (isUInt
<12>(CVal
) || isShiftedUInt
<12, 12>(CVal
))
5286 uint64_t NVal
= -C
->getSExtValue();
5287 if (isUInt
<12>(NVal
) || isShiftedUInt
<12, 12>(NVal
)) {
5288 CVal
= C
->getSExtValue();
5293 // The K and L constraints apply *only* to logical immediates, including
5294 // what used to be the MOVI alias for ORR (though the MOVI alias has now
5295 // been removed and MOV should be used). So these constraints have to
5296 // distinguish between bit patterns that are valid 32-bit or 64-bit
5297 // "bitmask immediates": for example 0xaaaaaaaa is a valid bimm32 (K), but
5298 // not a valid bimm64 (L) where 0xaaaaaaaaaaaaaaaa would be valid, and vice
5301 if (AArch64_AM::isLogicalImmediate(CVal
, 32))
5305 if (AArch64_AM::isLogicalImmediate(CVal
, 64))
5308 // The M and N constraints are a superset of K and L respectively, for use
5309 // with the MOV (immediate) alias. As well as the logical immediates they
5310 // also match 32 or 64-bit immediates that can be loaded either using a
5311 // *single* MOVZ or MOVN , such as 32-bit 0x12340000, 0x00001234, 0xffffedca
5312 // (M) or 64-bit 0x1234000000000000 (N) etc.
5313 // As a note some of this code is liberally stolen from the asm parser.
5315 if (!isUInt
<32>(CVal
))
5317 if (AArch64_AM::isLogicalImmediate(CVal
, 32))
5319 if ((CVal
& 0xFFFF) == CVal
)
5321 if ((CVal
& 0xFFFF0000ULL
) == CVal
)
5323 uint64_t NCVal
= ~(uint32_t)CVal
;
5324 if ((NCVal
& 0xFFFFULL
) == NCVal
)
5326 if ((NCVal
& 0xFFFF0000ULL
) == NCVal
)
5331 if (AArch64_AM::isLogicalImmediate(CVal
, 64))
5333 if ((CVal
& 0xFFFFULL
) == CVal
)
5335 if ((CVal
& 0xFFFF0000ULL
) == CVal
)
5337 if ((CVal
& 0xFFFF00000000ULL
) == CVal
)
5339 if ((CVal
& 0xFFFF000000000000ULL
) == CVal
)
5341 uint64_t NCVal
= ~CVal
;
5342 if ((NCVal
& 0xFFFFULL
) == NCVal
)
5344 if ((NCVal
& 0xFFFF0000ULL
) == NCVal
)
5346 if ((NCVal
& 0xFFFF00000000ULL
) == NCVal
)
5348 if ((NCVal
& 0xFFFF000000000000ULL
) == NCVal
)
5356 // All assembler immediates are 64-bit integers.
5357 Result
= DAG
.getTargetConstant(CVal
, SDLoc(Op
), MVT::i64
);
5361 if (Result
.getNode()) {
5362 Ops
.push_back(Result
);
5366 return TargetLowering::LowerAsmOperandForConstraint(Op
, Constraint
, Ops
, DAG
);
5369 //===----------------------------------------------------------------------===//
5370 // AArch64 Advanced SIMD Support
5371 //===----------------------------------------------------------------------===//
5373 /// WidenVector - Given a value in the V64 register class, produce the
5374 /// equivalent value in the V128 register class.
5375 static SDValue
WidenVector(SDValue V64Reg
, SelectionDAG
&DAG
) {
5376 EVT VT
= V64Reg
.getValueType();
5377 unsigned NarrowSize
= VT
.getVectorNumElements();
5378 MVT EltTy
= VT
.getVectorElementType().getSimpleVT();
5379 MVT WideTy
= MVT::getVectorVT(EltTy
, 2 * NarrowSize
);
5382 return DAG
.getNode(ISD::INSERT_SUBVECTOR
, DL
, WideTy
, DAG
.getUNDEF(WideTy
),
5383 V64Reg
, DAG
.getConstant(0, DL
, MVT::i32
));
5386 /// getExtFactor - Determine the adjustment factor for the position when
5387 /// generating an "extract from vector registers" instruction.
5388 static unsigned getExtFactor(SDValue
&V
) {
5389 EVT EltType
= V
.getValueType().getVectorElementType();
5390 return EltType
.getSizeInBits() / 8;
5393 /// NarrowVector - Given a value in the V128 register class, produce the
5394 /// equivalent value in the V64 register class.
5395 static SDValue
NarrowVector(SDValue V128Reg
, SelectionDAG
&DAG
) {
5396 EVT VT
= V128Reg
.getValueType();
5397 unsigned WideSize
= VT
.getVectorNumElements();
5398 MVT EltTy
= VT
.getVectorElementType().getSimpleVT();
5399 MVT NarrowTy
= MVT::getVectorVT(EltTy
, WideSize
/ 2);
5402 return DAG
.getTargetExtractSubreg(AArch64::dsub
, DL
, NarrowTy
, V128Reg
);
5405 // Gather data to see if the operation can be modelled as a
5406 // shuffle in combination with VEXTs.
5407 SDValue
AArch64TargetLowering::ReconstructShuffle(SDValue Op
,
5408 SelectionDAG
&DAG
) const {
5409 assert(Op
.getOpcode() == ISD::BUILD_VECTOR
&& "Unknown opcode!");
5410 DEBUG(dbgs() << "AArch64TargetLowering::ReconstructShuffle\n");
5412 EVT VT
= Op
.getValueType();
5413 unsigned NumElts
= VT
.getVectorNumElements();
5415 struct ShuffleSourceInfo
{
5420 // We may insert some combination of BITCASTs and VEXT nodes to force Vec to
5421 // be compatible with the shuffle we intend to construct. As a result
5422 // ShuffleVec will be some sliding window into the original Vec.
5425 // Code should guarantee that element i in Vec starts at element "WindowBase
5426 // + i * WindowScale in ShuffleVec".
5430 ShuffleSourceInfo(SDValue Vec
)
5431 : Vec(Vec
), MinElt(std::numeric_limits
<unsigned>::max()), MaxElt(0),
5432 ShuffleVec(Vec
), WindowBase(0), WindowScale(1) {}
5434 bool operator ==(SDValue OtherVec
) { return Vec
== OtherVec
; }
5437 // First gather all vectors used as an immediate source for this BUILD_VECTOR
5439 SmallVector
<ShuffleSourceInfo
, 2> Sources
;
5440 for (unsigned i
= 0; i
< NumElts
; ++i
) {
5441 SDValue V
= Op
.getOperand(i
);
5444 else if (V
.getOpcode() != ISD::EXTRACT_VECTOR_ELT
||
5445 !isa
<ConstantSDNode
>(V
.getOperand(1))) {
5446 DEBUG(dbgs() << "Reshuffle failed: "
5447 "a shuffle can only come from building a vector from "
5448 "various elements of other vectors, provided their "
5449 "indices are constant\n");
5453 // Add this element source to the list if it's not already there.
5454 SDValue SourceVec
= V
.getOperand(0);
5455 auto Source
= find(Sources
, SourceVec
);
5456 if (Source
== Sources
.end())
5457 Source
= Sources
.insert(Sources
.end(), ShuffleSourceInfo(SourceVec
));
5459 // Update the minimum and maximum lane number seen.
5460 unsigned EltNo
= cast
<ConstantSDNode
>(V
.getOperand(1))->getZExtValue();
5461 Source
->MinElt
= std::min(Source
->MinElt
, EltNo
);
5462 Source
->MaxElt
= std::max(Source
->MaxElt
, EltNo
);
5465 if (Sources
.size() > 2) {
5466 DEBUG(dbgs() << "Reshuffle failed: currently only do something sane when at "
5467 "most two source vectors are involved\n");
5471 // Find out the smallest element size among result and two sources, and use
5472 // it as element size to build the shuffle_vector.
5473 EVT SmallestEltTy
= VT
.getVectorElementType();
5474 for (auto &Source
: Sources
) {
5475 EVT SrcEltTy
= Source
.Vec
.getValueType().getVectorElementType();
5476 if (SrcEltTy
.bitsLT(SmallestEltTy
)) {
5477 SmallestEltTy
= SrcEltTy
;
5480 unsigned ResMultiplier
=
5481 VT
.getScalarSizeInBits() / SmallestEltTy
.getSizeInBits();
5482 NumElts
= VT
.getSizeInBits() / SmallestEltTy
.getSizeInBits();
5483 EVT ShuffleVT
= EVT::getVectorVT(*DAG
.getContext(), SmallestEltTy
, NumElts
);
5485 // If the source vector is too wide or too narrow, we may nevertheless be able
5486 // to construct a compatible shuffle either by concatenating it with UNDEF or
5487 // extracting a suitable range of elements.
5488 for (auto &Src
: Sources
) {
5489 EVT SrcVT
= Src
.ShuffleVec
.getValueType();
5491 if (SrcVT
.getSizeInBits() == VT
.getSizeInBits())
5494 // This stage of the search produces a source with the same element type as
5495 // the original, but with a total width matching the BUILD_VECTOR output.
5496 EVT EltVT
= SrcVT
.getVectorElementType();
5497 unsigned NumSrcElts
= VT
.getSizeInBits() / EltVT
.getSizeInBits();
5498 EVT DestVT
= EVT::getVectorVT(*DAG
.getContext(), EltVT
, NumSrcElts
);
5500 if (SrcVT
.getSizeInBits() < VT
.getSizeInBits()) {
5501 assert(2 * SrcVT
.getSizeInBits() == VT
.getSizeInBits());
5502 // We can pad out the smaller vector for free, so if it's part of a
5505 DAG
.getNode(ISD::CONCAT_VECTORS
, dl
, DestVT
, Src
.ShuffleVec
,
5506 DAG
.getUNDEF(Src
.ShuffleVec
.getValueType()));
5510 assert(SrcVT
.getSizeInBits() == 2 * VT
.getSizeInBits());
5512 if (Src
.MaxElt
- Src
.MinElt
>= NumSrcElts
) {
5513 DEBUG(dbgs() << "Reshuffle failed: span too large for a VEXT to cope\n");
5517 if (Src
.MinElt
>= NumSrcElts
) {
5518 // The extraction can just take the second half
5520 DAG
.getNode(ISD::EXTRACT_SUBVECTOR
, dl
, DestVT
, Src
.ShuffleVec
,
5521 DAG
.getConstant(NumSrcElts
, dl
, MVT::i64
));
5522 Src
.WindowBase
= -NumSrcElts
;
5523 } else if (Src
.MaxElt
< NumSrcElts
) {
5524 // The extraction can just take the first half
5526 DAG
.getNode(ISD::EXTRACT_SUBVECTOR
, dl
, DestVT
, Src
.ShuffleVec
,
5527 DAG
.getConstant(0, dl
, MVT::i64
));
5529 // An actual VEXT is needed
5531 DAG
.getNode(ISD::EXTRACT_SUBVECTOR
, dl
, DestVT
, Src
.ShuffleVec
,
5532 DAG
.getConstant(0, dl
, MVT::i64
));
5534 DAG
.getNode(ISD::EXTRACT_SUBVECTOR
, dl
, DestVT
, Src
.ShuffleVec
,
5535 DAG
.getConstant(NumSrcElts
, dl
, MVT::i64
));
5536 unsigned Imm
= Src
.MinElt
* getExtFactor(VEXTSrc1
);
5538 Src
.ShuffleVec
= DAG
.getNode(AArch64ISD::EXT
, dl
, DestVT
, VEXTSrc1
,
5540 DAG
.getConstant(Imm
, dl
, MVT::i32
));
5541 Src
.WindowBase
= -Src
.MinElt
;
5545 // Another possible incompatibility occurs from the vector element types. We
5546 // can fix this by bitcasting the source vectors to the same type we intend
5548 for (auto &Src
: Sources
) {
5549 EVT SrcEltTy
= Src
.ShuffleVec
.getValueType().getVectorElementType();
5550 if (SrcEltTy
== SmallestEltTy
)
5552 assert(ShuffleVT
.getVectorElementType() == SmallestEltTy
);
5553 Src
.ShuffleVec
= DAG
.getNode(ISD::BITCAST
, dl
, ShuffleVT
, Src
.ShuffleVec
);
5554 Src
.WindowScale
= SrcEltTy
.getSizeInBits() / SmallestEltTy
.getSizeInBits();
5555 Src
.WindowBase
*= Src
.WindowScale
;
5558 // Final sanity check before we try to actually produce a shuffle.
5560 for (auto Src
: Sources
)
5561 assert(Src
.ShuffleVec
.getValueType() == ShuffleVT
);
5564 // The stars all align, our next step is to produce the mask for the shuffle.
5565 SmallVector
<int, 8> Mask(ShuffleVT
.getVectorNumElements(), -1);
5566 int BitsPerShuffleLane
= ShuffleVT
.getScalarSizeInBits();
5567 for (unsigned i
= 0; i
< VT
.getVectorNumElements(); ++i
) {
5568 SDValue Entry
= Op
.getOperand(i
);
5569 if (Entry
.isUndef())
5572 auto Src
= find(Sources
, Entry
.getOperand(0));
5573 int EltNo
= cast
<ConstantSDNode
>(Entry
.getOperand(1))->getSExtValue();
5575 // EXTRACT_VECTOR_ELT performs an implicit any_ext; BUILD_VECTOR an implicit
5576 // trunc. So only std::min(SrcBits, DestBits) actually get defined in this
5578 EVT OrigEltTy
= Entry
.getOperand(0).getValueType().getVectorElementType();
5580 std::min(OrigEltTy
.getSizeInBits(), VT
.getScalarSizeInBits());
5581 int LanesDefined
= BitsDefined
/ BitsPerShuffleLane
;
5583 // This source is expected to fill ResMultiplier lanes of the final shuffle,
5584 // starting at the appropriate offset.
5585 int *LaneMask
= &Mask
[i
* ResMultiplier
];
5587 int ExtractBase
= EltNo
* Src
->WindowScale
+ Src
->WindowBase
;
5588 ExtractBase
+= NumElts
* (Src
- Sources
.begin());
5589 for (int j
= 0; j
< LanesDefined
; ++j
)
5590 LaneMask
[j
] = ExtractBase
+ j
;
5593 // Final check before we try to produce nonsense...
5594 if (!isShuffleMaskLegal(Mask
, ShuffleVT
)) {
5595 DEBUG(dbgs() << "Reshuffle failed: illegal shuffle mask\n");
5599 SDValue ShuffleOps
[] = { DAG
.getUNDEF(ShuffleVT
), DAG
.getUNDEF(ShuffleVT
) };
5600 for (unsigned i
= 0; i
< Sources
.size(); ++i
)
5601 ShuffleOps
[i
] = Sources
[i
].ShuffleVec
;
5603 SDValue Shuffle
= DAG
.getVectorShuffle(ShuffleVT
, dl
, ShuffleOps
[0],
5604 ShuffleOps
[1], Mask
);
5605 SDValue V
= DAG
.getNode(ISD::BITCAST
, dl
, VT
, Shuffle
);
5608 dbgs() << "Reshuffle, creating node: ";
5610 dbgs() << "Reshuffle, creating node: ";
5617 // check if an EXT instruction can handle the shuffle mask when the
5618 // vector sources of the shuffle are the same.
5619 static bool isSingletonEXTMask(ArrayRef
<int> M
, EVT VT
, unsigned &Imm
) {
5620 unsigned NumElts
= VT
.getVectorNumElements();
5622 // Assume that the first shuffle index is not UNDEF. Fail if it is.
5628 // If this is a VEXT shuffle, the immediate value is the index of the first
5629 // element. The other shuffle indices must be the successive elements after
5631 unsigned ExpectedElt
= Imm
;
5632 for (unsigned i
= 1; i
< NumElts
; ++i
) {
5633 // Increment the expected index. If it wraps around, just follow it
5634 // back to index zero and keep going.
5636 if (ExpectedElt
== NumElts
)
5640 continue; // ignore UNDEF indices
5641 if (ExpectedElt
!= static_cast<unsigned>(M
[i
]))
5648 // check if an EXT instruction can handle the shuffle mask when the
5649 // vector sources of the shuffle are different.
5650 static bool isEXTMask(ArrayRef
<int> M
, EVT VT
, bool &ReverseEXT
,
5652 // Look for the first non-undef element.
5653 const int *FirstRealElt
= find_if(M
, [](int Elt
) { return Elt
>= 0; });
5655 // Benefit form APInt to handle overflow when calculating expected element.
5656 unsigned NumElts
= VT
.getVectorNumElements();
5657 unsigned MaskBits
= APInt(32, NumElts
* 2).logBase2();
5658 APInt ExpectedElt
= APInt(MaskBits
, *FirstRealElt
+ 1);
5659 // The following shuffle indices must be the successive elements after the
5660 // first real element.
5661 const int *FirstWrongElt
= std::find_if(FirstRealElt
+ 1, M
.end(),
5662 [&](int Elt
) {return Elt
!= ExpectedElt
++ && Elt
!= -1;});
5663 if (FirstWrongElt
!= M
.end())
5666 // The index of an EXT is the first element if it is not UNDEF.
5667 // Watch out for the beginning UNDEFs. The EXT index should be the expected
5668 // value of the first element. E.g.
5669 // <-1, -1, 3, ...> is treated as <1, 2, 3, ...>.
5670 // <-1, -1, 0, 1, ...> is treated as <2*NumElts-2, 2*NumElts-1, 0, 1, ...>.
5671 // ExpectedElt is the last mask index plus 1.
5672 Imm
= ExpectedElt
.getZExtValue();
5674 // There are two difference cases requiring to reverse input vectors.
5675 // For example, for vector <4 x i32> we have the following cases,
5676 // Case 1: shufflevector(<4 x i32>,<4 x i32>,<-1, -1, -1, 0>)
5677 // Case 2: shufflevector(<4 x i32>,<4 x i32>,<-1, -1, 7, 0>)
5678 // For both cases, we finally use mask <5, 6, 7, 0>, which requires
5679 // to reverse two input vectors.
5688 /// isREVMask - Check if a vector shuffle corresponds to a REV
5689 /// instruction with the specified blocksize. (The order of the elements
5690 /// within each block of the vector is reversed.)
5691 static bool isREVMask(ArrayRef
<int> M
, EVT VT
, unsigned BlockSize
) {
5692 assert((BlockSize
== 16 || BlockSize
== 32 || BlockSize
== 64) &&
5693 "Only possible block sizes for REV are: 16, 32, 64");
5695 unsigned EltSz
= VT
.getScalarSizeInBits();
5699 unsigned NumElts
= VT
.getVectorNumElements();
5700 unsigned BlockElts
= M
[0] + 1;
5701 // If the first shuffle index is UNDEF, be optimistic.
5703 BlockElts
= BlockSize
/ EltSz
;
5705 if (BlockSize
<= EltSz
|| BlockSize
!= BlockElts
* EltSz
)
5708 for (unsigned i
= 0; i
< NumElts
; ++i
) {
5710 continue; // ignore UNDEF indices
5711 if ((unsigned)M
[i
] != (i
- i
% BlockElts
) + (BlockElts
- 1 - i
% BlockElts
))
5718 static bool isZIPMask(ArrayRef
<int> M
, EVT VT
, unsigned &WhichResult
) {
5719 unsigned NumElts
= VT
.getVectorNumElements();
5720 WhichResult
= (M
[0] == 0 ? 0 : 1);
5721 unsigned Idx
= WhichResult
* NumElts
/ 2;
5722 for (unsigned i
= 0; i
!= NumElts
; i
+= 2) {
5723 if ((M
[i
] >= 0 && (unsigned)M
[i
] != Idx
) ||
5724 (M
[i
+ 1] >= 0 && (unsigned)M
[i
+ 1] != Idx
+ NumElts
))
5732 static bool isUZPMask(ArrayRef
<int> M
, EVT VT
, unsigned &WhichResult
) {
5733 unsigned NumElts
= VT
.getVectorNumElements();
5734 WhichResult
= (M
[0] == 0 ? 0 : 1);
5735 for (unsigned i
= 0; i
!= NumElts
; ++i
) {
5737 continue; // ignore UNDEF indices
5738 if ((unsigned)M
[i
] != 2 * i
+ WhichResult
)
5745 static bool isTRNMask(ArrayRef
<int> M
, EVT VT
, unsigned &WhichResult
) {
5746 unsigned NumElts
= VT
.getVectorNumElements();
5747 WhichResult
= (M
[0] == 0 ? 0 : 1);
5748 for (unsigned i
= 0; i
< NumElts
; i
+= 2) {
5749 if ((M
[i
] >= 0 && (unsigned)M
[i
] != i
+ WhichResult
) ||
5750 (M
[i
+ 1] >= 0 && (unsigned)M
[i
+ 1] != i
+ NumElts
+ WhichResult
))
5756 /// isZIP_v_undef_Mask - Special case of isZIPMask for canonical form of
5757 /// "vector_shuffle v, v", i.e., "vector_shuffle v, undef".
5758 /// Mask is e.g., <0, 0, 1, 1> instead of <0, 4, 1, 5>.
5759 static bool isZIP_v_undef_Mask(ArrayRef
<int> M
, EVT VT
, unsigned &WhichResult
) {
5760 unsigned NumElts
= VT
.getVectorNumElements();
5761 WhichResult
= (M
[0] == 0 ? 0 : 1);
5762 unsigned Idx
= WhichResult
* NumElts
/ 2;
5763 for (unsigned i
= 0; i
!= NumElts
; i
+= 2) {
5764 if ((M
[i
] >= 0 && (unsigned)M
[i
] != Idx
) ||
5765 (M
[i
+ 1] >= 0 && (unsigned)M
[i
+ 1] != Idx
))
5773 /// isUZP_v_undef_Mask - Special case of isUZPMask for canonical form of
5774 /// "vector_shuffle v, v", i.e., "vector_shuffle v, undef".
5775 /// Mask is e.g., <0, 2, 0, 2> instead of <0, 2, 4, 6>,
5776 static bool isUZP_v_undef_Mask(ArrayRef
<int> M
, EVT VT
, unsigned &WhichResult
) {
5777 unsigned Half
= VT
.getVectorNumElements() / 2;
5778 WhichResult
= (M
[0] == 0 ? 0 : 1);
5779 for (unsigned j
= 0; j
!= 2; ++j
) {
5780 unsigned Idx
= WhichResult
;
5781 for (unsigned i
= 0; i
!= Half
; ++i
) {
5782 int MIdx
= M
[i
+ j
* Half
];
5783 if (MIdx
>= 0 && (unsigned)MIdx
!= Idx
)
5792 /// isTRN_v_undef_Mask - Special case of isTRNMask for canonical form of
5793 /// "vector_shuffle v, v", i.e., "vector_shuffle v, undef".
5794 /// Mask is e.g., <0, 0, 2, 2> instead of <0, 4, 2, 6>.
5795 static bool isTRN_v_undef_Mask(ArrayRef
<int> M
, EVT VT
, unsigned &WhichResult
) {
5796 unsigned NumElts
= VT
.getVectorNumElements();
5797 WhichResult
= (M
[0] == 0 ? 0 : 1);
5798 for (unsigned i
= 0; i
< NumElts
; i
+= 2) {
5799 if ((M
[i
] >= 0 && (unsigned)M
[i
] != i
+ WhichResult
) ||
5800 (M
[i
+ 1] >= 0 && (unsigned)M
[i
+ 1] != i
+ WhichResult
))
5806 static bool isINSMask(ArrayRef
<int> M
, int NumInputElements
,
5807 bool &DstIsLeft
, int &Anomaly
) {
5808 if (M
.size() != static_cast<size_t>(NumInputElements
))
5811 int NumLHSMatch
= 0, NumRHSMatch
= 0;
5812 int LastLHSMismatch
= -1, LastRHSMismatch
= -1;
5814 for (int i
= 0; i
< NumInputElements
; ++i
) {
5824 LastLHSMismatch
= i
;
5826 if (M
[i
] == i
+ NumInputElements
)
5829 LastRHSMismatch
= i
;
5832 if (NumLHSMatch
== NumInputElements
- 1) {
5834 Anomaly
= LastLHSMismatch
;
5836 } else if (NumRHSMatch
== NumInputElements
- 1) {
5838 Anomaly
= LastRHSMismatch
;
5845 static bool isConcatMask(ArrayRef
<int> Mask
, EVT VT
, bool SplitLHS
) {
5846 if (VT
.getSizeInBits() != 128)
5849 unsigned NumElts
= VT
.getVectorNumElements();
5851 for (int I
= 0, E
= NumElts
/ 2; I
!= E
; I
++) {
5856 int Offset
= NumElts
/ 2;
5857 for (int I
= NumElts
/ 2, E
= NumElts
; I
!= E
; I
++) {
5858 if (Mask
[I
] != I
+ SplitLHS
* Offset
)
5865 static SDValue
tryFormConcatFromShuffle(SDValue Op
, SelectionDAG
&DAG
) {
5867 EVT VT
= Op
.getValueType();
5868 SDValue V0
= Op
.getOperand(0);
5869 SDValue V1
= Op
.getOperand(1);
5870 ArrayRef
<int> Mask
= cast
<ShuffleVectorSDNode
>(Op
)->getMask();
5872 if (VT
.getVectorElementType() != V0
.getValueType().getVectorElementType() ||
5873 VT
.getVectorElementType() != V1
.getValueType().getVectorElementType())
5876 bool SplitV0
= V0
.getValueSizeInBits() == 128;
5878 if (!isConcatMask(Mask
, VT
, SplitV0
))
5881 EVT CastVT
= EVT::getVectorVT(*DAG
.getContext(), VT
.getVectorElementType(),
5882 VT
.getVectorNumElements() / 2);
5884 V0
= DAG
.getNode(ISD::EXTRACT_SUBVECTOR
, DL
, CastVT
, V0
,
5885 DAG
.getConstant(0, DL
, MVT::i64
));
5887 if (V1
.getValueSizeInBits() == 128) {
5888 V1
= DAG
.getNode(ISD::EXTRACT_SUBVECTOR
, DL
, CastVT
, V1
,
5889 DAG
.getConstant(0, DL
, MVT::i64
));
5891 return DAG
.getNode(ISD::CONCAT_VECTORS
, DL
, VT
, V0
, V1
);
5894 /// GeneratePerfectShuffle - Given an entry in the perfect-shuffle table, emit
5895 /// the specified operations to build the shuffle.
5896 static SDValue
GeneratePerfectShuffle(unsigned PFEntry
, SDValue LHS
,
5897 SDValue RHS
, SelectionDAG
&DAG
,
5899 unsigned OpNum
= (PFEntry
>> 26) & 0x0F;
5900 unsigned LHSID
= (PFEntry
>> 13) & ((1 << 13) - 1);
5901 unsigned RHSID
= (PFEntry
>> 0) & ((1 << 13) - 1);
5904 OP_COPY
= 0, // Copy, used for things like <u,u,u,3> to say it is <0,1,2,3>
5913 OP_VUZPL
, // VUZP, left result
5914 OP_VUZPR
, // VUZP, right result
5915 OP_VZIPL
, // VZIP, left result
5916 OP_VZIPR
, // VZIP, right result
5917 OP_VTRNL
, // VTRN, left result
5918 OP_VTRNR
// VTRN, right result
5921 if (OpNum
== OP_COPY
) {
5922 if (LHSID
== (1 * 9 + 2) * 9 + 3)
5924 assert(LHSID
== ((4 * 9 + 5) * 9 + 6) * 9 + 7 && "Illegal OP_COPY!");
5928 SDValue OpLHS
, OpRHS
;
5929 OpLHS
= GeneratePerfectShuffle(PerfectShuffleTable
[LHSID
], LHS
, RHS
, DAG
, dl
);
5930 OpRHS
= GeneratePerfectShuffle(PerfectShuffleTable
[RHSID
], LHS
, RHS
, DAG
, dl
);
5931 EVT VT
= OpLHS
.getValueType();
5935 llvm_unreachable("Unknown shuffle opcode!");
5937 // VREV divides the vector in half and swaps within the half.
5938 if (VT
.getVectorElementType() == MVT::i32
||
5939 VT
.getVectorElementType() == MVT::f32
)
5940 return DAG
.getNode(AArch64ISD::REV64
, dl
, VT
, OpLHS
);
5941 // vrev <4 x i16> -> REV32
5942 if (VT
.getVectorElementType() == MVT::i16
||
5943 VT
.getVectorElementType() == MVT::f16
)
5944 return DAG
.getNode(AArch64ISD::REV32
, dl
, VT
, OpLHS
);
5945 // vrev <4 x i8> -> REV16
5946 assert(VT
.getVectorElementType() == MVT::i8
);
5947 return DAG
.getNode(AArch64ISD::REV16
, dl
, VT
, OpLHS
);
5952 EVT EltTy
= VT
.getVectorElementType();
5954 if (EltTy
== MVT::i8
)
5955 Opcode
= AArch64ISD::DUPLANE8
;
5956 else if (EltTy
== MVT::i16
|| EltTy
== MVT::f16
)
5957 Opcode
= AArch64ISD::DUPLANE16
;
5958 else if (EltTy
== MVT::i32
|| EltTy
== MVT::f32
)
5959 Opcode
= AArch64ISD::DUPLANE32
;
5960 else if (EltTy
== MVT::i64
|| EltTy
== MVT::f64
)
5961 Opcode
= AArch64ISD::DUPLANE64
;
5963 llvm_unreachable("Invalid vector element type?");
5965 if (VT
.getSizeInBits() == 64)
5966 OpLHS
= WidenVector(OpLHS
, DAG
);
5967 SDValue Lane
= DAG
.getConstant(OpNum
- OP_VDUP0
, dl
, MVT::i64
);
5968 return DAG
.getNode(Opcode
, dl
, VT
, OpLHS
, Lane
);
5973 unsigned Imm
= (OpNum
- OP_VEXT1
+ 1) * getExtFactor(OpLHS
);
5974 return DAG
.getNode(AArch64ISD::EXT
, dl
, VT
, OpLHS
, OpRHS
,
5975 DAG
.getConstant(Imm
, dl
, MVT::i32
));
5978 return DAG
.getNode(AArch64ISD::UZP1
, dl
, DAG
.getVTList(VT
, VT
), OpLHS
,
5981 return DAG
.getNode(AArch64ISD::UZP2
, dl
, DAG
.getVTList(VT
, VT
), OpLHS
,
5984 return DAG
.getNode(AArch64ISD::ZIP1
, dl
, DAG
.getVTList(VT
, VT
), OpLHS
,
5987 return DAG
.getNode(AArch64ISD::ZIP2
, dl
, DAG
.getVTList(VT
, VT
), OpLHS
,
5990 return DAG
.getNode(AArch64ISD::TRN1
, dl
, DAG
.getVTList(VT
, VT
), OpLHS
,
5993 return DAG
.getNode(AArch64ISD::TRN2
, dl
, DAG
.getVTList(VT
, VT
), OpLHS
,
5998 static SDValue
GenerateTBL(SDValue Op
, ArrayRef
<int> ShuffleMask
,
5999 SelectionDAG
&DAG
) {
6000 // Check to see if we can use the TBL instruction.
6001 SDValue V1
= Op
.getOperand(0);
6002 SDValue V2
= Op
.getOperand(1);
6005 EVT EltVT
= Op
.getValueType().getVectorElementType();
6006 unsigned BytesPerElt
= EltVT
.getSizeInBits() / 8;
6008 SmallVector
<SDValue
, 8> TBLMask
;
6009 for (int Val
: ShuffleMask
) {
6010 for (unsigned Byte
= 0; Byte
< BytesPerElt
; ++Byte
) {
6011 unsigned Offset
= Byte
+ Val
* BytesPerElt
;
6012 TBLMask
.push_back(DAG
.getConstant(Offset
, DL
, MVT::i32
));
6016 MVT IndexVT
= MVT::v8i8
;
6017 unsigned IndexLen
= 8;
6018 if (Op
.getValueSizeInBits() == 128) {
6019 IndexVT
= MVT::v16i8
;
6023 SDValue V1Cst
= DAG
.getNode(ISD::BITCAST
, DL
, IndexVT
, V1
);
6024 SDValue V2Cst
= DAG
.getNode(ISD::BITCAST
, DL
, IndexVT
, V2
);
6027 if (V2
.getNode()->isUndef()) {
6029 V1Cst
= DAG
.getNode(ISD::CONCAT_VECTORS
, DL
, MVT::v16i8
, V1Cst
, V1Cst
);
6030 Shuffle
= DAG
.getNode(
6031 ISD::INTRINSIC_WO_CHAIN
, DL
, IndexVT
,
6032 DAG
.getConstant(Intrinsic::aarch64_neon_tbl1
, DL
, MVT::i32
), V1Cst
,
6033 DAG
.getBuildVector(IndexVT
, DL
,
6034 makeArrayRef(TBLMask
.data(), IndexLen
)));
6036 if (IndexLen
== 8) {
6037 V1Cst
= DAG
.getNode(ISD::CONCAT_VECTORS
, DL
, MVT::v16i8
, V1Cst
, V2Cst
);
6038 Shuffle
= DAG
.getNode(
6039 ISD::INTRINSIC_WO_CHAIN
, DL
, IndexVT
,
6040 DAG
.getConstant(Intrinsic::aarch64_neon_tbl1
, DL
, MVT::i32
), V1Cst
,
6041 DAG
.getBuildVector(IndexVT
, DL
,
6042 makeArrayRef(TBLMask
.data(), IndexLen
)));
6044 // FIXME: We cannot, for the moment, emit a TBL2 instruction because we
6045 // cannot currently represent the register constraints on the input
6047 // Shuffle = DAG.getNode(AArch64ISD::TBL2, DL, IndexVT, V1Cst, V2Cst,
6048 // DAG.getBuildVector(IndexVT, DL, &TBLMask[0],
6050 Shuffle
= DAG
.getNode(
6051 ISD::INTRINSIC_WO_CHAIN
, DL
, IndexVT
,
6052 DAG
.getConstant(Intrinsic::aarch64_neon_tbl2
, DL
, MVT::i32
), V1Cst
,
6053 V2Cst
, DAG
.getBuildVector(IndexVT
, DL
,
6054 makeArrayRef(TBLMask
.data(), IndexLen
)));
6057 return DAG
.getNode(ISD::BITCAST
, DL
, Op
.getValueType(), Shuffle
);
6060 static unsigned getDUPLANEOp(EVT EltType
) {
6061 if (EltType
== MVT::i8
)
6062 return AArch64ISD::DUPLANE8
;
6063 if (EltType
== MVT::i16
|| EltType
== MVT::f16
)
6064 return AArch64ISD::DUPLANE16
;
6065 if (EltType
== MVT::i32
|| EltType
== MVT::f32
)
6066 return AArch64ISD::DUPLANE32
;
6067 if (EltType
== MVT::i64
|| EltType
== MVT::f64
)
6068 return AArch64ISD::DUPLANE64
;
6070 llvm_unreachable("Invalid vector element type?");
6073 SDValue
AArch64TargetLowering::LowerVECTOR_SHUFFLE(SDValue Op
,
6074 SelectionDAG
&DAG
) const {
6076 EVT VT
= Op
.getValueType();
6078 ShuffleVectorSDNode
*SVN
= cast
<ShuffleVectorSDNode
>(Op
.getNode());
6080 // Convert shuffles that are directly supported on NEON to target-specific
6081 // DAG nodes, instead of keeping them as shuffles and matching them again
6082 // during code selection. This is more efficient and avoids the possibility
6083 // of inconsistencies between legalization and selection.
6084 ArrayRef
<int> ShuffleMask
= SVN
->getMask();
6086 SDValue V1
= Op
.getOperand(0);
6087 SDValue V2
= Op
.getOperand(1);
6089 if (SVN
->isSplat()) {
6090 int Lane
= SVN
->getSplatIndex();
6091 // If this is undef splat, generate it via "just" vdup, if possible.
6095 if (Lane
== 0 && V1
.getOpcode() == ISD::SCALAR_TO_VECTOR
)
6096 return DAG
.getNode(AArch64ISD::DUP
, dl
, V1
.getValueType(),
6098 // Test if V1 is a BUILD_VECTOR and the lane being referenced is a non-
6099 // constant. If so, we can just reference the lane's definition directly.
6100 if (V1
.getOpcode() == ISD::BUILD_VECTOR
&&
6101 !isa
<ConstantSDNode
>(V1
.getOperand(Lane
)))
6102 return DAG
.getNode(AArch64ISD::DUP
, dl
, VT
, V1
.getOperand(Lane
));
6104 // Otherwise, duplicate from the lane of the input vector.
6105 unsigned Opcode
= getDUPLANEOp(V1
.getValueType().getVectorElementType());
6107 // SelectionDAGBuilder may have "helpfully" already extracted or conatenated
6108 // to make a vector of the same size as this SHUFFLE. We can ignore the
6109 // extract entirely, and canonicalise the concat using WidenVector.
6110 if (V1
.getOpcode() == ISD::EXTRACT_SUBVECTOR
) {
6111 Lane
+= cast
<ConstantSDNode
>(V1
.getOperand(1))->getZExtValue();
6112 V1
= V1
.getOperand(0);
6113 } else if (V1
.getOpcode() == ISD::CONCAT_VECTORS
) {
6114 unsigned Idx
= Lane
>= (int)VT
.getVectorNumElements() / 2;
6115 Lane
-= Idx
* VT
.getVectorNumElements() / 2;
6116 V1
= WidenVector(V1
.getOperand(Idx
), DAG
);
6117 } else if (VT
.getSizeInBits() == 64)
6118 V1
= WidenVector(V1
, DAG
);
6120 return DAG
.getNode(Opcode
, dl
, VT
, V1
, DAG
.getConstant(Lane
, dl
, MVT::i64
));
6123 if (isREVMask(ShuffleMask
, VT
, 64))
6124 return DAG
.getNode(AArch64ISD::REV64
, dl
, V1
.getValueType(), V1
, V2
);
6125 if (isREVMask(ShuffleMask
, VT
, 32))
6126 return DAG
.getNode(AArch64ISD::REV32
, dl
, V1
.getValueType(), V1
, V2
);
6127 if (isREVMask(ShuffleMask
, VT
, 16))
6128 return DAG
.getNode(AArch64ISD::REV16
, dl
, V1
.getValueType(), V1
, V2
);
6130 bool ReverseEXT
= false;
6132 if (isEXTMask(ShuffleMask
, VT
, ReverseEXT
, Imm
)) {
6135 Imm
*= getExtFactor(V1
);
6136 return DAG
.getNode(AArch64ISD::EXT
, dl
, V1
.getValueType(), V1
, V2
,
6137 DAG
.getConstant(Imm
, dl
, MVT::i32
));
6138 } else if (V2
->isUndef() && isSingletonEXTMask(ShuffleMask
, VT
, Imm
)) {
6139 Imm
*= getExtFactor(V1
);
6140 return DAG
.getNode(AArch64ISD::EXT
, dl
, V1
.getValueType(), V1
, V1
,
6141 DAG
.getConstant(Imm
, dl
, MVT::i32
));
6144 unsigned WhichResult
;
6145 if (isZIPMask(ShuffleMask
, VT
, WhichResult
)) {
6146 unsigned Opc
= (WhichResult
== 0) ? AArch64ISD::ZIP1
: AArch64ISD::ZIP2
;
6147 return DAG
.getNode(Opc
, dl
, V1
.getValueType(), V1
, V2
);
6149 if (isUZPMask(ShuffleMask
, VT
, WhichResult
)) {
6150 unsigned Opc
= (WhichResult
== 0) ? AArch64ISD::UZP1
: AArch64ISD::UZP2
;
6151 return DAG
.getNode(Opc
, dl
, V1
.getValueType(), V1
, V2
);
6153 if (isTRNMask(ShuffleMask
, VT
, WhichResult
)) {
6154 unsigned Opc
= (WhichResult
== 0) ? AArch64ISD::TRN1
: AArch64ISD::TRN2
;
6155 return DAG
.getNode(Opc
, dl
, V1
.getValueType(), V1
, V2
);
6158 if (isZIP_v_undef_Mask(ShuffleMask
, VT
, WhichResult
)) {
6159 unsigned Opc
= (WhichResult
== 0) ? AArch64ISD::ZIP1
: AArch64ISD::ZIP2
;
6160 return DAG
.getNode(Opc
, dl
, V1
.getValueType(), V1
, V1
);
6162 if (isUZP_v_undef_Mask(ShuffleMask
, VT
, WhichResult
)) {
6163 unsigned Opc
= (WhichResult
== 0) ? AArch64ISD::UZP1
: AArch64ISD::UZP2
;
6164 return DAG
.getNode(Opc
, dl
, V1
.getValueType(), V1
, V1
);
6166 if (isTRN_v_undef_Mask(ShuffleMask
, VT
, WhichResult
)) {
6167 unsigned Opc
= (WhichResult
== 0) ? AArch64ISD::TRN1
: AArch64ISD::TRN2
;
6168 return DAG
.getNode(Opc
, dl
, V1
.getValueType(), V1
, V1
);
6171 if (SDValue Concat
= tryFormConcatFromShuffle(Op
, DAG
))
6176 int NumInputElements
= V1
.getValueType().getVectorNumElements();
6177 if (isINSMask(ShuffleMask
, NumInputElements
, DstIsLeft
, Anomaly
)) {
6178 SDValue DstVec
= DstIsLeft
? V1
: V2
;
6179 SDValue DstLaneV
= DAG
.getConstant(Anomaly
, dl
, MVT::i64
);
6181 SDValue SrcVec
= V1
;
6182 int SrcLane
= ShuffleMask
[Anomaly
];
6183 if (SrcLane
>= NumInputElements
) {
6185 SrcLane
-= VT
.getVectorNumElements();
6187 SDValue SrcLaneV
= DAG
.getConstant(SrcLane
, dl
, MVT::i64
);
6189 EVT ScalarVT
= VT
.getVectorElementType();
6191 if (ScalarVT
.getSizeInBits() < 32 && ScalarVT
.isInteger())
6192 ScalarVT
= MVT::i32
;
6195 ISD::INSERT_VECTOR_ELT
, dl
, VT
, DstVec
,
6196 DAG
.getNode(ISD::EXTRACT_VECTOR_ELT
, dl
, ScalarVT
, SrcVec
, SrcLaneV
),
6200 // If the shuffle is not directly supported and it has 4 elements, use
6201 // the PerfectShuffle-generated table to synthesize it from other shuffles.
6202 unsigned NumElts
= VT
.getVectorNumElements();
6204 unsigned PFIndexes
[4];
6205 for (unsigned i
= 0; i
!= 4; ++i
) {
6206 if (ShuffleMask
[i
] < 0)
6209 PFIndexes
[i
] = ShuffleMask
[i
];
6212 // Compute the index in the perfect shuffle table.
6213 unsigned PFTableIndex
= PFIndexes
[0] * 9 * 9 * 9 + PFIndexes
[1] * 9 * 9 +
6214 PFIndexes
[2] * 9 + PFIndexes
[3];
6215 unsigned PFEntry
= PerfectShuffleTable
[PFTableIndex
];
6216 unsigned Cost
= (PFEntry
>> 30);
6219 return GeneratePerfectShuffle(PFEntry
, V1
, V2
, DAG
, dl
);
6222 return GenerateTBL(Op
, ShuffleMask
, DAG
);
6225 static bool resolveBuildVector(BuildVectorSDNode
*BVN
, APInt
&CnstBits
,
6227 EVT VT
= BVN
->getValueType(0);
6228 APInt SplatBits
, SplatUndef
;
6229 unsigned SplatBitSize
;
6231 if (BVN
->isConstantSplat(SplatBits
, SplatUndef
, SplatBitSize
, HasAnyUndefs
)) {
6232 unsigned NumSplats
= VT
.getSizeInBits() / SplatBitSize
;
6234 for (unsigned i
= 0; i
< NumSplats
; ++i
) {
6235 CnstBits
<<= SplatBitSize
;
6236 UndefBits
<<= SplatBitSize
;
6237 CnstBits
|= SplatBits
.zextOrTrunc(VT
.getSizeInBits());
6238 UndefBits
|= (SplatBits
^ SplatUndef
).zextOrTrunc(VT
.getSizeInBits());
6247 SDValue
AArch64TargetLowering::LowerVectorAND(SDValue Op
,
6248 SelectionDAG
&DAG
) const {
6249 BuildVectorSDNode
*BVN
=
6250 dyn_cast
<BuildVectorSDNode
>(Op
.getOperand(1).getNode());
6251 SDValue LHS
= Op
.getOperand(0);
6253 EVT VT
= Op
.getValueType();
6258 APInt
CnstBits(VT
.getSizeInBits(), 0);
6259 APInt
UndefBits(VT
.getSizeInBits(), 0);
6260 if (resolveBuildVector(BVN
, CnstBits
, UndefBits
)) {
6261 // We only have BIC vector immediate instruction, which is and-not.
6262 CnstBits
= ~CnstBits
;
6264 // We make use of a little bit of goto ickiness in order to avoid having to
6265 // duplicate the immediate matching logic for the undef toggled case.
6266 bool SecondTry
= false;
6269 if (CnstBits
.getHiBits(64) == CnstBits
.getLoBits(64)) {
6270 CnstBits
= CnstBits
.zextOrTrunc(64);
6271 uint64_t CnstVal
= CnstBits
.getZExtValue();
6273 if (AArch64_AM::isAdvSIMDModImmType1(CnstVal
)) {
6274 CnstVal
= AArch64_AM::encodeAdvSIMDModImmType1(CnstVal
);
6275 MVT MovTy
= (VT
.getSizeInBits() == 128) ? MVT::v4i32
: MVT::v2i32
;
6276 SDValue Mov
= DAG
.getNode(AArch64ISD::BICi
, dl
, MovTy
, LHS
,
6277 DAG
.getConstant(CnstVal
, dl
, MVT::i32
),
6278 DAG
.getConstant(0, dl
, MVT::i32
));
6279 return DAG
.getNode(AArch64ISD::NVCAST
, dl
, VT
, Mov
);
6282 if (AArch64_AM::isAdvSIMDModImmType2(CnstVal
)) {
6283 CnstVal
= AArch64_AM::encodeAdvSIMDModImmType2(CnstVal
);
6284 MVT MovTy
= (VT
.getSizeInBits() == 128) ? MVT::v4i32
: MVT::v2i32
;
6285 SDValue Mov
= DAG
.getNode(AArch64ISD::BICi
, dl
, MovTy
, LHS
,
6286 DAG
.getConstant(CnstVal
, dl
, MVT::i32
),
6287 DAG
.getConstant(8, dl
, MVT::i32
));
6288 return DAG
.getNode(AArch64ISD::NVCAST
, dl
, VT
, Mov
);
6291 if (AArch64_AM::isAdvSIMDModImmType3(CnstVal
)) {
6292 CnstVal
= AArch64_AM::encodeAdvSIMDModImmType3(CnstVal
);
6293 MVT MovTy
= (VT
.getSizeInBits() == 128) ? MVT::v4i32
: MVT::v2i32
;
6294 SDValue Mov
= DAG
.getNode(AArch64ISD::BICi
, dl
, MovTy
, LHS
,
6295 DAG
.getConstant(CnstVal
, dl
, MVT::i32
),
6296 DAG
.getConstant(16, dl
, MVT::i32
));
6297 return DAG
.getNode(AArch64ISD::NVCAST
, dl
, VT
, Mov
);
6300 if (AArch64_AM::isAdvSIMDModImmType4(CnstVal
)) {
6301 CnstVal
= AArch64_AM::encodeAdvSIMDModImmType4(CnstVal
);
6302 MVT MovTy
= (VT
.getSizeInBits() == 128) ? MVT::v4i32
: MVT::v2i32
;
6303 SDValue Mov
= DAG
.getNode(AArch64ISD::BICi
, dl
, MovTy
, LHS
,
6304 DAG
.getConstant(CnstVal
, dl
, MVT::i32
),
6305 DAG
.getConstant(24, dl
, MVT::i32
));
6306 return DAG
.getNode(AArch64ISD::NVCAST
, dl
, VT
, Mov
);
6309 if (AArch64_AM::isAdvSIMDModImmType5(CnstVal
)) {
6310 CnstVal
= AArch64_AM::encodeAdvSIMDModImmType5(CnstVal
);
6311 MVT MovTy
= (VT
.getSizeInBits() == 128) ? MVT::v8i16
: MVT::v4i16
;
6312 SDValue Mov
= DAG
.getNode(AArch64ISD::BICi
, dl
, MovTy
, LHS
,
6313 DAG
.getConstant(CnstVal
, dl
, MVT::i32
),
6314 DAG
.getConstant(0, dl
, MVT::i32
));
6315 return DAG
.getNode(AArch64ISD::NVCAST
, dl
, VT
, Mov
);
6318 if (AArch64_AM::isAdvSIMDModImmType6(CnstVal
)) {
6319 CnstVal
= AArch64_AM::encodeAdvSIMDModImmType6(CnstVal
);
6320 MVT MovTy
= (VT
.getSizeInBits() == 128) ? MVT::v8i16
: MVT::v4i16
;
6321 SDValue Mov
= DAG
.getNode(AArch64ISD::BICi
, dl
, MovTy
, LHS
,
6322 DAG
.getConstant(CnstVal
, dl
, MVT::i32
),
6323 DAG
.getConstant(8, dl
, MVT::i32
));
6324 return DAG
.getNode(AArch64ISD::NVCAST
, dl
, VT
, Mov
);
6331 CnstBits
= ~UndefBits
;
6335 // We can always fall back to a non-immediate AND.
6340 // Specialized code to quickly find if PotentialBVec is a BuildVector that
6341 // consists of only the same constant int value, returned in reference arg
6343 static bool isAllConstantBuildVector(const SDValue
&PotentialBVec
,
6344 uint64_t &ConstVal
) {
6345 BuildVectorSDNode
*Bvec
= dyn_cast
<BuildVectorSDNode
>(PotentialBVec
);
6348 ConstantSDNode
*FirstElt
= dyn_cast
<ConstantSDNode
>(Bvec
->getOperand(0));
6351 EVT VT
= Bvec
->getValueType(0);
6352 unsigned NumElts
= VT
.getVectorNumElements();
6353 for (unsigned i
= 1; i
< NumElts
; ++i
)
6354 if (dyn_cast
<ConstantSDNode
>(Bvec
->getOperand(i
)) != FirstElt
)
6356 ConstVal
= FirstElt
->getZExtValue();
6360 static unsigned getIntrinsicID(const SDNode
*N
) {
6361 unsigned Opcode
= N
->getOpcode();
6364 return Intrinsic::not_intrinsic
;
6365 case ISD::INTRINSIC_WO_CHAIN
: {
6366 unsigned IID
= cast
<ConstantSDNode
>(N
->getOperand(0))->getZExtValue();
6367 if (IID
< Intrinsic::num_intrinsics
)
6369 return Intrinsic::not_intrinsic
;
6374 // Attempt to form a vector S[LR]I from (or (and X, BvecC1), (lsl Y, C2)),
6375 // to (SLI X, Y, C2), where X and Y have matching vector types, BvecC1 is a
6376 // BUILD_VECTORs with constant element C1, C2 is a constant, and C1 == ~C2.
6377 // Also, logical shift right -> sri, with the same structure.
6378 static SDValue
tryLowerToSLI(SDNode
*N
, SelectionDAG
&DAG
) {
6379 EVT VT
= N
->getValueType(0);
6386 // Is the first op an AND?
6387 const SDValue And
= N
->getOperand(0);
6388 if (And
.getOpcode() != ISD::AND
)
6391 // Is the second op an shl or lshr?
6392 SDValue Shift
= N
->getOperand(1);
6393 // This will have been turned into: AArch64ISD::VSHL vector, #shift
6394 // or AArch64ISD::VLSHR vector, #shift
6395 unsigned ShiftOpc
= Shift
.getOpcode();
6396 if ((ShiftOpc
!= AArch64ISD::VSHL
&& ShiftOpc
!= AArch64ISD::VLSHR
))
6398 bool IsShiftRight
= ShiftOpc
== AArch64ISD::VLSHR
;
6400 // Is the shift amount constant?
6401 ConstantSDNode
*C2node
= dyn_cast
<ConstantSDNode
>(Shift
.getOperand(1));
6405 // Is the and mask vector all constant?
6407 if (!isAllConstantBuildVector(And
.getOperand(1), C1
))
6410 // Is C1 == ~C2, taking into account how much one can shift elements of a
6412 uint64_t C2
= C2node
->getZExtValue();
6413 unsigned ElemSizeInBits
= VT
.getScalarSizeInBits();
6414 if (C2
> ElemSizeInBits
)
6416 unsigned ElemMask
= (1 << ElemSizeInBits
) - 1;
6417 if ((C1
& ElemMask
) != (~C2
& ElemMask
))
6420 SDValue X
= And
.getOperand(0);
6421 SDValue Y
= Shift
.getOperand(0);
6424 IsShiftRight
? Intrinsic::aarch64_neon_vsri
: Intrinsic::aarch64_neon_vsli
;
6426 DAG
.getNode(ISD::INTRINSIC_WO_CHAIN
, DL
, VT
,
6427 DAG
.getConstant(Intrin
, DL
, MVT::i32
), X
, Y
,
6428 Shift
.getOperand(1));
6430 DEBUG(dbgs() << "aarch64-lower: transformed: \n");
6431 DEBUG(N
->dump(&DAG
));
6432 DEBUG(dbgs() << "into: \n");
6433 DEBUG(ResultSLI
->dump(&DAG
));
6439 SDValue
AArch64TargetLowering::LowerVectorOR(SDValue Op
,
6440 SelectionDAG
&DAG
) const {
6441 // Attempt to form a vector S[LR]I from (or (and X, C1), (lsl Y, C2))
6442 if (EnableAArch64SlrGeneration
) {
6443 if (SDValue Res
= tryLowerToSLI(Op
.getNode(), DAG
))
6447 BuildVectorSDNode
*BVN
=
6448 dyn_cast
<BuildVectorSDNode
>(Op
.getOperand(0).getNode());
6449 SDValue LHS
= Op
.getOperand(1);
6451 EVT VT
= Op
.getValueType();
6453 // OR commutes, so try swapping the operands.
6455 LHS
= Op
.getOperand(0);
6456 BVN
= dyn_cast
<BuildVectorSDNode
>(Op
.getOperand(1).getNode());
6461 APInt
CnstBits(VT
.getSizeInBits(), 0);
6462 APInt
UndefBits(VT
.getSizeInBits(), 0);
6463 if (resolveBuildVector(BVN
, CnstBits
, UndefBits
)) {
6464 // We make use of a little bit of goto ickiness in order to avoid having to
6465 // duplicate the immediate matching logic for the undef toggled case.
6466 bool SecondTry
= false;
6469 if (CnstBits
.getHiBits(64) == CnstBits
.getLoBits(64)) {
6470 CnstBits
= CnstBits
.zextOrTrunc(64);
6471 uint64_t CnstVal
= CnstBits
.getZExtValue();
6473 if (AArch64_AM::isAdvSIMDModImmType1(CnstVal
)) {
6474 CnstVal
= AArch64_AM::encodeAdvSIMDModImmType1(CnstVal
);
6475 MVT MovTy
= (VT
.getSizeInBits() == 128) ? MVT::v4i32
: MVT::v2i32
;
6476 SDValue Mov
= DAG
.getNode(AArch64ISD::ORRi
, dl
, MovTy
, LHS
,
6477 DAG
.getConstant(CnstVal
, dl
, MVT::i32
),
6478 DAG
.getConstant(0, dl
, MVT::i32
));
6479 return DAG
.getNode(AArch64ISD::NVCAST
, dl
, VT
, Mov
);
6482 if (AArch64_AM::isAdvSIMDModImmType2(CnstVal
)) {
6483 CnstVal
= AArch64_AM::encodeAdvSIMDModImmType2(CnstVal
);
6484 MVT MovTy
= (VT
.getSizeInBits() == 128) ? MVT::v4i32
: MVT::v2i32
;
6485 SDValue Mov
= DAG
.getNode(AArch64ISD::ORRi
, dl
, MovTy
, LHS
,
6486 DAG
.getConstant(CnstVal
, dl
, MVT::i32
),
6487 DAG
.getConstant(8, dl
, MVT::i32
));
6488 return DAG
.getNode(AArch64ISD::NVCAST
, dl
, VT
, Mov
);
6491 if (AArch64_AM::isAdvSIMDModImmType3(CnstVal
)) {
6492 CnstVal
= AArch64_AM::encodeAdvSIMDModImmType3(CnstVal
);
6493 MVT MovTy
= (VT
.getSizeInBits() == 128) ? MVT::v4i32
: MVT::v2i32
;
6494 SDValue Mov
= DAG
.getNode(AArch64ISD::ORRi
, dl
, MovTy
, LHS
,
6495 DAG
.getConstant(CnstVal
, dl
, MVT::i32
),
6496 DAG
.getConstant(16, dl
, MVT::i32
));
6497 return DAG
.getNode(AArch64ISD::NVCAST
, dl
, VT
, Mov
);
6500 if (AArch64_AM::isAdvSIMDModImmType4(CnstVal
)) {
6501 CnstVal
= AArch64_AM::encodeAdvSIMDModImmType4(CnstVal
);
6502 MVT MovTy
= (VT
.getSizeInBits() == 128) ? MVT::v4i32
: MVT::v2i32
;
6503 SDValue Mov
= DAG
.getNode(AArch64ISD::ORRi
, dl
, MovTy
, LHS
,
6504 DAG
.getConstant(CnstVal
, dl
, MVT::i32
),
6505 DAG
.getConstant(24, dl
, MVT::i32
));
6506 return DAG
.getNode(AArch64ISD::NVCAST
, dl
, VT
, Mov
);
6509 if (AArch64_AM::isAdvSIMDModImmType5(CnstVal
)) {
6510 CnstVal
= AArch64_AM::encodeAdvSIMDModImmType5(CnstVal
);
6511 MVT MovTy
= (VT
.getSizeInBits() == 128) ? MVT::v8i16
: MVT::v4i16
;
6512 SDValue Mov
= DAG
.getNode(AArch64ISD::ORRi
, dl
, MovTy
, LHS
,
6513 DAG
.getConstant(CnstVal
, dl
, MVT::i32
),
6514 DAG
.getConstant(0, dl
, MVT::i32
));
6515 return DAG
.getNode(AArch64ISD::NVCAST
, dl
, VT
, Mov
);
6518 if (AArch64_AM::isAdvSIMDModImmType6(CnstVal
)) {
6519 CnstVal
= AArch64_AM::encodeAdvSIMDModImmType6(CnstVal
);
6520 MVT MovTy
= (VT
.getSizeInBits() == 128) ? MVT::v8i16
: MVT::v4i16
;
6521 SDValue Mov
= DAG
.getNode(AArch64ISD::ORRi
, dl
, MovTy
, LHS
,
6522 DAG
.getConstant(CnstVal
, dl
, MVT::i32
),
6523 DAG
.getConstant(8, dl
, MVT::i32
));
6524 return DAG
.getNode(AArch64ISD::NVCAST
, dl
, VT
, Mov
);
6531 CnstBits
= UndefBits
;
6535 // We can always fall back to a non-immediate OR.
6540 // Normalize the operands of BUILD_VECTOR. The value of constant operands will
6541 // be truncated to fit element width.
6542 static SDValue
NormalizeBuildVector(SDValue Op
,
6543 SelectionDAG
&DAG
) {
6544 assert(Op
.getOpcode() == ISD::BUILD_VECTOR
&& "Unknown opcode!");
6546 EVT VT
= Op
.getValueType();
6547 EVT EltTy
= VT
.getVectorElementType();
6549 if (EltTy
.isFloatingPoint() || EltTy
.getSizeInBits() > 16)
6552 SmallVector
<SDValue
, 16> Ops
;
6553 for (SDValue Lane
: Op
->ops()) {
6554 if (auto *CstLane
= dyn_cast
<ConstantSDNode
>(Lane
)) {
6555 APInt
LowBits(EltTy
.getSizeInBits(),
6556 CstLane
->getZExtValue());
6557 Lane
= DAG
.getConstant(LowBits
.getZExtValue(), dl
, MVT::i32
);
6559 Ops
.push_back(Lane
);
6561 return DAG
.getBuildVector(VT
, dl
, Ops
);
6564 SDValue
AArch64TargetLowering::LowerBUILD_VECTOR(SDValue Op
,
6565 SelectionDAG
&DAG
) const {
6567 EVT VT
= Op
.getValueType();
6568 Op
= NormalizeBuildVector(Op
, DAG
);
6569 BuildVectorSDNode
*BVN
= cast
<BuildVectorSDNode
>(Op
.getNode());
6571 APInt
CnstBits(VT
.getSizeInBits(), 0);
6572 APInt
UndefBits(VT
.getSizeInBits(), 0);
6573 if (resolveBuildVector(BVN
, CnstBits
, UndefBits
)) {
6574 // We make use of a little bit of goto ickiness in order to avoid having to
6575 // duplicate the immediate matching logic for the undef toggled case.
6576 bool SecondTry
= false;
6579 if (CnstBits
.getHiBits(64) == CnstBits
.getLoBits(64)) {
6580 CnstBits
= CnstBits
.zextOrTrunc(64);
6581 uint64_t CnstVal
= CnstBits
.getZExtValue();
6583 // Certain magic vector constants (used to express things like NOT
6584 // and NEG) are passed through unmodified. This allows codegen patterns
6585 // for these operations to match. Special-purpose patterns will lower
6586 // these immediates to MOVIs if it proves necessary.
6587 if (VT
.isInteger() && (CnstVal
== 0 || CnstVal
== ~0ULL))
6590 // The many faces of MOVI...
6591 if (AArch64_AM::isAdvSIMDModImmType10(CnstVal
)) {
6592 CnstVal
= AArch64_AM::encodeAdvSIMDModImmType10(CnstVal
);
6593 if (VT
.getSizeInBits() == 128) {
6594 SDValue Mov
= DAG
.getNode(AArch64ISD::MOVIedit
, dl
, MVT::v2i64
,
6595 DAG
.getConstant(CnstVal
, dl
, MVT::i32
));
6596 return DAG
.getNode(AArch64ISD::NVCAST
, dl
, VT
, Mov
);
6599 // Support the V64 version via subregister insertion.
6600 SDValue Mov
= DAG
.getNode(AArch64ISD::MOVIedit
, dl
, MVT::f64
,
6601 DAG
.getConstant(CnstVal
, dl
, MVT::i32
));
6602 return DAG
.getNode(AArch64ISD::NVCAST
, dl
, VT
, Mov
);
6605 if (AArch64_AM::isAdvSIMDModImmType1(CnstVal
)) {
6606 CnstVal
= AArch64_AM::encodeAdvSIMDModImmType1(CnstVal
);
6607 MVT MovTy
= (VT
.getSizeInBits() == 128) ? MVT::v4i32
: MVT::v2i32
;
6608 SDValue Mov
= DAG
.getNode(AArch64ISD::MOVIshift
, dl
, MovTy
,
6609 DAG
.getConstant(CnstVal
, dl
, MVT::i32
),
6610 DAG
.getConstant(0, dl
, MVT::i32
));
6611 return DAG
.getNode(AArch64ISD::NVCAST
, dl
, VT
, Mov
);
6614 if (AArch64_AM::isAdvSIMDModImmType2(CnstVal
)) {
6615 CnstVal
= AArch64_AM::encodeAdvSIMDModImmType2(CnstVal
);
6616 MVT MovTy
= (VT
.getSizeInBits() == 128) ? MVT::v4i32
: MVT::v2i32
;
6617 SDValue Mov
= DAG
.getNode(AArch64ISD::MOVIshift
, dl
, MovTy
,
6618 DAG
.getConstant(CnstVal
, dl
, MVT::i32
),
6619 DAG
.getConstant(8, dl
, MVT::i32
));
6620 return DAG
.getNode(AArch64ISD::NVCAST
, dl
, VT
, Mov
);
6623 if (AArch64_AM::isAdvSIMDModImmType3(CnstVal
)) {
6624 CnstVal
= AArch64_AM::encodeAdvSIMDModImmType3(CnstVal
);
6625 MVT MovTy
= (VT
.getSizeInBits() == 128) ? MVT::v4i32
: MVT::v2i32
;
6626 SDValue Mov
= DAG
.getNode(AArch64ISD::MOVIshift
, dl
, MovTy
,
6627 DAG
.getConstant(CnstVal
, dl
, MVT::i32
),
6628 DAG
.getConstant(16, dl
, MVT::i32
));
6629 return DAG
.getNode(AArch64ISD::NVCAST
, dl
, VT
, Mov
);
6632 if (AArch64_AM::isAdvSIMDModImmType4(CnstVal
)) {
6633 CnstVal
= AArch64_AM::encodeAdvSIMDModImmType4(CnstVal
);
6634 MVT MovTy
= (VT
.getSizeInBits() == 128) ? MVT::v4i32
: MVT::v2i32
;
6635 SDValue Mov
= DAG
.getNode(AArch64ISD::MOVIshift
, dl
, MovTy
,
6636 DAG
.getConstant(CnstVal
, dl
, MVT::i32
),
6637 DAG
.getConstant(24, dl
, MVT::i32
));
6638 return DAG
.getNode(AArch64ISD::NVCAST
, dl
, VT
, Mov
);
6641 if (AArch64_AM::isAdvSIMDModImmType5(CnstVal
)) {
6642 CnstVal
= AArch64_AM::encodeAdvSIMDModImmType5(CnstVal
);
6643 MVT MovTy
= (VT
.getSizeInBits() == 128) ? MVT::v8i16
: MVT::v4i16
;
6644 SDValue Mov
= DAG
.getNode(AArch64ISD::MOVIshift
, dl
, MovTy
,
6645 DAG
.getConstant(CnstVal
, dl
, MVT::i32
),
6646 DAG
.getConstant(0, dl
, MVT::i32
));
6647 return DAG
.getNode(AArch64ISD::NVCAST
, dl
, VT
, Mov
);
6650 if (AArch64_AM::isAdvSIMDModImmType6(CnstVal
)) {
6651 CnstVal
= AArch64_AM::encodeAdvSIMDModImmType6(CnstVal
);
6652 MVT MovTy
= (VT
.getSizeInBits() == 128) ? MVT::v8i16
: MVT::v4i16
;
6653 SDValue Mov
= DAG
.getNode(AArch64ISD::MOVIshift
, dl
, MovTy
,
6654 DAG
.getConstant(CnstVal
, dl
, MVT::i32
),
6655 DAG
.getConstant(8, dl
, MVT::i32
));
6656 return DAG
.getNode(AArch64ISD::NVCAST
, dl
, VT
, Mov
);
6659 if (AArch64_AM::isAdvSIMDModImmType7(CnstVal
)) {
6660 CnstVal
= AArch64_AM::encodeAdvSIMDModImmType7(CnstVal
);
6661 MVT MovTy
= (VT
.getSizeInBits() == 128) ? MVT::v4i32
: MVT::v2i32
;
6662 SDValue Mov
= DAG
.getNode(AArch64ISD::MOVImsl
, dl
, MovTy
,
6663 DAG
.getConstant(CnstVal
, dl
, MVT::i32
),
6664 DAG
.getConstant(264, dl
, MVT::i32
));
6665 return DAG
.getNode(AArch64ISD::NVCAST
, dl
, VT
, Mov
);
6668 if (AArch64_AM::isAdvSIMDModImmType8(CnstVal
)) {
6669 CnstVal
= AArch64_AM::encodeAdvSIMDModImmType8(CnstVal
);
6670 MVT MovTy
= (VT
.getSizeInBits() == 128) ? MVT::v4i32
: MVT::v2i32
;
6671 SDValue Mov
= DAG
.getNode(AArch64ISD::MOVImsl
, dl
, MovTy
,
6672 DAG
.getConstant(CnstVal
, dl
, MVT::i32
),
6673 DAG
.getConstant(272, dl
, MVT::i32
));
6674 return DAG
.getNode(AArch64ISD::NVCAST
, dl
, VT
, Mov
);
6677 if (AArch64_AM::isAdvSIMDModImmType9(CnstVal
)) {
6678 CnstVal
= AArch64_AM::encodeAdvSIMDModImmType9(CnstVal
);
6679 MVT MovTy
= (VT
.getSizeInBits() == 128) ? MVT::v16i8
: MVT::v8i8
;
6680 SDValue Mov
= DAG
.getNode(AArch64ISD::MOVI
, dl
, MovTy
,
6681 DAG
.getConstant(CnstVal
, dl
, MVT::i32
));
6682 return DAG
.getNode(AArch64ISD::NVCAST
, dl
, VT
, Mov
);
6685 // The few faces of FMOV...
6686 if (AArch64_AM::isAdvSIMDModImmType11(CnstVal
)) {
6687 CnstVal
= AArch64_AM::encodeAdvSIMDModImmType11(CnstVal
);
6688 MVT MovTy
= (VT
.getSizeInBits() == 128) ? MVT::v4f32
: MVT::v2f32
;
6689 SDValue Mov
= DAG
.getNode(AArch64ISD::FMOV
, dl
, MovTy
,
6690 DAG
.getConstant(CnstVal
, dl
, MVT::i32
));
6691 return DAG
.getNode(AArch64ISD::NVCAST
, dl
, VT
, Mov
);
6694 if (AArch64_AM::isAdvSIMDModImmType12(CnstVal
) &&
6695 VT
.getSizeInBits() == 128) {
6696 CnstVal
= AArch64_AM::encodeAdvSIMDModImmType12(CnstVal
);
6697 SDValue Mov
= DAG
.getNode(AArch64ISD::FMOV
, dl
, MVT::v2f64
,
6698 DAG
.getConstant(CnstVal
, dl
, MVT::i32
));
6699 return DAG
.getNode(AArch64ISD::NVCAST
, dl
, VT
, Mov
);
6702 // The many faces of MVNI...
6704 if (AArch64_AM::isAdvSIMDModImmType1(CnstVal
)) {
6705 CnstVal
= AArch64_AM::encodeAdvSIMDModImmType1(CnstVal
);
6706 MVT MovTy
= (VT
.getSizeInBits() == 128) ? MVT::v4i32
: MVT::v2i32
;
6707 SDValue Mov
= DAG
.getNode(AArch64ISD::MVNIshift
, dl
, MovTy
,
6708 DAG
.getConstant(CnstVal
, dl
, MVT::i32
),
6709 DAG
.getConstant(0, dl
, MVT::i32
));
6710 return DAG
.getNode(AArch64ISD::NVCAST
, dl
, VT
, Mov
);
6713 if (AArch64_AM::isAdvSIMDModImmType2(CnstVal
)) {
6714 CnstVal
= AArch64_AM::encodeAdvSIMDModImmType2(CnstVal
);
6715 MVT MovTy
= (VT
.getSizeInBits() == 128) ? MVT::v4i32
: MVT::v2i32
;
6716 SDValue Mov
= DAG
.getNode(AArch64ISD::MVNIshift
, dl
, MovTy
,
6717 DAG
.getConstant(CnstVal
, dl
, MVT::i32
),
6718 DAG
.getConstant(8, dl
, MVT::i32
));
6719 return DAG
.getNode(AArch64ISD::NVCAST
, dl
, VT
, Mov
);
6722 if (AArch64_AM::isAdvSIMDModImmType3(CnstVal
)) {
6723 CnstVal
= AArch64_AM::encodeAdvSIMDModImmType3(CnstVal
);
6724 MVT MovTy
= (VT
.getSizeInBits() == 128) ? MVT::v4i32
: MVT::v2i32
;
6725 SDValue Mov
= DAG
.getNode(AArch64ISD::MVNIshift
, dl
, MovTy
,
6726 DAG
.getConstant(CnstVal
, dl
, MVT::i32
),
6727 DAG
.getConstant(16, dl
, MVT::i32
));
6728 return DAG
.getNode(AArch64ISD::NVCAST
, dl
, VT
, Mov
);
6731 if (AArch64_AM::isAdvSIMDModImmType4(CnstVal
)) {
6732 CnstVal
= AArch64_AM::encodeAdvSIMDModImmType4(CnstVal
);
6733 MVT MovTy
= (VT
.getSizeInBits() == 128) ? MVT::v4i32
: MVT::v2i32
;
6734 SDValue Mov
= DAG
.getNode(AArch64ISD::MVNIshift
, dl
, MovTy
,
6735 DAG
.getConstant(CnstVal
, dl
, MVT::i32
),
6736 DAG
.getConstant(24, dl
, MVT::i32
));
6737 return DAG
.getNode(AArch64ISD::NVCAST
, dl
, VT
, Mov
);
6740 if (AArch64_AM::isAdvSIMDModImmType5(CnstVal
)) {
6741 CnstVal
= AArch64_AM::encodeAdvSIMDModImmType5(CnstVal
);
6742 MVT MovTy
= (VT
.getSizeInBits() == 128) ? MVT::v8i16
: MVT::v4i16
;
6743 SDValue Mov
= DAG
.getNode(AArch64ISD::MVNIshift
, dl
, MovTy
,
6744 DAG
.getConstant(CnstVal
, dl
, MVT::i32
),
6745 DAG
.getConstant(0, dl
, MVT::i32
));
6746 return DAG
.getNode(AArch64ISD::NVCAST
, dl
, VT
, Mov
);
6749 if (AArch64_AM::isAdvSIMDModImmType6(CnstVal
)) {
6750 CnstVal
= AArch64_AM::encodeAdvSIMDModImmType6(CnstVal
);
6751 MVT MovTy
= (VT
.getSizeInBits() == 128) ? MVT::v8i16
: MVT::v4i16
;
6752 SDValue Mov
= DAG
.getNode(AArch64ISD::MVNIshift
, dl
, MovTy
,
6753 DAG
.getConstant(CnstVal
, dl
, MVT::i32
),
6754 DAG
.getConstant(8, dl
, MVT::i32
));
6755 return DAG
.getNode(AArch64ISD::NVCAST
, dl
, VT
, Mov
);
6758 if (AArch64_AM::isAdvSIMDModImmType7(CnstVal
)) {
6759 CnstVal
= AArch64_AM::encodeAdvSIMDModImmType7(CnstVal
);
6760 MVT MovTy
= (VT
.getSizeInBits() == 128) ? MVT::v4i32
: MVT::v2i32
;
6761 SDValue Mov
= DAG
.getNode(AArch64ISD::MVNImsl
, dl
, MovTy
,
6762 DAG
.getConstant(CnstVal
, dl
, MVT::i32
),
6763 DAG
.getConstant(264, dl
, MVT::i32
));
6764 return DAG
.getNode(AArch64ISD::NVCAST
, dl
, VT
, Mov
);
6767 if (AArch64_AM::isAdvSIMDModImmType8(CnstVal
)) {
6768 CnstVal
= AArch64_AM::encodeAdvSIMDModImmType8(CnstVal
);
6769 MVT MovTy
= (VT
.getSizeInBits() == 128) ? MVT::v4i32
: MVT::v2i32
;
6770 SDValue Mov
= DAG
.getNode(AArch64ISD::MVNImsl
, dl
, MovTy
,
6771 DAG
.getConstant(CnstVal
, dl
, MVT::i32
),
6772 DAG
.getConstant(272, dl
, MVT::i32
));
6773 return DAG
.getNode(AArch64ISD::NVCAST
, dl
, VT
, Mov
);
6780 CnstBits
= UndefBits
;
6785 // Scan through the operands to find some interesting properties we can
6787 // 1) If only one value is used, we can use a DUP, or
6788 // 2) if only the low element is not undef, we can just insert that, or
6789 // 3) if only one constant value is used (w/ some non-constant lanes),
6790 // we can splat the constant value into the whole vector then fill
6791 // in the non-constant lanes.
6792 // 4) FIXME: If different constant values are used, but we can intelligently
6793 // select the values we'll be overwriting for the non-constant
6794 // lanes such that we can directly materialize the vector
6795 // some other way (MOVI, e.g.), we can be sneaky.
6796 unsigned NumElts
= VT
.getVectorNumElements();
6797 bool isOnlyLowElement
= true;
6798 bool usesOnlyOneValue
= true;
6799 bool usesOnlyOneConstantValue
= true;
6800 bool isConstant
= true;
6801 unsigned NumConstantLanes
= 0;
6803 SDValue ConstantValue
;
6804 for (unsigned i
= 0; i
< NumElts
; ++i
) {
6805 SDValue V
= Op
.getOperand(i
);
6809 isOnlyLowElement
= false;
6810 if (!isa
<ConstantFPSDNode
>(V
) && !isa
<ConstantSDNode
>(V
))
6813 if (isa
<ConstantSDNode
>(V
) || isa
<ConstantFPSDNode
>(V
)) {
6815 if (!ConstantValue
.getNode())
6817 else if (ConstantValue
!= V
)
6818 usesOnlyOneConstantValue
= false;
6821 if (!Value
.getNode())
6823 else if (V
!= Value
)
6824 usesOnlyOneValue
= false;
6827 if (!Value
.getNode()) {
6828 DEBUG(dbgs() << "LowerBUILD_VECTOR: value undefined, creating undef node\n");
6829 return DAG
.getUNDEF(VT
);
6832 if (isOnlyLowElement
) {
6833 DEBUG(dbgs() << "LowerBUILD_VECTOR: only low element used, creating 1 "
6834 "SCALAR_TO_VECTOR node\n");
6835 return DAG
.getNode(ISD::SCALAR_TO_VECTOR
, dl
, VT
, Value
);
6838 // Use DUP for non-constant splats. For f32 constant splats, reduce to
6839 // i32 and try again.
6840 if (usesOnlyOneValue
) {
6842 if (Value
.getOpcode() != ISD::EXTRACT_VECTOR_ELT
||
6843 Value
.getValueType() != VT
) {
6844 DEBUG(dbgs() << "LowerBUILD_VECTOR: use DUP for non-constant splats\n");
6845 return DAG
.getNode(AArch64ISD::DUP
, dl
, VT
, Value
);
6848 // This is actually a DUPLANExx operation, which keeps everything vectory.
6850 SDValue Lane
= Value
.getOperand(1);
6851 Value
= Value
.getOperand(0);
6852 if (Value
.getValueSizeInBits() == 64) {
6853 DEBUG(dbgs() << "LowerBUILD_VECTOR: DUPLANE works on 128-bit vectors, "
6855 Value
= WidenVector(Value
, DAG
);
6858 unsigned Opcode
= getDUPLANEOp(VT
.getVectorElementType());
6859 return DAG
.getNode(Opcode
, dl
, VT
, Value
, Lane
);
6862 if (VT
.getVectorElementType().isFloatingPoint()) {
6863 SmallVector
<SDValue
, 8> Ops
;
6864 EVT EltTy
= VT
.getVectorElementType();
6865 assert ((EltTy
== MVT::f16
|| EltTy
== MVT::f32
|| EltTy
== MVT::f64
) &&
6866 "Unsupported floating-point vector type");
6867 DEBUG(dbgs() << "LowerBUILD_VECTOR: float constant splats, creating int "
6868 "BITCASTS, and try again\n");
6869 MVT NewType
= MVT::getIntegerVT(EltTy
.getSizeInBits());
6870 for (unsigned i
= 0; i
< NumElts
; ++i
)
6871 Ops
.push_back(DAG
.getNode(ISD::BITCAST
, dl
, NewType
, Op
.getOperand(i
)));
6872 EVT VecVT
= EVT::getVectorVT(*DAG
.getContext(), NewType
, NumElts
);
6873 SDValue Val
= DAG
.getBuildVector(VecVT
, dl
, Ops
);
6875 dbgs() << "LowerBUILD_VECTOR: trying to lower new vector: ";
6878 Val
= LowerBUILD_VECTOR(Val
, DAG
);
6880 return DAG
.getNode(ISD::BITCAST
, dl
, VT
, Val
);
6884 // If there was only one constant value used and for more than one lane,
6885 // start by splatting that value, then replace the non-constant lanes. This
6886 // is better than the default, which will perform a separate initialization
6888 if (NumConstantLanes
> 0 && usesOnlyOneConstantValue
) {
6889 SDValue Val
= DAG
.getNode(AArch64ISD::DUP
, dl
, VT
, ConstantValue
);
6890 // Now insert the non-constant lanes.
6891 for (unsigned i
= 0; i
< NumElts
; ++i
) {
6892 SDValue V
= Op
.getOperand(i
);
6893 SDValue LaneIdx
= DAG
.getConstant(i
, dl
, MVT::i64
);
6894 if (!isa
<ConstantSDNode
>(V
) && !isa
<ConstantFPSDNode
>(V
)) {
6895 // Note that type legalization likely mucked about with the VT of the
6896 // source operand, so we may have to convert it here before inserting.
6897 Val
= DAG
.getNode(ISD::INSERT_VECTOR_ELT
, dl
, VT
, Val
, V
, LaneIdx
);
6903 // This will generate a load from the constant pool.
6905 DEBUG(dbgs() << "LowerBUILD_VECTOR: all elements are constant, use default "
6910 // Empirical tests suggest this is rarely worth it for vectors of length <= 2.
6912 if (SDValue shuffle
= ReconstructShuffle(Op
, DAG
))
6916 // If all else fails, just use a sequence of INSERT_VECTOR_ELT when we
6917 // know the default expansion would otherwise fall back on something even
6918 // worse. For a vector with one or two non-undef values, that's
6919 // scalar_to_vector for the elements followed by a shuffle (provided the
6920 // shuffle is valid for the target) and materialization element by element
6921 // on the stack followed by a load for everything else.
6922 if (!isConstant
&& !usesOnlyOneValue
) {
6923 DEBUG(dbgs() << "LowerBUILD_VECTOR: alternatives failed, creating sequence "
6924 "of INSERT_VECTOR_ELT\n");
6926 SDValue Vec
= DAG
.getUNDEF(VT
);
6927 SDValue Op0
= Op
.getOperand(0);
6930 // Use SCALAR_TO_VECTOR for lane zero to
6931 // a) Avoid a RMW dependency on the full vector register, and
6932 // b) Allow the register coalescer to fold away the copy if the
6933 // value is already in an S or D register, and we're forced to emit an
6934 // INSERT_SUBREG that we can't fold anywhere.
6936 // We also allow types like i8 and i16 which are illegal scalar but legal
6937 // vector element types. After type-legalization the inserted value is
6938 // extended (i32) and it is safe to cast them to the vector type by ignoring
6939 // the upper bits of the lowest lane (e.g. v8i8, v4i16).
6940 if (!Op0
.isUndef()) {
6941 DEBUG(dbgs() << "Creating node for op0, it is not undefined:\n");
6942 Vec
= DAG
.getNode(ISD::SCALAR_TO_VECTOR
, dl
, VT
, Op0
);
6947 dbgs() << "Creating nodes for the other vector elements:\n";
6949 for (; i
< NumElts
; ++i
) {
6950 SDValue V
= Op
.getOperand(i
);
6953 SDValue LaneIdx
= DAG
.getConstant(i
, dl
, MVT::i64
);
6954 Vec
= DAG
.getNode(ISD::INSERT_VECTOR_ELT
, dl
, VT
, Vec
, V
, LaneIdx
);
6959 DEBUG(dbgs() << "LowerBUILD_VECTOR: use default expansion, failed to find "
6960 "better alternative\n");
6964 SDValue
AArch64TargetLowering::LowerINSERT_VECTOR_ELT(SDValue Op
,
6965 SelectionDAG
&DAG
) const {
6966 assert(Op
.getOpcode() == ISD::INSERT_VECTOR_ELT
&& "Unknown opcode!");
6968 // Check for non-constant or out of range lane.
6969 EVT VT
= Op
.getOperand(0).getValueType();
6970 ConstantSDNode
*CI
= dyn_cast
<ConstantSDNode
>(Op
.getOperand(2));
6971 if (!CI
|| CI
->getZExtValue() >= VT
.getVectorNumElements())
6975 // Insertion/extraction are legal for V128 types.
6976 if (VT
== MVT::v16i8
|| VT
== MVT::v8i16
|| VT
== MVT::v4i32
||
6977 VT
== MVT::v2i64
|| VT
== MVT::v4f32
|| VT
== MVT::v2f64
||
6981 if (VT
!= MVT::v8i8
&& VT
!= MVT::v4i16
&& VT
!= MVT::v2i32
&&
6982 VT
!= MVT::v1i64
&& VT
!= MVT::v2f32
&& VT
!= MVT::v4f16
)
6985 // For V64 types, we perform insertion by expanding the value
6986 // to a V128 type and perform the insertion on that.
6988 SDValue WideVec
= WidenVector(Op
.getOperand(0), DAG
);
6989 EVT WideTy
= WideVec
.getValueType();
6991 SDValue Node
= DAG
.getNode(ISD::INSERT_VECTOR_ELT
, DL
, WideTy
, WideVec
,
6992 Op
.getOperand(1), Op
.getOperand(2));
6993 // Re-narrow the resultant vector.
6994 return NarrowVector(Node
, DAG
);
6998 AArch64TargetLowering::LowerEXTRACT_VECTOR_ELT(SDValue Op
,
6999 SelectionDAG
&DAG
) const {
7000 assert(Op
.getOpcode() == ISD::EXTRACT_VECTOR_ELT
&& "Unknown opcode!");
7002 // Check for non-constant or out of range lane.
7003 EVT VT
= Op
.getOperand(0).getValueType();
7004 ConstantSDNode
*CI
= dyn_cast
<ConstantSDNode
>(Op
.getOperand(1));
7005 if (!CI
|| CI
->getZExtValue() >= VT
.getVectorNumElements())
7009 // Insertion/extraction are legal for V128 types.
7010 if (VT
== MVT::v16i8
|| VT
== MVT::v8i16
|| VT
== MVT::v4i32
||
7011 VT
== MVT::v2i64
|| VT
== MVT::v4f32
|| VT
== MVT::v2f64
||
7015 if (VT
!= MVT::v8i8
&& VT
!= MVT::v4i16
&& VT
!= MVT::v2i32
&&
7016 VT
!= MVT::v1i64
&& VT
!= MVT::v2f32
&& VT
!= MVT::v4f16
)
7019 // For V64 types, we perform extraction by expanding the value
7020 // to a V128 type and perform the extraction on that.
7022 SDValue WideVec
= WidenVector(Op
.getOperand(0), DAG
);
7023 EVT WideTy
= WideVec
.getValueType();
7025 EVT ExtrTy
= WideTy
.getVectorElementType();
7026 if (ExtrTy
== MVT::i16
|| ExtrTy
== MVT::i8
)
7029 // For extractions, we just return the result directly.
7030 return DAG
.getNode(ISD::EXTRACT_VECTOR_ELT
, DL
, ExtrTy
, WideVec
,
7034 SDValue
AArch64TargetLowering::LowerEXTRACT_SUBVECTOR(SDValue Op
,
7035 SelectionDAG
&DAG
) const {
7036 EVT VT
= Op
.getOperand(0).getValueType();
7042 ConstantSDNode
*Cst
= dyn_cast
<ConstantSDNode
>(Op
.getOperand(1));
7045 unsigned Val
= Cst
->getZExtValue();
7047 unsigned Size
= Op
.getValueSizeInBits();
7049 // This will get lowered to an appropriate EXTRACT_SUBREG in ISel.
7053 // If this is extracting the upper 64-bits of a 128-bit vector, we match
7055 if (Size
== 64 && Val
* VT
.getScalarSizeInBits() == 64)
7061 bool AArch64TargetLowering::isShuffleMaskLegal(ArrayRef
<int> M
, EVT VT
) const {
7062 if (VT
.getVectorNumElements() == 4 &&
7063 (VT
.is128BitVector() || VT
.is64BitVector())) {
7064 unsigned PFIndexes
[4];
7065 for (unsigned i
= 0; i
!= 4; ++i
) {
7069 PFIndexes
[i
] = M
[i
];
7072 // Compute the index in the perfect shuffle table.
7073 unsigned PFTableIndex
= PFIndexes
[0] * 9 * 9 * 9 + PFIndexes
[1] * 9 * 9 +
7074 PFIndexes
[2] * 9 + PFIndexes
[3];
7075 unsigned PFEntry
= PerfectShuffleTable
[PFTableIndex
];
7076 unsigned Cost
= (PFEntry
>> 30);
7084 unsigned DummyUnsigned
;
7086 return (ShuffleVectorSDNode::isSplatMask(&M
[0], VT
) || isREVMask(M
, VT
, 64) ||
7087 isREVMask(M
, VT
, 32) || isREVMask(M
, VT
, 16) ||
7088 isEXTMask(M
, VT
, DummyBool
, DummyUnsigned
) ||
7089 // isTBLMask(M, VT) || // FIXME: Port TBL support from ARM.
7090 isTRNMask(M
, VT
, DummyUnsigned
) || isUZPMask(M
, VT
, DummyUnsigned
) ||
7091 isZIPMask(M
, VT
, DummyUnsigned
) ||
7092 isTRN_v_undef_Mask(M
, VT
, DummyUnsigned
) ||
7093 isUZP_v_undef_Mask(M
, VT
, DummyUnsigned
) ||
7094 isZIP_v_undef_Mask(M
, VT
, DummyUnsigned
) ||
7095 isINSMask(M
, VT
.getVectorNumElements(), DummyBool
, DummyInt
) ||
7096 isConcatMask(M
, VT
, VT
.getSizeInBits() == 128));
7099 /// getVShiftImm - Check if this is a valid build_vector for the immediate
7100 /// operand of a vector shift operation, where all the elements of the
7101 /// build_vector must have the same constant integer value.
7102 static bool getVShiftImm(SDValue Op
, unsigned ElementBits
, int64_t &Cnt
) {
7103 // Ignore bit_converts.
7104 while (Op
.getOpcode() == ISD::BITCAST
)
7105 Op
= Op
.getOperand(0);
7106 BuildVectorSDNode
*BVN
= dyn_cast
<BuildVectorSDNode
>(Op
.getNode());
7107 APInt SplatBits
, SplatUndef
;
7108 unsigned SplatBitSize
;
7110 if (!BVN
|| !BVN
->isConstantSplat(SplatBits
, SplatUndef
, SplatBitSize
,
7111 HasAnyUndefs
, ElementBits
) ||
7112 SplatBitSize
> ElementBits
)
7114 Cnt
= SplatBits
.getSExtValue();
7118 /// isVShiftLImm - Check if this is a valid build_vector for the immediate
7119 /// operand of a vector shift left operation. That value must be in the range:
7120 /// 0 <= Value < ElementBits for a left shift; or
7121 /// 0 <= Value <= ElementBits for a long left shift.
7122 static bool isVShiftLImm(SDValue Op
, EVT VT
, bool isLong
, int64_t &Cnt
) {
7123 assert(VT
.isVector() && "vector shift count is not a vector type");
7124 int64_t ElementBits
= VT
.getScalarSizeInBits();
7125 if (!getVShiftImm(Op
, ElementBits
, Cnt
))
7127 return (Cnt
>= 0 && (isLong
? Cnt
- 1 : Cnt
) < ElementBits
);
7130 /// isVShiftRImm - Check if this is a valid build_vector for the immediate
7131 /// operand of a vector shift right operation. The value must be in the range:
7132 /// 1 <= Value <= ElementBits for a right shift; or
7133 static bool isVShiftRImm(SDValue Op
, EVT VT
, bool isNarrow
, int64_t &Cnt
) {
7134 assert(VT
.isVector() && "vector shift count is not a vector type");
7135 int64_t ElementBits
= VT
.getScalarSizeInBits();
7136 if (!getVShiftImm(Op
, ElementBits
, Cnt
))
7138 return (Cnt
>= 1 && Cnt
<= (isNarrow
? ElementBits
/ 2 : ElementBits
));
7141 SDValue
AArch64TargetLowering::LowerVectorSRA_SRL_SHL(SDValue Op
,
7142 SelectionDAG
&DAG
) const {
7143 EVT VT
= Op
.getValueType();
7147 if (!Op
.getOperand(1).getValueType().isVector())
7149 unsigned EltSize
= VT
.getScalarSizeInBits();
7151 switch (Op
.getOpcode()) {
7153 llvm_unreachable("unexpected shift opcode");
7156 if (isVShiftLImm(Op
.getOperand(1), VT
, false, Cnt
) && Cnt
< EltSize
)
7157 return DAG
.getNode(AArch64ISD::VSHL
, DL
, VT
, Op
.getOperand(0),
7158 DAG
.getConstant(Cnt
, DL
, MVT::i32
));
7159 return DAG
.getNode(ISD::INTRINSIC_WO_CHAIN
, DL
, VT
,
7160 DAG
.getConstant(Intrinsic::aarch64_neon_ushl
, DL
,
7162 Op
.getOperand(0), Op
.getOperand(1));
7165 // Right shift immediate
7166 if (isVShiftRImm(Op
.getOperand(1), VT
, false, Cnt
) && Cnt
< EltSize
) {
7168 (Op
.getOpcode() == ISD::SRA
) ? AArch64ISD::VASHR
: AArch64ISD::VLSHR
;
7169 return DAG
.getNode(Opc
, DL
, VT
, Op
.getOperand(0),
7170 DAG
.getConstant(Cnt
, DL
, MVT::i32
));
7173 // Right shift register. Note, there is not a shift right register
7174 // instruction, but the shift left register instruction takes a signed
7175 // value, where negative numbers specify a right shift.
7176 unsigned Opc
= (Op
.getOpcode() == ISD::SRA
) ? Intrinsic::aarch64_neon_sshl
7177 : Intrinsic::aarch64_neon_ushl
;
7178 // negate the shift amount
7179 SDValue NegShift
= DAG
.getNode(AArch64ISD::NEG
, DL
, VT
, Op
.getOperand(1));
7180 SDValue NegShiftLeft
=
7181 DAG
.getNode(ISD::INTRINSIC_WO_CHAIN
, DL
, VT
,
7182 DAG
.getConstant(Opc
, DL
, MVT::i32
), Op
.getOperand(0),
7184 return NegShiftLeft
;
7190 static SDValue
EmitVectorComparison(SDValue LHS
, SDValue RHS
,
7191 AArch64CC::CondCode CC
, bool NoNans
, EVT VT
,
7192 const SDLoc
&dl
, SelectionDAG
&DAG
) {
7193 EVT SrcVT
= LHS
.getValueType();
7194 assert(VT
.getSizeInBits() == SrcVT
.getSizeInBits() &&
7195 "function only supposed to emit natural comparisons");
7197 BuildVectorSDNode
*BVN
= dyn_cast
<BuildVectorSDNode
>(RHS
.getNode());
7198 APInt
CnstBits(VT
.getSizeInBits(), 0);
7199 APInt
UndefBits(VT
.getSizeInBits(), 0);
7200 bool IsCnst
= BVN
&& resolveBuildVector(BVN
, CnstBits
, UndefBits
);
7201 bool IsZero
= IsCnst
&& (CnstBits
== 0);
7203 if (SrcVT
.getVectorElementType().isFloatingPoint()) {
7207 case AArch64CC::NE
: {
7210 Fcmeq
= DAG
.getNode(AArch64ISD::FCMEQz
, dl
, VT
, LHS
);
7212 Fcmeq
= DAG
.getNode(AArch64ISD::FCMEQ
, dl
, VT
, LHS
, RHS
);
7213 return DAG
.getNode(AArch64ISD::NOT
, dl
, VT
, Fcmeq
);
7217 return DAG
.getNode(AArch64ISD::FCMEQz
, dl
, VT
, LHS
);
7218 return DAG
.getNode(AArch64ISD::FCMEQ
, dl
, VT
, LHS
, RHS
);
7221 return DAG
.getNode(AArch64ISD::FCMGEz
, dl
, VT
, LHS
);
7222 return DAG
.getNode(AArch64ISD::FCMGE
, dl
, VT
, LHS
, RHS
);
7225 return DAG
.getNode(AArch64ISD::FCMGTz
, dl
, VT
, LHS
);
7226 return DAG
.getNode(AArch64ISD::FCMGT
, dl
, VT
, LHS
, RHS
);
7229 return DAG
.getNode(AArch64ISD::FCMLEz
, dl
, VT
, LHS
);
7230 return DAG
.getNode(AArch64ISD::FCMGE
, dl
, VT
, RHS
, LHS
);
7234 // If we ignore NaNs then we can use to the MI implementation.
7238 return DAG
.getNode(AArch64ISD::FCMLTz
, dl
, VT
, LHS
);
7239 return DAG
.getNode(AArch64ISD::FCMGT
, dl
, VT
, RHS
, LHS
);
7246 case AArch64CC::NE
: {
7249 Cmeq
= DAG
.getNode(AArch64ISD::CMEQz
, dl
, VT
, LHS
);
7251 Cmeq
= DAG
.getNode(AArch64ISD::CMEQ
, dl
, VT
, LHS
, RHS
);
7252 return DAG
.getNode(AArch64ISD::NOT
, dl
, VT
, Cmeq
);
7256 return DAG
.getNode(AArch64ISD::CMEQz
, dl
, VT
, LHS
);
7257 return DAG
.getNode(AArch64ISD::CMEQ
, dl
, VT
, LHS
, RHS
);
7260 return DAG
.getNode(AArch64ISD::CMGEz
, dl
, VT
, LHS
);
7261 return DAG
.getNode(AArch64ISD::CMGE
, dl
, VT
, LHS
, RHS
);
7264 return DAG
.getNode(AArch64ISD::CMGTz
, dl
, VT
, LHS
);
7265 return DAG
.getNode(AArch64ISD::CMGT
, dl
, VT
, LHS
, RHS
);
7268 return DAG
.getNode(AArch64ISD::CMLEz
, dl
, VT
, LHS
);
7269 return DAG
.getNode(AArch64ISD::CMGE
, dl
, VT
, RHS
, LHS
);
7271 return DAG
.getNode(AArch64ISD::CMHS
, dl
, VT
, RHS
, LHS
);
7273 return DAG
.getNode(AArch64ISD::CMHI
, dl
, VT
, RHS
, LHS
);
7276 return DAG
.getNode(AArch64ISD::CMLTz
, dl
, VT
, LHS
);
7277 return DAG
.getNode(AArch64ISD::CMGT
, dl
, VT
, RHS
, LHS
);
7279 return DAG
.getNode(AArch64ISD::CMHI
, dl
, VT
, LHS
, RHS
);
7281 return DAG
.getNode(AArch64ISD::CMHS
, dl
, VT
, LHS
, RHS
);
7285 SDValue
AArch64TargetLowering::LowerVSETCC(SDValue Op
,
7286 SelectionDAG
&DAG
) const {
7287 ISD::CondCode CC
= cast
<CondCodeSDNode
>(Op
.getOperand(2))->get();
7288 SDValue LHS
= Op
.getOperand(0);
7289 SDValue RHS
= Op
.getOperand(1);
7290 EVT CmpVT
= LHS
.getValueType().changeVectorElementTypeToInteger();
7293 if (LHS
.getValueType().getVectorElementType().isInteger()) {
7294 assert(LHS
.getValueType() == RHS
.getValueType());
7295 AArch64CC::CondCode AArch64CC
= changeIntCCToAArch64CC(CC
);
7297 EmitVectorComparison(LHS
, RHS
, AArch64CC
, false, CmpVT
, dl
, DAG
);
7298 return DAG
.getSExtOrTrunc(Cmp
, dl
, Op
.getValueType());
7301 const bool FullFP16
=
7302 static_cast<const AArch64Subtarget
&>(DAG
.getSubtarget()).hasFullFP16();
7304 // Make v4f16 (only) fcmp operations utilise vector instructions
7305 // v8f16 support will be a litle more complicated
7306 if (LHS
.getValueType().getVectorElementType() == MVT::f16
) {
7307 if (!FullFP16
&& LHS
.getValueType().getVectorNumElements() == 4) {
7308 LHS
= DAG
.getNode(ISD::FP_EXTEND
, dl
, MVT::v4f32
, LHS
);
7309 RHS
= DAG
.getNode(ISD::FP_EXTEND
, dl
, MVT::v4f32
, RHS
);
7310 SDValue NewSetcc
= DAG
.getSetCC(dl
, MVT::v4i16
, LHS
, RHS
, CC
);
7311 DAG
.ReplaceAllUsesWith(Op
, NewSetcc
);
7317 assert(LHS
.getValueType().getVectorElementType() == MVT::f32
||
7318 LHS
.getValueType().getVectorElementType() == MVT::f64
);
7320 // Unfortunately, the mapping of LLVM FP CC's onto AArch64 CC's isn't totally
7321 // clean. Some of them require two branches to implement.
7322 AArch64CC::CondCode CC1
, CC2
;
7324 changeVectorFPCCToAArch64CC(CC
, CC1
, CC2
, ShouldInvert
);
7326 bool NoNaNs
= getTargetMachine().Options
.NoNaNsFPMath
;
7328 EmitVectorComparison(LHS
, RHS
, CC1
, NoNaNs
, CmpVT
, dl
, DAG
);
7332 if (CC2
!= AArch64CC::AL
) {
7334 EmitVectorComparison(LHS
, RHS
, CC2
, NoNaNs
, CmpVT
, dl
, DAG
);
7335 if (!Cmp2
.getNode())
7338 Cmp
= DAG
.getNode(ISD::OR
, dl
, CmpVT
, Cmp
, Cmp2
);
7341 Cmp
= DAG
.getSExtOrTrunc(Cmp
, dl
, Op
.getValueType());
7344 return Cmp
= DAG
.getNOT(dl
, Cmp
, Cmp
.getValueType());
7349 static SDValue
getReductionSDNode(unsigned Op
, SDLoc DL
, SDValue ScalarOp
,
7350 SelectionDAG
&DAG
) {
7351 SDValue VecOp
= ScalarOp
.getOperand(0);
7352 auto Rdx
= DAG
.getNode(Op
, DL
, VecOp
.getSimpleValueType(), VecOp
);
7353 return DAG
.getNode(ISD::EXTRACT_VECTOR_ELT
, DL
, ScalarOp
.getValueType(), Rdx
,
7354 DAG
.getConstant(0, DL
, MVT::i64
));
7357 SDValue
AArch64TargetLowering::LowerVECREDUCE(SDValue Op
,
7358 SelectionDAG
&DAG
) const {
7360 switch (Op
.getOpcode()) {
7361 case ISD::VECREDUCE_ADD
:
7362 return getReductionSDNode(AArch64ISD::UADDV
, dl
, Op
, DAG
);
7363 case ISD::VECREDUCE_SMAX
:
7364 return getReductionSDNode(AArch64ISD::SMAXV
, dl
, Op
, DAG
);
7365 case ISD::VECREDUCE_SMIN
:
7366 return getReductionSDNode(AArch64ISD::SMINV
, dl
, Op
, DAG
);
7367 case ISD::VECREDUCE_UMAX
:
7368 return getReductionSDNode(AArch64ISD::UMAXV
, dl
, Op
, DAG
);
7369 case ISD::VECREDUCE_UMIN
:
7370 return getReductionSDNode(AArch64ISD::UMINV
, dl
, Op
, DAG
);
7371 case ISD::VECREDUCE_FMAX
: {
7372 assert(Op
->getFlags().hasNoNaNs() && "fmax vector reduction needs NoNaN flag");
7374 ISD::INTRINSIC_WO_CHAIN
, dl
, Op
.getValueType(),
7375 DAG
.getConstant(Intrinsic::aarch64_neon_fmaxnmv
, dl
, MVT::i32
),
7378 case ISD::VECREDUCE_FMIN
: {
7379 assert(Op
->getFlags().hasNoNaNs() && "fmin vector reduction needs NoNaN flag");
7381 ISD::INTRINSIC_WO_CHAIN
, dl
, Op
.getValueType(),
7382 DAG
.getConstant(Intrinsic::aarch64_neon_fminnmv
, dl
, MVT::i32
),
7386 llvm_unreachable("Unhandled reduction");
7390 SDValue
AArch64TargetLowering::LowerATOMIC_LOAD_SUB(SDValue Op
,
7391 SelectionDAG
&DAG
) const {
7392 auto &Subtarget
= static_cast<const AArch64Subtarget
&>(DAG
.getSubtarget());
7393 if (!Subtarget
.hasLSE())
7396 // LSE has an atomic load-add instruction, but not a load-sub.
7398 MVT VT
= Op
.getSimpleValueType();
7399 SDValue RHS
= Op
.getOperand(2);
7400 AtomicSDNode
*AN
= cast
<AtomicSDNode
>(Op
.getNode());
7401 RHS
= DAG
.getNode(ISD::SUB
, dl
, VT
, DAG
.getConstant(0, dl
, VT
), RHS
);
7402 return DAG
.getAtomic(ISD::ATOMIC_LOAD_ADD
, dl
, AN
->getMemoryVT(),
7403 Op
.getOperand(0), Op
.getOperand(1), RHS
,
7404 AN
->getMemOperand());
7407 SDValue
AArch64TargetLowering::LowerATOMIC_LOAD_AND(SDValue Op
,
7408 SelectionDAG
&DAG
) const {
7409 auto &Subtarget
= static_cast<const AArch64Subtarget
&>(DAG
.getSubtarget());
7410 if (!Subtarget
.hasLSE())
7413 // LSE has an atomic load-clear instruction, but not a load-and.
7415 MVT VT
= Op
.getSimpleValueType();
7416 SDValue RHS
= Op
.getOperand(2);
7417 AtomicSDNode
*AN
= cast
<AtomicSDNode
>(Op
.getNode());
7418 RHS
= DAG
.getNode(ISD::XOR
, dl
, VT
, DAG
.getConstant(-1ULL, dl
, VT
), RHS
);
7419 return DAG
.getAtomic(ISD::ATOMIC_LOAD_CLR
, dl
, AN
->getMemoryVT(),
7420 Op
.getOperand(0), Op
.getOperand(1), RHS
,
7421 AN
->getMemOperand());
7424 SDValue
AArch64TargetLowering::LowerWindowsDYNAMIC_STACKALLOC(
7425 SDValue Op
, SDValue Chain
, SDValue
&Size
, SelectionDAG
&DAG
) const {
7427 EVT PtrVT
= getPointerTy(DAG
.getDataLayout());
7428 SDValue Callee
= DAG
.getTargetExternalSymbol("__chkstk", PtrVT
, 0);
7430 const uint32_t *Mask
=
7431 Subtarget
->getRegisterInfo()->getWindowsStackProbePreservedMask();
7433 Size
= DAG
.getNode(ISD::SRL
, dl
, MVT::i64
, Size
,
7434 DAG
.getConstant(4, dl
, MVT::i64
));
7435 Chain
= DAG
.getCopyToReg(Chain
, dl
, AArch64::X15
, Size
, SDValue());
7437 DAG
.getNode(AArch64ISD::CALL
, dl
, DAG
.getVTList(MVT::Other
, MVT::Glue
),
7438 Chain
, Callee
, DAG
.getRegister(AArch64::X15
, MVT::i64
),
7439 DAG
.getRegisterMask(Mask
), Chain
.getValue(1));
7440 // To match the actual intent better, we should read the output from X15 here
7441 // again (instead of potentially spilling it to the stack), but rereading Size
7442 // from X15 here doesn't work at -O0, since it thinks that X15 is undefined
7445 Size
= DAG
.getNode(ISD::SHL
, dl
, MVT::i64
, Size
,
7446 DAG
.getConstant(4, dl
, MVT::i64
));
7451 AArch64TargetLowering::LowerDYNAMIC_STACKALLOC(SDValue Op
,
7452 SelectionDAG
&DAG
) const {
7453 assert(Subtarget
->isTargetWindows() &&
7454 "Only Windows alloca probing supported");
7457 SDNode
*Node
= Op
.getNode();
7458 SDValue Chain
= Op
.getOperand(0);
7459 SDValue Size
= Op
.getOperand(1);
7460 unsigned Align
= cast
<ConstantSDNode
>(Op
.getOperand(2))->getZExtValue();
7461 EVT VT
= Node
->getValueType(0);
7463 Chain
= DAG
.getCALLSEQ_START(Chain
, 0, 0, dl
);
7465 Chain
= LowerWindowsDYNAMIC_STACKALLOC(Op
, Chain
, Size
, DAG
);
7467 SDValue SP
= DAG
.getCopyFromReg(Chain
, dl
, AArch64::SP
, MVT::i64
);
7468 Chain
= SP
.getValue(1);
7469 SP
= DAG
.getNode(ISD::SUB
, dl
, MVT::i64
, SP
, Size
);
7470 Chain
= DAG
.getCopyToReg(Chain
, dl
, AArch64::SP
, SP
);
7473 SP
= DAG
.getNode(ISD::AND
, dl
, VT
, SP
.getValue(0),
7474 DAG
.getConstant(-(uint64_t)Align
, dl
, VT
));
7475 Chain
= DAG
.getCopyToReg(Chain
, dl
, AArch64::SP
, SP
);
7478 Chain
= DAG
.getCALLSEQ_END(Chain
, DAG
.getIntPtrConstant(0, dl
, true),
7479 DAG
.getIntPtrConstant(0, dl
, true), SDValue(), dl
);
7481 SDValue Ops
[2] = {SP
, Chain
};
7482 return DAG
.getMergeValues(Ops
, dl
);
7485 /// getTgtMemIntrinsic - Represent NEON load and store intrinsics as
7486 /// MemIntrinsicNodes. The associated MachineMemOperands record the alignment
7487 /// specified in the intrinsic calls.
7488 bool AArch64TargetLowering::getTgtMemIntrinsic(IntrinsicInfo
&Info
,
7490 MachineFunction
&MF
,
7491 unsigned Intrinsic
) const {
7492 auto &DL
= I
.getModule()->getDataLayout();
7493 switch (Intrinsic
) {
7494 case Intrinsic::aarch64_neon_ld2
:
7495 case Intrinsic::aarch64_neon_ld3
:
7496 case Intrinsic::aarch64_neon_ld4
:
7497 case Intrinsic::aarch64_neon_ld1x2
:
7498 case Intrinsic::aarch64_neon_ld1x3
:
7499 case Intrinsic::aarch64_neon_ld1x4
:
7500 case Intrinsic::aarch64_neon_ld2lane
:
7501 case Intrinsic::aarch64_neon_ld3lane
:
7502 case Intrinsic::aarch64_neon_ld4lane
:
7503 case Intrinsic::aarch64_neon_ld2r
:
7504 case Intrinsic::aarch64_neon_ld3r
:
7505 case Intrinsic::aarch64_neon_ld4r
: {
7506 Info
.opc
= ISD::INTRINSIC_W_CHAIN
;
7507 // Conservatively set memVT to the entire set of vectors loaded.
7508 uint64_t NumElts
= DL
.getTypeSizeInBits(I
.getType()) / 64;
7509 Info
.memVT
= EVT::getVectorVT(I
.getType()->getContext(), MVT::i64
, NumElts
);
7510 Info
.ptrVal
= I
.getArgOperand(I
.getNumArgOperands() - 1);
7513 // volatile loads with NEON intrinsics not supported
7514 Info
.flags
= MachineMemOperand::MOLoad
;
7517 case Intrinsic::aarch64_neon_st2
:
7518 case Intrinsic::aarch64_neon_st3
:
7519 case Intrinsic::aarch64_neon_st4
:
7520 case Intrinsic::aarch64_neon_st1x2
:
7521 case Intrinsic::aarch64_neon_st1x3
:
7522 case Intrinsic::aarch64_neon_st1x4
:
7523 case Intrinsic::aarch64_neon_st2lane
:
7524 case Intrinsic::aarch64_neon_st3lane
:
7525 case Intrinsic::aarch64_neon_st4lane
: {
7526 Info
.opc
= ISD::INTRINSIC_VOID
;
7527 // Conservatively set memVT to the entire set of vectors stored.
7528 unsigned NumElts
= 0;
7529 for (unsigned ArgI
= 1, ArgE
= I
.getNumArgOperands(); ArgI
< ArgE
; ++ArgI
) {
7530 Type
*ArgTy
= I
.getArgOperand(ArgI
)->getType();
7531 if (!ArgTy
->isVectorTy())
7533 NumElts
+= DL
.getTypeSizeInBits(ArgTy
) / 64;
7535 Info
.memVT
= EVT::getVectorVT(I
.getType()->getContext(), MVT::i64
, NumElts
);
7536 Info
.ptrVal
= I
.getArgOperand(I
.getNumArgOperands() - 1);
7539 // volatile stores with NEON intrinsics not supported
7540 Info
.flags
= MachineMemOperand::MOStore
;
7543 case Intrinsic::aarch64_ldaxr
:
7544 case Intrinsic::aarch64_ldxr
: {
7545 PointerType
*PtrTy
= cast
<PointerType
>(I
.getArgOperand(0)->getType());
7546 Info
.opc
= ISD::INTRINSIC_W_CHAIN
;
7547 Info
.memVT
= MVT::getVT(PtrTy
->getElementType());
7548 Info
.ptrVal
= I
.getArgOperand(0);
7550 Info
.align
= DL
.getABITypeAlignment(PtrTy
->getElementType());
7551 Info
.flags
= MachineMemOperand::MOLoad
| MachineMemOperand::MOVolatile
;
7554 case Intrinsic::aarch64_stlxr
:
7555 case Intrinsic::aarch64_stxr
: {
7556 PointerType
*PtrTy
= cast
<PointerType
>(I
.getArgOperand(1)->getType());
7557 Info
.opc
= ISD::INTRINSIC_W_CHAIN
;
7558 Info
.memVT
= MVT::getVT(PtrTy
->getElementType());
7559 Info
.ptrVal
= I
.getArgOperand(1);
7561 Info
.align
= DL
.getABITypeAlignment(PtrTy
->getElementType());
7562 Info
.flags
= MachineMemOperand::MOStore
| MachineMemOperand::MOVolatile
;
7565 case Intrinsic::aarch64_ldaxp
:
7566 case Intrinsic::aarch64_ldxp
:
7567 Info
.opc
= ISD::INTRINSIC_W_CHAIN
;
7568 Info
.memVT
= MVT::i128
;
7569 Info
.ptrVal
= I
.getArgOperand(0);
7572 Info
.flags
= MachineMemOperand::MOLoad
| MachineMemOperand::MOVolatile
;
7574 case Intrinsic::aarch64_stlxp
:
7575 case Intrinsic::aarch64_stxp
:
7576 Info
.opc
= ISD::INTRINSIC_W_CHAIN
;
7577 Info
.memVT
= MVT::i128
;
7578 Info
.ptrVal
= I
.getArgOperand(2);
7581 Info
.flags
= MachineMemOperand::MOStore
| MachineMemOperand::MOVolatile
;
7590 // Truncations from 64-bit GPR to 32-bit GPR is free.
7591 bool AArch64TargetLowering::isTruncateFree(Type
*Ty1
, Type
*Ty2
) const {
7592 if (!Ty1
->isIntegerTy() || !Ty2
->isIntegerTy())
7594 unsigned NumBits1
= Ty1
->getPrimitiveSizeInBits();
7595 unsigned NumBits2
= Ty2
->getPrimitiveSizeInBits();
7596 return NumBits1
> NumBits2
;
7598 bool AArch64TargetLowering::isTruncateFree(EVT VT1
, EVT VT2
) const {
7599 if (VT1
.isVector() || VT2
.isVector() || !VT1
.isInteger() || !VT2
.isInteger())
7601 unsigned NumBits1
= VT1
.getSizeInBits();
7602 unsigned NumBits2
= VT2
.getSizeInBits();
7603 return NumBits1
> NumBits2
;
7606 /// Check if it is profitable to hoist instruction in then/else to if.
7607 /// Not profitable if I and it's user can form a FMA instruction
7608 /// because we prefer FMSUB/FMADD.
7609 bool AArch64TargetLowering::isProfitableToHoist(Instruction
*I
) const {
7610 if (I
->getOpcode() != Instruction::FMul
)
7613 if (!I
->hasOneUse())
7616 Instruction
*User
= I
->user_back();
7619 !(User
->getOpcode() == Instruction::FSub
||
7620 User
->getOpcode() == Instruction::FAdd
))
7623 const TargetOptions
&Options
= getTargetMachine().Options
;
7624 const DataLayout
&DL
= I
->getModule()->getDataLayout();
7625 EVT VT
= getValueType(DL
, User
->getOperand(0)->getType());
7627 return !(isFMAFasterThanFMulAndFAdd(VT
) &&
7628 isOperationLegalOrCustom(ISD::FMA
, VT
) &&
7629 (Options
.AllowFPOpFusion
== FPOpFusion::Fast
||
7630 Options
.UnsafeFPMath
));
7633 // All 32-bit GPR operations implicitly zero the high-half of the corresponding
7635 bool AArch64TargetLowering::isZExtFree(Type
*Ty1
, Type
*Ty2
) const {
7636 if (!Ty1
->isIntegerTy() || !Ty2
->isIntegerTy())
7638 unsigned NumBits1
= Ty1
->getPrimitiveSizeInBits();
7639 unsigned NumBits2
= Ty2
->getPrimitiveSizeInBits();
7640 return NumBits1
== 32 && NumBits2
== 64;
7642 bool AArch64TargetLowering::isZExtFree(EVT VT1
, EVT VT2
) const {
7643 if (VT1
.isVector() || VT2
.isVector() || !VT1
.isInteger() || !VT2
.isInteger())
7645 unsigned NumBits1
= VT1
.getSizeInBits();
7646 unsigned NumBits2
= VT2
.getSizeInBits();
7647 return NumBits1
== 32 && NumBits2
== 64;
7650 bool AArch64TargetLowering::isZExtFree(SDValue Val
, EVT VT2
) const {
7651 EVT VT1
= Val
.getValueType();
7652 if (isZExtFree(VT1
, VT2
)) {
7656 if (Val
.getOpcode() != ISD::LOAD
)
7659 // 8-, 16-, and 32-bit integer loads all implicitly zero-extend.
7660 return (VT1
.isSimple() && !VT1
.isVector() && VT1
.isInteger() &&
7661 VT2
.isSimple() && !VT2
.isVector() && VT2
.isInteger() &&
7662 VT1
.getSizeInBits() <= 32);
7665 bool AArch64TargetLowering::isExtFreeImpl(const Instruction
*Ext
) const {
7666 if (isa
<FPExtInst
>(Ext
))
7669 // Vector types are not free.
7670 if (Ext
->getType()->isVectorTy())
7673 for (const Use
&U
: Ext
->uses()) {
7674 // The extension is free if we can fold it with a left shift in an
7675 // addressing mode or an arithmetic operation: add, sub, and cmp.
7677 // Is there a shift?
7678 const Instruction
*Instr
= cast
<Instruction
>(U
.getUser());
7680 // Is this a constant shift?
7681 switch (Instr
->getOpcode()) {
7682 case Instruction::Shl
:
7683 if (!isa
<ConstantInt
>(Instr
->getOperand(1)))
7686 case Instruction::GetElementPtr
: {
7687 gep_type_iterator GTI
= gep_type_begin(Instr
);
7688 auto &DL
= Ext
->getModule()->getDataLayout();
7689 std::advance(GTI
, U
.getOperandNo()-1);
7690 Type
*IdxTy
= GTI
.getIndexedType();
7691 // This extension will end up with a shift because of the scaling factor.
7692 // 8-bit sized types have a scaling factor of 1, thus a shift amount of 0.
7693 // Get the shift amount based on the scaling factor:
7694 // log2(sizeof(IdxTy)) - log2(8).
7696 countTrailingZeros(DL
.getTypeStoreSizeInBits(IdxTy
)) - 3;
7697 // Is the constant foldable in the shift of the addressing mode?
7698 // I.e., shift amount is between 1 and 4 inclusive.
7699 if (ShiftAmt
== 0 || ShiftAmt
> 4)
7703 case Instruction::Trunc
:
7704 // Check if this is a noop.
7705 // trunc(sext ty1 to ty2) to ty1.
7706 if (Instr
->getType() == Ext
->getOperand(0)->getType())
7713 // At this point we can use the bfm family, so this extension is free
7719 bool AArch64TargetLowering::hasPairedLoad(EVT LoadedType
,
7720 unsigned &RequiredAligment
) const {
7721 if (!LoadedType
.isSimple() ||
7722 (!LoadedType
.isInteger() && !LoadedType
.isFloatingPoint()))
7724 // Cyclone supports unaligned accesses.
7725 RequiredAligment
= 0;
7726 unsigned NumBits
= LoadedType
.getSizeInBits();
7727 return NumBits
== 32 || NumBits
== 64;
7730 /// A helper function for determining the number of interleaved accesses we
7731 /// will generate when lowering accesses of the given type.
7733 AArch64TargetLowering::getNumInterleavedAccesses(VectorType
*VecTy
,
7734 const DataLayout
&DL
) const {
7735 return (DL
.getTypeSizeInBits(VecTy
) + 127) / 128;
7738 MachineMemOperand::Flags
7739 AArch64TargetLowering::getMMOFlags(const Instruction
&I
) const {
7740 if (Subtarget
->getProcFamily() == AArch64Subtarget::Falkor
&&
7741 I
.getMetadata(FALKOR_STRIDED_ACCESS_MD
) != nullptr)
7742 return MOStridedAccess
;
7743 return MachineMemOperand::MONone
;
7746 bool AArch64TargetLowering::isLegalInterleavedAccessType(
7747 VectorType
*VecTy
, const DataLayout
&DL
) const {
7749 unsigned VecSize
= DL
.getTypeSizeInBits(VecTy
);
7750 unsigned ElSize
= DL
.getTypeSizeInBits(VecTy
->getElementType());
7752 // Ensure the number of vector elements is greater than 1.
7753 if (VecTy
->getNumElements() < 2)
7756 // Ensure the element type is legal.
7757 if (ElSize
!= 8 && ElSize
!= 16 && ElSize
!= 32 && ElSize
!= 64)
7760 // Ensure the total vector size is 64 or a multiple of 128. Types larger than
7761 // 128 will be split into multiple interleaved accesses.
7762 return VecSize
== 64 || VecSize
% 128 == 0;
7765 /// \brief Lower an interleaved load into a ldN intrinsic.
7767 /// E.g. Lower an interleaved load (Factor = 2):
7768 /// %wide.vec = load <8 x i32>, <8 x i32>* %ptr
7769 /// %v0 = shuffle %wide.vec, undef, <0, 2, 4, 6> ; Extract even elements
7770 /// %v1 = shuffle %wide.vec, undef, <1, 3, 5, 7> ; Extract odd elements
7773 /// %ld2 = { <4 x i32>, <4 x i32> } call llvm.aarch64.neon.ld2(%ptr)
7774 /// %vec0 = extractelement { <4 x i32>, <4 x i32> } %ld2, i32 0
7775 /// %vec1 = extractelement { <4 x i32>, <4 x i32> } %ld2, i32 1
7776 bool AArch64TargetLowering::lowerInterleavedLoad(
7777 LoadInst
*LI
, ArrayRef
<ShuffleVectorInst
*> Shuffles
,
7778 ArrayRef
<unsigned> Indices
, unsigned Factor
) const {
7779 assert(Factor
>= 2 && Factor
<= getMaxSupportedInterleaveFactor() &&
7780 "Invalid interleave factor");
7781 assert(!Shuffles
.empty() && "Empty shufflevector input");
7782 assert(Shuffles
.size() == Indices
.size() &&
7783 "Unmatched number of shufflevectors and indices");
7785 const DataLayout
&DL
= LI
->getModule()->getDataLayout();
7787 VectorType
*VecTy
= Shuffles
[0]->getType();
7789 // Skip if we do not have NEON and skip illegal vector types. We can
7790 // "legalize" wide vector types into multiple interleaved accesses as long as
7791 // the vector types are divisible by 128.
7792 if (!Subtarget
->hasNEON() || !isLegalInterleavedAccessType(VecTy
, DL
))
7795 unsigned NumLoads
= getNumInterleavedAccesses(VecTy
, DL
);
7797 // A pointer vector can not be the return type of the ldN intrinsics. Need to
7798 // load integer vectors first and then convert to pointer vectors.
7799 Type
*EltTy
= VecTy
->getVectorElementType();
7800 if (EltTy
->isPointerTy())
7802 VectorType::get(DL
.getIntPtrType(EltTy
), VecTy
->getVectorNumElements());
7804 IRBuilder
<> Builder(LI
);
7806 // The base address of the load.
7807 Value
*BaseAddr
= LI
->getPointerOperand();
7810 // If we're going to generate more than one load, reset the sub-vector type
7811 // to something legal.
7812 VecTy
= VectorType::get(VecTy
->getVectorElementType(),
7813 VecTy
->getVectorNumElements() / NumLoads
);
7815 // We will compute the pointer operand of each load from the original base
7816 // address using GEPs. Cast the base address to a pointer to the scalar
7818 BaseAddr
= Builder
.CreateBitCast(
7819 BaseAddr
, VecTy
->getVectorElementType()->getPointerTo(
7820 LI
->getPointerAddressSpace()));
7823 Type
*PtrTy
= VecTy
->getPointerTo(LI
->getPointerAddressSpace());
7824 Type
*Tys
[2] = {VecTy
, PtrTy
};
7825 static const Intrinsic::ID LoadInts
[3] = {Intrinsic::aarch64_neon_ld2
,
7826 Intrinsic::aarch64_neon_ld3
,
7827 Intrinsic::aarch64_neon_ld4
};
7829 Intrinsic::getDeclaration(LI
->getModule(), LoadInts
[Factor
- 2], Tys
);
7831 // Holds sub-vectors extracted from the load intrinsic return values. The
7832 // sub-vectors are associated with the shufflevector instructions they will
7834 DenseMap
<ShuffleVectorInst
*, SmallVector
<Value
*, 4>> SubVecs
;
7836 for (unsigned LoadCount
= 0; LoadCount
< NumLoads
; ++LoadCount
) {
7838 // If we're generating more than one load, compute the base address of
7839 // subsequent loads as an offset from the previous.
7841 BaseAddr
= Builder
.CreateConstGEP1_32(
7842 BaseAddr
, VecTy
->getVectorNumElements() * Factor
);
7844 CallInst
*LdN
= Builder
.CreateCall(
7845 LdNFunc
, Builder
.CreateBitCast(BaseAddr
, PtrTy
), "ldN");
7847 // Extract and store the sub-vectors returned by the load intrinsic.
7848 for (unsigned i
= 0; i
< Shuffles
.size(); i
++) {
7849 ShuffleVectorInst
*SVI
= Shuffles
[i
];
7850 unsigned Index
= Indices
[i
];
7852 Value
*SubVec
= Builder
.CreateExtractValue(LdN
, Index
);
7854 // Convert the integer vector to pointer vector if the element is pointer.
7855 if (EltTy
->isPointerTy())
7856 SubVec
= Builder
.CreateIntToPtr(
7857 SubVec
, VectorType::get(SVI
->getType()->getVectorElementType(),
7858 VecTy
->getVectorNumElements()));
7859 SubVecs
[SVI
].push_back(SubVec
);
7863 // Replace uses of the shufflevector instructions with the sub-vectors
7864 // returned by the load intrinsic. If a shufflevector instruction is
7865 // associated with more than one sub-vector, those sub-vectors will be
7866 // concatenated into a single wide vector.
7867 for (ShuffleVectorInst
*SVI
: Shuffles
) {
7868 auto &SubVec
= SubVecs
[SVI
];
7870 SubVec
.size() > 1 ? concatenateVectors(Builder
, SubVec
) : SubVec
[0];
7871 SVI
->replaceAllUsesWith(WideVec
);
7877 /// \brief Lower an interleaved store into a stN intrinsic.
7879 /// E.g. Lower an interleaved store (Factor = 3):
7880 /// %i.vec = shuffle <8 x i32> %v0, <8 x i32> %v1,
7881 /// <0, 4, 8, 1, 5, 9, 2, 6, 10, 3, 7, 11>
7882 /// store <12 x i32> %i.vec, <12 x i32>* %ptr
7885 /// %sub.v0 = shuffle <8 x i32> %v0, <8 x i32> v1, <0, 1, 2, 3>
7886 /// %sub.v1 = shuffle <8 x i32> %v0, <8 x i32> v1, <4, 5, 6, 7>
7887 /// %sub.v2 = shuffle <8 x i32> %v0, <8 x i32> v1, <8, 9, 10, 11>
7888 /// call void llvm.aarch64.neon.st3(%sub.v0, %sub.v1, %sub.v2, %ptr)
7890 /// Note that the new shufflevectors will be removed and we'll only generate one
7891 /// st3 instruction in CodeGen.
7893 /// Example for a more general valid mask (Factor 3). Lower:
7894 /// %i.vec = shuffle <32 x i32> %v0, <32 x i32> %v1,
7895 /// <4, 32, 16, 5, 33, 17, 6, 34, 18, 7, 35, 19>
7896 /// store <12 x i32> %i.vec, <12 x i32>* %ptr
7899 /// %sub.v0 = shuffle <32 x i32> %v0, <32 x i32> v1, <4, 5, 6, 7>
7900 /// %sub.v1 = shuffle <32 x i32> %v0, <32 x i32> v1, <32, 33, 34, 35>
7901 /// %sub.v2 = shuffle <32 x i32> %v0, <32 x i32> v1, <16, 17, 18, 19>
7902 /// call void llvm.aarch64.neon.st3(%sub.v0, %sub.v1, %sub.v2, %ptr)
7903 bool AArch64TargetLowering::lowerInterleavedStore(StoreInst
*SI
,
7904 ShuffleVectorInst
*SVI
,
7905 unsigned Factor
) const {
7906 assert(Factor
>= 2 && Factor
<= getMaxSupportedInterleaveFactor() &&
7907 "Invalid interleave factor");
7909 VectorType
*VecTy
= SVI
->getType();
7910 assert(VecTy
->getVectorNumElements() % Factor
== 0 &&
7911 "Invalid interleaved store");
7913 unsigned LaneLen
= VecTy
->getVectorNumElements() / Factor
;
7914 Type
*EltTy
= VecTy
->getVectorElementType();
7915 VectorType
*SubVecTy
= VectorType::get(EltTy
, LaneLen
);
7917 const DataLayout
&DL
= SI
->getModule()->getDataLayout();
7919 // Skip if we do not have NEON and skip illegal vector types. We can
7920 // "legalize" wide vector types into multiple interleaved accesses as long as
7921 // the vector types are divisible by 128.
7922 if (!Subtarget
->hasNEON() || !isLegalInterleavedAccessType(SubVecTy
, DL
))
7925 unsigned NumStores
= getNumInterleavedAccesses(SubVecTy
, DL
);
7927 Value
*Op0
= SVI
->getOperand(0);
7928 Value
*Op1
= SVI
->getOperand(1);
7929 IRBuilder
<> Builder(SI
);
7931 // StN intrinsics don't support pointer vectors as arguments. Convert pointer
7932 // vectors to integer vectors.
7933 if (EltTy
->isPointerTy()) {
7934 Type
*IntTy
= DL
.getIntPtrType(EltTy
);
7935 unsigned NumOpElts
=
7936 dyn_cast
<VectorType
>(Op0
->getType())->getVectorNumElements();
7938 // Convert to the corresponding integer vector.
7939 Type
*IntVecTy
= VectorType::get(IntTy
, NumOpElts
);
7940 Op0
= Builder
.CreatePtrToInt(Op0
, IntVecTy
);
7941 Op1
= Builder
.CreatePtrToInt(Op1
, IntVecTy
);
7943 SubVecTy
= VectorType::get(IntTy
, LaneLen
);
7946 // The base address of the store.
7947 Value
*BaseAddr
= SI
->getPointerOperand();
7949 if (NumStores
> 1) {
7950 // If we're going to generate more than one store, reset the lane length
7951 // and sub-vector type to something legal.
7952 LaneLen
/= NumStores
;
7953 SubVecTy
= VectorType::get(SubVecTy
->getVectorElementType(), LaneLen
);
7955 // We will compute the pointer operand of each store from the original base
7956 // address using GEPs. Cast the base address to a pointer to the scalar
7958 BaseAddr
= Builder
.CreateBitCast(
7959 BaseAddr
, SubVecTy
->getVectorElementType()->getPointerTo(
7960 SI
->getPointerAddressSpace()));
7963 auto Mask
= SVI
->getShuffleMask();
7965 Type
*PtrTy
= SubVecTy
->getPointerTo(SI
->getPointerAddressSpace());
7966 Type
*Tys
[2] = {SubVecTy
, PtrTy
};
7967 static const Intrinsic::ID StoreInts
[3] = {Intrinsic::aarch64_neon_st2
,
7968 Intrinsic::aarch64_neon_st3
,
7969 Intrinsic::aarch64_neon_st4
};
7971 Intrinsic::getDeclaration(SI
->getModule(), StoreInts
[Factor
- 2], Tys
);
7973 for (unsigned StoreCount
= 0; StoreCount
< NumStores
; ++StoreCount
) {
7975 SmallVector
<Value
*, 5> Ops
;
7977 // Split the shufflevector operands into sub vectors for the new stN call.
7978 for (unsigned i
= 0; i
< Factor
; i
++) {
7979 unsigned IdxI
= StoreCount
* LaneLen
* Factor
+ i
;
7980 if (Mask
[IdxI
] >= 0) {
7981 Ops
.push_back(Builder
.CreateShuffleVector(
7982 Op0
, Op1
, createSequentialMask(Builder
, Mask
[IdxI
], LaneLen
, 0)));
7984 unsigned StartMask
= 0;
7985 for (unsigned j
= 1; j
< LaneLen
; j
++) {
7986 unsigned IdxJ
= StoreCount
* LaneLen
* Factor
+ j
;
7987 if (Mask
[IdxJ
* Factor
+ IdxI
] >= 0) {
7988 StartMask
= Mask
[IdxJ
* Factor
+ IdxI
] - IdxJ
;
7992 // Note: Filling undef gaps with random elements is ok, since
7993 // those elements were being written anyway (with undefs).
7994 // In the case of all undefs we're defaulting to using elems from 0
7995 // Note: StartMask cannot be negative, it's checked in
7996 // isReInterleaveMask
7997 Ops
.push_back(Builder
.CreateShuffleVector(
7998 Op0
, Op1
, createSequentialMask(Builder
, StartMask
, LaneLen
, 0)));
8002 // If we generating more than one store, we compute the base address of
8003 // subsequent stores as an offset from the previous.
8005 BaseAddr
= Builder
.CreateConstGEP1_32(BaseAddr
, LaneLen
* Factor
);
8007 Ops
.push_back(Builder
.CreateBitCast(BaseAddr
, PtrTy
));
8008 Builder
.CreateCall(StNFunc
, Ops
);
8013 static bool memOpAlign(unsigned DstAlign
, unsigned SrcAlign
,
8014 unsigned AlignCheck
) {
8015 return ((SrcAlign
== 0 || SrcAlign
% AlignCheck
== 0) &&
8016 (DstAlign
== 0 || DstAlign
% AlignCheck
== 0));
8019 EVT
AArch64TargetLowering::getOptimalMemOpType(uint64_t Size
, unsigned DstAlign
,
8020 unsigned SrcAlign
, bool IsMemset
,
8023 MachineFunction
&MF
) const {
8024 // Don't use AdvSIMD to implement 16-byte memset. It would have taken one
8025 // instruction to materialize the v2i64 zero and one store (with restrictive
8026 // addressing mode). Just do two i64 store of zero-registers.
8028 const Function
&F
= MF
.getFunction();
8029 if (Subtarget
->hasFPARMv8() && !IsMemset
&& Size
>= 16 &&
8030 !F
.hasFnAttribute(Attribute::NoImplicitFloat
) &&
8031 (memOpAlign(SrcAlign
, DstAlign
, 16) ||
8032 (allowsMisalignedMemoryAccesses(MVT::f128
, 0, 1, &Fast
) && Fast
)))
8036 (memOpAlign(SrcAlign
, DstAlign
, 8) ||
8037 (allowsMisalignedMemoryAccesses(MVT::i64
, 0, 1, &Fast
) && Fast
)))
8041 (memOpAlign(SrcAlign
, DstAlign
, 4) ||
8042 (allowsMisalignedMemoryAccesses(MVT::i32
, 0, 1, &Fast
) && Fast
)))
8048 // 12-bit optionally shifted immediates are legal for adds.
8049 bool AArch64TargetLowering::isLegalAddImmediate(int64_t Immed
) const {
8050 if (Immed
== std::numeric_limits
<int64_t>::min()) {
8051 DEBUG(dbgs() << "Illegal add imm " << Immed
<< ": avoid UB for INT64_MIN\n");
8054 // Same encoding for add/sub, just flip the sign.
8055 Immed
= std::abs(Immed
);
8056 bool IsLegal
= ((Immed
>> 12) == 0 ||
8057 ((Immed
& 0xfff) == 0 && Immed
>> 24 == 0));
8058 DEBUG(dbgs() << "Is " << Immed
<< " legal add imm: " <<
8059 (IsLegal
? "yes" : "no") << "\n");
8063 // Integer comparisons are implemented with ADDS/SUBS, so the range of valid
8064 // immediates is the same as for an add or a sub.
8065 bool AArch64TargetLowering::isLegalICmpImmediate(int64_t Immed
) const {
8066 return isLegalAddImmediate(Immed
);
8069 /// isLegalAddressingMode - Return true if the addressing mode represented
8070 /// by AM is legal for this target, for a load/store of the specified type.
8071 bool AArch64TargetLowering::isLegalAddressingMode(const DataLayout
&DL
,
8072 const AddrMode
&AM
, Type
*Ty
,
8073 unsigned AS
, Instruction
*I
) const {
8074 // AArch64 has five basic addressing modes:
8076 // reg + 9-bit signed offset
8077 // reg + SIZE_IN_BYTES * 12-bit unsigned offset
8079 // reg + SIZE_IN_BYTES * reg
8081 // No global is ever allowed as a base.
8085 // No reg+reg+imm addressing.
8086 if (AM
.HasBaseReg
&& AM
.BaseOffs
&& AM
.Scale
)
8089 // check reg + imm case:
8090 // i.e., reg + 0, reg + imm9, reg + SIZE_IN_BYTES * uimm12
8091 uint64_t NumBytes
= 0;
8092 if (Ty
->isSized()) {
8093 uint64_t NumBits
= DL
.getTypeSizeInBits(Ty
);
8094 NumBytes
= NumBits
/ 8;
8095 if (!isPowerOf2_64(NumBits
))
8100 int64_t Offset
= AM
.BaseOffs
;
8102 // 9-bit signed offset
8103 if (isInt
<9>(Offset
))
8106 // 12-bit unsigned offset
8107 unsigned shift
= Log2_64(NumBytes
);
8108 if (NumBytes
&& Offset
> 0 && (Offset
/ NumBytes
) <= (1LL << 12) - 1 &&
8109 // Must be a multiple of NumBytes (NumBytes is a power of 2)
8110 (Offset
>> shift
) << shift
== Offset
)
8115 // Check reg1 + SIZE_IN_BYTES * reg2 and reg1 + reg2
8117 return AM
.Scale
== 1 || (AM
.Scale
> 0 && (uint64_t)AM
.Scale
== NumBytes
);
8120 int AArch64TargetLowering::getScalingFactorCost(const DataLayout
&DL
,
8121 const AddrMode
&AM
, Type
*Ty
,
8122 unsigned AS
) const {
8123 // Scaling factors are not free at all.
8124 // Operands | Rt Latency
8125 // -------------------------------------------
8127 // -------------------------------------------
8128 // Rt, [Xn, Xm, lsl #imm] | Rn: 4 Rm: 5
8129 // Rt, [Xn, Wm, <extend> #imm] |
8130 if (isLegalAddressingMode(DL
, AM
, Ty
, AS
))
8131 // Scale represents reg2 * scale, thus account for 1 if
8132 // it is not equal to 0 or 1.
8133 return AM
.Scale
!= 0 && AM
.Scale
!= 1;
8137 bool AArch64TargetLowering::isFMAFasterThanFMulAndFAdd(EVT VT
) const {
8138 VT
= VT
.getScalarType();
8143 switch (VT
.getSimpleVT().SimpleTy
) {
8155 AArch64TargetLowering::getScratchRegisters(CallingConv::ID
) const {
8156 // LR is a callee-save register, but we must treat it as clobbered by any call
8157 // site. Hence we include LR in the scratch registers, which are in turn added
8158 // as implicit-defs for stackmaps and patchpoints.
8159 static const MCPhysReg ScratchRegs
[] = {
8160 AArch64::X16
, AArch64::X17
, AArch64::LR
, 0
8166 AArch64TargetLowering::isDesirableToCommuteWithShift(const SDNode
*N
) const {
8167 EVT VT
= N
->getValueType(0);
8168 // If N is unsigned bit extraction: ((x >> C) & mask), then do not combine
8169 // it with shift to let it be lowered to UBFX.
8170 if (N
->getOpcode() == ISD::AND
&& (VT
== MVT::i32
|| VT
== MVT::i64
) &&
8171 isa
<ConstantSDNode
>(N
->getOperand(1))) {
8172 uint64_t TruncMask
= N
->getConstantOperandVal(1);
8173 if (isMask_64(TruncMask
) &&
8174 N
->getOperand(0).getOpcode() == ISD::SRL
&&
8175 isa
<ConstantSDNode
>(N
->getOperand(0)->getOperand(1)))
8181 bool AArch64TargetLowering::shouldConvertConstantLoadToIntImm(const APInt
&Imm
,
8183 assert(Ty
->isIntegerTy());
8185 unsigned BitSize
= Ty
->getPrimitiveSizeInBits();
8189 int64_t Val
= Imm
.getSExtValue();
8190 if (Val
== 0 || AArch64_AM::isLogicalImmediate(Val
, BitSize
))
8193 if ((int64_t)Val
< 0)
8196 Val
&= (1LL << 32) - 1;
8198 unsigned LZ
= countLeadingZeros((uint64_t)Val
);
8199 unsigned Shift
= (63 - LZ
) / 16;
8200 // MOVZ is free so return true for one or fewer MOVK.
8204 /// Turn vector tests of the signbit in the form of:
8205 /// xor (sra X, elt_size(X)-1), -1
8208 static SDValue
foldVectorXorShiftIntoCmp(SDNode
*N
, SelectionDAG
&DAG
,
8209 const AArch64Subtarget
*Subtarget
) {
8210 EVT VT
= N
->getValueType(0);
8211 if (!Subtarget
->hasNEON() || !VT
.isVector())
8214 // There must be a shift right algebraic before the xor, and the xor must be a
8216 SDValue Shift
= N
->getOperand(0);
8217 SDValue Ones
= N
->getOperand(1);
8218 if (Shift
.getOpcode() != AArch64ISD::VASHR
|| !Shift
.hasOneUse() ||
8219 !ISD::isBuildVectorAllOnes(Ones
.getNode()))
8222 // The shift should be smearing the sign bit across each vector element.
8223 auto *ShiftAmt
= dyn_cast
<ConstantSDNode
>(Shift
.getOperand(1));
8224 EVT ShiftEltTy
= Shift
.getValueType().getVectorElementType();
8225 if (!ShiftAmt
|| ShiftAmt
->getZExtValue() != ShiftEltTy
.getSizeInBits() - 1)
8228 return DAG
.getNode(AArch64ISD::CMGEz
, SDLoc(N
), VT
, Shift
.getOperand(0));
8231 // Generate SUBS and CSEL for integer abs.
8232 static SDValue
performIntegerAbsCombine(SDNode
*N
, SelectionDAG
&DAG
) {
8233 EVT VT
= N
->getValueType(0);
8235 SDValue N0
= N
->getOperand(0);
8236 SDValue N1
= N
->getOperand(1);
8239 // Check pattern of XOR(ADD(X,Y), Y) where Y is SRA(X, size(X)-1)
8240 // and change it to SUB and CSEL.
8241 if (VT
.isInteger() && N
->getOpcode() == ISD::XOR
&&
8242 N0
.getOpcode() == ISD::ADD
&& N0
.getOperand(1) == N1
&&
8243 N1
.getOpcode() == ISD::SRA
&& N1
.getOperand(0) == N0
.getOperand(0))
8244 if (ConstantSDNode
*Y1C
= dyn_cast
<ConstantSDNode
>(N1
.getOperand(1)))
8245 if (Y1C
->getAPIntValue() == VT
.getSizeInBits() - 1) {
8246 SDValue Neg
= DAG
.getNode(ISD::SUB
, DL
, VT
, DAG
.getConstant(0, DL
, VT
),
8248 // Generate SUBS & CSEL.
8250 DAG
.getNode(AArch64ISD::SUBS
, DL
, DAG
.getVTList(VT
, MVT::i32
),
8251 N0
.getOperand(0), DAG
.getConstant(0, DL
, VT
));
8252 return DAG
.getNode(AArch64ISD::CSEL
, DL
, VT
, N0
.getOperand(0), Neg
,
8253 DAG
.getConstant(AArch64CC::PL
, DL
, MVT::i32
),
8254 SDValue(Cmp
.getNode(), 1));
8259 static SDValue
performXorCombine(SDNode
*N
, SelectionDAG
&DAG
,
8260 TargetLowering::DAGCombinerInfo
&DCI
,
8261 const AArch64Subtarget
*Subtarget
) {
8262 if (DCI
.isBeforeLegalizeOps())
8265 if (SDValue Cmp
= foldVectorXorShiftIntoCmp(N
, DAG
, Subtarget
))
8268 return performIntegerAbsCombine(N
, DAG
);
8272 AArch64TargetLowering::BuildSDIVPow2(SDNode
*N
, const APInt
&Divisor
,
8274 std::vector
<SDNode
*> *Created
) const {
8275 AttributeList Attr
= DAG
.getMachineFunction().getFunction().getAttributes();
8276 if (isIntDivCheap(N
->getValueType(0), Attr
))
8277 return SDValue(N
,0); // Lower SDIV as SDIV
8279 // fold (sdiv X, pow2)
8280 EVT VT
= N
->getValueType(0);
8281 if ((VT
!= MVT::i32
&& VT
!= MVT::i64
) ||
8282 !(Divisor
.isPowerOf2() || (-Divisor
).isPowerOf2()))
8286 SDValue N0
= N
->getOperand(0);
8287 unsigned Lg2
= Divisor
.countTrailingZeros();
8288 SDValue Zero
= DAG
.getConstant(0, DL
, VT
);
8289 SDValue Pow2MinusOne
= DAG
.getConstant((1ULL << Lg2
) - 1, DL
, VT
);
8291 // Add (N0 < 0) ? Pow2 - 1 : 0;
8293 SDValue Cmp
= getAArch64Cmp(N0
, Zero
, ISD::SETLT
, CCVal
, DAG
, DL
);
8294 SDValue Add
= DAG
.getNode(ISD::ADD
, DL
, VT
, N0
, Pow2MinusOne
);
8295 SDValue CSel
= DAG
.getNode(AArch64ISD::CSEL
, DL
, VT
, Add
, N0
, CCVal
, Cmp
);
8298 Created
->push_back(Cmp
.getNode());
8299 Created
->push_back(Add
.getNode());
8300 Created
->push_back(CSel
.getNode());
8305 DAG
.getNode(ISD::SRA
, DL
, VT
, CSel
, DAG
.getConstant(Lg2
, DL
, MVT::i64
));
8307 // If we're dividing by a positive value, we're done. Otherwise, we must
8308 // negate the result.
8309 if (Divisor
.isNonNegative())
8313 Created
->push_back(SRA
.getNode());
8314 return DAG
.getNode(ISD::SUB
, DL
, VT
, DAG
.getConstant(0, DL
, VT
), SRA
);
8317 static SDValue
performMulCombine(SDNode
*N
, SelectionDAG
&DAG
,
8318 TargetLowering::DAGCombinerInfo
&DCI
,
8319 const AArch64Subtarget
*Subtarget
) {
8320 if (DCI
.isBeforeLegalizeOps())
8323 // The below optimizations require a constant RHS.
8324 if (!isa
<ConstantSDNode
>(N
->getOperand(1)))
8327 ConstantSDNode
*C
= cast
<ConstantSDNode
>(N
->getOperand(1));
8328 const APInt
&ConstValue
= C
->getAPIntValue();
8330 // Multiplication of a power of two plus/minus one can be done more
8331 // cheaply as as shift+add/sub. For now, this is true unilaterally. If
8332 // future CPUs have a cheaper MADD instruction, this may need to be
8333 // gated on a subtarget feature. For Cyclone, 32-bit MADD is 4 cycles and
8334 // 64-bit is 5 cycles, so this is always a win.
8335 // More aggressively, some multiplications N0 * C can be lowered to
8336 // shift+add+shift if the constant C = A * B where A = 2^N + 1 and B = 2^M,
8337 // e.g. 6=3*2=(2+1)*2.
8338 // TODO: consider lowering more cases, e.g. C = 14, -6, -14 or even 45
8339 // which equals to (1+2)*16-(1+2).
8340 SDValue N0
= N
->getOperand(0);
8341 // TrailingZeroes is used to test if the mul can be lowered to
8343 unsigned TrailingZeroes
= ConstValue
.countTrailingZeros();
8344 if (TrailingZeroes
) {
8345 // Conservatively do not lower to shift+add+shift if the mul might be
8346 // folded into smul or umul.
8347 if (N0
->hasOneUse() && (isSignExtended(N0
.getNode(), DAG
) ||
8348 isZeroExtended(N0
.getNode(), DAG
)))
8350 // Conservatively do not lower to shift+add+shift if the mul might be
8351 // folded into madd or msub.
8352 if (N
->hasOneUse() && (N
->use_begin()->getOpcode() == ISD::ADD
||
8353 N
->use_begin()->getOpcode() == ISD::SUB
))
8356 // Use ShiftedConstValue instead of ConstValue to support both shift+add/sub
8357 // and shift+add+shift.
8358 APInt ShiftedConstValue
= ConstValue
.ashr(TrailingZeroes
);
8360 unsigned ShiftAmt
, AddSubOpc
;
8361 // Is the shifted value the LHS operand of the add/sub?
8362 bool ShiftValUseIsN0
= true;
8363 // Do we need to negate the result?
8364 bool NegateResult
= false;
8366 if (ConstValue
.isNonNegative()) {
8367 // (mul x, 2^N + 1) => (add (shl x, N), x)
8368 // (mul x, 2^N - 1) => (sub (shl x, N), x)
8369 // (mul x, (2^N + 1) * 2^M) => (shl (add (shl x, N), x), M)
8370 APInt SCVMinus1
= ShiftedConstValue
- 1;
8371 APInt CVPlus1
= ConstValue
+ 1;
8372 if (SCVMinus1
.isPowerOf2()) {
8373 ShiftAmt
= SCVMinus1
.logBase2();
8374 AddSubOpc
= ISD::ADD
;
8375 } else if (CVPlus1
.isPowerOf2()) {
8376 ShiftAmt
= CVPlus1
.logBase2();
8377 AddSubOpc
= ISD::SUB
;
8381 // (mul x, -(2^N - 1)) => (sub x, (shl x, N))
8382 // (mul x, -(2^N + 1)) => - (add (shl x, N), x)
8383 APInt CVNegPlus1
= -ConstValue
+ 1;
8384 APInt CVNegMinus1
= -ConstValue
- 1;
8385 if (CVNegPlus1
.isPowerOf2()) {
8386 ShiftAmt
= CVNegPlus1
.logBase2();
8387 AddSubOpc
= ISD::SUB
;
8388 ShiftValUseIsN0
= false;
8389 } else if (CVNegMinus1
.isPowerOf2()) {
8390 ShiftAmt
= CVNegMinus1
.logBase2();
8391 AddSubOpc
= ISD::ADD
;
8392 NegateResult
= true;
8398 EVT VT
= N
->getValueType(0);
8399 SDValue ShiftedVal
= DAG
.getNode(ISD::SHL
, DL
, VT
, N0
,
8400 DAG
.getConstant(ShiftAmt
, DL
, MVT::i64
));
8402 SDValue AddSubN0
= ShiftValUseIsN0
? ShiftedVal
: N0
;
8403 SDValue AddSubN1
= ShiftValUseIsN0
? N0
: ShiftedVal
;
8404 SDValue Res
= DAG
.getNode(AddSubOpc
, DL
, VT
, AddSubN0
, AddSubN1
);
8405 assert(!(NegateResult
&& TrailingZeroes
) &&
8406 "NegateResult and TrailingZeroes cannot both be true for now.");
8407 // Negate the result.
8409 return DAG
.getNode(ISD::SUB
, DL
, VT
, DAG
.getConstant(0, DL
, VT
), Res
);
8410 // Shift the result.
8412 return DAG
.getNode(ISD::SHL
, DL
, VT
, Res
,
8413 DAG
.getConstant(TrailingZeroes
, DL
, MVT::i64
));
8417 static SDValue
performVectorCompareAndMaskUnaryOpCombine(SDNode
*N
,
8418 SelectionDAG
&DAG
) {
8419 // Take advantage of vector comparisons producing 0 or -1 in each lane to
8420 // optimize away operation when it's from a constant.
8422 // The general transformation is:
8423 // UNARYOP(AND(VECTOR_CMP(x,y), constant)) -->
8424 // AND(VECTOR_CMP(x,y), constant2)
8425 // constant2 = UNARYOP(constant)
8427 // Early exit if this isn't a vector operation, the operand of the
8428 // unary operation isn't a bitwise AND, or if the sizes of the operations
8430 EVT VT
= N
->getValueType(0);
8431 if (!VT
.isVector() || N
->getOperand(0)->getOpcode() != ISD::AND
||
8432 N
->getOperand(0)->getOperand(0)->getOpcode() != ISD::SETCC
||
8433 VT
.getSizeInBits() != N
->getOperand(0)->getValueType(0).getSizeInBits())
8436 // Now check that the other operand of the AND is a constant. We could
8437 // make the transformation for non-constant splats as well, but it's unclear
8438 // that would be a benefit as it would not eliminate any operations, just
8439 // perform one more step in scalar code before moving to the vector unit.
8440 if (BuildVectorSDNode
*BV
=
8441 dyn_cast
<BuildVectorSDNode
>(N
->getOperand(0)->getOperand(1))) {
8442 // Bail out if the vector isn't a constant.
8443 if (!BV
->isConstant())
8446 // Everything checks out. Build up the new and improved node.
8448 EVT IntVT
= BV
->getValueType(0);
8449 // Create a new constant of the appropriate type for the transformed
8451 SDValue SourceConst
= DAG
.getNode(N
->getOpcode(), DL
, VT
, SDValue(BV
, 0));
8452 // The AND node needs bitcasts to/from an integer vector type around it.
8453 SDValue MaskConst
= DAG
.getNode(ISD::BITCAST
, DL
, IntVT
, SourceConst
);
8454 SDValue NewAnd
= DAG
.getNode(ISD::AND
, DL
, IntVT
,
8455 N
->getOperand(0)->getOperand(0), MaskConst
);
8456 SDValue Res
= DAG
.getNode(ISD::BITCAST
, DL
, VT
, NewAnd
);
8463 static SDValue
performIntToFpCombine(SDNode
*N
, SelectionDAG
&DAG
,
8464 const AArch64Subtarget
*Subtarget
) {
8465 // First try to optimize away the conversion when it's conditionally from
8466 // a constant. Vectors only.
8467 if (SDValue Res
= performVectorCompareAndMaskUnaryOpCombine(N
, DAG
))
8470 EVT VT
= N
->getValueType(0);
8471 if (VT
!= MVT::f32
&& VT
!= MVT::f64
)
8474 // Only optimize when the source and destination types have the same width.
8475 if (VT
.getSizeInBits() != N
->getOperand(0).getValueSizeInBits())
8478 // If the result of an integer load is only used by an integer-to-float
8479 // conversion, use a fp load instead and a AdvSIMD scalar {S|U}CVTF instead.
8480 // This eliminates an "integer-to-vector-move" UOP and improves throughput.
8481 SDValue N0
= N
->getOperand(0);
8482 if (Subtarget
->hasNEON() && ISD::isNormalLoad(N0
.getNode()) && N0
.hasOneUse() &&
8483 // Do not change the width of a volatile load.
8484 !cast
<LoadSDNode
>(N0
)->isVolatile()) {
8485 LoadSDNode
*LN0
= cast
<LoadSDNode
>(N0
);
8486 SDValue Load
= DAG
.getLoad(VT
, SDLoc(N
), LN0
->getChain(), LN0
->getBasePtr(),
8487 LN0
->getPointerInfo(), LN0
->getAlignment(),
8488 LN0
->getMemOperand()->getFlags());
8490 // Make sure successors of the original load stay after it by updating them
8491 // to use the new Chain.
8492 DAG
.ReplaceAllUsesOfValueWith(SDValue(LN0
, 1), Load
.getValue(1));
8495 (N
->getOpcode() == ISD::SINT_TO_FP
) ? AArch64ISD::SITOF
: AArch64ISD::UITOF
;
8496 return DAG
.getNode(Opcode
, SDLoc(N
), VT
, Load
);
8502 /// Fold a floating-point multiply by power of two into floating-point to
8503 /// fixed-point conversion.
8504 static SDValue
performFpToIntCombine(SDNode
*N
, SelectionDAG
&DAG
,
8505 TargetLowering::DAGCombinerInfo
&DCI
,
8506 const AArch64Subtarget
*Subtarget
) {
8507 if (!Subtarget
->hasNEON())
8510 SDValue Op
= N
->getOperand(0);
8511 if (!Op
.getValueType().isVector() || !Op
.getValueType().isSimple() ||
8512 Op
.getOpcode() != ISD::FMUL
)
8515 SDValue ConstVec
= Op
->getOperand(1);
8516 if (!isa
<BuildVectorSDNode
>(ConstVec
))
8519 MVT FloatTy
= Op
.getSimpleValueType().getVectorElementType();
8520 uint32_t FloatBits
= FloatTy
.getSizeInBits();
8521 if (FloatBits
!= 32 && FloatBits
!= 64)
8524 MVT IntTy
= N
->getSimpleValueType(0).getVectorElementType();
8525 uint32_t IntBits
= IntTy
.getSizeInBits();
8526 if (IntBits
!= 16 && IntBits
!= 32 && IntBits
!= 64)
8529 // Avoid conversions where iN is larger than the float (e.g., float -> i64).
8530 if (IntBits
> FloatBits
)
8533 BitVector UndefElements
;
8534 BuildVectorSDNode
*BV
= cast
<BuildVectorSDNode
>(ConstVec
);
8535 int32_t Bits
= IntBits
== 64 ? 64 : 32;
8536 int32_t C
= BV
->getConstantFPSplatPow2ToLog2Int(&UndefElements
, Bits
+ 1);
8537 if (C
== -1 || C
== 0 || C
> Bits
)
8541 unsigned NumLanes
= Op
.getValueType().getVectorNumElements();
8546 ResTy
= FloatBits
== 32 ? MVT::v2i32
: MVT::v2i64
;
8549 ResTy
= FloatBits
== 32 ? MVT::v4i32
: MVT::v4i64
;
8553 if (ResTy
== MVT::v4i64
&& DCI
.isBeforeLegalizeOps())
8556 assert((ResTy
!= MVT::v4i64
|| DCI
.isBeforeLegalizeOps()) &&
8557 "Illegal vector type after legalization");
8560 bool IsSigned
= N
->getOpcode() == ISD::FP_TO_SINT
;
8561 unsigned IntrinsicOpcode
= IsSigned
? Intrinsic::aarch64_neon_vcvtfp2fxs
8562 : Intrinsic::aarch64_neon_vcvtfp2fxu
;
8564 DAG
.getNode(ISD::INTRINSIC_WO_CHAIN
, DL
, ResTy
,
8565 DAG
.getConstant(IntrinsicOpcode
, DL
, MVT::i32
),
8566 Op
->getOperand(0), DAG
.getConstant(C
, DL
, MVT::i32
));
8567 // We can handle smaller integers by generating an extra trunc.
8568 if (IntBits
< FloatBits
)
8569 FixConv
= DAG
.getNode(ISD::TRUNCATE
, DL
, N
->getValueType(0), FixConv
);
8574 /// Fold a floating-point divide by power of two into fixed-point to
8575 /// floating-point conversion.
8576 static SDValue
performFDivCombine(SDNode
*N
, SelectionDAG
&DAG
,
8577 TargetLowering::DAGCombinerInfo
&DCI
,
8578 const AArch64Subtarget
*Subtarget
) {
8579 if (!Subtarget
->hasNEON())
8582 SDValue Op
= N
->getOperand(0);
8583 unsigned Opc
= Op
->getOpcode();
8584 if (!Op
.getValueType().isVector() || !Op
.getValueType().isSimple() ||
8585 !Op
.getOperand(0).getValueType().isSimple() ||
8586 (Opc
!= ISD::SINT_TO_FP
&& Opc
!= ISD::UINT_TO_FP
))
8589 SDValue ConstVec
= N
->getOperand(1);
8590 if (!isa
<BuildVectorSDNode
>(ConstVec
))
8593 MVT IntTy
= Op
.getOperand(0).getSimpleValueType().getVectorElementType();
8594 int32_t IntBits
= IntTy
.getSizeInBits();
8595 if (IntBits
!= 16 && IntBits
!= 32 && IntBits
!= 64)
8598 MVT FloatTy
= N
->getSimpleValueType(0).getVectorElementType();
8599 int32_t FloatBits
= FloatTy
.getSizeInBits();
8600 if (FloatBits
!= 32 && FloatBits
!= 64)
8603 // Avoid conversions where iN is larger than the float (e.g., i64 -> float).
8604 if (IntBits
> FloatBits
)
8607 BitVector UndefElements
;
8608 BuildVectorSDNode
*BV
= cast
<BuildVectorSDNode
>(ConstVec
);
8609 int32_t C
= BV
->getConstantFPSplatPow2ToLog2Int(&UndefElements
, FloatBits
+ 1);
8610 if (C
== -1 || C
== 0 || C
> FloatBits
)
8614 unsigned NumLanes
= Op
.getValueType().getVectorNumElements();
8619 ResTy
= FloatBits
== 32 ? MVT::v2i32
: MVT::v2i64
;
8622 ResTy
= FloatBits
== 32 ? MVT::v4i32
: MVT::v4i64
;
8626 if (ResTy
== MVT::v4i64
&& DCI
.isBeforeLegalizeOps())
8630 SDValue ConvInput
= Op
.getOperand(0);
8631 bool IsSigned
= Opc
== ISD::SINT_TO_FP
;
8632 if (IntBits
< FloatBits
)
8633 ConvInput
= DAG
.getNode(IsSigned
? ISD::SIGN_EXTEND
: ISD::ZERO_EXTEND
, DL
,
8636 unsigned IntrinsicOpcode
= IsSigned
? Intrinsic::aarch64_neon_vcvtfxs2fp
8637 : Intrinsic::aarch64_neon_vcvtfxu2fp
;
8638 return DAG
.getNode(ISD::INTRINSIC_WO_CHAIN
, DL
, Op
.getValueType(),
8639 DAG
.getConstant(IntrinsicOpcode
, DL
, MVT::i32
), ConvInput
,
8640 DAG
.getConstant(C
, DL
, MVT::i32
));
8643 /// An EXTR instruction is made up of two shifts, ORed together. This helper
8644 /// searches for and classifies those shifts.
8645 static bool findEXTRHalf(SDValue N
, SDValue
&Src
, uint32_t &ShiftAmount
,
8647 if (N
.getOpcode() == ISD::SHL
)
8649 else if (N
.getOpcode() == ISD::SRL
)
8654 if (!isa
<ConstantSDNode
>(N
.getOperand(1)))
8657 ShiftAmount
= N
->getConstantOperandVal(1);
8658 Src
= N
->getOperand(0);
8662 /// EXTR instruction extracts a contiguous chunk of bits from two existing
8663 /// registers viewed as a high/low pair. This function looks for the pattern:
8664 /// <tt>(or (shl VAL1, \#N), (srl VAL2, \#RegWidth-N))</tt> and replaces it
8665 /// with an EXTR. Can't quite be done in TableGen because the two immediates
8666 /// aren't independent.
8667 static SDValue
tryCombineToEXTR(SDNode
*N
,
8668 TargetLowering::DAGCombinerInfo
&DCI
) {
8669 SelectionDAG
&DAG
= DCI
.DAG
;
8671 EVT VT
= N
->getValueType(0);
8673 assert(N
->getOpcode() == ISD::OR
&& "Unexpected root");
8675 if (VT
!= MVT::i32
&& VT
!= MVT::i64
)
8679 uint32_t ShiftLHS
= 0;
8680 bool LHSFromHi
= false;
8681 if (!findEXTRHalf(N
->getOperand(0), LHS
, ShiftLHS
, LHSFromHi
))
8685 uint32_t ShiftRHS
= 0;
8686 bool RHSFromHi
= false;
8687 if (!findEXTRHalf(N
->getOperand(1), RHS
, ShiftRHS
, RHSFromHi
))
8690 // If they're both trying to come from the high part of the register, they're
8691 // not really an EXTR.
8692 if (LHSFromHi
== RHSFromHi
)
8695 if (ShiftLHS
+ ShiftRHS
!= VT
.getSizeInBits())
8699 std::swap(LHS
, RHS
);
8700 std::swap(ShiftLHS
, ShiftRHS
);
8703 return DAG
.getNode(AArch64ISD::EXTR
, DL
, VT
, LHS
, RHS
,
8704 DAG
.getConstant(ShiftRHS
, DL
, MVT::i64
));
8707 static SDValue
tryCombineToBSL(SDNode
*N
,
8708 TargetLowering::DAGCombinerInfo
&DCI
) {
8709 EVT VT
= N
->getValueType(0);
8710 SelectionDAG
&DAG
= DCI
.DAG
;
8716 SDValue N0
= N
->getOperand(0);
8717 if (N0
.getOpcode() != ISD::AND
)
8720 SDValue N1
= N
->getOperand(1);
8721 if (N1
.getOpcode() != ISD::AND
)
8724 // We only have to look for constant vectors here since the general, variable
8725 // case can be handled in TableGen.
8726 unsigned Bits
= VT
.getScalarSizeInBits();
8727 uint64_t BitMask
= Bits
== 64 ? -1ULL : ((1ULL << Bits
) - 1);
8728 for (int i
= 1; i
>= 0; --i
)
8729 for (int j
= 1; j
>= 0; --j
) {
8730 BuildVectorSDNode
*BVN0
= dyn_cast
<BuildVectorSDNode
>(N0
->getOperand(i
));
8731 BuildVectorSDNode
*BVN1
= dyn_cast
<BuildVectorSDNode
>(N1
->getOperand(j
));
8735 bool FoundMatch
= true;
8736 for (unsigned k
= 0; k
< VT
.getVectorNumElements(); ++k
) {
8737 ConstantSDNode
*CN0
= dyn_cast
<ConstantSDNode
>(BVN0
->getOperand(k
));
8738 ConstantSDNode
*CN1
= dyn_cast
<ConstantSDNode
>(BVN1
->getOperand(k
));
8740 CN0
->getZExtValue() != (BitMask
& ~CN1
->getZExtValue())) {
8747 return DAG
.getNode(AArch64ISD::BSL
, DL
, VT
, SDValue(BVN0
, 0),
8748 N0
->getOperand(1 - i
), N1
->getOperand(1 - j
));
8754 static SDValue
performORCombine(SDNode
*N
, TargetLowering::DAGCombinerInfo
&DCI
,
8755 const AArch64Subtarget
*Subtarget
) {
8756 // Attempt to form an EXTR from (or (shl VAL1, #N), (srl VAL2, #RegWidth-N))
8757 SelectionDAG
&DAG
= DCI
.DAG
;
8758 EVT VT
= N
->getValueType(0);
8760 if (!DAG
.getTargetLoweringInfo().isTypeLegal(VT
))
8763 if (SDValue Res
= tryCombineToEXTR(N
, DCI
))
8766 if (SDValue Res
= tryCombineToBSL(N
, DCI
))
8772 static SDValue
performSRLCombine(SDNode
*N
,
8773 TargetLowering::DAGCombinerInfo
&DCI
) {
8774 SelectionDAG
&DAG
= DCI
.DAG
;
8775 EVT VT
= N
->getValueType(0);
8776 if (VT
!= MVT::i32
&& VT
!= MVT::i64
)
8779 // Canonicalize (srl (bswap i32 x), 16) to (rotr (bswap i32 x), 16), if the
8780 // high 16-bits of x are zero. Similarly, canonicalize (srl (bswap i64 x), 32)
8781 // to (rotr (bswap i64 x), 32), if the high 32-bits of x are zero.
8782 SDValue N0
= N
->getOperand(0);
8783 if (N0
.getOpcode() == ISD::BSWAP
) {
8785 SDValue N1
= N
->getOperand(1);
8786 SDValue N00
= N0
.getOperand(0);
8787 if (ConstantSDNode
*C
= dyn_cast
<ConstantSDNode
>(N1
)) {
8788 uint64_t ShiftAmt
= C
->getZExtValue();
8789 if (VT
== MVT::i32
&& ShiftAmt
== 16 &&
8790 DAG
.MaskedValueIsZero(N00
, APInt::getHighBitsSet(32, 16)))
8791 return DAG
.getNode(ISD::ROTR
, DL
, VT
, N0
, N1
);
8792 if (VT
== MVT::i64
&& ShiftAmt
== 32 &&
8793 DAG
.MaskedValueIsZero(N00
, APInt::getHighBitsSet(64, 32)))
8794 return DAG
.getNode(ISD::ROTR
, DL
, VT
, N0
, N1
);
8800 static SDValue
performBitcastCombine(SDNode
*N
,
8801 TargetLowering::DAGCombinerInfo
&DCI
,
8802 SelectionDAG
&DAG
) {
8803 // Wait 'til after everything is legalized to try this. That way we have
8804 // legal vector types and such.
8805 if (DCI
.isBeforeLegalizeOps())
8808 // Remove extraneous bitcasts around an extract_subvector.
8810 // (v4i16 (bitconvert
8811 // (extract_subvector (v2i64 (bitconvert (v8i16 ...)), (i64 1)))))
8813 // (extract_subvector ((v8i16 ...), (i64 4)))
8815 // Only interested in 64-bit vectors as the ultimate result.
8816 EVT VT
= N
->getValueType(0);
8819 if (VT
.getSimpleVT().getSizeInBits() != 64)
8821 // Is the operand an extract_subvector starting at the beginning or halfway
8822 // point of the vector? A low half may also come through as an
8823 // EXTRACT_SUBREG, so look for that, too.
8824 SDValue Op0
= N
->getOperand(0);
8825 if (Op0
->getOpcode() != ISD::EXTRACT_SUBVECTOR
&&
8826 !(Op0
->isMachineOpcode() &&
8827 Op0
->getMachineOpcode() == AArch64::EXTRACT_SUBREG
))
8829 uint64_t idx
= cast
<ConstantSDNode
>(Op0
->getOperand(1))->getZExtValue();
8830 if (Op0
->getOpcode() == ISD::EXTRACT_SUBVECTOR
) {
8831 if (Op0
->getValueType(0).getVectorNumElements() != idx
&& idx
!= 0)
8833 } else if (Op0
->getMachineOpcode() == AArch64::EXTRACT_SUBREG
) {
8834 if (idx
!= AArch64::dsub
)
8836 // The dsub reference is equivalent to a lane zero subvector reference.
8839 // Look through the bitcast of the input to the extract.
8840 if (Op0
->getOperand(0)->getOpcode() != ISD::BITCAST
)
8842 SDValue Source
= Op0
->getOperand(0)->getOperand(0);
8843 // If the source type has twice the number of elements as our destination
8844 // type, we know this is an extract of the high or low half of the vector.
8845 EVT SVT
= Source
->getValueType(0);
8846 if (!SVT
.isVector() ||
8847 SVT
.getVectorNumElements() != VT
.getVectorNumElements() * 2)
8850 DEBUG(dbgs() << "aarch64-lower: bitcast extract_subvector simplification\n");
8852 // Create the simplified form to just extract the low or high half of the
8853 // vector directly rather than bothering with the bitcasts.
8855 unsigned NumElements
= VT
.getVectorNumElements();
8857 SDValue HalfIdx
= DAG
.getConstant(NumElements
, dl
, MVT::i64
);
8858 return DAG
.getNode(ISD::EXTRACT_SUBVECTOR
, dl
, VT
, Source
, HalfIdx
);
8860 SDValue SubReg
= DAG
.getTargetConstant(AArch64::dsub
, dl
, MVT::i32
);
8861 return SDValue(DAG
.getMachineNode(TargetOpcode::EXTRACT_SUBREG
, dl
, VT
,
8867 static SDValue
performConcatVectorsCombine(SDNode
*N
,
8868 TargetLowering::DAGCombinerInfo
&DCI
,
8869 SelectionDAG
&DAG
) {
8871 EVT VT
= N
->getValueType(0);
8872 SDValue N0
= N
->getOperand(0), N1
= N
->getOperand(1);
8874 // Optimize concat_vectors of truncated vectors, where the intermediate
8875 // type is illegal, to avoid said illegality, e.g.,
8876 // (v4i16 (concat_vectors (v2i16 (truncate (v2i64))),
8877 // (v2i16 (truncate (v2i64)))))
8879 // (v4i16 (truncate (vector_shuffle (v4i32 (bitcast (v2i64))),
8880 // (v4i32 (bitcast (v2i64))),
8882 // This isn't really target-specific, but ISD::TRUNCATE legality isn't keyed
8883 // on both input and result type, so we might generate worse code.
8884 // On AArch64 we know it's fine for v2i64->v4i16 and v4i32->v8i8.
8885 if (N
->getNumOperands() == 2 &&
8886 N0
->getOpcode() == ISD::TRUNCATE
&&
8887 N1
->getOpcode() == ISD::TRUNCATE
) {
8888 SDValue N00
= N0
->getOperand(0);
8889 SDValue N10
= N1
->getOperand(0);
8890 EVT N00VT
= N00
.getValueType();
8892 if (N00VT
== N10
.getValueType() &&
8893 (N00VT
== MVT::v2i64
|| N00VT
== MVT::v4i32
) &&
8894 N00VT
.getScalarSizeInBits() == 4 * VT
.getScalarSizeInBits()) {
8895 MVT MidVT
= (N00VT
== MVT::v2i64
? MVT::v4i32
: MVT::v8i16
);
8896 SmallVector
<int, 8> Mask(MidVT
.getVectorNumElements());
8897 for (size_t i
= 0; i
< Mask
.size(); ++i
)
8899 return DAG
.getNode(ISD::TRUNCATE
, dl
, VT
,
8900 DAG
.getVectorShuffle(
8902 DAG
.getNode(ISD::BITCAST
, dl
, MidVT
, N00
),
8903 DAG
.getNode(ISD::BITCAST
, dl
, MidVT
, N10
), Mask
));
8907 // Wait 'til after everything is legalized to try this. That way we have
8908 // legal vector types and such.
8909 if (DCI
.isBeforeLegalizeOps())
8912 // If we see a (concat_vectors (v1x64 A), (v1x64 A)) it's really a vector
8913 // splat. The indexed instructions are going to be expecting a DUPLANE64, so
8914 // canonicalise to that.
8915 if (N0
== N1
&& VT
.getVectorNumElements() == 2) {
8916 assert(VT
.getScalarSizeInBits() == 64);
8917 return DAG
.getNode(AArch64ISD::DUPLANE64
, dl
, VT
, WidenVector(N0
, DAG
),
8918 DAG
.getConstant(0, dl
, MVT::i64
));
8921 // Canonicalise concat_vectors so that the right-hand vector has as few
8922 // bit-casts as possible before its real operation. The primary matching
8923 // destination for these operations will be the narrowing "2" instructions,
8924 // which depend on the operation being performed on this right-hand vector.
8926 // (concat_vectors LHS, (v1i64 (bitconvert (v4i16 RHS))))
8928 // (bitconvert (concat_vectors (v4i16 (bitconvert LHS)), RHS))
8930 if (N1
->getOpcode() != ISD::BITCAST
)
8932 SDValue RHS
= N1
->getOperand(0);
8933 MVT RHSTy
= RHS
.getValueType().getSimpleVT();
8934 // If the RHS is not a vector, this is not the pattern we're looking for.
8935 if (!RHSTy
.isVector())
8938 DEBUG(dbgs() << "aarch64-lower: concat_vectors bitcast simplification\n");
8940 MVT ConcatTy
= MVT::getVectorVT(RHSTy
.getVectorElementType(),
8941 RHSTy
.getVectorNumElements() * 2);
8942 return DAG
.getNode(ISD::BITCAST
, dl
, VT
,
8943 DAG
.getNode(ISD::CONCAT_VECTORS
, dl
, ConcatTy
,
8944 DAG
.getNode(ISD::BITCAST
, dl
, RHSTy
, N0
),
8948 static SDValue
tryCombineFixedPointConvert(SDNode
*N
,
8949 TargetLowering::DAGCombinerInfo
&DCI
,
8950 SelectionDAG
&DAG
) {
8951 // Wait until after everything is legalized to try this. That way we have
8952 // legal vector types and such.
8953 if (DCI
.isBeforeLegalizeOps())
8955 // Transform a scalar conversion of a value from a lane extract into a
8956 // lane extract of a vector conversion. E.g., from foo1 to foo2:
8957 // double foo1(int64x2_t a) { return vcvtd_n_f64_s64(a[1], 9); }
8958 // double foo2(int64x2_t a) { return vcvtq_n_f64_s64(a, 9)[1]; }
8960 // The second form interacts better with instruction selection and the
8961 // register allocator to avoid cross-class register copies that aren't
8962 // coalescable due to a lane reference.
8964 // Check the operand and see if it originates from a lane extract.
8965 SDValue Op1
= N
->getOperand(1);
8966 if (Op1
.getOpcode() == ISD::EXTRACT_VECTOR_ELT
) {
8967 // Yep, no additional predication needed. Perform the transform.
8968 SDValue IID
= N
->getOperand(0);
8969 SDValue Shift
= N
->getOperand(2);
8970 SDValue Vec
= Op1
.getOperand(0);
8971 SDValue Lane
= Op1
.getOperand(1);
8972 EVT ResTy
= N
->getValueType(0);
8976 // The vector width should be 128 bits by the time we get here, even
8977 // if it started as 64 bits (the extract_vector handling will have
8979 assert(Vec
.getValueSizeInBits() == 128 &&
8980 "unexpected vector size on extract_vector_elt!");
8981 if (Vec
.getValueType() == MVT::v4i32
)
8982 VecResTy
= MVT::v4f32
;
8983 else if (Vec
.getValueType() == MVT::v2i64
)
8984 VecResTy
= MVT::v2f64
;
8986 llvm_unreachable("unexpected vector type!");
8989 DAG
.getNode(ISD::INTRINSIC_WO_CHAIN
, DL
, VecResTy
, IID
, Vec
, Shift
);
8990 return DAG
.getNode(ISD::EXTRACT_VECTOR_ELT
, DL
, ResTy
, Convert
, Lane
);
8995 // AArch64 high-vector "long" operations are formed by performing the non-high
8996 // version on an extract_subvector of each operand which gets the high half:
8998 // (longop2 LHS, RHS) == (longop (extract_high LHS), (extract_high RHS))
9000 // However, there are cases which don't have an extract_high explicitly, but
9001 // have another operation that can be made compatible with one for free. For
9004 // (dupv64 scalar) --> (extract_high (dup128 scalar))
9006 // This routine does the actual conversion of such DUPs, once outer routines
9007 // have determined that everything else is in order.
9008 // It also supports immediate DUP-like nodes (MOVI/MVNi), which we can fold
9010 static SDValue
tryExtendDUPToExtractHigh(SDValue N
, SelectionDAG
&DAG
) {
9011 switch (N
.getOpcode()) {
9012 case AArch64ISD::DUP
:
9013 case AArch64ISD::DUPLANE8
:
9014 case AArch64ISD::DUPLANE16
:
9015 case AArch64ISD::DUPLANE32
:
9016 case AArch64ISD::DUPLANE64
:
9017 case AArch64ISD::MOVI
:
9018 case AArch64ISD::MOVIshift
:
9019 case AArch64ISD::MOVIedit
:
9020 case AArch64ISD::MOVImsl
:
9021 case AArch64ISD::MVNIshift
:
9022 case AArch64ISD::MVNImsl
:
9025 // FMOV could be supported, but isn't very useful, as it would only occur
9026 // if you passed a bitcast' floating point immediate to an eligible long
9027 // integer op (addl, smull, ...).
9031 MVT NarrowTy
= N
.getSimpleValueType();
9032 if (!NarrowTy
.is64BitVector())
9035 MVT ElementTy
= NarrowTy
.getVectorElementType();
9036 unsigned NumElems
= NarrowTy
.getVectorNumElements();
9037 MVT NewVT
= MVT::getVectorVT(ElementTy
, NumElems
* 2);
9040 return DAG
.getNode(ISD::EXTRACT_SUBVECTOR
, dl
, NarrowTy
,
9041 DAG
.getNode(N
->getOpcode(), dl
, NewVT
, N
->ops()),
9042 DAG
.getConstant(NumElems
, dl
, MVT::i64
));
9045 static bool isEssentiallyExtractSubvector(SDValue N
) {
9046 if (N
.getOpcode() == ISD::EXTRACT_SUBVECTOR
)
9049 return N
.getOpcode() == ISD::BITCAST
&&
9050 N
.getOperand(0).getOpcode() == ISD::EXTRACT_SUBVECTOR
;
9053 /// \brief Helper structure to keep track of ISD::SET_CC operands.
9054 struct GenericSetCCInfo
{
9055 const SDValue
*Opnd0
;
9056 const SDValue
*Opnd1
;
9060 /// \brief Helper structure to keep track of a SET_CC lowered into AArch64 code.
9061 struct AArch64SetCCInfo
{
9063 AArch64CC::CondCode CC
;
9066 /// \brief Helper structure to keep track of SetCC information.
9068 GenericSetCCInfo Generic
;
9069 AArch64SetCCInfo AArch64
;
9072 /// \brief Helper structure to be able to read SetCC information. If set to
9073 /// true, IsAArch64 field, Info is a AArch64SetCCInfo, otherwise Info is a
9074 /// GenericSetCCInfo.
9075 struct SetCCInfoAndKind
{
9080 /// \brief Check whether or not \p Op is a SET_CC operation, either a generic or
9082 /// AArch64 lowered one.
9083 /// \p SetCCInfo is filled accordingly.
9084 /// \post SetCCInfo is meanginfull only when this function returns true.
9085 /// \return True when Op is a kind of SET_CC operation.
9086 static bool isSetCC(SDValue Op
, SetCCInfoAndKind
&SetCCInfo
) {
9087 // If this is a setcc, this is straight forward.
9088 if (Op
.getOpcode() == ISD::SETCC
) {
9089 SetCCInfo
.Info
.Generic
.Opnd0
= &Op
.getOperand(0);
9090 SetCCInfo
.Info
.Generic
.Opnd1
= &Op
.getOperand(1);
9091 SetCCInfo
.Info
.Generic
.CC
= cast
<CondCodeSDNode
>(Op
.getOperand(2))->get();
9092 SetCCInfo
.IsAArch64
= false;
9095 // Otherwise, check if this is a matching csel instruction.
9099 if (Op
.getOpcode() != AArch64ISD::CSEL
)
9101 // Set the information about the operands.
9102 // TODO: we want the operands of the Cmp not the csel
9103 SetCCInfo
.Info
.AArch64
.Cmp
= &Op
.getOperand(3);
9104 SetCCInfo
.IsAArch64
= true;
9105 SetCCInfo
.Info
.AArch64
.CC
= static_cast<AArch64CC::CondCode
>(
9106 cast
<ConstantSDNode
>(Op
.getOperand(2))->getZExtValue());
9108 // Check that the operands matches the constraints:
9109 // (1) Both operands must be constants.
9110 // (2) One must be 1 and the other must be 0.
9111 ConstantSDNode
*TValue
= dyn_cast
<ConstantSDNode
>(Op
.getOperand(0));
9112 ConstantSDNode
*FValue
= dyn_cast
<ConstantSDNode
>(Op
.getOperand(1));
9115 if (!TValue
|| !FValue
)
9119 if (!TValue
->isOne()) {
9120 // Update the comparison when we are interested in !cc.
9121 std::swap(TValue
, FValue
);
9122 SetCCInfo
.Info
.AArch64
.CC
=
9123 AArch64CC::getInvertedCondCode(SetCCInfo
.Info
.AArch64
.CC
);
9125 return TValue
->isOne() && FValue
->isNullValue();
9128 // Returns true if Op is setcc or zext of setcc.
9129 static bool isSetCCOrZExtSetCC(const SDValue
& Op
, SetCCInfoAndKind
&Info
) {
9130 if (isSetCC(Op
, Info
))
9132 return ((Op
.getOpcode() == ISD::ZERO_EXTEND
) &&
9133 isSetCC(Op
->getOperand(0), Info
));
9136 // The folding we want to perform is:
9137 // (add x, [zext] (setcc cc ...) )
9139 // (csel x, (add x, 1), !cc ...)
9141 // The latter will get matched to a CSINC instruction.
9142 static SDValue
performSetccAddFolding(SDNode
*Op
, SelectionDAG
&DAG
) {
9143 assert(Op
&& Op
->getOpcode() == ISD::ADD
&& "Unexpected operation!");
9144 SDValue LHS
= Op
->getOperand(0);
9145 SDValue RHS
= Op
->getOperand(1);
9146 SetCCInfoAndKind InfoAndKind
;
9148 // If neither operand is a SET_CC, give up.
9149 if (!isSetCCOrZExtSetCC(LHS
, InfoAndKind
)) {
9150 std::swap(LHS
, RHS
);
9151 if (!isSetCCOrZExtSetCC(LHS
, InfoAndKind
))
9155 // FIXME: This could be generatized to work for FP comparisons.
9156 EVT CmpVT
= InfoAndKind
.IsAArch64
9157 ? InfoAndKind
.Info
.AArch64
.Cmp
->getOperand(0).getValueType()
9158 : InfoAndKind
.Info
.Generic
.Opnd0
->getValueType();
9159 if (CmpVT
!= MVT::i32
&& CmpVT
!= MVT::i64
)
9165 if (InfoAndKind
.IsAArch64
) {
9166 CCVal
= DAG
.getConstant(
9167 AArch64CC::getInvertedCondCode(InfoAndKind
.Info
.AArch64
.CC
), dl
,
9169 Cmp
= *InfoAndKind
.Info
.AArch64
.Cmp
;
9171 Cmp
= getAArch64Cmp(*InfoAndKind
.Info
.Generic
.Opnd0
,
9172 *InfoAndKind
.Info
.Generic
.Opnd1
,
9173 ISD::getSetCCInverse(InfoAndKind
.Info
.Generic
.CC
, true),
9176 EVT VT
= Op
->getValueType(0);
9177 LHS
= DAG
.getNode(ISD::ADD
, dl
, VT
, RHS
, DAG
.getConstant(1, dl
, VT
));
9178 return DAG
.getNode(AArch64ISD::CSEL
, dl
, VT
, RHS
, LHS
, CCVal
, Cmp
);
9181 // The basic add/sub long vector instructions have variants with "2" on the end
9182 // which act on the high-half of their inputs. They are normally matched by
9185 // (add (zeroext (extract_high LHS)),
9186 // (zeroext (extract_high RHS)))
9187 // -> uaddl2 vD, vN, vM
9189 // However, if one of the extracts is something like a duplicate, this
9190 // instruction can still be used profitably. This function puts the DAG into a
9191 // more appropriate form for those patterns to trigger.
9192 static SDValue
performAddSubLongCombine(SDNode
*N
,
9193 TargetLowering::DAGCombinerInfo
&DCI
,
9194 SelectionDAG
&DAG
) {
9195 if (DCI
.isBeforeLegalizeOps())
9198 MVT VT
= N
->getSimpleValueType(0);
9199 if (!VT
.is128BitVector()) {
9200 if (N
->getOpcode() == ISD::ADD
)
9201 return performSetccAddFolding(N
, DAG
);
9205 // Make sure both branches are extended in the same way.
9206 SDValue LHS
= N
->getOperand(0);
9207 SDValue RHS
= N
->getOperand(1);
9208 if ((LHS
.getOpcode() != ISD::ZERO_EXTEND
&&
9209 LHS
.getOpcode() != ISD::SIGN_EXTEND
) ||
9210 LHS
.getOpcode() != RHS
.getOpcode())
9213 unsigned ExtType
= LHS
.getOpcode();
9215 // It's not worth doing if at least one of the inputs isn't already an
9216 // extract, but we don't know which it'll be so we have to try both.
9217 if (isEssentiallyExtractSubvector(LHS
.getOperand(0))) {
9218 RHS
= tryExtendDUPToExtractHigh(RHS
.getOperand(0), DAG
);
9222 RHS
= DAG
.getNode(ExtType
, SDLoc(N
), VT
, RHS
);
9223 } else if (isEssentiallyExtractSubvector(RHS
.getOperand(0))) {
9224 LHS
= tryExtendDUPToExtractHigh(LHS
.getOperand(0), DAG
);
9228 LHS
= DAG
.getNode(ExtType
, SDLoc(N
), VT
, LHS
);
9231 return DAG
.getNode(N
->getOpcode(), SDLoc(N
), VT
, LHS
, RHS
);
9234 // Massage DAGs which we can use the high-half "long" operations on into
9235 // something isel will recognize better. E.g.
9237 // (aarch64_neon_umull (extract_high vec) (dupv64 scalar)) -->
9238 // (aarch64_neon_umull (extract_high (v2i64 vec)))
9239 // (extract_high (v2i64 (dup128 scalar)))))
9241 static SDValue
tryCombineLongOpWithDup(unsigned IID
, SDNode
*N
,
9242 TargetLowering::DAGCombinerInfo
&DCI
,
9243 SelectionDAG
&DAG
) {
9244 if (DCI
.isBeforeLegalizeOps())
9247 SDValue LHS
= N
->getOperand(1);
9248 SDValue RHS
= N
->getOperand(2);
9249 assert(LHS
.getValueType().is64BitVector() &&
9250 RHS
.getValueType().is64BitVector() &&
9251 "unexpected shape for long operation");
9253 // Either node could be a DUP, but it's not worth doing both of them (you'd
9254 // just as well use the non-high version) so look for a corresponding extract
9255 // operation on the other "wing".
9256 if (isEssentiallyExtractSubvector(LHS
)) {
9257 RHS
= tryExtendDUPToExtractHigh(RHS
, DAG
);
9260 } else if (isEssentiallyExtractSubvector(RHS
)) {
9261 LHS
= tryExtendDUPToExtractHigh(LHS
, DAG
);
9266 return DAG
.getNode(ISD::INTRINSIC_WO_CHAIN
, SDLoc(N
), N
->getValueType(0),
9267 N
->getOperand(0), LHS
, RHS
);
9270 static SDValue
tryCombineShiftImm(unsigned IID
, SDNode
*N
, SelectionDAG
&DAG
) {
9271 MVT ElemTy
= N
->getSimpleValueType(0).getScalarType();
9272 unsigned ElemBits
= ElemTy
.getSizeInBits();
9274 int64_t ShiftAmount
;
9275 if (BuildVectorSDNode
*BVN
= dyn_cast
<BuildVectorSDNode
>(N
->getOperand(2))) {
9276 APInt SplatValue
, SplatUndef
;
9277 unsigned SplatBitSize
;
9279 if (!BVN
->isConstantSplat(SplatValue
, SplatUndef
, SplatBitSize
,
9280 HasAnyUndefs
, ElemBits
) ||
9281 SplatBitSize
!= ElemBits
)
9284 ShiftAmount
= SplatValue
.getSExtValue();
9285 } else if (ConstantSDNode
*CVN
= dyn_cast
<ConstantSDNode
>(N
->getOperand(2))) {
9286 ShiftAmount
= CVN
->getSExtValue();
9294 llvm_unreachable("Unknown shift intrinsic");
9295 case Intrinsic::aarch64_neon_sqshl
:
9296 Opcode
= AArch64ISD::SQSHL_I
;
9297 IsRightShift
= false;
9299 case Intrinsic::aarch64_neon_uqshl
:
9300 Opcode
= AArch64ISD::UQSHL_I
;
9301 IsRightShift
= false;
9303 case Intrinsic::aarch64_neon_srshl
:
9304 Opcode
= AArch64ISD::SRSHR_I
;
9305 IsRightShift
= true;
9307 case Intrinsic::aarch64_neon_urshl
:
9308 Opcode
= AArch64ISD::URSHR_I
;
9309 IsRightShift
= true;
9311 case Intrinsic::aarch64_neon_sqshlu
:
9312 Opcode
= AArch64ISD::SQSHLU_I
;
9313 IsRightShift
= false;
9317 if (IsRightShift
&& ShiftAmount
<= -1 && ShiftAmount
>= -(int)ElemBits
) {
9319 return DAG
.getNode(Opcode
, dl
, N
->getValueType(0), N
->getOperand(1),
9320 DAG
.getConstant(-ShiftAmount
, dl
, MVT::i32
));
9321 } else if (!IsRightShift
&& ShiftAmount
>= 0 && ShiftAmount
< ElemBits
) {
9323 return DAG
.getNode(Opcode
, dl
, N
->getValueType(0), N
->getOperand(1),
9324 DAG
.getConstant(ShiftAmount
, dl
, MVT::i32
));
9330 // The CRC32[BH] instructions ignore the high bits of their data operand. Since
9331 // the intrinsics must be legal and take an i32, this means there's almost
9332 // certainly going to be a zext in the DAG which we can eliminate.
9333 static SDValue
tryCombineCRC32(unsigned Mask
, SDNode
*N
, SelectionDAG
&DAG
) {
9334 SDValue AndN
= N
->getOperand(2);
9335 if (AndN
.getOpcode() != ISD::AND
)
9338 ConstantSDNode
*CMask
= dyn_cast
<ConstantSDNode
>(AndN
.getOperand(1));
9339 if (!CMask
|| CMask
->getZExtValue() != Mask
)
9342 return DAG
.getNode(ISD::INTRINSIC_WO_CHAIN
, SDLoc(N
), MVT::i32
,
9343 N
->getOperand(0), N
->getOperand(1), AndN
.getOperand(0));
9346 static SDValue
combineAcrossLanesIntrinsic(unsigned Opc
, SDNode
*N
,
9347 SelectionDAG
&DAG
) {
9349 return DAG
.getNode(ISD::EXTRACT_VECTOR_ELT
, dl
, N
->getValueType(0),
9350 DAG
.getNode(Opc
, dl
,
9351 N
->getOperand(1).getSimpleValueType(),
9353 DAG
.getConstant(0, dl
, MVT::i64
));
9356 static SDValue
performIntrinsicCombine(SDNode
*N
,
9357 TargetLowering::DAGCombinerInfo
&DCI
,
9358 const AArch64Subtarget
*Subtarget
) {
9359 SelectionDAG
&DAG
= DCI
.DAG
;
9360 unsigned IID
= getIntrinsicID(N
);
9364 case Intrinsic::aarch64_neon_vcvtfxs2fp
:
9365 case Intrinsic::aarch64_neon_vcvtfxu2fp
:
9366 return tryCombineFixedPointConvert(N
, DCI
, DAG
);
9367 case Intrinsic::aarch64_neon_saddv
:
9368 return combineAcrossLanesIntrinsic(AArch64ISD::SADDV
, N
, DAG
);
9369 case Intrinsic::aarch64_neon_uaddv
:
9370 return combineAcrossLanesIntrinsic(AArch64ISD::UADDV
, N
, DAG
);
9371 case Intrinsic::aarch64_neon_sminv
:
9372 return combineAcrossLanesIntrinsic(AArch64ISD::SMINV
, N
, DAG
);
9373 case Intrinsic::aarch64_neon_uminv
:
9374 return combineAcrossLanesIntrinsic(AArch64ISD::UMINV
, N
, DAG
);
9375 case Intrinsic::aarch64_neon_smaxv
:
9376 return combineAcrossLanesIntrinsic(AArch64ISD::SMAXV
, N
, DAG
);
9377 case Intrinsic::aarch64_neon_umaxv
:
9378 return combineAcrossLanesIntrinsic(AArch64ISD::UMAXV
, N
, DAG
);
9379 case Intrinsic::aarch64_neon_fmax
:
9380 return DAG
.getNode(ISD::FMAXNAN
, SDLoc(N
), N
->getValueType(0),
9381 N
->getOperand(1), N
->getOperand(2));
9382 case Intrinsic::aarch64_neon_fmin
:
9383 return DAG
.getNode(ISD::FMINNAN
, SDLoc(N
), N
->getValueType(0),
9384 N
->getOperand(1), N
->getOperand(2));
9385 case Intrinsic::aarch64_neon_fmaxnm
:
9386 return DAG
.getNode(ISD::FMAXNUM
, SDLoc(N
), N
->getValueType(0),
9387 N
->getOperand(1), N
->getOperand(2));
9388 case Intrinsic::aarch64_neon_fminnm
:
9389 return DAG
.getNode(ISD::FMINNUM
, SDLoc(N
), N
->getValueType(0),
9390 N
->getOperand(1), N
->getOperand(2));
9391 case Intrinsic::aarch64_neon_smull
:
9392 case Intrinsic::aarch64_neon_umull
:
9393 case Intrinsic::aarch64_neon_pmull
:
9394 case Intrinsic::aarch64_neon_sqdmull
:
9395 return tryCombineLongOpWithDup(IID
, N
, DCI
, DAG
);
9396 case Intrinsic::aarch64_neon_sqshl
:
9397 case Intrinsic::aarch64_neon_uqshl
:
9398 case Intrinsic::aarch64_neon_sqshlu
:
9399 case Intrinsic::aarch64_neon_srshl
:
9400 case Intrinsic::aarch64_neon_urshl
:
9401 return tryCombineShiftImm(IID
, N
, DAG
);
9402 case Intrinsic::aarch64_crc32b
:
9403 case Intrinsic::aarch64_crc32cb
:
9404 return tryCombineCRC32(0xff, N
, DAG
);
9405 case Intrinsic::aarch64_crc32h
:
9406 case Intrinsic::aarch64_crc32ch
:
9407 return tryCombineCRC32(0xffff, N
, DAG
);
9412 static SDValue
performExtendCombine(SDNode
*N
,
9413 TargetLowering::DAGCombinerInfo
&DCI
,
9414 SelectionDAG
&DAG
) {
9415 // If we see something like (zext (sabd (extract_high ...), (DUP ...))) then
9416 // we can convert that DUP into another extract_high (of a bigger DUP), which
9417 // helps the backend to decide that an sabdl2 would be useful, saving a real
9418 // extract_high operation.
9419 if (!DCI
.isBeforeLegalizeOps() && N
->getOpcode() == ISD::ZERO_EXTEND
&&
9420 N
->getOperand(0).getOpcode() == ISD::INTRINSIC_WO_CHAIN
) {
9421 SDNode
*ABDNode
= N
->getOperand(0).getNode();
9422 unsigned IID
= getIntrinsicID(ABDNode
);
9423 if (IID
== Intrinsic::aarch64_neon_sabd
||
9424 IID
== Intrinsic::aarch64_neon_uabd
) {
9425 SDValue NewABD
= tryCombineLongOpWithDup(IID
, ABDNode
, DCI
, DAG
);
9426 if (!NewABD
.getNode())
9429 return DAG
.getNode(ISD::ZERO_EXTEND
, SDLoc(N
), N
->getValueType(0),
9434 // This is effectively a custom type legalization for AArch64.
9436 // Type legalization will split an extend of a small, legal, type to a larger
9437 // illegal type by first splitting the destination type, often creating
9438 // illegal source types, which then get legalized in isel-confusing ways,
9439 // leading to really terrible codegen. E.g.,
9440 // %result = v8i32 sext v8i8 %value
9442 // %losrc = extract_subreg %value, ...
9443 // %hisrc = extract_subreg %value, ...
9444 // %lo = v4i32 sext v4i8 %losrc
9445 // %hi = v4i32 sext v4i8 %hisrc
9446 // Things go rapidly downhill from there.
9448 // For AArch64, the [sz]ext vector instructions can only go up one element
9449 // size, so we can, e.g., extend from i8 to i16, but to go from i8 to i32
9450 // take two instructions.
9452 // This implies that the most efficient way to do the extend from v8i8
9453 // to two v4i32 values is to first extend the v8i8 to v8i16, then do
9454 // the normal splitting to happen for the v8i16->v8i32.
9456 // This is pre-legalization to catch some cases where the default
9457 // type legalization will create ill-tempered code.
9458 if (!DCI
.isBeforeLegalizeOps())
9461 // We're only interested in cleaning things up for non-legal vector types
9462 // here. If both the source and destination are legal, things will just
9463 // work naturally without any fiddling.
9464 const TargetLowering
&TLI
= DAG
.getTargetLoweringInfo();
9465 EVT ResVT
= N
->getValueType(0);
9466 if (!ResVT
.isVector() || TLI
.isTypeLegal(ResVT
))
9468 // If the vector type isn't a simple VT, it's beyond the scope of what
9469 // we're worried about here. Let legalization do its thing and hope for
9471 SDValue Src
= N
->getOperand(0);
9472 EVT SrcVT
= Src
->getValueType(0);
9473 if (!ResVT
.isSimple() || !SrcVT
.isSimple())
9476 // If the source VT is a 64-bit vector, we can play games and get the
9477 // better results we want.
9478 if (SrcVT
.getSizeInBits() != 64)
9481 unsigned SrcEltSize
= SrcVT
.getScalarSizeInBits();
9482 unsigned ElementCount
= SrcVT
.getVectorNumElements();
9483 SrcVT
= MVT::getVectorVT(MVT::getIntegerVT(SrcEltSize
* 2), ElementCount
);
9485 Src
= DAG
.getNode(N
->getOpcode(), DL
, SrcVT
, Src
);
9487 // Now split the rest of the operation into two halves, each with a 64
9491 unsigned NumElements
= ResVT
.getVectorNumElements();
9492 assert(!(NumElements
& 1) && "Splitting vector, but not in half!");
9493 LoVT
= HiVT
= EVT::getVectorVT(*DAG
.getContext(),
9494 ResVT
.getVectorElementType(), NumElements
/ 2);
9496 EVT InNVT
= EVT::getVectorVT(*DAG
.getContext(), SrcVT
.getVectorElementType(),
9497 LoVT
.getVectorNumElements());
9498 Lo
= DAG
.getNode(ISD::EXTRACT_SUBVECTOR
, DL
, InNVT
, Src
,
9499 DAG
.getConstant(0, DL
, MVT::i64
));
9500 Hi
= DAG
.getNode(ISD::EXTRACT_SUBVECTOR
, DL
, InNVT
, Src
,
9501 DAG
.getConstant(InNVT
.getVectorNumElements(), DL
, MVT::i64
));
9502 Lo
= DAG
.getNode(N
->getOpcode(), DL
, LoVT
, Lo
);
9503 Hi
= DAG
.getNode(N
->getOpcode(), DL
, HiVT
, Hi
);
9505 // Now combine the parts back together so we still have a single result
9506 // like the combiner expects.
9507 return DAG
.getNode(ISD::CONCAT_VECTORS
, DL
, ResVT
, Lo
, Hi
);
9510 static SDValue
splitStoreSplat(SelectionDAG
&DAG
, StoreSDNode
&St
,
9511 SDValue SplatVal
, unsigned NumVecElts
) {
9512 unsigned OrigAlignment
= St
.getAlignment();
9513 unsigned EltOffset
= SplatVal
.getValueType().getSizeInBits() / 8;
9515 // Create scalar stores. This is at least as good as the code sequence for a
9516 // split unaligned store which is a dup.s, ext.b, and two stores.
9517 // Most of the time the three stores should be replaced by store pair
9518 // instructions (stp).
9520 SDValue BasePtr
= St
.getBasePtr();
9521 uint64_t BaseOffset
= 0;
9523 const MachinePointerInfo
&PtrInfo
= St
.getPointerInfo();
9525 DAG
.getStore(St
.getChain(), DL
, SplatVal
, BasePtr
, PtrInfo
,
9526 OrigAlignment
, St
.getMemOperand()->getFlags());
9528 // As this in ISel, we will not merge this add which may degrade results.
9529 if (BasePtr
->getOpcode() == ISD::ADD
&&
9530 isa
<ConstantSDNode
>(BasePtr
->getOperand(1))) {
9531 BaseOffset
= cast
<ConstantSDNode
>(BasePtr
->getOperand(1))->getSExtValue();
9532 BasePtr
= BasePtr
->getOperand(0);
9535 unsigned Offset
= EltOffset
;
9536 while (--NumVecElts
) {
9537 unsigned Alignment
= MinAlign(OrigAlignment
, Offset
);
9539 DAG
.getNode(ISD::ADD
, DL
, MVT::i64
, BasePtr
,
9540 DAG
.getConstant(BaseOffset
+ Offset
, DL
, MVT::i64
));
9541 NewST1
= DAG
.getStore(NewST1
.getValue(0), DL
, SplatVal
, OffsetPtr
,
9542 PtrInfo
.getWithOffset(Offset
), Alignment
,
9543 St
.getMemOperand()->getFlags());
9544 Offset
+= EltOffset
;
9549 /// Replace a splat of zeros to a vector store by scalar stores of WZR/XZR. The
9550 /// load store optimizer pass will merge them to store pair stores. This should
9551 /// be better than a movi to create the vector zero followed by a vector store
9552 /// if the zero constant is not re-used, since one instructions and one register
9553 /// live range will be removed.
9555 /// For example, the final generated code should be:
9557 /// stp xzr, xzr, [x0]
9564 static SDValue
replaceZeroVectorStore(SelectionDAG
&DAG
, StoreSDNode
&St
) {
9565 SDValue StVal
= St
.getValue();
9566 EVT VT
= StVal
.getValueType();
9568 // It is beneficial to scalarize a zero splat store for 2 or 3 i64 elements or
9569 // 2, 3 or 4 i32 elements.
9570 int NumVecElts
= VT
.getVectorNumElements();
9571 if (!(((NumVecElts
== 2 || NumVecElts
== 3) &&
9572 VT
.getVectorElementType().getSizeInBits() == 64) ||
9573 ((NumVecElts
== 2 || NumVecElts
== 3 || NumVecElts
== 4) &&
9574 VT
.getVectorElementType().getSizeInBits() == 32)))
9577 if (StVal
.getOpcode() != ISD::BUILD_VECTOR
)
9580 // If the zero constant has more than one use then the vector store could be
9581 // better since the constant mov will be amortized and stp q instructions
9582 // should be able to be formed.
9583 if (!StVal
.hasOneUse())
9586 // If the immediate offset of the address operand is too large for the stp
9587 // instruction, then bail out.
9588 if (DAG
.isBaseWithConstantOffset(St
.getBasePtr())) {
9589 int64_t Offset
= St
.getBasePtr()->getConstantOperandVal(1);
9590 if (Offset
< -512 || Offset
> 504)
9594 for (int I
= 0; I
< NumVecElts
; ++I
) {
9595 SDValue EltVal
= StVal
.getOperand(I
);
9596 if (!isNullConstant(EltVal
) && !isNullFPConstant(EltVal
))
9600 // Use a CopyFromReg WZR/XZR here to prevent
9601 // DAGCombiner::MergeConsecutiveStores from undoing this transformation.
9605 if (VT
.getVectorElementType().getSizeInBits() == 32) {
9606 ZeroReg
= AArch64::WZR
;
9609 ZeroReg
= AArch64::XZR
;
9613 DAG
.getCopyFromReg(DAG
.getEntryNode(), DL
, ZeroReg
, ZeroVT
);
9614 return splitStoreSplat(DAG
, St
, SplatVal
, NumVecElts
);
9617 /// Replace a splat of a scalar to a vector store by scalar stores of the scalar
9618 /// value. The load store optimizer pass will merge them to store pair stores.
9619 /// This has better performance than a splat of the scalar followed by a split
9620 /// vector store. Even if the stores are not merged it is four stores vs a dup,
9621 /// followed by an ext.b and two stores.
9622 static SDValue
replaceSplatVectorStore(SelectionDAG
&DAG
, StoreSDNode
&St
) {
9623 SDValue StVal
= St
.getValue();
9624 EVT VT
= StVal
.getValueType();
9626 // Don't replace floating point stores, they possibly won't be transformed to
9627 // stp because of the store pair suppress pass.
9628 if (VT
.isFloatingPoint())
9631 // We can express a splat as store pair(s) for 2 or 4 elements.
9632 unsigned NumVecElts
= VT
.getVectorNumElements();
9633 if (NumVecElts
!= 4 && NumVecElts
!= 2)
9636 // Check that this is a splat.
9637 // Make sure that each of the relevant vector element locations are inserted
9638 // to, i.e. 0 and 1 for v2i64 and 0, 1, 2, 3 for v4i32.
9639 std::bitset
<4> IndexNotInserted((1 << NumVecElts
) - 1);
9641 for (unsigned I
= 0; I
< NumVecElts
; ++I
) {
9642 // Check for insert vector elements.
9643 if (StVal
.getOpcode() != ISD::INSERT_VECTOR_ELT
)
9646 // Check that same value is inserted at each vector element.
9648 SplatVal
= StVal
.getOperand(1);
9649 else if (StVal
.getOperand(1) != SplatVal
)
9652 // Check insert element index.
9653 ConstantSDNode
*CIndex
= dyn_cast
<ConstantSDNode
>(StVal
.getOperand(2));
9656 uint64_t IndexVal
= CIndex
->getZExtValue();
9657 if (IndexVal
>= NumVecElts
)
9659 IndexNotInserted
.reset(IndexVal
);
9661 StVal
= StVal
.getOperand(0);
9663 // Check that all vector element locations were inserted to.
9664 if (IndexNotInserted
.any())
9667 return splitStoreSplat(DAG
, St
, SplatVal
, NumVecElts
);
9670 static SDValue
splitStores(SDNode
*N
, TargetLowering::DAGCombinerInfo
&DCI
,
9672 const AArch64Subtarget
*Subtarget
) {
9674 StoreSDNode
*S
= cast
<StoreSDNode
>(N
);
9675 if (S
->isVolatile() || S
->isIndexed())
9678 SDValue StVal
= S
->getValue();
9679 EVT VT
= StVal
.getValueType();
9683 // If we get a splat of zeros, convert this vector store to a store of
9684 // scalars. They will be merged into store pairs of xzr thereby removing one
9685 // instruction and one register.
9686 if (SDValue ReplacedZeroSplat
= replaceZeroVectorStore(DAG
, *S
))
9687 return ReplacedZeroSplat
;
9689 // FIXME: The logic for deciding if an unaligned store should be split should
9690 // be included in TLI.allowsMisalignedMemoryAccesses(), and there should be
9691 // a call to that function here.
9693 if (!Subtarget
->isMisaligned128StoreSlow())
9696 // Don't split at -Oz.
9697 if (DAG
.getMachineFunction().getFunction().optForMinSize())
9700 // Don't split v2i64 vectors. Memcpy lowering produces those and splitting
9701 // those up regresses performance on micro-benchmarks and olden/bh.
9702 if (VT
.getVectorNumElements() < 2 || VT
== MVT::v2i64
)
9705 // Split unaligned 16B stores. They are terrible for performance.
9706 // Don't split stores with alignment of 1 or 2. Code that uses clang vector
9707 // extensions can use this to mark that it does not want splitting to happen
9708 // (by underspecifying alignment to be 1 or 2). Furthermore, the chance of
9709 // eliminating alignment hazards is only 1 in 8 for alignment of 2.
9710 if (VT
.getSizeInBits() != 128 || S
->getAlignment() >= 16 ||
9711 S
->getAlignment() <= 2)
9714 // If we get a splat of a scalar convert this vector store to a store of
9715 // scalars. They will be merged into store pairs thereby removing two
9717 if (SDValue ReplacedSplat
= replaceSplatVectorStore(DAG
, *S
))
9718 return ReplacedSplat
;
9721 unsigned NumElts
= VT
.getVectorNumElements() / 2;
9722 // Split VT into two.
9724 EVT::getVectorVT(*DAG
.getContext(), VT
.getVectorElementType(), NumElts
);
9725 SDValue SubVector0
= DAG
.getNode(ISD::EXTRACT_SUBVECTOR
, DL
, HalfVT
, StVal
,
9726 DAG
.getConstant(0, DL
, MVT::i64
));
9727 SDValue SubVector1
= DAG
.getNode(ISD::EXTRACT_SUBVECTOR
, DL
, HalfVT
, StVal
,
9728 DAG
.getConstant(NumElts
, DL
, MVT::i64
));
9729 SDValue BasePtr
= S
->getBasePtr();
9731 DAG
.getStore(S
->getChain(), DL
, SubVector0
, BasePtr
, S
->getPointerInfo(),
9732 S
->getAlignment(), S
->getMemOperand()->getFlags());
9733 SDValue OffsetPtr
= DAG
.getNode(ISD::ADD
, DL
, MVT::i64
, BasePtr
,
9734 DAG
.getConstant(8, DL
, MVT::i64
));
9735 return DAG
.getStore(NewST1
.getValue(0), DL
, SubVector1
, OffsetPtr
,
9736 S
->getPointerInfo(), S
->getAlignment(),
9737 S
->getMemOperand()->getFlags());
9740 /// Target-specific DAG combine function for post-increment LD1 (lane) and
9741 /// post-increment LD1R.
9742 static SDValue
performPostLD1Combine(SDNode
*N
,
9743 TargetLowering::DAGCombinerInfo
&DCI
,
9745 if (DCI
.isBeforeLegalizeOps())
9748 SelectionDAG
&DAG
= DCI
.DAG
;
9749 EVT VT
= N
->getValueType(0);
9751 unsigned LoadIdx
= IsLaneOp
? 1 : 0;
9752 SDNode
*LD
= N
->getOperand(LoadIdx
).getNode();
9753 // If it is not LOAD, can not do such combine.
9754 if (LD
->getOpcode() != ISD::LOAD
)
9757 LoadSDNode
*LoadSDN
= cast
<LoadSDNode
>(LD
);
9758 EVT MemVT
= LoadSDN
->getMemoryVT();
9759 // Check if memory operand is the same type as the vector element.
9760 if (MemVT
!= VT
.getVectorElementType())
9763 // Check if there are other uses. If so, do not combine as it will introduce
9765 for (SDNode::use_iterator UI
= LD
->use_begin(), UE
= LD
->use_end(); UI
!= UE
;
9767 if (UI
.getUse().getResNo() == 1) // Ignore uses of the chain result.
9773 SDValue Addr
= LD
->getOperand(1);
9774 SDValue Vector
= N
->getOperand(0);
9775 // Search for a use of the address operand that is an increment.
9776 for (SDNode::use_iterator UI
= Addr
.getNode()->use_begin(), UE
=
9777 Addr
.getNode()->use_end(); UI
!= UE
; ++UI
) {
9779 if (User
->getOpcode() != ISD::ADD
9780 || UI
.getUse().getResNo() != Addr
.getResNo())
9783 // Check that the add is independent of the load. Otherwise, folding it
9784 // would create a cycle.
9785 if (User
->isPredecessorOf(LD
) || LD
->isPredecessorOf(User
))
9787 // Also check that add is not used in the vector operand. This would also
9789 if (User
->isPredecessorOf(Vector
.getNode()))
9792 // If the increment is a constant, it must match the memory ref size.
9793 SDValue Inc
= User
->getOperand(User
->getOperand(0) == Addr
? 1 : 0);
9794 if (ConstantSDNode
*CInc
= dyn_cast
<ConstantSDNode
>(Inc
.getNode())) {
9795 uint32_t IncVal
= CInc
->getZExtValue();
9796 unsigned NumBytes
= VT
.getScalarSizeInBits() / 8;
9797 if (IncVal
!= NumBytes
)
9799 Inc
= DAG
.getRegister(AArch64::XZR
, MVT::i64
);
9802 // Finally, check that the vector doesn't depend on the load.
9803 // Again, this would create a cycle.
9804 // The load depending on the vector is fine, as that's the case for the
9805 // LD1*post we'll eventually generate anyway.
9806 if (LoadSDN
->isPredecessorOf(Vector
.getNode()))
9809 SmallVector
<SDValue
, 8> Ops
;
9810 Ops
.push_back(LD
->getOperand(0)); // Chain
9812 Ops
.push_back(Vector
); // The vector to be inserted
9813 Ops
.push_back(N
->getOperand(2)); // The lane to be inserted in the vector
9815 Ops
.push_back(Addr
);
9818 EVT Tys
[3] = { VT
, MVT::i64
, MVT::Other
};
9819 SDVTList SDTys
= DAG
.getVTList(Tys
);
9820 unsigned NewOp
= IsLaneOp
? AArch64ISD::LD1LANEpost
: AArch64ISD::LD1DUPpost
;
9821 SDValue UpdN
= DAG
.getMemIntrinsicNode(NewOp
, SDLoc(N
), SDTys
, Ops
,
9823 LoadSDN
->getMemOperand());
9826 SDValue NewResults
[] = {
9827 SDValue(LD
, 0), // The result of load
9828 SDValue(UpdN
.getNode(), 2) // Chain
9830 DCI
.CombineTo(LD
, NewResults
);
9831 DCI
.CombineTo(N
, SDValue(UpdN
.getNode(), 0)); // Dup/Inserted Result
9832 DCI
.CombineTo(User
, SDValue(UpdN
.getNode(), 1)); // Write back register
9839 /// Simplify ``Addr`` given that the top byte of it is ignored by HW during
9840 /// address translation.
9841 static bool performTBISimplification(SDValue Addr
,
9842 TargetLowering::DAGCombinerInfo
&DCI
,
9843 SelectionDAG
&DAG
) {
9844 APInt DemandedMask
= APInt::getLowBitsSet(64, 56);
9846 TargetLowering::TargetLoweringOpt
TLO(DAG
, !DCI
.isBeforeLegalize(),
9847 !DCI
.isBeforeLegalizeOps());
9848 const TargetLowering
&TLI
= DAG
.getTargetLoweringInfo();
9849 if (TLI
.SimplifyDemandedBits(Addr
, DemandedMask
, Known
, TLO
)) {
9850 DCI
.CommitTargetLoweringOpt(TLO
);
9856 static SDValue
performSTORECombine(SDNode
*N
,
9857 TargetLowering::DAGCombinerInfo
&DCI
,
9859 const AArch64Subtarget
*Subtarget
) {
9860 if (SDValue Split
= splitStores(N
, DCI
, DAG
, Subtarget
))
9863 if (Subtarget
->supportsAddressTopByteIgnored() &&
9864 performTBISimplification(N
->getOperand(2), DCI
, DAG
))
9865 return SDValue(N
, 0);
9871 /// Target-specific DAG combine function for NEON load/store intrinsics
9872 /// to merge base address updates.
9873 static SDValue
performNEONPostLDSTCombine(SDNode
*N
,
9874 TargetLowering::DAGCombinerInfo
&DCI
,
9875 SelectionDAG
&DAG
) {
9876 if (DCI
.isBeforeLegalize() || DCI
.isCalledByLegalizer())
9879 unsigned AddrOpIdx
= N
->getNumOperands() - 1;
9880 SDValue Addr
= N
->getOperand(AddrOpIdx
);
9882 // Search for a use of the address operand that is an increment.
9883 for (SDNode::use_iterator UI
= Addr
.getNode()->use_begin(),
9884 UE
= Addr
.getNode()->use_end(); UI
!= UE
; ++UI
) {
9886 if (User
->getOpcode() != ISD::ADD
||
9887 UI
.getUse().getResNo() != Addr
.getResNo())
9890 // Check that the add is independent of the load/store. Otherwise, folding
9891 // it would create a cycle.
9892 if (User
->isPredecessorOf(N
) || N
->isPredecessorOf(User
))
9895 // Find the new opcode for the updating load/store.
9896 bool IsStore
= false;
9897 bool IsLaneOp
= false;
9898 bool IsDupOp
= false;
9899 unsigned NewOpc
= 0;
9900 unsigned NumVecs
= 0;
9901 unsigned IntNo
= cast
<ConstantSDNode
>(N
->getOperand(1))->getZExtValue();
9903 default: llvm_unreachable("unexpected intrinsic for Neon base update");
9904 case Intrinsic::aarch64_neon_ld2
: NewOpc
= AArch64ISD::LD2post
;
9906 case Intrinsic::aarch64_neon_ld3
: NewOpc
= AArch64ISD::LD3post
;
9908 case Intrinsic::aarch64_neon_ld4
: NewOpc
= AArch64ISD::LD4post
;
9910 case Intrinsic::aarch64_neon_st2
: NewOpc
= AArch64ISD::ST2post
;
9911 NumVecs
= 2; IsStore
= true; break;
9912 case Intrinsic::aarch64_neon_st3
: NewOpc
= AArch64ISD::ST3post
;
9913 NumVecs
= 3; IsStore
= true; break;
9914 case Intrinsic::aarch64_neon_st4
: NewOpc
= AArch64ISD::ST4post
;
9915 NumVecs
= 4; IsStore
= true; break;
9916 case Intrinsic::aarch64_neon_ld1x2
: NewOpc
= AArch64ISD::LD1x2post
;
9918 case Intrinsic::aarch64_neon_ld1x3
: NewOpc
= AArch64ISD::LD1x3post
;
9920 case Intrinsic::aarch64_neon_ld1x4
: NewOpc
= AArch64ISD::LD1x4post
;
9922 case Intrinsic::aarch64_neon_st1x2
: NewOpc
= AArch64ISD::ST1x2post
;
9923 NumVecs
= 2; IsStore
= true; break;
9924 case Intrinsic::aarch64_neon_st1x3
: NewOpc
= AArch64ISD::ST1x3post
;
9925 NumVecs
= 3; IsStore
= true; break;
9926 case Intrinsic::aarch64_neon_st1x4
: NewOpc
= AArch64ISD::ST1x4post
;
9927 NumVecs
= 4; IsStore
= true; break;
9928 case Intrinsic::aarch64_neon_ld2r
: NewOpc
= AArch64ISD::LD2DUPpost
;
9929 NumVecs
= 2; IsDupOp
= true; break;
9930 case Intrinsic::aarch64_neon_ld3r
: NewOpc
= AArch64ISD::LD3DUPpost
;
9931 NumVecs
= 3; IsDupOp
= true; break;
9932 case Intrinsic::aarch64_neon_ld4r
: NewOpc
= AArch64ISD::LD4DUPpost
;
9933 NumVecs
= 4; IsDupOp
= true; break;
9934 case Intrinsic::aarch64_neon_ld2lane
: NewOpc
= AArch64ISD::LD2LANEpost
;
9935 NumVecs
= 2; IsLaneOp
= true; break;
9936 case Intrinsic::aarch64_neon_ld3lane
: NewOpc
= AArch64ISD::LD3LANEpost
;
9937 NumVecs
= 3; IsLaneOp
= true; break;
9938 case Intrinsic::aarch64_neon_ld4lane
: NewOpc
= AArch64ISD::LD4LANEpost
;
9939 NumVecs
= 4; IsLaneOp
= true; break;
9940 case Intrinsic::aarch64_neon_st2lane
: NewOpc
= AArch64ISD::ST2LANEpost
;
9941 NumVecs
= 2; IsStore
= true; IsLaneOp
= true; break;
9942 case Intrinsic::aarch64_neon_st3lane
: NewOpc
= AArch64ISD::ST3LANEpost
;
9943 NumVecs
= 3; IsStore
= true; IsLaneOp
= true; break;
9944 case Intrinsic::aarch64_neon_st4lane
: NewOpc
= AArch64ISD::ST4LANEpost
;
9945 NumVecs
= 4; IsStore
= true; IsLaneOp
= true; break;
9950 VecTy
= N
->getOperand(2).getValueType();
9952 VecTy
= N
->getValueType(0);
9954 // If the increment is a constant, it must match the memory ref size.
9955 SDValue Inc
= User
->getOperand(User
->getOperand(0) == Addr
? 1 : 0);
9956 if (ConstantSDNode
*CInc
= dyn_cast
<ConstantSDNode
>(Inc
.getNode())) {
9957 uint32_t IncVal
= CInc
->getZExtValue();
9958 unsigned NumBytes
= NumVecs
* VecTy
.getSizeInBits() / 8;
9959 if (IsLaneOp
|| IsDupOp
)
9960 NumBytes
/= VecTy
.getVectorNumElements();
9961 if (IncVal
!= NumBytes
)
9963 Inc
= DAG
.getRegister(AArch64::XZR
, MVT::i64
);
9965 SmallVector
<SDValue
, 8> Ops
;
9966 Ops
.push_back(N
->getOperand(0)); // Incoming chain
9967 // Load lane and store have vector list as input.
9968 if (IsLaneOp
|| IsStore
)
9969 for (unsigned i
= 2; i
< AddrOpIdx
; ++i
)
9970 Ops
.push_back(N
->getOperand(i
));
9971 Ops
.push_back(Addr
); // Base register
9976 unsigned NumResultVecs
= (IsStore
? 0 : NumVecs
);
9978 for (n
= 0; n
< NumResultVecs
; ++n
)
9980 Tys
[n
++] = MVT::i64
; // Type of write back register
9981 Tys
[n
] = MVT::Other
; // Type of the chain
9982 SDVTList SDTys
= DAG
.getVTList(makeArrayRef(Tys
, NumResultVecs
+ 2));
9984 MemIntrinsicSDNode
*MemInt
= cast
<MemIntrinsicSDNode
>(N
);
9985 SDValue UpdN
= DAG
.getMemIntrinsicNode(NewOpc
, SDLoc(N
), SDTys
, Ops
,
9986 MemInt
->getMemoryVT(),
9987 MemInt
->getMemOperand());
9990 std::vector
<SDValue
> NewResults
;
9991 for (unsigned i
= 0; i
< NumResultVecs
; ++i
) {
9992 NewResults
.push_back(SDValue(UpdN
.getNode(), i
));
9994 NewResults
.push_back(SDValue(UpdN
.getNode(), NumResultVecs
+ 1));
9995 DCI
.CombineTo(N
, NewResults
);
9996 DCI
.CombineTo(User
, SDValue(UpdN
.getNode(), NumResultVecs
));
10003 // Checks to see if the value is the prescribed width and returns information
10004 // about its extension mode.
10006 bool checkValueWidth(SDValue V
, unsigned width
, ISD::LoadExtType
&ExtType
) {
10007 ExtType
= ISD::NON_EXTLOAD
;
10008 switch(V
.getNode()->getOpcode()) {
10012 LoadSDNode
*LoadNode
= cast
<LoadSDNode
>(V
.getNode());
10013 if ((LoadNode
->getMemoryVT() == MVT::i8
&& width
== 8)
10014 || (LoadNode
->getMemoryVT() == MVT::i16
&& width
== 16)) {
10015 ExtType
= LoadNode
->getExtensionType();
10020 case ISD::AssertSext
: {
10021 VTSDNode
*TypeNode
= cast
<VTSDNode
>(V
.getNode()->getOperand(1));
10022 if ((TypeNode
->getVT() == MVT::i8
&& width
== 8)
10023 || (TypeNode
->getVT() == MVT::i16
&& width
== 16)) {
10024 ExtType
= ISD::SEXTLOAD
;
10029 case ISD::AssertZext
: {
10030 VTSDNode
*TypeNode
= cast
<VTSDNode
>(V
.getNode()->getOperand(1));
10031 if ((TypeNode
->getVT() == MVT::i8
&& width
== 8)
10032 || (TypeNode
->getVT() == MVT::i16
&& width
== 16)) {
10033 ExtType
= ISD::ZEXTLOAD
;
10038 case ISD::Constant
:
10039 case ISD::TargetConstant
: {
10040 return std::abs(cast
<ConstantSDNode
>(V
.getNode())->getSExtValue()) <
10041 1LL << (width
- 1);
10048 // This function does a whole lot of voodoo to determine if the tests are
10049 // equivalent without and with a mask. Essentially what happens is that given a
10052 // +-------------+ +-------------+ +-------------+ +-------------+
10053 // | Input | | AddConstant | | CompConstant| | CC |
10054 // +-------------+ +-------------+ +-------------+ +-------------+
10056 // V V | +----------+
10057 // +-------------+ +----+ | |
10058 // | ADD | |0xff| | |
10059 // +-------------+ +----+ | |
10062 // +-------------+ | |
10064 // +-------------+ | |
10073 // The AND node may be safely removed for some combinations of inputs. In
10074 // particular we need to take into account the extension type of the Input,
10075 // the exact values of AddConstant, CompConstant, and CC, along with the nominal
10076 // width of the input (this can work for any width inputs, the above graph is
10077 // specific to 8 bits.
10079 // The specific equations were worked out by generating output tables for each
10080 // AArch64CC value in terms of and AddConstant (w1), CompConstant(w2). The
10081 // problem was simplified by working with 4 bit inputs, which means we only
10082 // needed to reason about 24 distinct bit patterns: 8 patterns unique to zero
10083 // extension (8,15), 8 patterns unique to sign extensions (-8,-1), and 8
10084 // patterns present in both extensions (0,7). For every distinct set of
10085 // AddConstant and CompConstants bit patterns we can consider the masked and
10086 // unmasked versions to be equivalent if the result of this function is true for
10087 // all 16 distinct bit patterns of for the current extension type of Input (w0).
10090 // and w10, w8, #0x0f
10092 // cset w9, AArch64CC
10094 // cset w11, AArch64CC
10099 // Since the above function shows when the outputs are equivalent it defines
10100 // when it is safe to remove the AND. Unfortunately it only runs on AArch64 and
10101 // would be expensive to run during compiles. The equations below were written
10102 // in a test harness that confirmed they gave equivalent outputs to the above
10103 // for all inputs function, so they can be used determine if the removal is
10106 // isEquivalentMaskless() is the code for testing if the AND can be removed
10107 // factored out of the DAG recognition as the DAG can take several forms.
10109 static bool isEquivalentMaskless(unsigned CC
, unsigned width
,
10110 ISD::LoadExtType ExtType
, int AddConstant
,
10111 int CompConstant
) {
10112 // By being careful about our equations and only writing the in term
10113 // symbolic values and well known constants (0, 1, -1, MaxUInt) we can
10114 // make them generally applicable to all bit widths.
10115 int MaxUInt
= (1 << width
);
10117 // For the purposes of these comparisons sign extending the type is
10118 // equivalent to zero extending the add and displacing it by half the integer
10119 // width. Provided we are careful and make sure our equations are valid over
10120 // the whole range we can just adjust the input and avoid writing equations
10121 // for sign extended inputs.
10122 if (ExtType
== ISD::SEXTLOAD
)
10123 AddConstant
-= (1 << (width
-1));
10126 case AArch64CC::LE
:
10127 case AArch64CC::GT
:
10128 if ((AddConstant
== 0) ||
10129 (CompConstant
== MaxUInt
- 1 && AddConstant
< 0) ||
10130 (AddConstant
>= 0 && CompConstant
< 0) ||
10131 (AddConstant
<= 0 && CompConstant
<= 0 && CompConstant
< AddConstant
))
10134 case AArch64CC::LT
:
10135 case AArch64CC::GE
:
10136 if ((AddConstant
== 0) ||
10137 (AddConstant
>= 0 && CompConstant
<= 0) ||
10138 (AddConstant
<= 0 && CompConstant
<= 0 && CompConstant
<= AddConstant
))
10141 case AArch64CC::HI
:
10142 case AArch64CC::LS
:
10143 if ((AddConstant
>= 0 && CompConstant
< 0) ||
10144 (AddConstant
<= 0 && CompConstant
>= -1 &&
10145 CompConstant
< AddConstant
+ MaxUInt
))
10148 case AArch64CC::PL
:
10149 case AArch64CC::MI
:
10150 if ((AddConstant
== 0) ||
10151 (AddConstant
> 0 && CompConstant
<= 0) ||
10152 (AddConstant
< 0 && CompConstant
<= AddConstant
))
10155 case AArch64CC::LO
:
10156 case AArch64CC::HS
:
10157 if ((AddConstant
>= 0 && CompConstant
<= 0) ||
10158 (AddConstant
<= 0 && CompConstant
>= 0 &&
10159 CompConstant
<= AddConstant
+ MaxUInt
))
10162 case AArch64CC::EQ
:
10163 case AArch64CC::NE
:
10164 if ((AddConstant
> 0 && CompConstant
< 0) ||
10165 (AddConstant
< 0 && CompConstant
>= 0 &&
10166 CompConstant
< AddConstant
+ MaxUInt
) ||
10167 (AddConstant
>= 0 && CompConstant
>= 0 &&
10168 CompConstant
>= AddConstant
) ||
10169 (AddConstant
<= 0 && CompConstant
< 0 && CompConstant
< AddConstant
))
10172 case AArch64CC::VS
:
10173 case AArch64CC::VC
:
10174 case AArch64CC::AL
:
10175 case AArch64CC::NV
:
10177 case AArch64CC::Invalid
:
10185 SDValue
performCONDCombine(SDNode
*N
,
10186 TargetLowering::DAGCombinerInfo
&DCI
,
10187 SelectionDAG
&DAG
, unsigned CCIndex
,
10188 unsigned CmpIndex
) {
10189 unsigned CC
= cast
<ConstantSDNode
>(N
->getOperand(CCIndex
))->getSExtValue();
10190 SDNode
*SubsNode
= N
->getOperand(CmpIndex
).getNode();
10191 unsigned CondOpcode
= SubsNode
->getOpcode();
10193 if (CondOpcode
!= AArch64ISD::SUBS
)
10196 // There is a SUBS feeding this condition. Is it fed by a mask we can
10199 SDNode
*AndNode
= SubsNode
->getOperand(0).getNode();
10200 unsigned MaskBits
= 0;
10202 if (AndNode
->getOpcode() != ISD::AND
)
10205 if (ConstantSDNode
*CN
= dyn_cast
<ConstantSDNode
>(AndNode
->getOperand(1))) {
10206 uint32_t CNV
= CN
->getZExtValue();
10209 else if (CNV
== 65535)
10216 SDValue AddValue
= AndNode
->getOperand(0);
10218 if (AddValue
.getOpcode() != ISD::ADD
)
10221 // The basic dag structure is correct, grab the inputs and validate them.
10223 SDValue AddInputValue1
= AddValue
.getNode()->getOperand(0);
10224 SDValue AddInputValue2
= AddValue
.getNode()->getOperand(1);
10225 SDValue SubsInputValue
= SubsNode
->getOperand(1);
10227 // The mask is present and the provenance of all the values is a smaller type,
10228 // lets see if the mask is superfluous.
10230 if (!isa
<ConstantSDNode
>(AddInputValue2
.getNode()) ||
10231 !isa
<ConstantSDNode
>(SubsInputValue
.getNode()))
10234 ISD::LoadExtType ExtType
;
10236 if (!checkValueWidth(SubsInputValue
, MaskBits
, ExtType
) ||
10237 !checkValueWidth(AddInputValue2
, MaskBits
, ExtType
) ||
10238 !checkValueWidth(AddInputValue1
, MaskBits
, ExtType
) )
10241 if(!isEquivalentMaskless(CC
, MaskBits
, ExtType
,
10242 cast
<ConstantSDNode
>(AddInputValue2
.getNode())->getSExtValue(),
10243 cast
<ConstantSDNode
>(SubsInputValue
.getNode())->getSExtValue()))
10246 // The AND is not necessary, remove it.
10248 SDVTList VTs
= DAG
.getVTList(SubsNode
->getValueType(0),
10249 SubsNode
->getValueType(1));
10250 SDValue Ops
[] = { AddValue
, SubsNode
->getOperand(1) };
10252 SDValue NewValue
= DAG
.getNode(CondOpcode
, SDLoc(SubsNode
), VTs
, Ops
);
10253 DAG
.ReplaceAllUsesWith(SubsNode
, NewValue
.getNode());
10255 return SDValue(N
, 0);
10258 // Optimize compare with zero and branch.
10259 static SDValue
performBRCONDCombine(SDNode
*N
,
10260 TargetLowering::DAGCombinerInfo
&DCI
,
10261 SelectionDAG
&DAG
) {
10262 if (SDValue NV
= performCONDCombine(N
, DCI
, DAG
, 2, 3))
10264 SDValue Chain
= N
->getOperand(0);
10265 SDValue Dest
= N
->getOperand(1);
10266 SDValue CCVal
= N
->getOperand(2);
10267 SDValue Cmp
= N
->getOperand(3);
10269 assert(isa
<ConstantSDNode
>(CCVal
) && "Expected a ConstantSDNode here!");
10270 unsigned CC
= cast
<ConstantSDNode
>(CCVal
)->getZExtValue();
10271 if (CC
!= AArch64CC::EQ
&& CC
!= AArch64CC::NE
)
10274 unsigned CmpOpc
= Cmp
.getOpcode();
10275 if (CmpOpc
!= AArch64ISD::ADDS
&& CmpOpc
!= AArch64ISD::SUBS
)
10278 // Only attempt folding if there is only one use of the flag and no use of the
10280 if (!Cmp
->hasNUsesOfValue(0, 0) || !Cmp
->hasNUsesOfValue(1, 1))
10283 SDValue LHS
= Cmp
.getOperand(0);
10284 SDValue RHS
= Cmp
.getOperand(1);
10286 assert(LHS
.getValueType() == RHS
.getValueType() &&
10287 "Expected the value type to be the same for both operands!");
10288 if (LHS
.getValueType() != MVT::i32
&& LHS
.getValueType() != MVT::i64
)
10291 if (isNullConstant(LHS
))
10292 std::swap(LHS
, RHS
);
10294 if (!isNullConstant(RHS
))
10297 if (LHS
.getOpcode() == ISD::SHL
|| LHS
.getOpcode() == ISD::SRA
||
10298 LHS
.getOpcode() == ISD::SRL
)
10301 // Fold the compare into the branch instruction.
10303 if (CC
== AArch64CC::EQ
)
10304 BR
= DAG
.getNode(AArch64ISD::CBZ
, SDLoc(N
), MVT::Other
, Chain
, LHS
, Dest
);
10306 BR
= DAG
.getNode(AArch64ISD::CBNZ
, SDLoc(N
), MVT::Other
, Chain
, LHS
, Dest
);
10308 // Do not add new nodes to DAG combiner worklist.
10309 DCI
.CombineTo(N
, BR
, false);
10314 // Optimize some simple tbz/tbnz cases. Returns the new operand and bit to test
10315 // as well as whether the test should be inverted. This code is required to
10316 // catch these cases (as opposed to standard dag combines) because
10317 // AArch64ISD::TBZ is matched during legalization.
10318 static SDValue
getTestBitOperand(SDValue Op
, unsigned &Bit
, bool &Invert
,
10319 SelectionDAG
&DAG
) {
10321 if (!Op
->hasOneUse())
10324 // We don't handle undef/constant-fold cases below, as they should have
10325 // already been taken care of (e.g. and of 0, test of undefined shifted bits,
10328 // (tbz (trunc x), b) -> (tbz x, b)
10329 // This case is just here to enable more of the below cases to be caught.
10330 if (Op
->getOpcode() == ISD::TRUNCATE
&&
10331 Bit
< Op
->getValueType(0).getSizeInBits()) {
10332 return getTestBitOperand(Op
->getOperand(0), Bit
, Invert
, DAG
);
10335 if (Op
->getNumOperands() != 2)
10338 auto *C
= dyn_cast
<ConstantSDNode
>(Op
->getOperand(1));
10342 switch (Op
->getOpcode()) {
10346 // (tbz (and x, m), b) -> (tbz x, b)
10348 if ((C
->getZExtValue() >> Bit
) & 1)
10349 return getTestBitOperand(Op
->getOperand(0), Bit
, Invert
, DAG
);
10352 // (tbz (shl x, c), b) -> (tbz x, b-c)
10354 if (C
->getZExtValue() <= Bit
&&
10355 (Bit
- C
->getZExtValue()) < Op
->getValueType(0).getSizeInBits()) {
10356 Bit
= Bit
- C
->getZExtValue();
10357 return getTestBitOperand(Op
->getOperand(0), Bit
, Invert
, DAG
);
10361 // (tbz (sra x, c), b) -> (tbz x, b+c) or (tbz x, msb) if b+c is > # bits in x
10363 Bit
= Bit
+ C
->getZExtValue();
10364 if (Bit
>= Op
->getValueType(0).getSizeInBits())
10365 Bit
= Op
->getValueType(0).getSizeInBits() - 1;
10366 return getTestBitOperand(Op
->getOperand(0), Bit
, Invert
, DAG
);
10368 // (tbz (srl x, c), b) -> (tbz x, b+c)
10370 if ((Bit
+ C
->getZExtValue()) < Op
->getValueType(0).getSizeInBits()) {
10371 Bit
= Bit
+ C
->getZExtValue();
10372 return getTestBitOperand(Op
->getOperand(0), Bit
, Invert
, DAG
);
10376 // (tbz (xor x, -1), b) -> (tbnz x, b)
10378 if ((C
->getZExtValue() >> Bit
) & 1)
10380 return getTestBitOperand(Op
->getOperand(0), Bit
, Invert
, DAG
);
10384 // Optimize test single bit zero/non-zero and branch.
10385 static SDValue
performTBZCombine(SDNode
*N
,
10386 TargetLowering::DAGCombinerInfo
&DCI
,
10387 SelectionDAG
&DAG
) {
10388 unsigned Bit
= cast
<ConstantSDNode
>(N
->getOperand(2))->getZExtValue();
10389 bool Invert
= false;
10390 SDValue TestSrc
= N
->getOperand(1);
10391 SDValue NewTestSrc
= getTestBitOperand(TestSrc
, Bit
, Invert
, DAG
);
10393 if (TestSrc
== NewTestSrc
)
10396 unsigned NewOpc
= N
->getOpcode();
10398 if (NewOpc
== AArch64ISD::TBZ
)
10399 NewOpc
= AArch64ISD::TBNZ
;
10401 assert(NewOpc
== AArch64ISD::TBNZ
);
10402 NewOpc
= AArch64ISD::TBZ
;
10407 return DAG
.getNode(NewOpc
, DL
, MVT::Other
, N
->getOperand(0), NewTestSrc
,
10408 DAG
.getConstant(Bit
, DL
, MVT::i64
), N
->getOperand(3));
10411 // vselect (v1i1 setcc) ->
10412 // vselect (v1iXX setcc) (XX is the size of the compared operand type)
10413 // FIXME: Currently the type legalizer can't handle VSELECT having v1i1 as
10414 // condition. If it can legalize "VSELECT v1i1" correctly, no need to combine
10416 static SDValue
performVSelectCombine(SDNode
*N
, SelectionDAG
&DAG
) {
10417 SDValue N0
= N
->getOperand(0);
10418 EVT CCVT
= N0
.getValueType();
10420 if (N0
.getOpcode() != ISD::SETCC
|| CCVT
.getVectorNumElements() != 1 ||
10421 CCVT
.getVectorElementType() != MVT::i1
)
10424 EVT ResVT
= N
->getValueType(0);
10425 EVT CmpVT
= N0
.getOperand(0).getValueType();
10426 // Only combine when the result type is of the same size as the compared
10428 if (ResVT
.getSizeInBits() != CmpVT
.getSizeInBits())
10431 SDValue IfTrue
= N
->getOperand(1);
10432 SDValue IfFalse
= N
->getOperand(2);
10434 DAG
.getSetCC(SDLoc(N
), CmpVT
.changeVectorElementTypeToInteger(),
10435 N0
.getOperand(0), N0
.getOperand(1),
10436 cast
<CondCodeSDNode
>(N0
.getOperand(2))->get());
10437 return DAG
.getNode(ISD::VSELECT
, SDLoc(N
), ResVT
, SetCC
,
10441 /// A vector select: "(select vL, vR, (setcc LHS, RHS))" is best performed with
10442 /// the compare-mask instructions rather than going via NZCV, even if LHS and
10443 /// RHS are really scalar. This replaces any scalar setcc in the above pattern
10444 /// with a vector one followed by a DUP shuffle on the result.
10445 static SDValue
performSelectCombine(SDNode
*N
,
10446 TargetLowering::DAGCombinerInfo
&DCI
) {
10447 SelectionDAG
&DAG
= DCI
.DAG
;
10448 SDValue N0
= N
->getOperand(0);
10449 EVT ResVT
= N
->getValueType(0);
10451 if (N0
.getOpcode() != ISD::SETCC
)
10454 // Make sure the SETCC result is either i1 (initial DAG), or i32, the lowered
10455 // scalar SetCCResultType. We also don't expect vectors, because we assume
10456 // that selects fed by vector SETCCs are canonicalized to VSELECT.
10457 assert((N0
.getValueType() == MVT::i1
|| N0
.getValueType() == MVT::i32
) &&
10458 "Scalar-SETCC feeding SELECT has unexpected result type!");
10460 // If NumMaskElts == 0, the comparison is larger than select result. The
10461 // largest real NEON comparison is 64-bits per lane, which means the result is
10462 // at most 32-bits and an illegal vector. Just bail out for now.
10463 EVT SrcVT
= N0
.getOperand(0).getValueType();
10465 // Don't try to do this optimization when the setcc itself has i1 operands.
10466 // There are no legal vectors of i1, so this would be pointless.
10467 if (SrcVT
== MVT::i1
)
10470 int NumMaskElts
= ResVT
.getSizeInBits() / SrcVT
.getSizeInBits();
10471 if (!ResVT
.isVector() || NumMaskElts
== 0)
10474 SrcVT
= EVT::getVectorVT(*DAG
.getContext(), SrcVT
, NumMaskElts
);
10475 EVT CCVT
= SrcVT
.changeVectorElementTypeToInteger();
10477 // Also bail out if the vector CCVT isn't the same size as ResVT.
10478 // This can happen if the SETCC operand size doesn't divide the ResVT size
10479 // (e.g., f64 vs v3f32).
10480 if (CCVT
.getSizeInBits() != ResVT
.getSizeInBits())
10483 // Make sure we didn't create illegal types, if we're not supposed to.
10484 assert(DCI
.isBeforeLegalize() ||
10485 DAG
.getTargetLoweringInfo().isTypeLegal(SrcVT
));
10487 // First perform a vector comparison, where lane 0 is the one we're interested
10491 DAG
.getNode(ISD::SCALAR_TO_VECTOR
, DL
, SrcVT
, N0
.getOperand(0));
10493 DAG
.getNode(ISD::SCALAR_TO_VECTOR
, DL
, SrcVT
, N0
.getOperand(1));
10494 SDValue SetCC
= DAG
.getNode(ISD::SETCC
, DL
, CCVT
, LHS
, RHS
, N0
.getOperand(2));
10496 // Now duplicate the comparison mask we want across all other lanes.
10497 SmallVector
<int, 8> DUPMask(CCVT
.getVectorNumElements(), 0);
10498 SDValue Mask
= DAG
.getVectorShuffle(CCVT
, DL
, SetCC
, SetCC
, DUPMask
);
10499 Mask
= DAG
.getNode(ISD::BITCAST
, DL
,
10500 ResVT
.changeVectorElementTypeToInteger(), Mask
);
10502 return DAG
.getSelect(DL
, ResVT
, Mask
, N
->getOperand(1), N
->getOperand(2));
10505 /// Get rid of unnecessary NVCASTs (that don't change the type).
10506 static SDValue
performNVCASTCombine(SDNode
*N
) {
10507 if (N
->getValueType(0) == N
->getOperand(0).getValueType())
10508 return N
->getOperand(0);
10513 SDValue
AArch64TargetLowering::PerformDAGCombine(SDNode
*N
,
10514 DAGCombinerInfo
&DCI
) const {
10515 SelectionDAG
&DAG
= DCI
.DAG
;
10516 switch (N
->getOpcode()) {
10518 DEBUG(dbgs() << "Custom combining: skipping\n");
10522 return performAddSubLongCombine(N
, DCI
, DAG
);
10524 return performXorCombine(N
, DAG
, DCI
, Subtarget
);
10526 return performMulCombine(N
, DAG
, DCI
, Subtarget
);
10527 case ISD::SINT_TO_FP
:
10528 case ISD::UINT_TO_FP
:
10529 return performIntToFpCombine(N
, DAG
, Subtarget
);
10530 case ISD::FP_TO_SINT
:
10531 case ISD::FP_TO_UINT
:
10532 return performFpToIntCombine(N
, DAG
, DCI
, Subtarget
);
10534 return performFDivCombine(N
, DAG
, DCI
, Subtarget
);
10536 return performORCombine(N
, DCI
, Subtarget
);
10538 return performSRLCombine(N
, DCI
);
10539 case ISD::INTRINSIC_WO_CHAIN
:
10540 return performIntrinsicCombine(N
, DCI
, Subtarget
);
10541 case ISD::ANY_EXTEND
:
10542 case ISD::ZERO_EXTEND
:
10543 case ISD::SIGN_EXTEND
:
10544 return performExtendCombine(N
, DCI
, DAG
);
10546 return performBitcastCombine(N
, DCI
, DAG
);
10547 case ISD::CONCAT_VECTORS
:
10548 return performConcatVectorsCombine(N
, DCI
, DAG
);
10550 return performSelectCombine(N
, DCI
);
10552 return performVSelectCombine(N
, DCI
.DAG
);
10554 if (performTBISimplification(N
->getOperand(1), DCI
, DAG
))
10555 return SDValue(N
, 0);
10558 return performSTORECombine(N
, DCI
, DAG
, Subtarget
);
10559 case AArch64ISD::BRCOND
:
10560 return performBRCONDCombine(N
, DCI
, DAG
);
10561 case AArch64ISD::TBNZ
:
10562 case AArch64ISD::TBZ
:
10563 return performTBZCombine(N
, DCI
, DAG
);
10564 case AArch64ISD::CSEL
:
10565 return performCONDCombine(N
, DCI
, DAG
, 2, 3);
10566 case AArch64ISD::DUP
:
10567 return performPostLD1Combine(N
, DCI
, false);
10568 case AArch64ISD::NVCAST
:
10569 return performNVCASTCombine(N
);
10570 case ISD::INSERT_VECTOR_ELT
:
10571 return performPostLD1Combine(N
, DCI
, true);
10572 case ISD::INTRINSIC_VOID
:
10573 case ISD::INTRINSIC_W_CHAIN
:
10574 switch (cast
<ConstantSDNode
>(N
->getOperand(1))->getZExtValue()) {
10575 case Intrinsic::aarch64_neon_ld2
:
10576 case Intrinsic::aarch64_neon_ld3
:
10577 case Intrinsic::aarch64_neon_ld4
:
10578 case Intrinsic::aarch64_neon_ld1x2
:
10579 case Intrinsic::aarch64_neon_ld1x3
:
10580 case Intrinsic::aarch64_neon_ld1x4
:
10581 case Intrinsic::aarch64_neon_ld2lane
:
10582 case Intrinsic::aarch64_neon_ld3lane
:
10583 case Intrinsic::aarch64_neon_ld4lane
:
10584 case Intrinsic::aarch64_neon_ld2r
:
10585 case Intrinsic::aarch64_neon_ld3r
:
10586 case Intrinsic::aarch64_neon_ld4r
:
10587 case Intrinsic::aarch64_neon_st2
:
10588 case Intrinsic::aarch64_neon_st3
:
10589 case Intrinsic::aarch64_neon_st4
:
10590 case Intrinsic::aarch64_neon_st1x2
:
10591 case Intrinsic::aarch64_neon_st1x3
:
10592 case Intrinsic::aarch64_neon_st1x4
:
10593 case Intrinsic::aarch64_neon_st2lane
:
10594 case Intrinsic::aarch64_neon_st3lane
:
10595 case Intrinsic::aarch64_neon_st4lane
:
10596 return performNEONPostLDSTCombine(N
, DCI
, DAG
);
10604 // Check if the return value is used as only a return value, as otherwise
10605 // we can't perform a tail-call. In particular, we need to check for
10606 // target ISD nodes that are returns and any other "odd" constructs
10607 // that the generic analysis code won't necessarily catch.
10608 bool AArch64TargetLowering::isUsedByReturnOnly(SDNode
*N
,
10609 SDValue
&Chain
) const {
10610 if (N
->getNumValues() != 1)
10612 if (!N
->hasNUsesOfValue(1, 0))
10615 SDValue TCChain
= Chain
;
10616 SDNode
*Copy
= *N
->use_begin();
10617 if (Copy
->getOpcode() == ISD::CopyToReg
) {
10618 // If the copy has a glue operand, we conservatively assume it isn't safe to
10619 // perform a tail call.
10620 if (Copy
->getOperand(Copy
->getNumOperands() - 1).getValueType() ==
10623 TCChain
= Copy
->getOperand(0);
10624 } else if (Copy
->getOpcode() != ISD::FP_EXTEND
)
10627 bool HasRet
= false;
10628 for (SDNode
*Node
: Copy
->uses()) {
10629 if (Node
->getOpcode() != AArch64ISD::RET_FLAG
)
10641 // Return whether the an instruction can potentially be optimized to a tail
10642 // call. This will cause the optimizers to attempt to move, or duplicate,
10643 // return instructions to help enable tail call optimizations for this
10645 bool AArch64TargetLowering::mayBeEmittedAsTailCall(const CallInst
*CI
) const {
10646 return CI
->isTailCall();
10649 bool AArch64TargetLowering::getIndexedAddressParts(SDNode
*Op
, SDValue
&Base
,
10651 ISD::MemIndexedMode
&AM
,
10653 SelectionDAG
&DAG
) const {
10654 if (Op
->getOpcode() != ISD::ADD
&& Op
->getOpcode() != ISD::SUB
)
10657 Base
= Op
->getOperand(0);
10658 // All of the indexed addressing mode instructions take a signed
10659 // 9 bit immediate offset.
10660 if (ConstantSDNode
*RHS
= dyn_cast
<ConstantSDNode
>(Op
->getOperand(1))) {
10661 int64_t RHSC
= RHS
->getSExtValue();
10662 if (Op
->getOpcode() == ISD::SUB
)
10663 RHSC
= -(uint64_t)RHSC
;
10664 if (!isInt
<9>(RHSC
))
10666 IsInc
= (Op
->getOpcode() == ISD::ADD
);
10667 Offset
= Op
->getOperand(1);
10673 bool AArch64TargetLowering::getPreIndexedAddressParts(SDNode
*N
, SDValue
&Base
,
10675 ISD::MemIndexedMode
&AM
,
10676 SelectionDAG
&DAG
) const {
10679 if (LoadSDNode
*LD
= dyn_cast
<LoadSDNode
>(N
)) {
10680 VT
= LD
->getMemoryVT();
10681 Ptr
= LD
->getBasePtr();
10682 } else if (StoreSDNode
*ST
= dyn_cast
<StoreSDNode
>(N
)) {
10683 VT
= ST
->getMemoryVT();
10684 Ptr
= ST
->getBasePtr();
10689 if (!getIndexedAddressParts(Ptr
.getNode(), Base
, Offset
, AM
, IsInc
, DAG
))
10691 AM
= IsInc
? ISD::PRE_INC
: ISD::PRE_DEC
;
10695 bool AArch64TargetLowering::getPostIndexedAddressParts(
10696 SDNode
*N
, SDNode
*Op
, SDValue
&Base
, SDValue
&Offset
,
10697 ISD::MemIndexedMode
&AM
, SelectionDAG
&DAG
) const {
10700 if (LoadSDNode
*LD
= dyn_cast
<LoadSDNode
>(N
)) {
10701 VT
= LD
->getMemoryVT();
10702 Ptr
= LD
->getBasePtr();
10703 } else if (StoreSDNode
*ST
= dyn_cast
<StoreSDNode
>(N
)) {
10704 VT
= ST
->getMemoryVT();
10705 Ptr
= ST
->getBasePtr();
10710 if (!getIndexedAddressParts(Op
, Base
, Offset
, AM
, IsInc
, DAG
))
10712 // Post-indexing updates the base, so it's not a valid transform
10713 // if that's not the same as the load's pointer.
10716 AM
= IsInc
? ISD::POST_INC
: ISD::POST_DEC
;
10720 static void ReplaceBITCASTResults(SDNode
*N
, SmallVectorImpl
<SDValue
> &Results
,
10721 SelectionDAG
&DAG
) {
10723 SDValue Op
= N
->getOperand(0);
10725 if (N
->getValueType(0) != MVT::i16
|| Op
.getValueType() != MVT::f16
)
10729 DAG
.getMachineNode(TargetOpcode::INSERT_SUBREG
, DL
, MVT::f32
,
10730 DAG
.getUNDEF(MVT::i32
), Op
,
10731 DAG
.getTargetConstant(AArch64::hsub
, DL
, MVT::i32
)),
10733 Op
= DAG
.getNode(ISD::BITCAST
, DL
, MVT::i32
, Op
);
10734 Results
.push_back(DAG
.getNode(ISD::TRUNCATE
, DL
, MVT::i16
, Op
));
10737 static void ReplaceReductionResults(SDNode
*N
,
10738 SmallVectorImpl
<SDValue
> &Results
,
10739 SelectionDAG
&DAG
, unsigned InterOp
,
10740 unsigned AcrossOp
) {
10744 std::tie(LoVT
, HiVT
) = DAG
.GetSplitDestVTs(N
->getValueType(0));
10745 std::tie(Lo
, Hi
) = DAG
.SplitVectorOperand(N
, 0);
10746 SDValue InterVal
= DAG
.getNode(InterOp
, dl
, LoVT
, Lo
, Hi
);
10747 SDValue SplitVal
= DAG
.getNode(AcrossOp
, dl
, LoVT
, InterVal
);
10748 Results
.push_back(SplitVal
);
10751 static std::pair
<SDValue
, SDValue
> splitInt128(SDValue N
, SelectionDAG
&DAG
) {
10753 SDValue Lo
= DAG
.getNode(ISD::TRUNCATE
, DL
, MVT::i64
, N
);
10754 SDValue Hi
= DAG
.getNode(ISD::TRUNCATE
, DL
, MVT::i64
,
10755 DAG
.getNode(ISD::SRL
, DL
, MVT::i128
, N
,
10756 DAG
.getConstant(64, DL
, MVT::i64
)));
10757 return std::make_pair(Lo
, Hi
);
10760 // Create an even/odd pair of X registers holding integer value V.
10761 static SDValue
createGPRPairNode(SelectionDAG
&DAG
, SDValue V
) {
10762 SDLoc
dl(V
.getNode());
10763 SDValue VLo
= DAG
.getAnyExtOrTrunc(V
, dl
, MVT::i64
);
10764 SDValue VHi
= DAG
.getAnyExtOrTrunc(
10765 DAG
.getNode(ISD::SRL
, dl
, MVT::i128
, V
, DAG
.getConstant(64, dl
, MVT::i64
)),
10767 if (DAG
.getDataLayout().isBigEndian())
10768 std::swap (VLo
, VHi
);
10770 DAG
.getTargetConstant(AArch64::XSeqPairsClassRegClassID
, dl
, MVT::i32
);
10771 SDValue SubReg0
= DAG
.getTargetConstant(AArch64::sube64
, dl
, MVT::i32
);
10772 SDValue SubReg1
= DAG
.getTargetConstant(AArch64::subo64
, dl
, MVT::i32
);
10773 const SDValue Ops
[] = { RegClass
, VLo
, SubReg0
, VHi
, SubReg1
};
10775 DAG
.getMachineNode(TargetOpcode::REG_SEQUENCE
, dl
, MVT::Untyped
, Ops
), 0);
10778 static void ReplaceCMP_SWAP_128Results(SDNode
*N
,
10779 SmallVectorImpl
<SDValue
> &Results
,
10781 const AArch64Subtarget
*Subtarget
) {
10782 assert(N
->getValueType(0) == MVT::i128
&&
10783 "AtomicCmpSwap on types less than 128 should be legal");
10785 if (Subtarget
->hasLSE()) {
10786 // LSE has a 128-bit compare and swap (CASP), but i128 is not a legal type,
10787 // so lower it here, wrapped in REG_SEQUENCE and EXTRACT_SUBREG.
10789 createGPRPairNode(DAG
, N
->getOperand(2)), // Compare value
10790 createGPRPairNode(DAG
, N
->getOperand(3)), // Store value
10791 N
->getOperand(1), // Ptr
10792 N
->getOperand(0), // Chain in
10795 MachineFunction
&MF
= DAG
.getMachineFunction();
10796 MachineSDNode::mmo_iterator MemOp
= MF
.allocateMemRefsArray(1);
10797 MemOp
[0] = cast
<MemSDNode
>(N
)->getMemOperand();
10800 switch (MemOp
[0]->getOrdering()) {
10801 case AtomicOrdering::Monotonic
:
10802 Opcode
= AArch64::CASPX
;
10804 case AtomicOrdering::Acquire
:
10805 Opcode
= AArch64::CASPAX
;
10807 case AtomicOrdering::Release
:
10808 Opcode
= AArch64::CASPLX
;
10810 case AtomicOrdering::AcquireRelease
:
10811 case AtomicOrdering::SequentiallyConsistent
:
10812 Opcode
= AArch64::CASPALX
;
10815 llvm_unreachable("Unexpected ordering!");
10818 MachineSDNode
*CmpSwap
= DAG
.getMachineNode(
10819 Opcode
, SDLoc(N
), DAG
.getVTList(MVT::Untyped
, MVT::Other
), Ops
);
10820 CmpSwap
->setMemRefs(MemOp
, MemOp
+ 1);
10822 unsigned SubReg1
= AArch64::sube64
, SubReg2
= AArch64::subo64
;
10823 if (DAG
.getDataLayout().isBigEndian())
10824 std::swap(SubReg1
, SubReg2
);
10825 Results
.push_back(DAG
.getTargetExtractSubreg(SubReg1
, SDLoc(N
), MVT::i64
,
10826 SDValue(CmpSwap
, 0)));
10827 Results
.push_back(DAG
.getTargetExtractSubreg(SubReg2
, SDLoc(N
), MVT::i64
,
10828 SDValue(CmpSwap
, 0)));
10829 Results
.push_back(SDValue(CmpSwap
, 1)); // Chain out
10833 auto Desired
= splitInt128(N
->getOperand(2), DAG
);
10834 auto New
= splitInt128(N
->getOperand(3), DAG
);
10835 SDValue Ops
[] = {N
->getOperand(1), Desired
.first
, Desired
.second
,
10836 New
.first
, New
.second
, N
->getOperand(0)};
10837 SDNode
*CmpSwap
= DAG
.getMachineNode(
10838 AArch64::CMP_SWAP_128
, SDLoc(N
),
10839 DAG
.getVTList(MVT::i64
, MVT::i64
, MVT::i32
, MVT::Other
), Ops
);
10841 MachineFunction
&MF
= DAG
.getMachineFunction();
10842 MachineSDNode::mmo_iterator MemOp
= MF
.allocateMemRefsArray(1);
10843 MemOp
[0] = cast
<MemSDNode
>(N
)->getMemOperand();
10844 cast
<MachineSDNode
>(CmpSwap
)->setMemRefs(MemOp
, MemOp
+ 1);
10846 Results
.push_back(SDValue(CmpSwap
, 0));
10847 Results
.push_back(SDValue(CmpSwap
, 1));
10848 Results
.push_back(SDValue(CmpSwap
, 3));
10851 void AArch64TargetLowering::ReplaceNodeResults(
10852 SDNode
*N
, SmallVectorImpl
<SDValue
> &Results
, SelectionDAG
&DAG
) const {
10853 switch (N
->getOpcode()) {
10855 llvm_unreachable("Don't know how to custom expand this");
10857 ReplaceBITCASTResults(N
, Results
, DAG
);
10859 case ISD::VECREDUCE_ADD
:
10860 case ISD::VECREDUCE_SMAX
:
10861 case ISD::VECREDUCE_SMIN
:
10862 case ISD::VECREDUCE_UMAX
:
10863 case ISD::VECREDUCE_UMIN
:
10864 Results
.push_back(LowerVECREDUCE(SDValue(N
, 0), DAG
));
10867 case AArch64ISD::SADDV
:
10868 ReplaceReductionResults(N
, Results
, DAG
, ISD::ADD
, AArch64ISD::SADDV
);
10870 case AArch64ISD::UADDV
:
10871 ReplaceReductionResults(N
, Results
, DAG
, ISD::ADD
, AArch64ISD::UADDV
);
10873 case AArch64ISD::SMINV
:
10874 ReplaceReductionResults(N
, Results
, DAG
, ISD::SMIN
, AArch64ISD::SMINV
);
10876 case AArch64ISD::UMINV
:
10877 ReplaceReductionResults(N
, Results
, DAG
, ISD::UMIN
, AArch64ISD::UMINV
);
10879 case AArch64ISD::SMAXV
:
10880 ReplaceReductionResults(N
, Results
, DAG
, ISD::SMAX
, AArch64ISD::SMAXV
);
10882 case AArch64ISD::UMAXV
:
10883 ReplaceReductionResults(N
, Results
, DAG
, ISD::UMAX
, AArch64ISD::UMAXV
);
10885 case ISD::FP_TO_UINT
:
10886 case ISD::FP_TO_SINT
:
10887 assert(N
->getValueType(0) == MVT::i128
&& "unexpected illegal conversion");
10888 // Let normal code take care of it by not adding anything to Results.
10890 case ISD::ATOMIC_CMP_SWAP
:
10891 ReplaceCMP_SWAP_128Results(N
, Results
, DAG
, Subtarget
);
10896 bool AArch64TargetLowering::useLoadStackGuardNode() const {
10897 if (Subtarget
->isTargetAndroid() || Subtarget
->isTargetFuchsia())
10898 return TargetLowering::useLoadStackGuardNode();
10902 unsigned AArch64TargetLowering::combineRepeatedFPDivisors() const {
10903 // Combine multiple FDIVs with the same divisor into multiple FMULs by the
10904 // reciprocal if there are three or more FDIVs.
10908 TargetLoweringBase::LegalizeTypeAction
10909 AArch64TargetLowering::getPreferredVectorAction(EVT VT
) const {
10910 MVT SVT
= VT
.getSimpleVT();
10911 // During type legalization, we prefer to widen v1i8, v1i16, v1i32 to v8i8,
10912 // v4i16, v2i32 instead of to promote.
10913 if (SVT
== MVT::v1i8
|| SVT
== MVT::v1i16
|| SVT
== MVT::v1i32
10914 || SVT
== MVT::v1f32
)
10915 return TypeWidenVector
;
10917 return TargetLoweringBase::getPreferredVectorAction(VT
);
10920 // Loads and stores less than 128-bits are already atomic; ones above that
10921 // are doomed anyway, so defer to the default libcall and blame the OS when
10922 // things go wrong.
10923 bool AArch64TargetLowering::shouldExpandAtomicStoreInIR(StoreInst
*SI
) const {
10924 unsigned Size
= SI
->getValueOperand()->getType()->getPrimitiveSizeInBits();
10925 return Size
== 128;
10928 // Loads and stores less than 128-bits are already atomic; ones above that
10929 // are doomed anyway, so defer to the default libcall and blame the OS when
10930 // things go wrong.
10931 TargetLowering::AtomicExpansionKind
10932 AArch64TargetLowering::shouldExpandAtomicLoadInIR(LoadInst
*LI
) const {
10933 unsigned Size
= LI
->getType()->getPrimitiveSizeInBits();
10934 return Size
== 128 ? AtomicExpansionKind::LLSC
: AtomicExpansionKind::None
;
10937 // For the real atomic operations, we have ldxr/stxr up to 128 bits,
10938 TargetLowering::AtomicExpansionKind
10939 AArch64TargetLowering::shouldExpandAtomicRMWInIR(AtomicRMWInst
*AI
) const {
10940 unsigned Size
= AI
->getType()->getPrimitiveSizeInBits();
10941 if (Size
> 128) return AtomicExpansionKind::None
;
10942 // Nand not supported in LSE.
10943 if (AI
->getOperation() == AtomicRMWInst::Nand
) return AtomicExpansionKind::LLSC
;
10944 // Leave 128 bits to LLSC.
10945 return (Subtarget
->hasLSE() && Size
< 128) ? AtomicExpansionKind::None
: AtomicExpansionKind::LLSC
;
10948 bool AArch64TargetLowering::shouldExpandAtomicCmpXchgInIR(
10949 AtomicCmpXchgInst
*AI
) const {
10950 // If subtarget has LSE, leave cmpxchg intact for codegen.
10951 if (Subtarget
->hasLSE()) return false;
10952 // At -O0, fast-regalloc cannot cope with the live vregs necessary to
10953 // implement cmpxchg without spilling. If the address being exchanged is also
10954 // on the stack and close enough to the spill slot, this can lead to a
10955 // situation where the monitor always gets cleared and the atomic operation
10956 // can never succeed. So at -O0 we need a late-expanded pseudo-inst instead.
10957 return getTargetMachine().getOptLevel() != 0;
10960 Value
*AArch64TargetLowering::emitLoadLinked(IRBuilder
<> &Builder
, Value
*Addr
,
10961 AtomicOrdering Ord
) const {
10962 Module
*M
= Builder
.GetInsertBlock()->getParent()->getParent();
10963 Type
*ValTy
= cast
<PointerType
>(Addr
->getType())->getElementType();
10964 bool IsAcquire
= isAcquireOrStronger(Ord
);
10966 // Since i128 isn't legal and intrinsics don't get type-lowered, the ldrexd
10967 // intrinsic must return {i64, i64} and we have to recombine them into a
10968 // single i128 here.
10969 if (ValTy
->getPrimitiveSizeInBits() == 128) {
10970 Intrinsic::ID Int
=
10971 IsAcquire
? Intrinsic::aarch64_ldaxp
: Intrinsic::aarch64_ldxp
;
10972 Function
*Ldxr
= Intrinsic::getDeclaration(M
, Int
);
10974 Addr
= Builder
.CreateBitCast(Addr
, Type::getInt8PtrTy(M
->getContext()));
10975 Value
*LoHi
= Builder
.CreateCall(Ldxr
, Addr
, "lohi");
10977 Value
*Lo
= Builder
.CreateExtractValue(LoHi
, 0, "lo");
10978 Value
*Hi
= Builder
.CreateExtractValue(LoHi
, 1, "hi");
10979 Lo
= Builder
.CreateZExt(Lo
, ValTy
, "lo64");
10980 Hi
= Builder
.CreateZExt(Hi
, ValTy
, "hi64");
10981 return Builder
.CreateOr(
10982 Lo
, Builder
.CreateShl(Hi
, ConstantInt::get(ValTy
, 64)), "val64");
10985 Type
*Tys
[] = { Addr
->getType() };
10986 Intrinsic::ID Int
=
10987 IsAcquire
? Intrinsic::aarch64_ldaxr
: Intrinsic::aarch64_ldxr
;
10988 Function
*Ldxr
= Intrinsic::getDeclaration(M
, Int
, Tys
);
10990 return Builder
.CreateTruncOrBitCast(
10991 Builder
.CreateCall(Ldxr
, Addr
),
10992 cast
<PointerType
>(Addr
->getType())->getElementType());
10995 void AArch64TargetLowering::emitAtomicCmpXchgNoStoreLLBalance(
10996 IRBuilder
<> &Builder
) const {
10997 Module
*M
= Builder
.GetInsertBlock()->getParent()->getParent();
10998 Builder
.CreateCall(Intrinsic::getDeclaration(M
, Intrinsic::aarch64_clrex
));
11001 Value
*AArch64TargetLowering::emitStoreConditional(IRBuilder
<> &Builder
,
11002 Value
*Val
, Value
*Addr
,
11003 AtomicOrdering Ord
) const {
11004 Module
*M
= Builder
.GetInsertBlock()->getParent()->getParent();
11005 bool IsRelease
= isReleaseOrStronger(Ord
);
11007 // Since the intrinsics must have legal type, the i128 intrinsics take two
11008 // parameters: "i64, i64". We must marshal Val into the appropriate form
11009 // before the call.
11010 if (Val
->getType()->getPrimitiveSizeInBits() == 128) {
11011 Intrinsic::ID Int
=
11012 IsRelease
? Intrinsic::aarch64_stlxp
: Intrinsic::aarch64_stxp
;
11013 Function
*Stxr
= Intrinsic::getDeclaration(M
, Int
);
11014 Type
*Int64Ty
= Type::getInt64Ty(M
->getContext());
11016 Value
*Lo
= Builder
.CreateTrunc(Val
, Int64Ty
, "lo");
11017 Value
*Hi
= Builder
.CreateTrunc(Builder
.CreateLShr(Val
, 64), Int64Ty
, "hi");
11018 Addr
= Builder
.CreateBitCast(Addr
, Type::getInt8PtrTy(M
->getContext()));
11019 return Builder
.CreateCall(Stxr
, {Lo
, Hi
, Addr
});
11022 Intrinsic::ID Int
=
11023 IsRelease
? Intrinsic::aarch64_stlxr
: Intrinsic::aarch64_stxr
;
11024 Type
*Tys
[] = { Addr
->getType() };
11025 Function
*Stxr
= Intrinsic::getDeclaration(M
, Int
, Tys
);
11027 return Builder
.CreateCall(Stxr
,
11028 {Builder
.CreateZExtOrBitCast(
11029 Val
, Stxr
->getFunctionType()->getParamType(0)),
11033 bool AArch64TargetLowering::functionArgumentNeedsConsecutiveRegisters(
11034 Type
*Ty
, CallingConv::ID CallConv
, bool isVarArg
) const {
11035 return Ty
->isArrayTy();
11038 bool AArch64TargetLowering::shouldNormalizeToSelectSequence(LLVMContext
&,
11043 static Value
*UseTlsOffset(IRBuilder
<> &IRB
, unsigned Offset
) {
11044 Module
*M
= IRB
.GetInsertBlock()->getParent()->getParent();
11045 Function
*ThreadPointerFunc
=
11046 Intrinsic::getDeclaration(M
, Intrinsic::thread_pointer
);
11047 return IRB
.CreatePointerCast(
11048 IRB
.CreateConstGEP1_32(IRB
.CreateCall(ThreadPointerFunc
), Offset
),
11049 Type::getInt8PtrTy(IRB
.getContext())->getPointerTo(0));
11052 Value
*AArch64TargetLowering::getIRStackGuard(IRBuilder
<> &IRB
) const {
11053 // Android provides a fixed TLS slot for the stack cookie. See the definition
11054 // of TLS_SLOT_STACK_GUARD in
11055 // https://android.googlesource.com/platform/bionic/+/master/libc/private/bionic_tls.h
11056 if (Subtarget
->isTargetAndroid())
11057 return UseTlsOffset(IRB
, 0x28);
11059 // Fuchsia is similar.
11060 // <zircon/tls.h> defines ZX_TLS_STACK_GUARD_OFFSET with this value.
11061 if (Subtarget
->isTargetFuchsia())
11062 return UseTlsOffset(IRB
, -0x10);
11064 return TargetLowering::getIRStackGuard(IRB
);
11067 Value
*AArch64TargetLowering::getSafeStackPointerLocation(IRBuilder
<> &IRB
) const {
11068 // Android provides a fixed TLS slot for the SafeStack pointer. See the
11069 // definition of TLS_SLOT_SAFESTACK in
11070 // https://android.googlesource.com/platform/bionic/+/master/libc/private/bionic_tls.h
11071 if (Subtarget
->isTargetAndroid())
11072 return UseTlsOffset(IRB
, 0x48);
11074 // Fuchsia is similar.
11075 // <zircon/tls.h> defines ZX_TLS_UNSAFE_SP_OFFSET with this value.
11076 if (Subtarget
->isTargetFuchsia())
11077 return UseTlsOffset(IRB
, -0x8);
11079 return TargetLowering::getSafeStackPointerLocation(IRB
);
11082 bool AArch64TargetLowering::isMaskAndCmp0FoldingBeneficial(
11083 const Instruction
&AndI
) const {
11084 // Only sink 'and' mask to cmp use block if it is masking a single bit, since
11085 // this is likely to be fold the and/cmp/br into a single tbz instruction. It
11086 // may be beneficial to sink in other cases, but we would have to check that
11087 // the cmp would not get folded into the br to form a cbz for these to be
11089 ConstantInt
* Mask
= dyn_cast
<ConstantInt
>(AndI
.getOperand(1));
11092 return Mask
->getValue().isPowerOf2();
11095 void AArch64TargetLowering::initializeSplitCSR(MachineBasicBlock
*Entry
) const {
11096 // Update IsSplitCSR in AArch64unctionInfo.
11097 AArch64FunctionInfo
*AFI
= Entry
->getParent()->getInfo
<AArch64FunctionInfo
>();
11098 AFI
->setIsSplitCSR(true);
11101 void AArch64TargetLowering::insertCopiesSplitCSR(
11102 MachineBasicBlock
*Entry
,
11103 const SmallVectorImpl
<MachineBasicBlock
*> &Exits
) const {
11104 const AArch64RegisterInfo
*TRI
= Subtarget
->getRegisterInfo();
11105 const MCPhysReg
*IStart
= TRI
->getCalleeSavedRegsViaCopy(Entry
->getParent());
11109 const TargetInstrInfo
*TII
= Subtarget
->getInstrInfo();
11110 MachineRegisterInfo
*MRI
= &Entry
->getParent()->getRegInfo();
11111 MachineBasicBlock::iterator MBBI
= Entry
->begin();
11112 for (const MCPhysReg
*I
= IStart
; *I
; ++I
) {
11113 const TargetRegisterClass
*RC
= nullptr;
11114 if (AArch64::GPR64RegClass
.contains(*I
))
11115 RC
= &AArch64::GPR64RegClass
;
11116 else if (AArch64::FPR64RegClass
.contains(*I
))
11117 RC
= &AArch64::FPR64RegClass
;
11119 llvm_unreachable("Unexpected register class in CSRsViaCopy!");
11121 unsigned NewVR
= MRI
->createVirtualRegister(RC
);
11122 // Create copy from CSR to a virtual register.
11123 // FIXME: this currently does not emit CFI pseudo-instructions, it works
11124 // fine for CXX_FAST_TLS since the C++-style TLS access functions should be
11125 // nounwind. If we want to generalize this later, we may need to emit
11126 // CFI pseudo-instructions.
11127 assert(Entry
->getParent()->getFunction().hasFnAttribute(
11128 Attribute::NoUnwind
) &&
11129 "Function should be nounwind in insertCopiesSplitCSR!");
11130 Entry
->addLiveIn(*I
);
11131 BuildMI(*Entry
, MBBI
, DebugLoc(), TII
->get(TargetOpcode::COPY
), NewVR
)
11134 // Insert the copy-back instructions right before the terminator.
11135 for (auto *Exit
: Exits
)
11136 BuildMI(*Exit
, Exit
->getFirstTerminator(), DebugLoc(),
11137 TII
->get(TargetOpcode::COPY
), *I
)
11142 bool AArch64TargetLowering::isIntDivCheap(EVT VT
, AttributeList Attr
) const {
11143 // Integer division on AArch64 is expensive. However, when aggressively
11144 // optimizing for code size, we prefer to use a div instruction, as it is
11145 // usually smaller than the alternative sequence.
11146 // The exception to this is vector division. Since AArch64 doesn't have vector
11147 // integer division, leaving the division as-is is a loss even in terms of
11148 // size, because it will have to be scalarized, while the alternative code
11149 // sequence can be performed in vector form.
11151 Attr
.hasAttribute(AttributeList::FunctionIndex
, Attribute::MinSize
);
11152 return OptSize
&& !VT
.isVector();
11155 bool AArch64TargetLowering::enableAggressiveFMAFusion(EVT VT
) const {
11156 return Subtarget
->hasAggressiveFMA() && VT
.isFloatingPoint();
11160 AArch64TargetLowering::getVaListSizeInBits(const DataLayout
&DL
) const {
11161 if (Subtarget
->isTargetDarwin() || Subtarget
->isTargetWindows())
11162 return getPointerTy(DL
).getSizeInBits();
11164 return 3 * getPointerTy(DL
).getSizeInBits() + 2 * 32;
11167 void AArch64TargetLowering::finalizeLowering(MachineFunction
&MF
) const {
11168 MF
.getFrameInfo().computeMaxCallFrameSize(MF
);
11169 TargetLoweringBase::finalizeLowering(MF
);