2 // Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
3 // See https://llvm.org/LICENSE.txt for license information.
4 // SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
6 //===----------------------------------------------------------------------===//
8 // This file contains a pass that performs optimization on SIMD instructions
9 // with high latency by splitting them into more efficient series of
12 // 1. Rewrite certain SIMD instructions with vector element due to their
13 // inefficiency on some targets.
16 // fmla v0.4s, v1.4s, v2.s[1]
20 // fmla v0.4s, v1.4s, v3.4s
22 // 2. Rewrite interleaved memory access instructions due to their
23 // inefficiency on some targets.
26 // st2 {v0.4s, v1.4s}, addr
29 // zip1 v2.4s, v0.4s, v1.4s
30 // zip2 v3.4s, v0.4s, v1.4s
33 //===----------------------------------------------------------------------===//
35 #include "AArch64InstrInfo.h"
36 #include "llvm/ADT/SmallVector.h"
37 #include "llvm/ADT/Statistic.h"
38 #include "llvm/ADT/StringRef.h"
39 #include "llvm/CodeGen/MachineBasicBlock.h"
40 #include "llvm/CodeGen/MachineFunction.h"
41 #include "llvm/CodeGen/MachineFunctionPass.h"
42 #include "llvm/CodeGen/MachineInstr.h"
43 #include "llvm/CodeGen/MachineInstrBuilder.h"
44 #include "llvm/CodeGen/MachineOperand.h"
45 #include "llvm/CodeGen/MachineRegisterInfo.h"
46 #include "llvm/CodeGen/TargetInstrInfo.h"
47 #include "llvm/CodeGen/TargetSchedule.h"
48 #include "llvm/CodeGen/TargetSubtargetInfo.h"
49 #include "llvm/MC/MCInstrDesc.h"
50 #include "llvm/MC/MCSchedule.h"
51 #include "llvm/Pass.h"
52 #include <unordered_map>
56 #define DEBUG_TYPE "aarch64-simdinstr-opt"
58 STATISTIC(NumModifiedInstr
,
59 "Number of SIMD instructions modified");
61 #define AARCH64_VECTOR_BY_ELEMENT_OPT_NAME \
62 "AArch64 SIMD instructions optimization pass"
66 struct AArch64SIMDInstrOpt
: public MachineFunctionPass
{
69 const TargetInstrInfo
*TII
;
70 MachineRegisterInfo
*MRI
;
71 TargetSchedModel SchedModel
;
73 // The two maps below are used to cache decisions instead of recomputing:
74 // This is used to cache instruction replacement decisions within function
75 // units and across function units.
76 std::map
<std::pair
<unsigned, std::string
>, bool> SIMDInstrTable
;
77 // This is used to cache the decision of whether to leave the interleaved
78 // store instructions replacement pass early or not for a particular target.
79 std::unordered_map
<std::string
, bool> InterlEarlyExit
;
86 // Instruction represented by OrigOpc is replaced by instructions in ReplOpc.
89 std::vector
<unsigned> ReplOpc
;
90 const TargetRegisterClass RC
;
93 #define RuleST2(OpcOrg, OpcR0, OpcR1, OpcR2, RC) \
94 {OpcOrg, {OpcR0, OpcR1, OpcR2}, RC}
95 #define RuleST4(OpcOrg, OpcR0, OpcR1, OpcR2, OpcR3, OpcR4, OpcR5, OpcR6, \
96 OpcR7, OpcR8, OpcR9, RC) \
98 {OpcR0, OpcR1, OpcR2, OpcR3, OpcR4, OpcR5, OpcR6, OpcR7, OpcR8, OpcR9}, RC}
100 // The Instruction Replacement Table:
101 std::vector
<InstReplInfo
> IRT
= {
103 RuleST2(AArch64::ST2Twov2d
, AArch64::ZIP1v2i64
, AArch64::ZIP2v2i64
,
104 AArch64::STPQi
, AArch64::FPR128RegClass
),
105 RuleST2(AArch64::ST2Twov4s
, AArch64::ZIP1v4i32
, AArch64::ZIP2v4i32
,
106 AArch64::STPQi
, AArch64::FPR128RegClass
),
107 RuleST2(AArch64::ST2Twov2s
, AArch64::ZIP1v2i32
, AArch64::ZIP2v2i32
,
108 AArch64::STPDi
, AArch64::FPR64RegClass
),
109 RuleST2(AArch64::ST2Twov8h
, AArch64::ZIP1v8i16
, AArch64::ZIP2v8i16
,
110 AArch64::STPQi
, AArch64::FPR128RegClass
),
111 RuleST2(AArch64::ST2Twov4h
, AArch64::ZIP1v4i16
, AArch64::ZIP2v4i16
,
112 AArch64::STPDi
, AArch64::FPR64RegClass
),
113 RuleST2(AArch64::ST2Twov16b
, AArch64::ZIP1v16i8
, AArch64::ZIP2v16i8
,
114 AArch64::STPQi
, AArch64::FPR128RegClass
),
115 RuleST2(AArch64::ST2Twov8b
, AArch64::ZIP1v8i8
, AArch64::ZIP2v8i8
,
116 AArch64::STPDi
, AArch64::FPR64RegClass
),
118 RuleST4(AArch64::ST4Fourv2d
, AArch64::ZIP1v2i64
, AArch64::ZIP2v2i64
,
119 AArch64::ZIP1v2i64
, AArch64::ZIP2v2i64
, AArch64::ZIP1v2i64
,
120 AArch64::ZIP2v2i64
, AArch64::ZIP1v2i64
, AArch64::ZIP2v2i64
,
121 AArch64::STPQi
, AArch64::STPQi
, AArch64::FPR128RegClass
),
122 RuleST4(AArch64::ST4Fourv4s
, AArch64::ZIP1v4i32
, AArch64::ZIP2v4i32
,
123 AArch64::ZIP1v4i32
, AArch64::ZIP2v4i32
, AArch64::ZIP1v4i32
,
124 AArch64::ZIP2v4i32
, AArch64::ZIP1v4i32
, AArch64::ZIP2v4i32
,
125 AArch64::STPQi
, AArch64::STPQi
, AArch64::FPR128RegClass
),
126 RuleST4(AArch64::ST4Fourv2s
, AArch64::ZIP1v2i32
, AArch64::ZIP2v2i32
,
127 AArch64::ZIP1v2i32
, AArch64::ZIP2v2i32
, AArch64::ZIP1v2i32
,
128 AArch64::ZIP2v2i32
, AArch64::ZIP1v2i32
, AArch64::ZIP2v2i32
,
129 AArch64::STPDi
, AArch64::STPDi
, AArch64::FPR64RegClass
),
130 RuleST4(AArch64::ST4Fourv8h
, AArch64::ZIP1v8i16
, AArch64::ZIP2v8i16
,
131 AArch64::ZIP1v8i16
, AArch64::ZIP2v8i16
, AArch64::ZIP1v8i16
,
132 AArch64::ZIP2v8i16
, AArch64::ZIP1v8i16
, AArch64::ZIP2v8i16
,
133 AArch64::STPQi
, AArch64::STPQi
, AArch64::FPR128RegClass
),
134 RuleST4(AArch64::ST4Fourv4h
, AArch64::ZIP1v4i16
, AArch64::ZIP2v4i16
,
135 AArch64::ZIP1v4i16
, AArch64::ZIP2v4i16
, AArch64::ZIP1v4i16
,
136 AArch64::ZIP2v4i16
, AArch64::ZIP1v4i16
, AArch64::ZIP2v4i16
,
137 AArch64::STPDi
, AArch64::STPDi
, AArch64::FPR64RegClass
),
138 RuleST4(AArch64::ST4Fourv16b
, AArch64::ZIP1v16i8
, AArch64::ZIP2v16i8
,
139 AArch64::ZIP1v16i8
, AArch64::ZIP2v16i8
, AArch64::ZIP1v16i8
,
140 AArch64::ZIP2v16i8
, AArch64::ZIP1v16i8
, AArch64::ZIP2v16i8
,
141 AArch64::STPQi
, AArch64::STPQi
, AArch64::FPR128RegClass
),
142 RuleST4(AArch64::ST4Fourv8b
, AArch64::ZIP1v8i8
, AArch64::ZIP2v8i8
,
143 AArch64::ZIP1v8i8
, AArch64::ZIP2v8i8
, AArch64::ZIP1v8i8
,
144 AArch64::ZIP2v8i8
, AArch64::ZIP1v8i8
, AArch64::ZIP2v8i8
,
145 AArch64::STPDi
, AArch64::STPDi
, AArch64::FPR64RegClass
)
148 // A costly instruction is replaced in this work by N efficient instructions
149 // The maximum of N is curently 10 and it is for ST4 case.
150 static const unsigned MaxNumRepl
= 10;
152 AArch64SIMDInstrOpt() : MachineFunctionPass(ID
) {
153 initializeAArch64SIMDInstrOptPass(*PassRegistry::getPassRegistry());
156 /// Based only on latency of instructions, determine if it is cost efficient
157 /// to replace the instruction InstDesc by the instructions stored in the
158 /// array InstDescRepl.
159 /// Return true if replacement is expected to be faster.
160 bool shouldReplaceInst(MachineFunction
*MF
, const MCInstrDesc
*InstDesc
,
161 SmallVectorImpl
<const MCInstrDesc
*> &ReplInstrMCID
);
163 /// Determine if we need to exit the instruction replacement optimization
164 /// passes early. This makes sure that no compile time is spent in this pass
165 /// for targets with no need for any of these optimizations.
166 /// Return true if early exit of the pass is recommended.
167 bool shouldExitEarly(MachineFunction
*MF
, Subpass SP
);
169 /// Check whether an equivalent DUP instruction has already been
171 /// Return true when the DUP instruction already exists. In this case,
172 /// DestReg will point to the destination of the already created DUP.
173 bool reuseDUP(MachineInstr
&MI
, unsigned DupOpcode
, unsigned SrcReg
,
174 unsigned LaneNumber
, unsigned *DestReg
) const;
176 /// Certain SIMD instructions with vector element operand are not efficient.
177 /// Rewrite them into SIMD instructions with vector operands. This rewrite
178 /// is driven by the latency of the instructions.
179 /// Return true if the SIMD instruction is modified.
180 bool optimizeVectElement(MachineInstr
&MI
);
182 /// Process The REG_SEQUENCE instruction, and extract the source
183 /// operands of the ST2/4 instruction from it.
184 /// Example of such instructions.
185 /// %dest = REG_SEQUENCE %st2_src1, dsub0, %st2_src2, dsub1;
186 /// Return true when the instruction is processed successfully.
187 bool processSeqRegInst(MachineInstr
*DefiningMI
, unsigned* StReg
,
188 unsigned* StRegKill
, unsigned NumArg
) const;
190 /// Load/Store Interleaving instructions are not always beneficial.
191 /// Replace them by ZIP instructionand classical load/store.
192 /// Return true if the SIMD instruction is modified.
193 bool optimizeLdStInterleave(MachineInstr
&MI
);
195 /// Return the number of useful source registers for this
196 /// instruction (2 for ST2 and 4 for ST4).
197 unsigned determineSrcReg(MachineInstr
&MI
) const;
199 bool runOnMachineFunction(MachineFunction
&Fn
) override
;
201 StringRef
getPassName() const override
{
202 return AARCH64_VECTOR_BY_ELEMENT_OPT_NAME
;
206 char AArch64SIMDInstrOpt::ID
= 0;
208 } // end anonymous namespace
210 INITIALIZE_PASS(AArch64SIMDInstrOpt
, "aarch64-simdinstr-opt",
211 AARCH64_VECTOR_BY_ELEMENT_OPT_NAME
, false, false)
213 /// Based only on latency of instructions, determine if it is cost efficient
214 /// to replace the instruction InstDesc by the instructions stored in the
215 /// array InstDescRepl.
216 /// Return true if replacement is expected to be faster.
217 bool AArch64SIMDInstrOpt::
218 shouldReplaceInst(MachineFunction
*MF
, const MCInstrDesc
*InstDesc
,
219 SmallVectorImpl
<const MCInstrDesc
*> &InstDescRepl
) {
220 // Check if replacement decision is already available in the cached table.
222 std::string Subtarget
= std::string(SchedModel
.getSubtargetInfo()->getCPU());
223 auto InstID
= std::make_pair(InstDesc
->getOpcode(), Subtarget
);
224 auto It
= SIMDInstrTable
.find(InstID
);
225 if (It
!= SIMDInstrTable
.end())
228 unsigned SCIdx
= InstDesc
->getSchedClass();
229 const MCSchedClassDesc
*SCDesc
=
230 SchedModel
.getMCSchedModel()->getSchedClassDesc(SCIdx
);
232 // If a target does not define resources for the instructions
233 // of interest, then return false for no replacement.
234 const MCSchedClassDesc
*SCDescRepl
;
235 if (!SCDesc
->isValid() || SCDesc
->isVariant())
237 SIMDInstrTable
[InstID
] = false;
240 for (auto IDesc
: InstDescRepl
)
242 SCDescRepl
= SchedModel
.getMCSchedModel()->getSchedClassDesc(
243 IDesc
->getSchedClass());
244 if (!SCDescRepl
->isValid() || SCDescRepl
->isVariant())
246 SIMDInstrTable
[InstID
] = false;
252 unsigned ReplCost
= 0;
253 for (auto IDesc
:InstDescRepl
)
254 ReplCost
+= SchedModel
.computeInstrLatency(IDesc
->getOpcode());
256 if (SchedModel
.computeInstrLatency(InstDesc
->getOpcode()) > ReplCost
)
258 SIMDInstrTable
[InstID
] = true;
263 SIMDInstrTable
[InstID
] = false;
268 /// Determine if we need to exit this pass for a kind of instruction replacement
269 /// early. This makes sure that no compile time is spent in this pass for
270 /// targets with no need for any of these optimizations beyond performing this
272 /// Return true if early exit of this pass for a kind of instruction
273 /// replacement is recommended for a target.
274 bool AArch64SIMDInstrOpt::shouldExitEarly(MachineFunction
*MF
, Subpass SP
) {
275 const MCInstrDesc
* OriginalMCID
;
276 SmallVector
<const MCInstrDesc
*, MaxNumRepl
> ReplInstrMCID
;
279 // For this optimization, check by comparing the latency of a representative
280 // instruction to that of the replacement instructions.
281 // TODO: check for all concerned instructions.
283 OriginalMCID
= &TII
->get(AArch64::FMLAv4i32_indexed
);
284 ReplInstrMCID
.push_back(&TII
->get(AArch64::DUPv4i32lane
));
285 ReplInstrMCID
.push_back(&TII
->get(AArch64::FMLAv4f32
));
286 if (shouldReplaceInst(MF
, OriginalMCID
, ReplInstrMCID
))
290 // For this optimization, check for all concerned instructions.
292 std::string Subtarget
=
293 std::string(SchedModel
.getSubtargetInfo()->getCPU());
294 auto It
= InterlEarlyExit
.find(Subtarget
);
295 if (It
!= InterlEarlyExit
.end())
298 for (auto &I
: IRT
) {
299 OriginalMCID
= &TII
->get(I
.OrigOpc
);
300 for (auto &Repl
: I
.ReplOpc
)
301 ReplInstrMCID
.push_back(&TII
->get(Repl
));
302 if (shouldReplaceInst(MF
, OriginalMCID
, ReplInstrMCID
)) {
303 InterlEarlyExit
[Subtarget
] = false;
306 ReplInstrMCID
.clear();
308 InterlEarlyExit
[Subtarget
] = true;
315 /// Check whether an equivalent DUP instruction has already been
317 /// Return true when the DUP instruction already exists. In this case,
318 /// DestReg will point to the destination of the already created DUP.
319 bool AArch64SIMDInstrOpt::reuseDUP(MachineInstr
&MI
, unsigned DupOpcode
,
320 unsigned SrcReg
, unsigned LaneNumber
,
321 unsigned *DestReg
) const {
322 for (MachineBasicBlock::iterator MII
= MI
, MIE
= MI
.getParent()->begin();
325 MachineInstr
*CurrentMI
= &*MII
;
327 if (CurrentMI
->getOpcode() == DupOpcode
&&
328 CurrentMI
->getNumOperands() == 3 &&
329 CurrentMI
->getOperand(1).getReg() == SrcReg
&&
330 CurrentMI
->getOperand(2).getImm() == LaneNumber
) {
331 *DestReg
= CurrentMI
->getOperand(0).getReg();
339 /// Certain SIMD instructions with vector element operand are not efficient.
340 /// Rewrite them into SIMD instructions with vector operands. This rewrite
341 /// is driven by the latency of the instructions.
342 /// The instruction of concerns are for the time being FMLA, FMLS, FMUL,
343 /// and FMULX and hence they are hardcoded.
346 /// fmla v0.4s, v1.4s, v2.s[1]
348 /// Is rewritten into
349 /// dup v3.4s, v2.s[1] // DUP not necessary if redundant
350 /// fmla v0.4s, v1.4s, v3.4s
352 /// Return true if the SIMD instruction is modified.
353 bool AArch64SIMDInstrOpt::optimizeVectElement(MachineInstr
&MI
) {
354 const MCInstrDesc
*MulMCID
, *DupMCID
;
355 const TargetRegisterClass
*RC
= &AArch64::FPR128RegClass
;
357 switch (MI
.getOpcode()) {
362 case AArch64::FMLAv4i32_indexed
:
363 DupMCID
= &TII
->get(AArch64::DUPv4i32lane
);
364 MulMCID
= &TII
->get(AArch64::FMLAv4f32
);
366 case AArch64::FMLSv4i32_indexed
:
367 DupMCID
= &TII
->get(AArch64::DUPv4i32lane
);
368 MulMCID
= &TII
->get(AArch64::FMLSv4f32
);
370 case AArch64::FMULXv4i32_indexed
:
371 DupMCID
= &TII
->get(AArch64::DUPv4i32lane
);
372 MulMCID
= &TII
->get(AArch64::FMULXv4f32
);
374 case AArch64::FMULv4i32_indexed
:
375 DupMCID
= &TII
->get(AArch64::DUPv4i32lane
);
376 MulMCID
= &TII
->get(AArch64::FMULv4f32
);
380 case AArch64::FMLAv2i64_indexed
:
381 DupMCID
= &TII
->get(AArch64::DUPv2i64lane
);
382 MulMCID
= &TII
->get(AArch64::FMLAv2f64
);
384 case AArch64::FMLSv2i64_indexed
:
385 DupMCID
= &TII
->get(AArch64::DUPv2i64lane
);
386 MulMCID
= &TII
->get(AArch64::FMLSv2f64
);
388 case AArch64::FMULXv2i64_indexed
:
389 DupMCID
= &TII
->get(AArch64::DUPv2i64lane
);
390 MulMCID
= &TII
->get(AArch64::FMULXv2f64
);
392 case AArch64::FMULv2i64_indexed
:
393 DupMCID
= &TII
->get(AArch64::DUPv2i64lane
);
394 MulMCID
= &TII
->get(AArch64::FMULv2f64
);
398 case AArch64::FMLAv2i32_indexed
:
399 RC
= &AArch64::FPR64RegClass
;
400 DupMCID
= &TII
->get(AArch64::DUPv2i32lane
);
401 MulMCID
= &TII
->get(AArch64::FMLAv2f32
);
403 case AArch64::FMLSv2i32_indexed
:
404 RC
= &AArch64::FPR64RegClass
;
405 DupMCID
= &TII
->get(AArch64::DUPv2i32lane
);
406 MulMCID
= &TII
->get(AArch64::FMLSv2f32
);
408 case AArch64::FMULXv2i32_indexed
:
409 RC
= &AArch64::FPR64RegClass
;
410 DupMCID
= &TII
->get(AArch64::DUPv2i32lane
);
411 MulMCID
= &TII
->get(AArch64::FMULXv2f32
);
413 case AArch64::FMULv2i32_indexed
:
414 RC
= &AArch64::FPR64RegClass
;
415 DupMCID
= &TII
->get(AArch64::DUPv2i32lane
);
416 MulMCID
= &TII
->get(AArch64::FMULv2f32
);
420 SmallVector
<const MCInstrDesc
*, 2> ReplInstrMCID
;
421 ReplInstrMCID
.push_back(DupMCID
);
422 ReplInstrMCID
.push_back(MulMCID
);
423 if (!shouldReplaceInst(MI
.getParent()->getParent(), &TII
->get(MI
.getOpcode()),
427 const DebugLoc
&DL
= MI
.getDebugLoc();
428 MachineBasicBlock
&MBB
= *MI
.getParent();
429 MachineRegisterInfo
&MRI
= MBB
.getParent()->getRegInfo();
431 // Get the operands of the current SIMD arithmetic instruction.
432 Register MulDest
= MI
.getOperand(0).getReg();
433 Register SrcReg0
= MI
.getOperand(1).getReg();
434 unsigned Src0IsKill
= getKillRegState(MI
.getOperand(1).isKill());
435 Register SrcReg1
= MI
.getOperand(2).getReg();
436 unsigned Src1IsKill
= getKillRegState(MI
.getOperand(2).isKill());
439 // Instructions of interest have either 4 or 5 operands.
440 if (MI
.getNumOperands() == 5) {
441 Register SrcReg2
= MI
.getOperand(3).getReg();
442 unsigned Src2IsKill
= getKillRegState(MI
.getOperand(3).isKill());
443 unsigned LaneNumber
= MI
.getOperand(4).getImm();
444 // Create a new DUP instruction. Note that if an equivalent DUP instruction
445 // has already been created before, then use that one instead of creating
447 if (!reuseDUP(MI
, DupMCID
->getOpcode(), SrcReg2
, LaneNumber
, &DupDest
)) {
448 DupDest
= MRI
.createVirtualRegister(RC
);
449 BuildMI(MBB
, MI
, DL
, *DupMCID
, DupDest
)
450 .addReg(SrcReg2
, Src2IsKill
)
453 BuildMI(MBB
, MI
, DL
, *MulMCID
, MulDest
)
454 .addReg(SrcReg0
, Src0IsKill
)
455 .addReg(SrcReg1
, Src1IsKill
)
456 .addReg(DupDest
, Src2IsKill
);
457 } else if (MI
.getNumOperands() == 4) {
458 unsigned LaneNumber
= MI
.getOperand(3).getImm();
459 if (!reuseDUP(MI
, DupMCID
->getOpcode(), SrcReg1
, LaneNumber
, &DupDest
)) {
460 DupDest
= MRI
.createVirtualRegister(RC
);
461 BuildMI(MBB
, MI
, DL
, *DupMCID
, DupDest
)
462 .addReg(SrcReg1
, Src1IsKill
)
465 BuildMI(MBB
, MI
, DL
, *MulMCID
, MulDest
)
466 .addReg(SrcReg0
, Src0IsKill
)
467 .addReg(DupDest
, Src1IsKill
);
476 /// Load/Store Interleaving instructions are not always beneficial.
477 /// Replace them by ZIP instructions and classical load/store.
480 /// st2 {v0.4s, v1.4s}, addr
482 /// Is rewritten into:
483 /// zip1 v2.4s, v0.4s, v1.4s
484 /// zip2 v3.4s, v0.4s, v1.4s
488 /// st4 {v0.4s, v1.4s, v2.4s, v3.4s}, addr
490 /// Is rewritten into:
491 /// zip1 v4.4s, v0.4s, v2.4s
492 /// zip2 v5.4s, v0.4s, v2.4s
493 /// zip1 v6.4s, v1.4s, v3.4s
494 /// zip2 v7.4s, v1.4s, v3.4s
495 /// zip1 v8.4s, v4.4s, v6.4s
496 /// zip2 v9.4s, v4.4s, v6.4s
497 /// zip1 v10.4s, v5.4s, v7.4s
498 /// zip2 v11.4s, v5.4s, v7.4s
500 /// stp q10, q11, addr+32
502 /// Currently only instructions related to ST2 and ST4 are considered.
503 /// Other may be added later.
504 /// Return true if the SIMD instruction is modified.
505 bool AArch64SIMDInstrOpt::optimizeLdStInterleave(MachineInstr
&MI
) {
507 unsigned SeqReg
, AddrReg
;
508 unsigned StReg
[4], StRegKill
[4];
509 MachineInstr
*DefiningMI
;
510 const DebugLoc
&DL
= MI
.getDebugLoc();
511 MachineBasicBlock
&MBB
= *MI
.getParent();
512 SmallVector
<unsigned, MaxNumRepl
> ZipDest
;
513 SmallVector
<const MCInstrDesc
*, MaxNumRepl
> ReplInstrMCID
;
515 // If current instruction matches any of the rewriting rules, then
516 // gather information about parameters of the new instructions.
518 for (auto &I
: IRT
) {
519 if (MI
.getOpcode() == I
.OrigOpc
) {
520 SeqReg
= MI
.getOperand(0).getReg();
521 AddrReg
= MI
.getOperand(1).getReg();
522 DefiningMI
= MRI
->getUniqueVRegDef(SeqReg
);
523 unsigned NumReg
= determineSrcReg(MI
);
524 if (!processSeqRegInst(DefiningMI
, StReg
, StRegKill
, NumReg
))
527 for (auto &Repl
: I
.ReplOpc
) {
528 ReplInstrMCID
.push_back(&TII
->get(Repl
));
529 // Generate destination registers but only for non-store instruction.
530 if (Repl
!= AArch64::STPQi
&& Repl
!= AArch64::STPDi
)
531 ZipDest
.push_back(MRI
->createVirtualRegister(&I
.RC
));
541 // Determine if it is profitable to replace MI by the series of instructions
542 // represented in ReplInstrMCID.
543 if (!shouldReplaceInst(MI
.getParent()->getParent(), &TII
->get(MI
.getOpcode()),
547 // Generate the replacement instructions composed of ZIP1, ZIP2, and STP (at
548 // this point, the code generation is hardcoded and does not rely on the IRT
549 // table used above given that code generation for ST2 replacement is somewhat
550 // different than for ST4 replacement. We could have added more info into the
551 // table related to how we build new instructions but we may be adding more
552 // complexity with that).
553 switch (MI
.getOpcode()) {
557 case AArch64::ST2Twov16b
:
558 case AArch64::ST2Twov8b
:
559 case AArch64::ST2Twov8h
:
560 case AArch64::ST2Twov4h
:
561 case AArch64::ST2Twov4s
:
562 case AArch64::ST2Twov2s
:
563 case AArch64::ST2Twov2d
:
565 BuildMI(MBB
, MI
, DL
, *ReplInstrMCID
[0], ZipDest
[0])
568 BuildMI(MBB
, MI
, DL
, *ReplInstrMCID
[1], ZipDest
[1])
569 .addReg(StReg
[0], StRegKill
[0])
570 .addReg(StReg
[1], StRegKill
[1]);
572 BuildMI(MBB
, MI
, DL
, *ReplInstrMCID
[2])
579 case AArch64::ST4Fourv16b
:
580 case AArch64::ST4Fourv8b
:
581 case AArch64::ST4Fourv8h
:
582 case AArch64::ST4Fourv4h
:
583 case AArch64::ST4Fourv4s
:
584 case AArch64::ST4Fourv2s
:
585 case AArch64::ST4Fourv2d
:
587 BuildMI(MBB
, MI
, DL
, *ReplInstrMCID
[0], ZipDest
[0])
590 BuildMI(MBB
, MI
, DL
, *ReplInstrMCID
[1], ZipDest
[1])
591 .addReg(StReg
[0], StRegKill
[0])
592 .addReg(StReg
[2], StRegKill
[2]);
593 BuildMI(MBB
, MI
, DL
, *ReplInstrMCID
[2], ZipDest
[2])
596 BuildMI(MBB
, MI
, DL
, *ReplInstrMCID
[3], ZipDest
[3])
597 .addReg(StReg
[1], StRegKill
[1])
598 .addReg(StReg
[3], StRegKill
[3]);
599 BuildMI(MBB
, MI
, DL
, *ReplInstrMCID
[4], ZipDest
[4])
602 BuildMI(MBB
, MI
, DL
, *ReplInstrMCID
[5], ZipDest
[5])
605 BuildMI(MBB
, MI
, DL
, *ReplInstrMCID
[6], ZipDest
[6])
608 BuildMI(MBB
, MI
, DL
, *ReplInstrMCID
[7], ZipDest
[7])
612 BuildMI(MBB
, MI
, DL
, *ReplInstrMCID
[8])
617 BuildMI(MBB
, MI
, DL
, *ReplInstrMCID
[9])
629 /// Process The REG_SEQUENCE instruction, and extract the source
630 /// operands of the ST2/4 instruction from it.
631 /// Example of such instruction.
632 /// %dest = REG_SEQUENCE %st2_src1, dsub0, %st2_src2, dsub1;
633 /// Return true when the instruction is processed successfully.
634 bool AArch64SIMDInstrOpt::processSeqRegInst(MachineInstr
*DefiningMI
,
635 unsigned* StReg
, unsigned* StRegKill
, unsigned NumArg
) const {
636 assert(DefiningMI
!= nullptr);
637 if (DefiningMI
->getOpcode() != AArch64::REG_SEQUENCE
)
640 for (unsigned i
=0; i
<NumArg
; i
++) {
641 StReg
[i
] = DefiningMI
->getOperand(2*i
+1).getReg();
642 StRegKill
[i
] = getKillRegState(DefiningMI
->getOperand(2*i
+1).isKill());
644 // Validation check for the other arguments.
645 if (DefiningMI
->getOperand(2*i
+2).isImm()) {
646 switch (DefiningMI
->getOperand(2*i
+2).getImm()) {
667 /// Return the number of useful source registers for this instruction
668 /// (2 for ST2 and 4 for ST4).
669 unsigned AArch64SIMDInstrOpt::determineSrcReg(MachineInstr
&MI
) const {
670 switch (MI
.getOpcode()) {
672 llvm_unreachable("Unsupported instruction for this pass");
674 case AArch64::ST2Twov16b
:
675 case AArch64::ST2Twov8b
:
676 case AArch64::ST2Twov8h
:
677 case AArch64::ST2Twov4h
:
678 case AArch64::ST2Twov4s
:
679 case AArch64::ST2Twov2s
:
680 case AArch64::ST2Twov2d
:
683 case AArch64::ST4Fourv16b
:
684 case AArch64::ST4Fourv8b
:
685 case AArch64::ST4Fourv8h
:
686 case AArch64::ST4Fourv4h
:
687 case AArch64::ST4Fourv4s
:
688 case AArch64::ST4Fourv2s
:
689 case AArch64::ST4Fourv2d
:
694 bool AArch64SIMDInstrOpt::runOnMachineFunction(MachineFunction
&MF
) {
695 if (skipFunction(MF
.getFunction()))
698 TII
= MF
.getSubtarget().getInstrInfo();
699 MRI
= &MF
.getRegInfo();
700 const TargetSubtargetInfo
&ST
= MF
.getSubtarget();
701 const AArch64InstrInfo
*AAII
=
702 static_cast<const AArch64InstrInfo
*>(ST
.getInstrInfo());
705 SchedModel
.init(&ST
);
706 if (!SchedModel
.hasInstrSchedModel())
709 bool Changed
= false;
710 for (auto OptimizationKind
: {VectorElem
, Interleave
}) {
711 if (!shouldExitEarly(&MF
, OptimizationKind
)) {
712 SmallVector
<MachineInstr
*, 8> RemoveMIs
;
713 for (MachineBasicBlock
&MBB
: MF
) {
714 for (MachineInstr
&MI
: MBB
) {
716 if (OptimizationKind
== VectorElem
)
717 InstRewrite
= optimizeVectElement(MI
) ;
719 InstRewrite
= optimizeLdStInterleave(MI
);
721 // Add MI to the list of instructions to be removed given that it
722 // has been replaced.
723 RemoveMIs
.push_back(&MI
);
728 for (MachineInstr
*MI
: RemoveMIs
)
729 MI
->eraseFromParent();
736 /// Returns an instance of the high cost ASIMD instruction replacement
737 /// optimization pass.
738 FunctionPass
*llvm::createAArch64SIMDInstrOptPass() {
739 return new AArch64SIMDInstrOpt();