2 // Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
3 // See https://llvm.org/LICENSE.txt for license information.
4 // SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
6 //===----------------------------------------------------------------------===//
8 // This file contains a pass that performs optimization on SIMD instructions
9 // with high latency by splitting them into more efficient series of
12 // 1. Rewrite certain SIMD instructions with vector element due to their
13 // inefficiency on some targets.
16 // fmla v0.4s, v1.4s, v2.s[1]
20 // fmla v0.4s, v1.4s, v3.4s
22 // 2. Rewrite interleaved memory access instructions due to their
23 // inefficiency on some targets.
26 // st2 {v0.4s, v1.4s}, addr
29 // zip1 v2.4s, v0.4s, v1.4s
30 // zip2 v3.4s, v0.4s, v1.4s
33 //===----------------------------------------------------------------------===//
35 #include "AArch64InstrInfo.h"
36 #include "llvm/ADT/SmallVector.h"
37 #include "llvm/ADT/Statistic.h"
38 #include "llvm/ADT/StringRef.h"
39 #include "llvm/CodeGen/MachineBasicBlock.h"
40 #include "llvm/CodeGen/MachineFunction.h"
41 #include "llvm/CodeGen/MachineFunctionPass.h"
42 #include "llvm/CodeGen/MachineInstr.h"
43 #include "llvm/CodeGen/MachineInstrBuilder.h"
44 #include "llvm/CodeGen/MachineOperand.h"
45 #include "llvm/CodeGen/MachineRegisterInfo.h"
46 #include "llvm/CodeGen/TargetInstrInfo.h"
47 #include "llvm/CodeGen/TargetSchedule.h"
48 #include "llvm/CodeGen/TargetSubtargetInfo.h"
49 #include "llvm/MC/MCInstrDesc.h"
50 #include "llvm/MC/MCSchedule.h"
51 #include "llvm/Pass.h"
52 #include <unordered_map>
57 #define DEBUG_TYPE "aarch64-simdinstr-opt"
59 STATISTIC(NumModifiedInstr
,
60 "Number of SIMD instructions modified");
62 #define AARCH64_VECTOR_BY_ELEMENT_OPT_NAME \
63 "AArch64 SIMD instructions optimization pass"
67 struct AArch64SIMDInstrOpt
: public MachineFunctionPass
{
70 const TargetInstrInfo
*TII
;
71 MachineRegisterInfo
*MRI
;
72 TargetSchedModel SchedModel
;
74 // The two maps below are used to cache decisions instead of recomputing:
75 // This is used to cache instruction replacement decisions within function
76 // units and across function units.
77 std::map
<std::pair
<unsigned, std::string
>, bool> SIMDInstrTable
;
78 // This is used to cache the decision of whether to leave the interleaved
79 // store instructions replacement pass early or not for a particular target.
80 std::unordered_map
<std::string
, bool> InterlEarlyExit
;
87 // Instruction represented by OrigOpc is replaced by instructions in ReplOpc.
90 std::vector
<unsigned> ReplOpc
;
91 const TargetRegisterClass RC
;
94 #define RuleST2(OpcOrg, OpcR0, OpcR1, OpcR2, RC) \
95 {OpcOrg, {OpcR0, OpcR1, OpcR2}, RC}
96 #define RuleST4(OpcOrg, OpcR0, OpcR1, OpcR2, OpcR3, OpcR4, OpcR5, OpcR6, \
97 OpcR7, OpcR8, OpcR9, RC) \
99 {OpcR0, OpcR1, OpcR2, OpcR3, OpcR4, OpcR5, OpcR6, OpcR7, OpcR8, OpcR9}, RC}
101 // The Instruction Replacement Table:
102 std::vector
<InstReplInfo
> IRT
= {
104 RuleST2(AArch64::ST2Twov2d
, AArch64::ZIP1v2i64
, AArch64::ZIP2v2i64
,
105 AArch64::STPQi
, AArch64::FPR128RegClass
),
106 RuleST2(AArch64::ST2Twov4s
, AArch64::ZIP1v4i32
, AArch64::ZIP2v4i32
,
107 AArch64::STPQi
, AArch64::FPR128RegClass
),
108 RuleST2(AArch64::ST2Twov2s
, AArch64::ZIP1v2i32
, AArch64::ZIP2v2i32
,
109 AArch64::STPDi
, AArch64::FPR64RegClass
),
110 RuleST2(AArch64::ST2Twov8h
, AArch64::ZIP1v8i16
, AArch64::ZIP2v8i16
,
111 AArch64::STPQi
, AArch64::FPR128RegClass
),
112 RuleST2(AArch64::ST2Twov4h
, AArch64::ZIP1v4i16
, AArch64::ZIP2v4i16
,
113 AArch64::STPDi
, AArch64::FPR64RegClass
),
114 RuleST2(AArch64::ST2Twov16b
, AArch64::ZIP1v16i8
, AArch64::ZIP2v16i8
,
115 AArch64::STPQi
, AArch64::FPR128RegClass
),
116 RuleST2(AArch64::ST2Twov8b
, AArch64::ZIP1v8i8
, AArch64::ZIP2v8i8
,
117 AArch64::STPDi
, AArch64::FPR64RegClass
),
119 RuleST4(AArch64::ST4Fourv2d
, AArch64::ZIP1v2i64
, AArch64::ZIP2v2i64
,
120 AArch64::ZIP1v2i64
, AArch64::ZIP2v2i64
, AArch64::ZIP1v2i64
,
121 AArch64::ZIP2v2i64
, AArch64::ZIP1v2i64
, AArch64::ZIP2v2i64
,
122 AArch64::STPQi
, AArch64::STPQi
, AArch64::FPR128RegClass
),
123 RuleST4(AArch64::ST4Fourv4s
, AArch64::ZIP1v4i32
, AArch64::ZIP2v4i32
,
124 AArch64::ZIP1v4i32
, AArch64::ZIP2v4i32
, AArch64::ZIP1v4i32
,
125 AArch64::ZIP2v4i32
, AArch64::ZIP1v4i32
, AArch64::ZIP2v4i32
,
126 AArch64::STPQi
, AArch64::STPQi
, AArch64::FPR128RegClass
),
127 RuleST4(AArch64::ST4Fourv2s
, AArch64::ZIP1v2i32
, AArch64::ZIP2v2i32
,
128 AArch64::ZIP1v2i32
, AArch64::ZIP2v2i32
, AArch64::ZIP1v2i32
,
129 AArch64::ZIP2v2i32
, AArch64::ZIP1v2i32
, AArch64::ZIP2v2i32
,
130 AArch64::STPDi
, AArch64::STPDi
, AArch64::FPR64RegClass
),
131 RuleST4(AArch64::ST4Fourv8h
, AArch64::ZIP1v8i16
, AArch64::ZIP2v8i16
,
132 AArch64::ZIP1v8i16
, AArch64::ZIP2v8i16
, AArch64::ZIP1v8i16
,
133 AArch64::ZIP2v8i16
, AArch64::ZIP1v8i16
, AArch64::ZIP2v8i16
,
134 AArch64::STPQi
, AArch64::STPQi
, AArch64::FPR128RegClass
),
135 RuleST4(AArch64::ST4Fourv4h
, AArch64::ZIP1v4i16
, AArch64::ZIP2v4i16
,
136 AArch64::ZIP1v4i16
, AArch64::ZIP2v4i16
, AArch64::ZIP1v4i16
,
137 AArch64::ZIP2v4i16
, AArch64::ZIP1v4i16
, AArch64::ZIP2v4i16
,
138 AArch64::STPDi
, AArch64::STPDi
, AArch64::FPR64RegClass
),
139 RuleST4(AArch64::ST4Fourv16b
, AArch64::ZIP1v16i8
, AArch64::ZIP2v16i8
,
140 AArch64::ZIP1v16i8
, AArch64::ZIP2v16i8
, AArch64::ZIP1v16i8
,
141 AArch64::ZIP2v16i8
, AArch64::ZIP1v16i8
, AArch64::ZIP2v16i8
,
142 AArch64::STPQi
, AArch64::STPQi
, AArch64::FPR128RegClass
),
143 RuleST4(AArch64::ST4Fourv8b
, AArch64::ZIP1v8i8
, AArch64::ZIP2v8i8
,
144 AArch64::ZIP1v8i8
, AArch64::ZIP2v8i8
, AArch64::ZIP1v8i8
,
145 AArch64::ZIP2v8i8
, AArch64::ZIP1v8i8
, AArch64::ZIP2v8i8
,
146 AArch64::STPDi
, AArch64::STPDi
, AArch64::FPR64RegClass
)
149 // A costly instruction is replaced in this work by N efficient instructions
150 // The maximum of N is curently 10 and it is for ST4 case.
151 static const unsigned MaxNumRepl
= 10;
153 AArch64SIMDInstrOpt() : MachineFunctionPass(ID
) {
154 initializeAArch64SIMDInstrOptPass(*PassRegistry::getPassRegistry());
157 /// Based only on latency of instructions, determine if it is cost efficient
158 /// to replace the instruction InstDesc by the instructions stored in the
159 /// array InstDescRepl.
160 /// Return true if replacement is expected to be faster.
161 bool shouldReplaceInst(MachineFunction
*MF
, const MCInstrDesc
*InstDesc
,
162 SmallVectorImpl
<const MCInstrDesc
*> &ReplInstrMCID
);
164 /// Determine if we need to exit the instruction replacement optimization
165 /// passes early. This makes sure that no compile time is spent in this pass
166 /// for targets with no need for any of these optimizations.
167 /// Return true if early exit of the pass is recommended.
168 bool shouldExitEarly(MachineFunction
*MF
, Subpass SP
);
170 /// Check whether an equivalent DUP instruction has already been
172 /// Return true when the DUP instruction already exists. In this case,
173 /// DestReg will point to the destination of the already created DUP.
174 bool reuseDUP(MachineInstr
&MI
, unsigned DupOpcode
, unsigned SrcReg
,
175 unsigned LaneNumber
, unsigned *DestReg
) const;
177 /// Certain SIMD instructions with vector element operand are not efficient.
178 /// Rewrite them into SIMD instructions with vector operands. This rewrite
179 /// is driven by the latency of the instructions.
180 /// Return true if the SIMD instruction is modified.
181 bool optimizeVectElement(MachineInstr
&MI
);
183 /// Process The REG_SEQUENCE instruction, and extract the source
184 /// operands of the ST2/4 instruction from it.
185 /// Example of such instructions.
186 /// %dest = REG_SEQUENCE %st2_src1, dsub0, %st2_src2, dsub1;
187 /// Return true when the instruction is processed successfully.
188 bool processSeqRegInst(MachineInstr
*DefiningMI
, unsigned* StReg
,
189 unsigned* StRegKill
, unsigned NumArg
) const;
191 /// Load/Store Interleaving instructions are not always beneficial.
192 /// Replace them by ZIP instructionand classical load/store.
193 /// Return true if the SIMD instruction is modified.
194 bool optimizeLdStInterleave(MachineInstr
&MI
);
196 /// Return the number of useful source registers for this
197 /// instruction (2 for ST2 and 4 for ST4).
198 unsigned determineSrcReg(MachineInstr
&MI
) const;
200 bool runOnMachineFunction(MachineFunction
&Fn
) override
;
202 StringRef
getPassName() const override
{
203 return AARCH64_VECTOR_BY_ELEMENT_OPT_NAME
;
207 char AArch64SIMDInstrOpt::ID
= 0;
209 } // end anonymous namespace
211 INITIALIZE_PASS(AArch64SIMDInstrOpt
, "aarch64-simdinstr-opt",
212 AARCH64_VECTOR_BY_ELEMENT_OPT_NAME
, false, false)
214 /// Based only on latency of instructions, determine if it is cost efficient
215 /// to replace the instruction InstDesc by the instructions stored in the
216 /// array InstDescRepl.
217 /// Return true if replacement is expected to be faster.
218 bool AArch64SIMDInstrOpt::
219 shouldReplaceInst(MachineFunction
*MF
, const MCInstrDesc
*InstDesc
,
220 SmallVectorImpl
<const MCInstrDesc
*> &InstDescRepl
) {
221 // Check if replacement decision is already available in the cached table.
223 std::string Subtarget
= std::string(SchedModel
.getSubtargetInfo()->getCPU());
224 auto InstID
= std::make_pair(InstDesc
->getOpcode(), Subtarget
);
225 auto It
= SIMDInstrTable
.find(InstID
);
226 if (It
!= SIMDInstrTable
.end())
229 unsigned SCIdx
= InstDesc
->getSchedClass();
230 const MCSchedClassDesc
*SCDesc
=
231 SchedModel
.getMCSchedModel()->getSchedClassDesc(SCIdx
);
233 // If a target does not define resources for the instructions
234 // of interest, then return false for no replacement.
235 const MCSchedClassDesc
*SCDescRepl
;
236 if (!SCDesc
->isValid() || SCDesc
->isVariant())
238 SIMDInstrTable
[InstID
] = false;
241 for (const auto *IDesc
: InstDescRepl
)
243 SCDescRepl
= SchedModel
.getMCSchedModel()->getSchedClassDesc(
244 IDesc
->getSchedClass());
245 if (!SCDescRepl
->isValid() || SCDescRepl
->isVariant())
247 SIMDInstrTable
[InstID
] = false;
253 unsigned ReplCost
= 0;
254 for (const auto *IDesc
:InstDescRepl
)
255 ReplCost
+= SchedModel
.computeInstrLatency(IDesc
->getOpcode());
257 if (SchedModel
.computeInstrLatency(InstDesc
->getOpcode()) > ReplCost
)
259 SIMDInstrTable
[InstID
] = true;
264 SIMDInstrTable
[InstID
] = false;
269 /// Determine if we need to exit this pass for a kind of instruction replacement
270 /// early. This makes sure that no compile time is spent in this pass for
271 /// targets with no need for any of these optimizations beyond performing this
273 /// Return true if early exit of this pass for a kind of instruction
274 /// replacement is recommended for a target.
275 bool AArch64SIMDInstrOpt::shouldExitEarly(MachineFunction
*MF
, Subpass SP
) {
276 const MCInstrDesc
* OriginalMCID
;
277 SmallVector
<const MCInstrDesc
*, MaxNumRepl
> ReplInstrMCID
;
280 // For this optimization, check by comparing the latency of a representative
281 // instruction to that of the replacement instructions.
282 // TODO: check for all concerned instructions.
284 OriginalMCID
= &TII
->get(AArch64::FMLAv4i32_indexed
);
285 ReplInstrMCID
.push_back(&TII
->get(AArch64::DUPv4i32lane
));
286 ReplInstrMCID
.push_back(&TII
->get(AArch64::FMLAv4f32
));
287 if (shouldReplaceInst(MF
, OriginalMCID
, ReplInstrMCID
))
291 // For this optimization, check for all concerned instructions.
293 std::string Subtarget
=
294 std::string(SchedModel
.getSubtargetInfo()->getCPU());
295 auto It
= InterlEarlyExit
.find(Subtarget
);
296 if (It
!= InterlEarlyExit
.end())
299 for (auto &I
: IRT
) {
300 OriginalMCID
= &TII
->get(I
.OrigOpc
);
301 for (auto &Repl
: I
.ReplOpc
)
302 ReplInstrMCID
.push_back(&TII
->get(Repl
));
303 if (shouldReplaceInst(MF
, OriginalMCID
, ReplInstrMCID
)) {
304 InterlEarlyExit
[Subtarget
] = false;
307 ReplInstrMCID
.clear();
309 InterlEarlyExit
[Subtarget
] = true;
316 /// Check whether an equivalent DUP instruction has already been
318 /// Return true when the DUP instruction already exists. In this case,
319 /// DestReg will point to the destination of the already created DUP.
320 bool AArch64SIMDInstrOpt::reuseDUP(MachineInstr
&MI
, unsigned DupOpcode
,
321 unsigned SrcReg
, unsigned LaneNumber
,
322 unsigned *DestReg
) const {
323 for (MachineBasicBlock::iterator MII
= MI
, MIE
= MI
.getParent()->begin();
326 MachineInstr
*CurrentMI
= &*MII
;
328 if (CurrentMI
->getOpcode() == DupOpcode
&&
329 CurrentMI
->getNumOperands() == 3 &&
330 CurrentMI
->getOperand(1).getReg() == SrcReg
&&
331 CurrentMI
->getOperand(2).getImm() == LaneNumber
) {
332 *DestReg
= CurrentMI
->getOperand(0).getReg();
340 /// Certain SIMD instructions with vector element operand are not efficient.
341 /// Rewrite them into SIMD instructions with vector operands. This rewrite
342 /// is driven by the latency of the instructions.
343 /// The instruction of concerns are for the time being FMLA, FMLS, FMUL,
344 /// and FMULX and hence they are hardcoded.
347 /// fmla v0.4s, v1.4s, v2.s[1]
349 /// Is rewritten into
350 /// dup v3.4s, v2.s[1] // DUP not necessary if redundant
351 /// fmla v0.4s, v1.4s, v3.4s
353 /// Return true if the SIMD instruction is modified.
354 bool AArch64SIMDInstrOpt::optimizeVectElement(MachineInstr
&MI
) {
355 const MCInstrDesc
*MulMCID
, *DupMCID
;
356 const TargetRegisterClass
*RC
= &AArch64::FPR128RegClass
;
358 switch (MI
.getOpcode()) {
363 case AArch64::FMLAv4i32_indexed
:
364 DupMCID
= &TII
->get(AArch64::DUPv4i32lane
);
365 MulMCID
= &TII
->get(AArch64::FMLAv4f32
);
367 case AArch64::FMLSv4i32_indexed
:
368 DupMCID
= &TII
->get(AArch64::DUPv4i32lane
);
369 MulMCID
= &TII
->get(AArch64::FMLSv4f32
);
371 case AArch64::FMULXv4i32_indexed
:
372 DupMCID
= &TII
->get(AArch64::DUPv4i32lane
);
373 MulMCID
= &TII
->get(AArch64::FMULXv4f32
);
375 case AArch64::FMULv4i32_indexed
:
376 DupMCID
= &TII
->get(AArch64::DUPv4i32lane
);
377 MulMCID
= &TII
->get(AArch64::FMULv4f32
);
381 case AArch64::FMLAv2i64_indexed
:
382 DupMCID
= &TII
->get(AArch64::DUPv2i64lane
);
383 MulMCID
= &TII
->get(AArch64::FMLAv2f64
);
385 case AArch64::FMLSv2i64_indexed
:
386 DupMCID
= &TII
->get(AArch64::DUPv2i64lane
);
387 MulMCID
= &TII
->get(AArch64::FMLSv2f64
);
389 case AArch64::FMULXv2i64_indexed
:
390 DupMCID
= &TII
->get(AArch64::DUPv2i64lane
);
391 MulMCID
= &TII
->get(AArch64::FMULXv2f64
);
393 case AArch64::FMULv2i64_indexed
:
394 DupMCID
= &TII
->get(AArch64::DUPv2i64lane
);
395 MulMCID
= &TII
->get(AArch64::FMULv2f64
);
399 case AArch64::FMLAv2i32_indexed
:
400 RC
= &AArch64::FPR64RegClass
;
401 DupMCID
= &TII
->get(AArch64::DUPv2i32lane
);
402 MulMCID
= &TII
->get(AArch64::FMLAv2f32
);
404 case AArch64::FMLSv2i32_indexed
:
405 RC
= &AArch64::FPR64RegClass
;
406 DupMCID
= &TII
->get(AArch64::DUPv2i32lane
);
407 MulMCID
= &TII
->get(AArch64::FMLSv2f32
);
409 case AArch64::FMULXv2i32_indexed
:
410 RC
= &AArch64::FPR64RegClass
;
411 DupMCID
= &TII
->get(AArch64::DUPv2i32lane
);
412 MulMCID
= &TII
->get(AArch64::FMULXv2f32
);
414 case AArch64::FMULv2i32_indexed
:
415 RC
= &AArch64::FPR64RegClass
;
416 DupMCID
= &TII
->get(AArch64::DUPv2i32lane
);
417 MulMCID
= &TII
->get(AArch64::FMULv2f32
);
421 SmallVector
<const MCInstrDesc
*, 2> ReplInstrMCID
;
422 ReplInstrMCID
.push_back(DupMCID
);
423 ReplInstrMCID
.push_back(MulMCID
);
424 if (!shouldReplaceInst(MI
.getParent()->getParent(), &TII
->get(MI
.getOpcode()),
428 const DebugLoc
&DL
= MI
.getDebugLoc();
429 MachineBasicBlock
&MBB
= *MI
.getParent();
430 MachineRegisterInfo
&MRI
= MBB
.getParent()->getRegInfo();
432 // Get the operands of the current SIMD arithmetic instruction.
433 Register MulDest
= MI
.getOperand(0).getReg();
434 Register SrcReg0
= MI
.getOperand(1).getReg();
435 unsigned Src0IsKill
= getKillRegState(MI
.getOperand(1).isKill());
436 Register SrcReg1
= MI
.getOperand(2).getReg();
437 unsigned Src1IsKill
= getKillRegState(MI
.getOperand(2).isKill());
440 // Instructions of interest have either 4 or 5 operands.
441 if (MI
.getNumOperands() == 5) {
442 Register SrcReg2
= MI
.getOperand(3).getReg();
443 unsigned Src2IsKill
= getKillRegState(MI
.getOperand(3).isKill());
444 unsigned LaneNumber
= MI
.getOperand(4).getImm();
445 // Create a new DUP instruction. Note that if an equivalent DUP instruction
446 // has already been created before, then use that one instead of creating
448 if (!reuseDUP(MI
, DupMCID
->getOpcode(), SrcReg2
, LaneNumber
, &DupDest
)) {
449 DupDest
= MRI
.createVirtualRegister(RC
);
450 BuildMI(MBB
, MI
, DL
, *DupMCID
, DupDest
)
451 .addReg(SrcReg2
, Src2IsKill
)
454 BuildMI(MBB
, MI
, DL
, *MulMCID
, MulDest
)
455 .addReg(SrcReg0
, Src0IsKill
)
456 .addReg(SrcReg1
, Src1IsKill
)
457 .addReg(DupDest
, Src2IsKill
);
458 } else if (MI
.getNumOperands() == 4) {
459 unsigned LaneNumber
= MI
.getOperand(3).getImm();
460 if (!reuseDUP(MI
, DupMCID
->getOpcode(), SrcReg1
, LaneNumber
, &DupDest
)) {
461 DupDest
= MRI
.createVirtualRegister(RC
);
462 BuildMI(MBB
, MI
, DL
, *DupMCID
, DupDest
)
463 .addReg(SrcReg1
, Src1IsKill
)
466 BuildMI(MBB
, MI
, DL
, *MulMCID
, MulDest
)
467 .addReg(SrcReg0
, Src0IsKill
)
468 .addReg(DupDest
, Src1IsKill
);
477 /// Load/Store Interleaving instructions are not always beneficial.
478 /// Replace them by ZIP instructions and classical load/store.
481 /// st2 {v0.4s, v1.4s}, addr
483 /// Is rewritten into:
484 /// zip1 v2.4s, v0.4s, v1.4s
485 /// zip2 v3.4s, v0.4s, v1.4s
489 /// st4 {v0.4s, v1.4s, v2.4s, v3.4s}, addr
491 /// Is rewritten into:
492 /// zip1 v4.4s, v0.4s, v2.4s
493 /// zip2 v5.4s, v0.4s, v2.4s
494 /// zip1 v6.4s, v1.4s, v3.4s
495 /// zip2 v7.4s, v1.4s, v3.4s
496 /// zip1 v8.4s, v4.4s, v6.4s
497 /// zip2 v9.4s, v4.4s, v6.4s
498 /// zip1 v10.4s, v5.4s, v7.4s
499 /// zip2 v11.4s, v5.4s, v7.4s
501 /// stp q10, q11, addr+32
503 /// Currently only instructions related to ST2 and ST4 are considered.
504 /// Other may be added later.
505 /// Return true if the SIMD instruction is modified.
506 bool AArch64SIMDInstrOpt::optimizeLdStInterleave(MachineInstr
&MI
) {
508 unsigned SeqReg
, AddrReg
;
509 unsigned StReg
[4], StRegKill
[4];
510 MachineInstr
*DefiningMI
;
511 const DebugLoc
&DL
= MI
.getDebugLoc();
512 MachineBasicBlock
&MBB
= *MI
.getParent();
513 SmallVector
<unsigned, MaxNumRepl
> ZipDest
;
514 SmallVector
<const MCInstrDesc
*, MaxNumRepl
> ReplInstrMCID
;
516 // If current instruction matches any of the rewriting rules, then
517 // gather information about parameters of the new instructions.
519 for (auto &I
: IRT
) {
520 if (MI
.getOpcode() == I
.OrigOpc
) {
521 SeqReg
= MI
.getOperand(0).getReg();
522 AddrReg
= MI
.getOperand(1).getReg();
523 DefiningMI
= MRI
->getUniqueVRegDef(SeqReg
);
524 unsigned NumReg
= determineSrcReg(MI
);
525 if (!processSeqRegInst(DefiningMI
, StReg
, StRegKill
, NumReg
))
528 for (auto &Repl
: I
.ReplOpc
) {
529 ReplInstrMCID
.push_back(&TII
->get(Repl
));
530 // Generate destination registers but only for non-store instruction.
531 if (Repl
!= AArch64::STPQi
&& Repl
!= AArch64::STPDi
)
532 ZipDest
.push_back(MRI
->createVirtualRegister(&I
.RC
));
542 // Determine if it is profitable to replace MI by the series of instructions
543 // represented in ReplInstrMCID.
544 if (!shouldReplaceInst(MI
.getParent()->getParent(), &TII
->get(MI
.getOpcode()),
548 // Generate the replacement instructions composed of ZIP1, ZIP2, and STP (at
549 // this point, the code generation is hardcoded and does not rely on the IRT
550 // table used above given that code generation for ST2 replacement is somewhat
551 // different than for ST4 replacement. We could have added more info into the
552 // table related to how we build new instructions but we may be adding more
553 // complexity with that).
554 switch (MI
.getOpcode()) {
558 case AArch64::ST2Twov16b
:
559 case AArch64::ST2Twov8b
:
560 case AArch64::ST2Twov8h
:
561 case AArch64::ST2Twov4h
:
562 case AArch64::ST2Twov4s
:
563 case AArch64::ST2Twov2s
:
564 case AArch64::ST2Twov2d
:
566 BuildMI(MBB
, MI
, DL
, *ReplInstrMCID
[0], ZipDest
[0])
569 BuildMI(MBB
, MI
, DL
, *ReplInstrMCID
[1], ZipDest
[1])
570 .addReg(StReg
[0], StRegKill
[0])
571 .addReg(StReg
[1], StRegKill
[1]);
573 BuildMI(MBB
, MI
, DL
, *ReplInstrMCID
[2])
580 case AArch64::ST4Fourv16b
:
581 case AArch64::ST4Fourv8b
:
582 case AArch64::ST4Fourv8h
:
583 case AArch64::ST4Fourv4h
:
584 case AArch64::ST4Fourv4s
:
585 case AArch64::ST4Fourv2s
:
586 case AArch64::ST4Fourv2d
:
588 BuildMI(MBB
, MI
, DL
, *ReplInstrMCID
[0], ZipDest
[0])
591 BuildMI(MBB
, MI
, DL
, *ReplInstrMCID
[1], ZipDest
[1])
592 .addReg(StReg
[0], StRegKill
[0])
593 .addReg(StReg
[2], StRegKill
[2]);
594 BuildMI(MBB
, MI
, DL
, *ReplInstrMCID
[2], ZipDest
[2])
597 BuildMI(MBB
, MI
, DL
, *ReplInstrMCID
[3], ZipDest
[3])
598 .addReg(StReg
[1], StRegKill
[1])
599 .addReg(StReg
[3], StRegKill
[3]);
600 BuildMI(MBB
, MI
, DL
, *ReplInstrMCID
[4], ZipDest
[4])
603 BuildMI(MBB
, MI
, DL
, *ReplInstrMCID
[5], ZipDest
[5])
606 BuildMI(MBB
, MI
, DL
, *ReplInstrMCID
[6], ZipDest
[6])
609 BuildMI(MBB
, MI
, DL
, *ReplInstrMCID
[7], ZipDest
[7])
613 BuildMI(MBB
, MI
, DL
, *ReplInstrMCID
[8])
618 BuildMI(MBB
, MI
, DL
, *ReplInstrMCID
[9])
630 /// Process The REG_SEQUENCE instruction, and extract the source
631 /// operands of the ST2/4 instruction from it.
632 /// Example of such instruction.
633 /// %dest = REG_SEQUENCE %st2_src1, dsub0, %st2_src2, dsub1;
634 /// Return true when the instruction is processed successfully.
635 bool AArch64SIMDInstrOpt::processSeqRegInst(MachineInstr
*DefiningMI
,
636 unsigned* StReg
, unsigned* StRegKill
, unsigned NumArg
) const {
637 assert(DefiningMI
!= nullptr);
638 if (DefiningMI
->getOpcode() != AArch64::REG_SEQUENCE
)
641 for (unsigned i
=0; i
<NumArg
; i
++) {
642 StReg
[i
] = DefiningMI
->getOperand(2*i
+1).getReg();
643 StRegKill
[i
] = getKillRegState(DefiningMI
->getOperand(2*i
+1).isKill());
645 // Validation check for the other arguments.
646 if (DefiningMI
->getOperand(2*i
+2).isImm()) {
647 switch (DefiningMI
->getOperand(2*i
+2).getImm()) {
668 /// Return the number of useful source registers for this instruction
669 /// (2 for ST2 and 4 for ST4).
670 unsigned AArch64SIMDInstrOpt::determineSrcReg(MachineInstr
&MI
) const {
671 switch (MI
.getOpcode()) {
673 llvm_unreachable("Unsupported instruction for this pass");
675 case AArch64::ST2Twov16b
:
676 case AArch64::ST2Twov8b
:
677 case AArch64::ST2Twov8h
:
678 case AArch64::ST2Twov4h
:
679 case AArch64::ST2Twov4s
:
680 case AArch64::ST2Twov2s
:
681 case AArch64::ST2Twov2d
:
684 case AArch64::ST4Fourv16b
:
685 case AArch64::ST4Fourv8b
:
686 case AArch64::ST4Fourv8h
:
687 case AArch64::ST4Fourv4h
:
688 case AArch64::ST4Fourv4s
:
689 case AArch64::ST4Fourv2s
:
690 case AArch64::ST4Fourv2d
:
695 bool AArch64SIMDInstrOpt::runOnMachineFunction(MachineFunction
&MF
) {
696 if (skipFunction(MF
.getFunction()))
699 TII
= MF
.getSubtarget().getInstrInfo();
700 MRI
= &MF
.getRegInfo();
701 const TargetSubtargetInfo
&ST
= MF
.getSubtarget();
702 const AArch64InstrInfo
*AAII
=
703 static_cast<const AArch64InstrInfo
*>(ST
.getInstrInfo());
706 SchedModel
.init(&ST
);
707 if (!SchedModel
.hasInstrSchedModel())
710 bool Changed
= false;
711 for (auto OptimizationKind
: {VectorElem
, Interleave
}) {
712 if (!shouldExitEarly(&MF
, OptimizationKind
)) {
713 SmallVector
<MachineInstr
*, 8> RemoveMIs
;
714 for (MachineBasicBlock
&MBB
: MF
) {
715 for (MachineInstr
&MI
: MBB
) {
717 if (OptimizationKind
== VectorElem
)
718 InstRewrite
= optimizeVectElement(MI
) ;
720 InstRewrite
= optimizeLdStInterleave(MI
);
722 // Add MI to the list of instructions to be removed given that it
723 // has been replaced.
724 RemoveMIs
.push_back(&MI
);
729 for (MachineInstr
*MI
: RemoveMIs
)
730 MI
->eraseFromParent();
737 /// Returns an instance of the high cost ASIMD instruction replacement
738 /// optimization pass.
739 FunctionPass
*llvm::createAArch64SIMDInstrOptPass() {
740 return new AArch64SIMDInstrOpt();