2 // Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
3 // See https://llvm.org/LICENSE.txt for license information.
4 // SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
6 //===----------------------------------------------------------------------===//
8 // This file contains a pass that performs optimization on SIMD instructions
9 // with high latency by splitting them into more efficient series of
12 // 1. Rewrite certain SIMD instructions with vector element due to their
13 // inefficiency on some targets.
16 // fmla v0.4s, v1.4s, v2.s[1]
20 // fmla v0.4s, v1.4s, v3.4s
22 // 2. Rewrite interleaved memory access instructions due to their
23 // inefficiency on some targets.
26 // st2 {v0.4s, v1.4s}, addr
29 // zip1 v2.4s, v0.4s, v1.4s
30 // zip2 v3.4s, v0.4s, v1.4s
33 //===----------------------------------------------------------------------===//
35 #include "AArch64InstrInfo.h"
36 #include "llvm/ADT/SmallVector.h"
37 #include "llvm/ADT/Statistic.h"
38 #include "llvm/ADT/StringRef.h"
39 #include "llvm/CodeGen/MachineBasicBlock.h"
40 #include "llvm/CodeGen/MachineFunction.h"
41 #include "llvm/CodeGen/MachineFunctionPass.h"
42 #include "llvm/CodeGen/MachineInstr.h"
43 #include "llvm/CodeGen/MachineInstrBuilder.h"
44 #include "llvm/CodeGen/MachineOperand.h"
45 #include "llvm/CodeGen/MachineRegisterInfo.h"
46 #include "llvm/CodeGen/TargetInstrInfo.h"
47 #include "llvm/CodeGen/TargetSchedule.h"
48 #include "llvm/CodeGen/TargetSubtargetInfo.h"
49 #include "llvm/MC/MCInstrDesc.h"
50 #include "llvm/MC/MCSchedule.h"
51 #include "llvm/Pass.h"
52 #include <unordered_map>
56 #define DEBUG_TYPE "aarch64-simdinstr-opt"
58 STATISTIC(NumModifiedInstr
,
59 "Number of SIMD instructions modified");
61 #define AARCH64_VECTOR_BY_ELEMENT_OPT_NAME \
62 "AArch64 SIMD instructions optimization pass"
66 struct AArch64SIMDInstrOpt
: public MachineFunctionPass
{
69 const TargetInstrInfo
*TII
;
70 MachineRegisterInfo
*MRI
;
71 TargetSchedModel SchedModel
;
73 // The two maps below are used to cache decisions instead of recomputing:
74 // This is used to cache instruction replacement decisions within function
75 // units and across function units.
76 std::map
<std::pair
<unsigned, std::string
>, bool> SIMDInstrTable
;
77 // This is used to cache the decision of whether to leave the interleaved
78 // store instructions replacement pass early or not for a particular target.
79 std::unordered_map
<std::string
, bool> InterlEarlyExit
;
86 // Instruction represented by OrigOpc is replaced by instructions in ReplOpc.
89 std::vector
<unsigned> ReplOpc
;
90 const TargetRegisterClass RC
;
93 #define RuleST2(OpcOrg, OpcR0, OpcR1, OpcR2, RC) \
94 {OpcOrg, {OpcR0, OpcR1, OpcR2}, RC}
95 #define RuleST4(OpcOrg, OpcR0, OpcR1, OpcR2, OpcR3, OpcR4, OpcR5, OpcR6, \
96 OpcR7, OpcR8, OpcR9, RC) \
98 {OpcR0, OpcR1, OpcR2, OpcR3, OpcR4, OpcR5, OpcR6, OpcR7, OpcR8, OpcR9}, RC}
100 // The Instruction Replacement Table:
101 std::vector
<InstReplInfo
> IRT
= {
103 RuleST2(AArch64::ST2Twov2d
, AArch64::ZIP1v2i64
, AArch64::ZIP2v2i64
,
104 AArch64::STPQi
, AArch64::FPR128RegClass
),
105 RuleST2(AArch64::ST2Twov4s
, AArch64::ZIP1v4i32
, AArch64::ZIP2v4i32
,
106 AArch64::STPQi
, AArch64::FPR128RegClass
),
107 RuleST2(AArch64::ST2Twov2s
, AArch64::ZIP1v2i32
, AArch64::ZIP2v2i32
,
108 AArch64::STPDi
, AArch64::FPR64RegClass
),
109 RuleST2(AArch64::ST2Twov8h
, AArch64::ZIP1v8i16
, AArch64::ZIP2v8i16
,
110 AArch64::STPQi
, AArch64::FPR128RegClass
),
111 RuleST2(AArch64::ST2Twov4h
, AArch64::ZIP1v4i16
, AArch64::ZIP2v4i16
,
112 AArch64::STPDi
, AArch64::FPR64RegClass
),
113 RuleST2(AArch64::ST2Twov16b
, AArch64::ZIP1v16i8
, AArch64::ZIP2v16i8
,
114 AArch64::STPQi
, AArch64::FPR128RegClass
),
115 RuleST2(AArch64::ST2Twov8b
, AArch64::ZIP1v8i8
, AArch64::ZIP2v8i8
,
116 AArch64::STPDi
, AArch64::FPR64RegClass
),
118 RuleST4(AArch64::ST4Fourv2d
, AArch64::ZIP1v2i64
, AArch64::ZIP2v2i64
,
119 AArch64::ZIP1v2i64
, AArch64::ZIP2v2i64
, AArch64::ZIP1v2i64
,
120 AArch64::ZIP2v2i64
, AArch64::ZIP1v2i64
, AArch64::ZIP2v2i64
,
121 AArch64::STPQi
, AArch64::STPQi
, AArch64::FPR128RegClass
),
122 RuleST4(AArch64::ST4Fourv4s
, AArch64::ZIP1v4i32
, AArch64::ZIP2v4i32
,
123 AArch64::ZIP1v4i32
, AArch64::ZIP2v4i32
, AArch64::ZIP1v4i32
,
124 AArch64::ZIP2v4i32
, AArch64::ZIP1v4i32
, AArch64::ZIP2v4i32
,
125 AArch64::STPQi
, AArch64::STPQi
, AArch64::FPR128RegClass
),
126 RuleST4(AArch64::ST4Fourv2s
, AArch64::ZIP1v2i32
, AArch64::ZIP2v2i32
,
127 AArch64::ZIP1v2i32
, AArch64::ZIP2v2i32
, AArch64::ZIP1v2i32
,
128 AArch64::ZIP2v2i32
, AArch64::ZIP1v2i32
, AArch64::ZIP2v2i32
,
129 AArch64::STPDi
, AArch64::STPDi
, AArch64::FPR64RegClass
),
130 RuleST4(AArch64::ST4Fourv8h
, AArch64::ZIP1v8i16
, AArch64::ZIP2v8i16
,
131 AArch64::ZIP1v8i16
, AArch64::ZIP2v8i16
, AArch64::ZIP1v8i16
,
132 AArch64::ZIP2v8i16
, AArch64::ZIP1v8i16
, AArch64::ZIP2v8i16
,
133 AArch64::STPQi
, AArch64::STPQi
, AArch64::FPR128RegClass
),
134 RuleST4(AArch64::ST4Fourv4h
, AArch64::ZIP1v4i16
, AArch64::ZIP2v4i16
,
135 AArch64::ZIP1v4i16
, AArch64::ZIP2v4i16
, AArch64::ZIP1v4i16
,
136 AArch64::ZIP2v4i16
, AArch64::ZIP1v4i16
, AArch64::ZIP2v4i16
,
137 AArch64::STPDi
, AArch64::STPDi
, AArch64::FPR64RegClass
),
138 RuleST4(AArch64::ST4Fourv16b
, AArch64::ZIP1v16i8
, AArch64::ZIP2v16i8
,
139 AArch64::ZIP1v16i8
, AArch64::ZIP2v16i8
, AArch64::ZIP1v16i8
,
140 AArch64::ZIP2v16i8
, AArch64::ZIP1v16i8
, AArch64::ZIP2v16i8
,
141 AArch64::STPQi
, AArch64::STPQi
, AArch64::FPR128RegClass
),
142 RuleST4(AArch64::ST4Fourv8b
, AArch64::ZIP1v8i8
, AArch64::ZIP2v8i8
,
143 AArch64::ZIP1v8i8
, AArch64::ZIP2v8i8
, AArch64::ZIP1v8i8
,
144 AArch64::ZIP2v8i8
, AArch64::ZIP1v8i8
, AArch64::ZIP2v8i8
,
145 AArch64::STPDi
, AArch64::STPDi
, AArch64::FPR64RegClass
)
148 // A costly instruction is replaced in this work by N efficient instructions
149 // The maximum of N is curently 10 and it is for ST4 case.
150 static const unsigned MaxNumRepl
= 10;
152 AArch64SIMDInstrOpt() : MachineFunctionPass(ID
) {
153 initializeAArch64SIMDInstrOptPass(*PassRegistry::getPassRegistry());
156 /// Based only on latency of instructions, determine if it is cost efficient
157 /// to replace the instruction InstDesc by the instructions stored in the
158 /// array InstDescRepl.
159 /// Return true if replacement is expected to be faster.
160 bool shouldReplaceInst(MachineFunction
*MF
, const MCInstrDesc
*InstDesc
,
161 SmallVectorImpl
<const MCInstrDesc
*> &ReplInstrMCID
);
163 /// Determine if we need to exit the instruction replacement optimization
164 /// passes early. This makes sure that no compile time is spent in this pass
165 /// for targets with no need for any of these optimizations.
166 /// Return true if early exit of the pass is recommended.
167 bool shouldExitEarly(MachineFunction
*MF
, Subpass SP
);
169 /// Check whether an equivalent DUP instruction has already been
171 /// Return true when the DUP instruction already exists. In this case,
172 /// DestReg will point to the destination of the already created DUP.
173 bool reuseDUP(MachineInstr
&MI
, unsigned DupOpcode
, unsigned SrcReg
,
174 unsigned LaneNumber
, unsigned *DestReg
) const;
176 /// Certain SIMD instructions with vector element operand are not efficient.
177 /// Rewrite them into SIMD instructions with vector operands. This rewrite
178 /// is driven by the latency of the instructions.
179 /// Return true if the SIMD instruction is modified.
180 bool optimizeVectElement(MachineInstr
&MI
);
182 /// Process The REG_SEQUENCE instruction, and extract the source
183 /// operands of the ST2/4 instruction from it.
184 /// Example of such instructions.
185 /// %dest = REG_SEQUENCE %st2_src1, dsub0, %st2_src2, dsub1;
186 /// Return true when the instruction is processed successfully.
187 bool processSeqRegInst(MachineInstr
*DefiningMI
, unsigned* StReg
,
188 unsigned* StRegKill
, unsigned NumArg
) const;
190 /// Load/Store Interleaving instructions are not always beneficial.
191 /// Replace them by ZIP instructionand classical load/store.
192 /// Return true if the SIMD instruction is modified.
193 bool optimizeLdStInterleave(MachineInstr
&MI
);
195 /// Return the number of useful source registers for this
196 /// instruction (2 for ST2 and 4 for ST4).
197 unsigned determineSrcReg(MachineInstr
&MI
) const;
199 bool runOnMachineFunction(MachineFunction
&Fn
) override
;
201 StringRef
getPassName() const override
{
202 return AARCH64_VECTOR_BY_ELEMENT_OPT_NAME
;
206 char AArch64SIMDInstrOpt::ID
= 0;
208 } // end anonymous namespace
210 INITIALIZE_PASS(AArch64SIMDInstrOpt
, "aarch64-simdinstr-opt",
211 AARCH64_VECTOR_BY_ELEMENT_OPT_NAME
, false, false)
213 /// Based only on latency of instructions, determine if it is cost efficient
214 /// to replace the instruction InstDesc by the instructions stored in the
215 /// array InstDescRepl.
216 /// Return true if replacement is expected to be faster.
217 bool AArch64SIMDInstrOpt::
218 shouldReplaceInst(MachineFunction
*MF
, const MCInstrDesc
*InstDesc
,
219 SmallVectorImpl
<const MCInstrDesc
*> &InstDescRepl
) {
220 // Check if replacement decision is already available in the cached table.
222 std::string Subtarget
= SchedModel
.getSubtargetInfo()->getCPU();
223 auto InstID
= std::make_pair(InstDesc
->getOpcode(), Subtarget
);
224 if (SIMDInstrTable
.find(InstID
) != SIMDInstrTable
.end())
225 return SIMDInstrTable
[InstID
];
227 unsigned SCIdx
= InstDesc
->getSchedClass();
228 const MCSchedClassDesc
*SCDesc
=
229 SchedModel
.getMCSchedModel()->getSchedClassDesc(SCIdx
);
231 // If a target does not define resources for the instructions
232 // of interest, then return false for no replacement.
233 const MCSchedClassDesc
*SCDescRepl
;
234 if (!SCDesc
->isValid() || SCDesc
->isVariant())
236 SIMDInstrTable
[InstID
] = false;
239 for (auto IDesc
: InstDescRepl
)
241 SCDescRepl
= SchedModel
.getMCSchedModel()->getSchedClassDesc(
242 IDesc
->getSchedClass());
243 if (!SCDescRepl
->isValid() || SCDescRepl
->isVariant())
245 SIMDInstrTable
[InstID
] = false;
251 unsigned ReplCost
= 0;
252 for (auto IDesc
:InstDescRepl
)
253 ReplCost
+= SchedModel
.computeInstrLatency(IDesc
->getOpcode());
255 if (SchedModel
.computeInstrLatency(InstDesc
->getOpcode()) > ReplCost
)
257 SIMDInstrTable
[InstID
] = true;
262 SIMDInstrTable
[InstID
] = false;
267 /// Determine if we need to exit this pass for a kind of instruction replacement
268 /// early. This makes sure that no compile time is spent in this pass for
269 /// targets with no need for any of these optimizations beyond performing this
271 /// Return true if early exit of this pass for a kind of instruction
272 /// replacement is recommended for a target.
273 bool AArch64SIMDInstrOpt::shouldExitEarly(MachineFunction
*MF
, Subpass SP
) {
274 const MCInstrDesc
* OriginalMCID
;
275 SmallVector
<const MCInstrDesc
*, MaxNumRepl
> ReplInstrMCID
;
278 // For this optimization, check by comparing the latency of a representative
279 // instruction to that of the replacement instructions.
280 // TODO: check for all concerned instructions.
282 OriginalMCID
= &TII
->get(AArch64::FMLAv4i32_indexed
);
283 ReplInstrMCID
.push_back(&TII
->get(AArch64::DUPv4i32lane
));
284 ReplInstrMCID
.push_back(&TII
->get(AArch64::FMLAv4f32
));
285 if (shouldReplaceInst(MF
, OriginalMCID
, ReplInstrMCID
))
289 // For this optimization, check for all concerned instructions.
291 std::string Subtarget
= SchedModel
.getSubtargetInfo()->getCPU();
292 if (InterlEarlyExit
.find(Subtarget
) != InterlEarlyExit
.end())
293 return InterlEarlyExit
[Subtarget
];
295 for (auto &I
: IRT
) {
296 OriginalMCID
= &TII
->get(I
.OrigOpc
);
297 for (auto &Repl
: I
.ReplOpc
)
298 ReplInstrMCID
.push_back(&TII
->get(Repl
));
299 if (shouldReplaceInst(MF
, OriginalMCID
, ReplInstrMCID
)) {
300 InterlEarlyExit
[Subtarget
] = false;
303 ReplInstrMCID
.clear();
305 InterlEarlyExit
[Subtarget
] = true;
312 /// Check whether an equivalent DUP instruction has already been
314 /// Return true when the DUP instruction already exists. In this case,
315 /// DestReg will point to the destination of the already created DUP.
316 bool AArch64SIMDInstrOpt::reuseDUP(MachineInstr
&MI
, unsigned DupOpcode
,
317 unsigned SrcReg
, unsigned LaneNumber
,
318 unsigned *DestReg
) const {
319 for (MachineBasicBlock::iterator MII
= MI
, MIE
= MI
.getParent()->begin();
322 MachineInstr
*CurrentMI
= &*MII
;
324 if (CurrentMI
->getOpcode() == DupOpcode
&&
325 CurrentMI
->getNumOperands() == 3 &&
326 CurrentMI
->getOperand(1).getReg() == SrcReg
&&
327 CurrentMI
->getOperand(2).getImm() == LaneNumber
) {
328 *DestReg
= CurrentMI
->getOperand(0).getReg();
336 /// Certain SIMD instructions with vector element operand are not efficient.
337 /// Rewrite them into SIMD instructions with vector operands. This rewrite
338 /// is driven by the latency of the instructions.
339 /// The instruction of concerns are for the time being FMLA, FMLS, FMUL,
340 /// and FMULX and hence they are hardcoded.
343 /// fmla v0.4s, v1.4s, v2.s[1]
345 /// Is rewritten into
346 /// dup v3.4s, v2.s[1] // DUP not necessary if redundant
347 /// fmla v0.4s, v1.4s, v3.4s
349 /// Return true if the SIMD instruction is modified.
350 bool AArch64SIMDInstrOpt::optimizeVectElement(MachineInstr
&MI
) {
351 const MCInstrDesc
*MulMCID
, *DupMCID
;
352 const TargetRegisterClass
*RC
= &AArch64::FPR128RegClass
;
354 switch (MI
.getOpcode()) {
359 case AArch64::FMLAv4i32_indexed
:
360 DupMCID
= &TII
->get(AArch64::DUPv4i32lane
);
361 MulMCID
= &TII
->get(AArch64::FMLAv4f32
);
363 case AArch64::FMLSv4i32_indexed
:
364 DupMCID
= &TII
->get(AArch64::DUPv4i32lane
);
365 MulMCID
= &TII
->get(AArch64::FMLSv4f32
);
367 case AArch64::FMULXv4i32_indexed
:
368 DupMCID
= &TII
->get(AArch64::DUPv4i32lane
);
369 MulMCID
= &TII
->get(AArch64::FMULXv4f32
);
371 case AArch64::FMULv4i32_indexed
:
372 DupMCID
= &TII
->get(AArch64::DUPv4i32lane
);
373 MulMCID
= &TII
->get(AArch64::FMULv4f32
);
377 case AArch64::FMLAv2i64_indexed
:
378 DupMCID
= &TII
->get(AArch64::DUPv2i64lane
);
379 MulMCID
= &TII
->get(AArch64::FMLAv2f64
);
381 case AArch64::FMLSv2i64_indexed
:
382 DupMCID
= &TII
->get(AArch64::DUPv2i64lane
);
383 MulMCID
= &TII
->get(AArch64::FMLSv2f64
);
385 case AArch64::FMULXv2i64_indexed
:
386 DupMCID
= &TII
->get(AArch64::DUPv2i64lane
);
387 MulMCID
= &TII
->get(AArch64::FMULXv2f64
);
389 case AArch64::FMULv2i64_indexed
:
390 DupMCID
= &TII
->get(AArch64::DUPv2i64lane
);
391 MulMCID
= &TII
->get(AArch64::FMULv2f64
);
395 case AArch64::FMLAv2i32_indexed
:
396 RC
= &AArch64::FPR64RegClass
;
397 DupMCID
= &TII
->get(AArch64::DUPv2i32lane
);
398 MulMCID
= &TII
->get(AArch64::FMLAv2f32
);
400 case AArch64::FMLSv2i32_indexed
:
401 RC
= &AArch64::FPR64RegClass
;
402 DupMCID
= &TII
->get(AArch64::DUPv2i32lane
);
403 MulMCID
= &TII
->get(AArch64::FMLSv2f32
);
405 case AArch64::FMULXv2i32_indexed
:
406 RC
= &AArch64::FPR64RegClass
;
407 DupMCID
= &TII
->get(AArch64::DUPv2i32lane
);
408 MulMCID
= &TII
->get(AArch64::FMULXv2f32
);
410 case AArch64::FMULv2i32_indexed
:
411 RC
= &AArch64::FPR64RegClass
;
412 DupMCID
= &TII
->get(AArch64::DUPv2i32lane
);
413 MulMCID
= &TII
->get(AArch64::FMULv2f32
);
417 SmallVector
<const MCInstrDesc
*, 2> ReplInstrMCID
;
418 ReplInstrMCID
.push_back(DupMCID
);
419 ReplInstrMCID
.push_back(MulMCID
);
420 if (!shouldReplaceInst(MI
.getParent()->getParent(), &TII
->get(MI
.getOpcode()),
424 const DebugLoc
&DL
= MI
.getDebugLoc();
425 MachineBasicBlock
&MBB
= *MI
.getParent();
426 MachineRegisterInfo
&MRI
= MBB
.getParent()->getRegInfo();
428 // Get the operands of the current SIMD arithmetic instruction.
429 unsigned MulDest
= MI
.getOperand(0).getReg();
430 unsigned SrcReg0
= MI
.getOperand(1).getReg();
431 unsigned Src0IsKill
= getKillRegState(MI
.getOperand(1).isKill());
432 unsigned SrcReg1
= MI
.getOperand(2).getReg();
433 unsigned Src1IsKill
= getKillRegState(MI
.getOperand(2).isKill());
436 // Instructions of interest have either 4 or 5 operands.
437 if (MI
.getNumOperands() == 5) {
438 unsigned SrcReg2
= MI
.getOperand(3).getReg();
439 unsigned Src2IsKill
= getKillRegState(MI
.getOperand(3).isKill());
440 unsigned LaneNumber
= MI
.getOperand(4).getImm();
441 // Create a new DUP instruction. Note that if an equivalent DUP instruction
442 // has already been created before, then use that one instead of creating
444 if (!reuseDUP(MI
, DupMCID
->getOpcode(), SrcReg2
, LaneNumber
, &DupDest
)) {
445 DupDest
= MRI
.createVirtualRegister(RC
);
446 BuildMI(MBB
, MI
, DL
, *DupMCID
, DupDest
)
447 .addReg(SrcReg2
, Src2IsKill
)
450 BuildMI(MBB
, MI
, DL
, *MulMCID
, MulDest
)
451 .addReg(SrcReg0
, Src0IsKill
)
452 .addReg(SrcReg1
, Src1IsKill
)
453 .addReg(DupDest
, Src2IsKill
);
454 } else if (MI
.getNumOperands() == 4) {
455 unsigned LaneNumber
= MI
.getOperand(3).getImm();
456 if (!reuseDUP(MI
, DupMCID
->getOpcode(), SrcReg1
, LaneNumber
, &DupDest
)) {
457 DupDest
= MRI
.createVirtualRegister(RC
);
458 BuildMI(MBB
, MI
, DL
, *DupMCID
, DupDest
)
459 .addReg(SrcReg1
, Src1IsKill
)
462 BuildMI(MBB
, MI
, DL
, *MulMCID
, MulDest
)
463 .addReg(SrcReg0
, Src0IsKill
)
464 .addReg(DupDest
, Src1IsKill
);
473 /// Load/Store Interleaving instructions are not always beneficial.
474 /// Replace them by ZIP instructions and classical load/store.
477 /// st2 {v0.4s, v1.4s}, addr
479 /// Is rewritten into:
480 /// zip1 v2.4s, v0.4s, v1.4s
481 /// zip2 v3.4s, v0.4s, v1.4s
485 /// st4 {v0.4s, v1.4s, v2.4s, v3.4s}, addr
487 /// Is rewritten into:
488 /// zip1 v4.4s, v0.4s, v2.4s
489 /// zip2 v5.4s, v0.4s, v2.4s
490 /// zip1 v6.4s, v1.4s, v3.4s
491 /// zip2 v7.4s, v1.4s, v3.4s
492 /// zip1 v8.4s, v4.4s, v6.4s
493 /// zip2 v9.4s, v4.4s, v6.4s
494 /// zip1 v10.4s, v5.4s, v7.4s
495 /// zip2 v11.4s, v5.4s, v7.4s
497 /// stp q10, q11, addr+32
499 /// Currently only instructions related to ST2 and ST4 are considered.
500 /// Other may be added later.
501 /// Return true if the SIMD instruction is modified.
502 bool AArch64SIMDInstrOpt::optimizeLdStInterleave(MachineInstr
&MI
) {
504 unsigned SeqReg
, AddrReg
;
505 unsigned StReg
[4], StRegKill
[4];
506 MachineInstr
*DefiningMI
;
507 const DebugLoc
&DL
= MI
.getDebugLoc();
508 MachineBasicBlock
&MBB
= *MI
.getParent();
509 SmallVector
<unsigned, MaxNumRepl
> ZipDest
;
510 SmallVector
<const MCInstrDesc
*, MaxNumRepl
> ReplInstrMCID
;
512 // If current instruction matches any of the rewriting rules, then
513 // gather information about parameters of the new instructions.
515 for (auto &I
: IRT
) {
516 if (MI
.getOpcode() == I
.OrigOpc
) {
517 SeqReg
= MI
.getOperand(0).getReg();
518 AddrReg
= MI
.getOperand(1).getReg();
519 DefiningMI
= MRI
->getUniqueVRegDef(SeqReg
);
520 unsigned NumReg
= determineSrcReg(MI
);
521 if (!processSeqRegInst(DefiningMI
, StReg
, StRegKill
, NumReg
))
524 for (auto &Repl
: I
.ReplOpc
) {
525 ReplInstrMCID
.push_back(&TII
->get(Repl
));
526 // Generate destination registers but only for non-store instruction.
527 if (Repl
!= AArch64::STPQi
&& Repl
!= AArch64::STPDi
)
528 ZipDest
.push_back(MRI
->createVirtualRegister(&I
.RC
));
538 // Determine if it is profitable to replace MI by the series of instructions
539 // represented in ReplInstrMCID.
540 if (!shouldReplaceInst(MI
.getParent()->getParent(), &TII
->get(MI
.getOpcode()),
544 // Generate the replacement instructions composed of ZIP1, ZIP2, and STP (at
545 // this point, the code generation is hardcoded and does not rely on the IRT
546 // table used above given that code generation for ST2 replacement is somewhat
547 // different than for ST4 replacement. We could have added more info into the
548 // table related to how we build new instructions but we may be adding more
549 // complexity with that).
550 switch (MI
.getOpcode()) {
554 case AArch64::ST2Twov16b
:
555 case AArch64::ST2Twov8b
:
556 case AArch64::ST2Twov8h
:
557 case AArch64::ST2Twov4h
:
558 case AArch64::ST2Twov4s
:
559 case AArch64::ST2Twov2s
:
560 case AArch64::ST2Twov2d
:
562 BuildMI(MBB
, MI
, DL
, *ReplInstrMCID
[0], ZipDest
[0])
565 BuildMI(MBB
, MI
, DL
, *ReplInstrMCID
[1], ZipDest
[1])
566 .addReg(StReg
[0], StRegKill
[0])
567 .addReg(StReg
[1], StRegKill
[1]);
569 BuildMI(MBB
, MI
, DL
, *ReplInstrMCID
[2])
576 case AArch64::ST4Fourv16b
:
577 case AArch64::ST4Fourv8b
:
578 case AArch64::ST4Fourv8h
:
579 case AArch64::ST4Fourv4h
:
580 case AArch64::ST4Fourv4s
:
581 case AArch64::ST4Fourv2s
:
582 case AArch64::ST4Fourv2d
:
584 BuildMI(MBB
, MI
, DL
, *ReplInstrMCID
[0], ZipDest
[0])
587 BuildMI(MBB
, MI
, DL
, *ReplInstrMCID
[1], ZipDest
[1])
588 .addReg(StReg
[0], StRegKill
[0])
589 .addReg(StReg
[2], StRegKill
[2]);
590 BuildMI(MBB
, MI
, DL
, *ReplInstrMCID
[2], ZipDest
[2])
593 BuildMI(MBB
, MI
, DL
, *ReplInstrMCID
[3], ZipDest
[3])
594 .addReg(StReg
[1], StRegKill
[1])
595 .addReg(StReg
[3], StRegKill
[3]);
596 BuildMI(MBB
, MI
, DL
, *ReplInstrMCID
[4], ZipDest
[4])
599 BuildMI(MBB
, MI
, DL
, *ReplInstrMCID
[5], ZipDest
[5])
602 BuildMI(MBB
, MI
, DL
, *ReplInstrMCID
[6], ZipDest
[6])
605 BuildMI(MBB
, MI
, DL
, *ReplInstrMCID
[7], ZipDest
[7])
609 BuildMI(MBB
, MI
, DL
, *ReplInstrMCID
[8])
614 BuildMI(MBB
, MI
, DL
, *ReplInstrMCID
[9])
626 /// Process The REG_SEQUENCE instruction, and extract the source
627 /// operands of the ST2/4 instruction from it.
628 /// Example of such instruction.
629 /// %dest = REG_SEQUENCE %st2_src1, dsub0, %st2_src2, dsub1;
630 /// Return true when the instruction is processed successfully.
631 bool AArch64SIMDInstrOpt::processSeqRegInst(MachineInstr
*DefiningMI
,
632 unsigned* StReg
, unsigned* StRegKill
, unsigned NumArg
) const {
633 assert (DefiningMI
!= NULL
);
634 if (DefiningMI
->getOpcode() != AArch64::REG_SEQUENCE
)
637 for (unsigned i
=0; i
<NumArg
; i
++) {
638 StReg
[i
] = DefiningMI
->getOperand(2*i
+1).getReg();
639 StRegKill
[i
] = getKillRegState(DefiningMI
->getOperand(2*i
+1).isKill());
641 // Sanity check for the other arguments.
642 if (DefiningMI
->getOperand(2*i
+2).isImm()) {
643 switch (DefiningMI
->getOperand(2*i
+2).getImm()) {
664 /// Return the number of useful source registers for this instruction
665 /// (2 for ST2 and 4 for ST4).
666 unsigned AArch64SIMDInstrOpt::determineSrcReg(MachineInstr
&MI
) const {
667 switch (MI
.getOpcode()) {
669 llvm_unreachable("Unsupported instruction for this pass");
671 case AArch64::ST2Twov16b
:
672 case AArch64::ST2Twov8b
:
673 case AArch64::ST2Twov8h
:
674 case AArch64::ST2Twov4h
:
675 case AArch64::ST2Twov4s
:
676 case AArch64::ST2Twov2s
:
677 case AArch64::ST2Twov2d
:
680 case AArch64::ST4Fourv16b
:
681 case AArch64::ST4Fourv8b
:
682 case AArch64::ST4Fourv8h
:
683 case AArch64::ST4Fourv4h
:
684 case AArch64::ST4Fourv4s
:
685 case AArch64::ST4Fourv2s
:
686 case AArch64::ST4Fourv2d
:
691 bool AArch64SIMDInstrOpt::runOnMachineFunction(MachineFunction
&MF
) {
692 if (skipFunction(MF
.getFunction()))
695 TII
= MF
.getSubtarget().getInstrInfo();
696 MRI
= &MF
.getRegInfo();
697 const TargetSubtargetInfo
&ST
= MF
.getSubtarget();
698 const AArch64InstrInfo
*AAII
=
699 static_cast<const AArch64InstrInfo
*>(ST
.getInstrInfo());
702 SchedModel
.init(&ST
);
703 if (!SchedModel
.hasInstrSchedModel())
706 bool Changed
= false;
707 for (auto OptimizationKind
: {VectorElem
, Interleave
}) {
708 if (!shouldExitEarly(&MF
, OptimizationKind
)) {
709 SmallVector
<MachineInstr
*, 8> RemoveMIs
;
710 for (MachineBasicBlock
&MBB
: MF
) {
711 for (MachineBasicBlock::iterator MII
= MBB
.begin(), MIE
= MBB
.end();
713 MachineInstr
&MI
= *MII
;
715 if (OptimizationKind
== VectorElem
)
716 InstRewrite
= optimizeVectElement(MI
) ;
718 InstRewrite
= optimizeLdStInterleave(MI
);
720 // Add MI to the list of instructions to be removed given that it
721 // has been replaced.
722 RemoveMIs
.push_back(&MI
);
728 for (MachineInstr
*MI
: RemoveMIs
)
729 MI
->eraseFromParent();
736 /// Returns an instance of the high cost ASIMD instruction replacement
737 /// optimization pass.
738 FunctionPass
*llvm::createAArch64SIMDInstrOptPass() {
739 return new AArch64SIMDInstrOpt();