1 //===- AArch64LoadStoreOptimizer.cpp - AArch64 load/store opt. pass -------===//
3 // Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
4 // See https://llvm.org/LICENSE.txt for license information.
5 // SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
7 //===----------------------------------------------------------------------===//
9 // This file contains a pass that performs load / store related peephole
10 // optimizations. This pass should be run after register allocation.
12 //===----------------------------------------------------------------------===//
14 #include "AArch64InstrInfo.h"
15 #include "AArch64Subtarget.h"
16 #include "MCTargetDesc/AArch64AddressingModes.h"
17 #include "llvm/ADT/BitVector.h"
18 #include "llvm/ADT/SmallVector.h"
19 #include "llvm/ADT/Statistic.h"
20 #include "llvm/ADT/StringRef.h"
21 #include "llvm/ADT/iterator_range.h"
22 #include "llvm/Analysis/AliasAnalysis.h"
23 #include "llvm/CodeGen/MachineBasicBlock.h"
24 #include "llvm/CodeGen/MachineFunction.h"
25 #include "llvm/CodeGen/MachineFunctionPass.h"
26 #include "llvm/CodeGen/MachineInstr.h"
27 #include "llvm/CodeGen/MachineInstrBuilder.h"
28 #include "llvm/CodeGen/MachineOperand.h"
29 #include "llvm/CodeGen/TargetRegisterInfo.h"
30 #include "llvm/IR/DebugLoc.h"
31 #include "llvm/MC/MCRegisterInfo.h"
32 #include "llvm/Pass.h"
33 #include "llvm/Support/CommandLine.h"
34 #include "llvm/Support/Debug.h"
35 #include "llvm/Support/ErrorHandling.h"
36 #include "llvm/Support/raw_ostream.h"
44 #define DEBUG_TYPE "aarch64-ldst-opt"
46 STATISTIC(NumPairCreated
, "Number of load/store pair instructions generated");
47 STATISTIC(NumPostFolded
, "Number of post-index updates folded");
48 STATISTIC(NumPreFolded
, "Number of pre-index updates folded");
49 STATISTIC(NumUnscaledPairCreated
,
50 "Number of load/store from unscaled generated");
51 STATISTIC(NumZeroStoresPromoted
, "Number of narrow zero stores promoted");
52 STATISTIC(NumLoadsFromStoresPromoted
, "Number of loads from stores promoted");
54 // The LdStLimit limits how far we search for load/store pairs.
55 static cl::opt
<unsigned> LdStLimit("aarch64-load-store-scan-limit",
56 cl::init(20), cl::Hidden
);
58 // The UpdateLimit limits how far we search for update instructions when we form
59 // pre-/post-index instructions.
60 static cl::opt
<unsigned> UpdateLimit("aarch64-update-scan-limit", cl::init(100),
63 #define AARCH64_LOAD_STORE_OPT_NAME "AArch64 load / store optimization pass"
67 using LdStPairFlags
= struct LdStPairFlags
{
68 // If a matching instruction is found, MergeForward is set to true if the
69 // merge is to remove the first instruction and replace the second with
70 // a pair-wise insn, and false if the reverse is true.
71 bool MergeForward
= false;
73 // SExtIdx gives the index of the result of the load pair that must be
74 // extended. The value of SExtIdx assumes that the paired load produces the
75 // value in this order: (I, returned iterator), i.e., -1 means no value has
76 // to be extended, 0 means I, and 1 means the returned iterator.
79 LdStPairFlags() = default;
81 void setMergeForward(bool V
= true) { MergeForward
= V
; }
82 bool getMergeForward() const { return MergeForward
; }
84 void setSExtIdx(int V
) { SExtIdx
= V
; }
85 int getSExtIdx() const { return SExtIdx
; }
88 struct AArch64LoadStoreOpt
: public MachineFunctionPass
{
91 AArch64LoadStoreOpt() : MachineFunctionPass(ID
) {
92 initializeAArch64LoadStoreOptPass(*PassRegistry::getPassRegistry());
96 const AArch64InstrInfo
*TII
;
97 const TargetRegisterInfo
*TRI
;
98 const AArch64Subtarget
*Subtarget
;
100 // Track which register units have been modified and used.
101 LiveRegUnits ModifiedRegUnits
, UsedRegUnits
;
103 void getAnalysisUsage(AnalysisUsage
&AU
) const override
{
104 AU
.addRequired
<AAResultsWrapperPass
>();
105 MachineFunctionPass::getAnalysisUsage(AU
);
108 // Scan the instructions looking for a load/store that can be combined
109 // with the current instruction into a load/store pair.
110 // Return the matching instruction if one is found, else MBB->end().
111 MachineBasicBlock::iterator
findMatchingInsn(MachineBasicBlock::iterator I
,
112 LdStPairFlags
&Flags
,
114 bool FindNarrowMerge
);
116 // Scan the instructions looking for a store that writes to the address from
117 // which the current load instruction reads. Return true if one is found.
118 bool findMatchingStore(MachineBasicBlock::iterator I
, unsigned Limit
,
119 MachineBasicBlock::iterator
&StoreI
);
121 // Merge the two instructions indicated into a wider narrow store instruction.
122 MachineBasicBlock::iterator
123 mergeNarrowZeroStores(MachineBasicBlock::iterator I
,
124 MachineBasicBlock::iterator MergeMI
,
125 const LdStPairFlags
&Flags
);
127 // Merge the two instructions indicated into a single pair-wise instruction.
128 MachineBasicBlock::iterator
129 mergePairedInsns(MachineBasicBlock::iterator I
,
130 MachineBasicBlock::iterator Paired
,
131 const LdStPairFlags
&Flags
);
133 // Promote the load that reads directly from the address stored to.
134 MachineBasicBlock::iterator
135 promoteLoadFromStore(MachineBasicBlock::iterator LoadI
,
136 MachineBasicBlock::iterator StoreI
);
138 // Scan the instruction list to find a base register update that can
139 // be combined with the current instruction (a load or store) using
140 // pre or post indexed addressing with writeback. Scan forwards.
141 MachineBasicBlock::iterator
142 findMatchingUpdateInsnForward(MachineBasicBlock::iterator I
,
143 int UnscaledOffset
, unsigned Limit
);
145 // Scan the instruction list to find a base register update that can
146 // be combined with the current instruction (a load or store) using
147 // pre or post indexed addressing with writeback. Scan backwards.
148 MachineBasicBlock::iterator
149 findMatchingUpdateInsnBackward(MachineBasicBlock::iterator I
, unsigned Limit
);
151 // Find an instruction that updates the base register of the ld/st
153 bool isMatchingUpdateInsn(MachineInstr
&MemMI
, MachineInstr
&MI
,
154 unsigned BaseReg
, int Offset
);
156 // Merge a pre- or post-index base register update into a ld/st instruction.
157 MachineBasicBlock::iterator
158 mergeUpdateInsn(MachineBasicBlock::iterator I
,
159 MachineBasicBlock::iterator Update
, bool IsPreIdx
);
161 // Find and merge zero store instructions.
162 bool tryToMergeZeroStInst(MachineBasicBlock::iterator
&MBBI
);
164 // Find and pair ldr/str instructions.
165 bool tryToPairLdStInst(MachineBasicBlock::iterator
&MBBI
);
167 // Find and promote load instructions which read directly from store.
168 bool tryToPromoteLoadFromStore(MachineBasicBlock::iterator
&MBBI
);
170 // Find and merge a base register updates before or after a ld/st instruction.
171 bool tryToMergeLdStUpdate(MachineBasicBlock::iterator
&MBBI
);
173 bool optimizeBlock(MachineBasicBlock
&MBB
, bool EnableNarrowZeroStOpt
);
175 bool runOnMachineFunction(MachineFunction
&Fn
) override
;
177 MachineFunctionProperties
getRequiredProperties() const override
{
178 return MachineFunctionProperties().set(
179 MachineFunctionProperties::Property::NoVRegs
);
182 StringRef
getPassName() const override
{ return AARCH64_LOAD_STORE_OPT_NAME
; }
185 char AArch64LoadStoreOpt::ID
= 0;
187 } // end anonymous namespace
189 INITIALIZE_PASS(AArch64LoadStoreOpt
, "aarch64-ldst-opt",
190 AARCH64_LOAD_STORE_OPT_NAME
, false, false)
192 static bool isNarrowStore(unsigned Opc
) {
196 case AArch64::STRBBui
:
197 case AArch64::STURBBi
:
198 case AArch64::STRHHui
:
199 case AArch64::STURHHi
:
204 // Scaling factor for unscaled load or store.
205 static int getMemScale(MachineInstr
&MI
) {
206 switch (MI
.getOpcode()) {
208 llvm_unreachable("Opcode has unknown scale!");
209 case AArch64::LDRBBui
:
210 case AArch64::LDURBBi
:
211 case AArch64::LDRSBWui
:
212 case AArch64::LDURSBWi
:
213 case AArch64::STRBBui
:
214 case AArch64::STURBBi
:
216 case AArch64::LDRHHui
:
217 case AArch64::LDURHHi
:
218 case AArch64::LDRSHWui
:
219 case AArch64::LDURSHWi
:
220 case AArch64::STRHHui
:
221 case AArch64::STURHHi
:
223 case AArch64::LDRSui
:
224 case AArch64::LDURSi
:
225 case AArch64::LDRSWui
:
226 case AArch64::LDURSWi
:
227 case AArch64::LDRWui
:
228 case AArch64::LDURWi
:
229 case AArch64::STRSui
:
230 case AArch64::STURSi
:
231 case AArch64::STRWui
:
232 case AArch64::STURWi
:
234 case AArch64::LDPSWi
:
239 case AArch64::LDRDui
:
240 case AArch64::LDURDi
:
241 case AArch64::LDRXui
:
242 case AArch64::LDURXi
:
243 case AArch64::STRDui
:
244 case AArch64::STURDi
:
245 case AArch64::STRXui
:
246 case AArch64::STURXi
:
252 case AArch64::LDRQui
:
253 case AArch64::LDURQi
:
254 case AArch64::STRQui
:
255 case AArch64::STURQi
:
262 static unsigned getMatchingNonSExtOpcode(unsigned Opc
,
263 bool *IsValidLdStrOpc
= nullptr) {
265 *IsValidLdStrOpc
= true;
269 *IsValidLdStrOpc
= false;
270 return std::numeric_limits
<unsigned>::max();
271 case AArch64::STRDui
:
272 case AArch64::STURDi
:
273 case AArch64::STRQui
:
274 case AArch64::STURQi
:
275 case AArch64::STRBBui
:
276 case AArch64::STURBBi
:
277 case AArch64::STRHHui
:
278 case AArch64::STURHHi
:
279 case AArch64::STRWui
:
280 case AArch64::STURWi
:
281 case AArch64::STRXui
:
282 case AArch64::STURXi
:
283 case AArch64::LDRDui
:
284 case AArch64::LDURDi
:
285 case AArch64::LDRQui
:
286 case AArch64::LDURQi
:
287 case AArch64::LDRWui
:
288 case AArch64::LDURWi
:
289 case AArch64::LDRXui
:
290 case AArch64::LDURXi
:
291 case AArch64::STRSui
:
292 case AArch64::STURSi
:
293 case AArch64::LDRSui
:
294 case AArch64::LDURSi
:
296 case AArch64::LDRSWui
:
297 return AArch64::LDRWui
;
298 case AArch64::LDURSWi
:
299 return AArch64::LDURWi
;
303 static unsigned getMatchingWideOpcode(unsigned Opc
) {
306 llvm_unreachable("Opcode has no wide equivalent!");
307 case AArch64::STRBBui
:
308 return AArch64::STRHHui
;
309 case AArch64::STRHHui
:
310 return AArch64::STRWui
;
311 case AArch64::STURBBi
:
312 return AArch64::STURHHi
;
313 case AArch64::STURHHi
:
314 return AArch64::STURWi
;
315 case AArch64::STURWi
:
316 return AArch64::STURXi
;
317 case AArch64::STRWui
:
318 return AArch64::STRXui
;
322 static unsigned getMatchingPairOpcode(unsigned Opc
) {
325 llvm_unreachable("Opcode has no pairwise equivalent!");
326 case AArch64::STRSui
:
327 case AArch64::STURSi
:
328 return AArch64::STPSi
;
329 case AArch64::STRDui
:
330 case AArch64::STURDi
:
331 return AArch64::STPDi
;
332 case AArch64::STRQui
:
333 case AArch64::STURQi
:
334 return AArch64::STPQi
;
335 case AArch64::STRWui
:
336 case AArch64::STURWi
:
337 return AArch64::STPWi
;
338 case AArch64::STRXui
:
339 case AArch64::STURXi
:
340 return AArch64::STPXi
;
341 case AArch64::LDRSui
:
342 case AArch64::LDURSi
:
343 return AArch64::LDPSi
;
344 case AArch64::LDRDui
:
345 case AArch64::LDURDi
:
346 return AArch64::LDPDi
;
347 case AArch64::LDRQui
:
348 case AArch64::LDURQi
:
349 return AArch64::LDPQi
;
350 case AArch64::LDRWui
:
351 case AArch64::LDURWi
:
352 return AArch64::LDPWi
;
353 case AArch64::LDRXui
:
354 case AArch64::LDURXi
:
355 return AArch64::LDPXi
;
356 case AArch64::LDRSWui
:
357 case AArch64::LDURSWi
:
358 return AArch64::LDPSWi
;
362 static unsigned isMatchingStore(MachineInstr
&LoadInst
,
363 MachineInstr
&StoreInst
) {
364 unsigned LdOpc
= LoadInst
.getOpcode();
365 unsigned StOpc
= StoreInst
.getOpcode();
368 llvm_unreachable("Unsupported load instruction!");
369 case AArch64::LDRBBui
:
370 return StOpc
== AArch64::STRBBui
|| StOpc
== AArch64::STRHHui
||
371 StOpc
== AArch64::STRWui
|| StOpc
== AArch64::STRXui
;
372 case AArch64::LDURBBi
:
373 return StOpc
== AArch64::STURBBi
|| StOpc
== AArch64::STURHHi
||
374 StOpc
== AArch64::STURWi
|| StOpc
== AArch64::STURXi
;
375 case AArch64::LDRHHui
:
376 return StOpc
== AArch64::STRHHui
|| StOpc
== AArch64::STRWui
||
377 StOpc
== AArch64::STRXui
;
378 case AArch64::LDURHHi
:
379 return StOpc
== AArch64::STURHHi
|| StOpc
== AArch64::STURWi
||
380 StOpc
== AArch64::STURXi
;
381 case AArch64::LDRWui
:
382 return StOpc
== AArch64::STRWui
|| StOpc
== AArch64::STRXui
;
383 case AArch64::LDURWi
:
384 return StOpc
== AArch64::STURWi
|| StOpc
== AArch64::STURXi
;
385 case AArch64::LDRXui
:
386 return StOpc
== AArch64::STRXui
;
387 case AArch64::LDURXi
:
388 return StOpc
== AArch64::STURXi
;
392 static unsigned getPreIndexedOpcode(unsigned Opc
) {
393 // FIXME: We don't currently support creating pre-indexed loads/stores when
394 // the load or store is the unscaled version. If we decide to perform such an
395 // optimization in the future the cases for the unscaled loads/stores will
396 // need to be added here.
399 llvm_unreachable("Opcode has no pre-indexed equivalent!");
400 case AArch64::STRSui
:
401 return AArch64::STRSpre
;
402 case AArch64::STRDui
:
403 return AArch64::STRDpre
;
404 case AArch64::STRQui
:
405 return AArch64::STRQpre
;
406 case AArch64::STRBBui
:
407 return AArch64::STRBBpre
;
408 case AArch64::STRHHui
:
409 return AArch64::STRHHpre
;
410 case AArch64::STRWui
:
411 return AArch64::STRWpre
;
412 case AArch64::STRXui
:
413 return AArch64::STRXpre
;
414 case AArch64::LDRSui
:
415 return AArch64::LDRSpre
;
416 case AArch64::LDRDui
:
417 return AArch64::LDRDpre
;
418 case AArch64::LDRQui
:
419 return AArch64::LDRQpre
;
420 case AArch64::LDRBBui
:
421 return AArch64::LDRBBpre
;
422 case AArch64::LDRHHui
:
423 return AArch64::LDRHHpre
;
424 case AArch64::LDRWui
:
425 return AArch64::LDRWpre
;
426 case AArch64::LDRXui
:
427 return AArch64::LDRXpre
;
428 case AArch64::LDRSWui
:
429 return AArch64::LDRSWpre
;
431 return AArch64::LDPSpre
;
432 case AArch64::LDPSWi
:
433 return AArch64::LDPSWpre
;
435 return AArch64::LDPDpre
;
437 return AArch64::LDPQpre
;
439 return AArch64::LDPWpre
;
441 return AArch64::LDPXpre
;
443 return AArch64::STPSpre
;
445 return AArch64::STPDpre
;
447 return AArch64::STPQpre
;
449 return AArch64::STPWpre
;
451 return AArch64::STPXpre
;
455 static unsigned getPostIndexedOpcode(unsigned Opc
) {
458 llvm_unreachable("Opcode has no post-indexed wise equivalent!");
459 case AArch64::STRSui
:
460 case AArch64::STURSi
:
461 return AArch64::STRSpost
;
462 case AArch64::STRDui
:
463 case AArch64::STURDi
:
464 return AArch64::STRDpost
;
465 case AArch64::STRQui
:
466 case AArch64::STURQi
:
467 return AArch64::STRQpost
;
468 case AArch64::STRBBui
:
469 return AArch64::STRBBpost
;
470 case AArch64::STRHHui
:
471 return AArch64::STRHHpost
;
472 case AArch64::STRWui
:
473 case AArch64::STURWi
:
474 return AArch64::STRWpost
;
475 case AArch64::STRXui
:
476 case AArch64::STURXi
:
477 return AArch64::STRXpost
;
478 case AArch64::LDRSui
:
479 case AArch64::LDURSi
:
480 return AArch64::LDRSpost
;
481 case AArch64::LDRDui
:
482 case AArch64::LDURDi
:
483 return AArch64::LDRDpost
;
484 case AArch64::LDRQui
:
485 case AArch64::LDURQi
:
486 return AArch64::LDRQpost
;
487 case AArch64::LDRBBui
:
488 return AArch64::LDRBBpost
;
489 case AArch64::LDRHHui
:
490 return AArch64::LDRHHpost
;
491 case AArch64::LDRWui
:
492 case AArch64::LDURWi
:
493 return AArch64::LDRWpost
;
494 case AArch64::LDRXui
:
495 case AArch64::LDURXi
:
496 return AArch64::LDRXpost
;
497 case AArch64::LDRSWui
:
498 return AArch64::LDRSWpost
;
500 return AArch64::LDPSpost
;
501 case AArch64::LDPSWi
:
502 return AArch64::LDPSWpost
;
504 return AArch64::LDPDpost
;
506 return AArch64::LDPQpost
;
508 return AArch64::LDPWpost
;
510 return AArch64::LDPXpost
;
512 return AArch64::STPSpost
;
514 return AArch64::STPDpost
;
516 return AArch64::STPQpost
;
518 return AArch64::STPWpost
;
520 return AArch64::STPXpost
;
524 static bool isPairedLdSt(const MachineInstr
&MI
) {
525 switch (MI
.getOpcode()) {
529 case AArch64::LDPSWi
:
543 static const MachineOperand
&getLdStRegOp(const MachineInstr
&MI
,
544 unsigned PairedRegOp
= 0) {
545 assert(PairedRegOp
< 2 && "Unexpected register operand idx.");
546 unsigned Idx
= isPairedLdSt(MI
) ? PairedRegOp
: 0;
547 return MI
.getOperand(Idx
);
550 static const MachineOperand
&getLdStBaseOp(const MachineInstr
&MI
) {
551 unsigned Idx
= isPairedLdSt(MI
) ? 2 : 1;
552 return MI
.getOperand(Idx
);
555 static const MachineOperand
&getLdStOffsetOp(const MachineInstr
&MI
) {
556 unsigned Idx
= isPairedLdSt(MI
) ? 3 : 2;
557 return MI
.getOperand(Idx
);
560 static bool isLdOffsetInRangeOfSt(MachineInstr
&LoadInst
,
561 MachineInstr
&StoreInst
,
562 const AArch64InstrInfo
*TII
) {
563 assert(isMatchingStore(LoadInst
, StoreInst
) && "Expect only matched ld/st.");
564 int LoadSize
= getMemScale(LoadInst
);
565 int StoreSize
= getMemScale(StoreInst
);
566 int UnscaledStOffset
= TII
->isUnscaledLdSt(StoreInst
)
567 ? getLdStOffsetOp(StoreInst
).getImm()
568 : getLdStOffsetOp(StoreInst
).getImm() * StoreSize
;
569 int UnscaledLdOffset
= TII
->isUnscaledLdSt(LoadInst
)
570 ? getLdStOffsetOp(LoadInst
).getImm()
571 : getLdStOffsetOp(LoadInst
).getImm() * LoadSize
;
572 return (UnscaledStOffset
<= UnscaledLdOffset
) &&
573 (UnscaledLdOffset
+ LoadSize
<= (UnscaledStOffset
+ StoreSize
));
576 static bool isPromotableZeroStoreInst(MachineInstr
&MI
) {
577 unsigned Opc
= MI
.getOpcode();
578 return (Opc
== AArch64::STRWui
|| Opc
== AArch64::STURWi
||
579 isNarrowStore(Opc
)) &&
580 getLdStRegOp(MI
).getReg() == AArch64::WZR
;
583 static bool isPromotableLoadFromStore(MachineInstr
&MI
) {
584 switch (MI
.getOpcode()) {
587 // Scaled instructions.
588 case AArch64::LDRBBui
:
589 case AArch64::LDRHHui
:
590 case AArch64::LDRWui
:
591 case AArch64::LDRXui
:
592 // Unscaled instructions.
593 case AArch64::LDURBBi
:
594 case AArch64::LDURHHi
:
595 case AArch64::LDURWi
:
596 case AArch64::LDURXi
:
601 static bool isMergeableLdStUpdate(MachineInstr
&MI
) {
602 unsigned Opc
= MI
.getOpcode();
606 // Scaled instructions.
607 case AArch64::STRSui
:
608 case AArch64::STRDui
:
609 case AArch64::STRQui
:
610 case AArch64::STRXui
:
611 case AArch64::STRWui
:
612 case AArch64::STRHHui
:
613 case AArch64::STRBBui
:
614 case AArch64::LDRSui
:
615 case AArch64::LDRDui
:
616 case AArch64::LDRQui
:
617 case AArch64::LDRXui
:
618 case AArch64::LDRWui
:
619 case AArch64::LDRHHui
:
620 case AArch64::LDRBBui
:
621 // Unscaled instructions.
622 case AArch64::STURSi
:
623 case AArch64::STURDi
:
624 case AArch64::STURQi
:
625 case AArch64::STURWi
:
626 case AArch64::STURXi
:
627 case AArch64::LDURSi
:
628 case AArch64::LDURDi
:
629 case AArch64::LDURQi
:
630 case AArch64::LDURWi
:
631 case AArch64::LDURXi
:
632 // Paired instructions.
634 case AArch64::LDPSWi
:
644 // Make sure this is a reg+imm (as opposed to an address reloc).
645 if (!getLdStOffsetOp(MI
).isImm())
652 MachineBasicBlock::iterator
653 AArch64LoadStoreOpt::mergeNarrowZeroStores(MachineBasicBlock::iterator I
,
654 MachineBasicBlock::iterator MergeMI
,
655 const LdStPairFlags
&Flags
) {
656 assert(isPromotableZeroStoreInst(*I
) && isPromotableZeroStoreInst(*MergeMI
) &&
657 "Expected promotable zero stores.");
659 MachineBasicBlock::iterator NextI
= I
;
661 // If NextI is the second of the two instructions to be merged, we need
662 // to skip one further. Either way we merge will invalidate the iterator,
663 // and we don't need to scan the new instruction, as it's a pairwise
664 // instruction, which we're not considering for further action anyway.
665 if (NextI
== MergeMI
)
668 unsigned Opc
= I
->getOpcode();
669 bool IsScaled
= !TII
->isUnscaledLdSt(Opc
);
670 int OffsetStride
= IsScaled
? 1 : getMemScale(*I
);
672 bool MergeForward
= Flags
.getMergeForward();
673 // Insert our new paired instruction after whichever of the paired
674 // instructions MergeForward indicates.
675 MachineBasicBlock::iterator InsertionPoint
= MergeForward
? MergeMI
: I
;
676 // Also based on MergeForward is from where we copy the base register operand
677 // so we get the flags compatible with the input code.
678 const MachineOperand
&BaseRegOp
=
679 MergeForward
? getLdStBaseOp(*MergeMI
) : getLdStBaseOp(*I
);
681 // Which register is Rt and which is Rt2 depends on the offset order.
683 if (getLdStOffsetOp(*I
).getImm() ==
684 getLdStOffsetOp(*MergeMI
).getImm() + OffsetStride
)
689 int OffsetImm
= getLdStOffsetOp(*RtMI
).getImm();
690 // Change the scaled offset from small to large type.
692 assert(((OffsetImm
& 1) == 0) && "Unexpected offset to merge");
696 // Construct the new instruction.
697 DebugLoc DL
= I
->getDebugLoc();
698 MachineBasicBlock
*MBB
= I
->getParent();
699 MachineInstrBuilder MIB
;
700 MIB
= BuildMI(*MBB
, InsertionPoint
, DL
, TII
->get(getMatchingWideOpcode(Opc
)))
701 .addReg(isNarrowStore(Opc
) ? AArch64::WZR
: AArch64::XZR
)
704 .cloneMergedMemRefs({&*I
, &*MergeMI
})
705 .setMIFlags(I
->mergeFlagsWith(*MergeMI
));
708 LLVM_DEBUG(dbgs() << "Creating wider store. Replacing instructions:\n ");
709 LLVM_DEBUG(I
->print(dbgs()));
710 LLVM_DEBUG(dbgs() << " ");
711 LLVM_DEBUG(MergeMI
->print(dbgs()));
712 LLVM_DEBUG(dbgs() << " with instruction:\n ");
713 LLVM_DEBUG(((MachineInstr
*)MIB
)->print(dbgs()));
714 LLVM_DEBUG(dbgs() << "\n");
716 // Erase the old instructions.
717 I
->eraseFromParent();
718 MergeMI
->eraseFromParent();
722 MachineBasicBlock::iterator
723 AArch64LoadStoreOpt::mergePairedInsns(MachineBasicBlock::iterator I
,
724 MachineBasicBlock::iterator Paired
,
725 const LdStPairFlags
&Flags
) {
726 MachineBasicBlock::iterator NextI
= I
;
728 // If NextI is the second of the two instructions to be merged, we need
729 // to skip one further. Either way we merge will invalidate the iterator,
730 // and we don't need to scan the new instruction, as it's a pairwise
731 // instruction, which we're not considering for further action anyway.
735 int SExtIdx
= Flags
.getSExtIdx();
737 SExtIdx
== -1 ? I
->getOpcode() : getMatchingNonSExtOpcode(I
->getOpcode());
738 bool IsUnscaled
= TII
->isUnscaledLdSt(Opc
);
739 int OffsetStride
= IsUnscaled
? getMemScale(*I
) : 1;
741 bool MergeForward
= Flags
.getMergeForward();
742 // Insert our new paired instruction after whichever of the paired
743 // instructions MergeForward indicates.
744 MachineBasicBlock::iterator InsertionPoint
= MergeForward
? Paired
: I
;
745 // Also based on MergeForward is from where we copy the base register operand
746 // so we get the flags compatible with the input code.
747 const MachineOperand
&BaseRegOp
=
748 MergeForward
? getLdStBaseOp(*Paired
) : getLdStBaseOp(*I
);
750 int Offset
= getLdStOffsetOp(*I
).getImm();
751 int PairedOffset
= getLdStOffsetOp(*Paired
).getImm();
752 bool PairedIsUnscaled
= TII
->isUnscaledLdSt(Paired
->getOpcode());
753 if (IsUnscaled
!= PairedIsUnscaled
) {
754 // We're trying to pair instructions that differ in how they are scaled. If
755 // I is scaled then scale the offset of Paired accordingly. Otherwise, do
756 // the opposite (i.e., make Paired's offset unscaled).
757 int MemSize
= getMemScale(*Paired
);
758 if (PairedIsUnscaled
) {
759 // If the unscaled offset isn't a multiple of the MemSize, we can't
760 // pair the operations together.
761 assert(!(PairedOffset
% getMemScale(*Paired
)) &&
762 "Offset should be a multiple of the stride!");
763 PairedOffset
/= MemSize
;
765 PairedOffset
*= MemSize
;
769 // Which register is Rt and which is Rt2 depends on the offset order.
770 MachineInstr
*RtMI
, *Rt2MI
;
771 if (Offset
== PairedOffset
+ OffsetStride
) {
774 // Here we swapped the assumption made for SExtIdx.
775 // I.e., we turn ldp I, Paired into ldp Paired, I.
776 // Update the index accordingly.
778 SExtIdx
= (SExtIdx
+ 1) % 2;
783 int OffsetImm
= getLdStOffsetOp(*RtMI
).getImm();
784 // Scale the immediate offset, if necessary.
785 if (TII
->isUnscaledLdSt(RtMI
->getOpcode())) {
786 assert(!(OffsetImm
% getMemScale(*RtMI
)) &&
787 "Unscaled offset cannot be scaled.");
788 OffsetImm
/= getMemScale(*RtMI
);
791 // Construct the new instruction.
792 MachineInstrBuilder MIB
;
793 DebugLoc DL
= I
->getDebugLoc();
794 MachineBasicBlock
*MBB
= I
->getParent();
795 MachineOperand RegOp0
= getLdStRegOp(*RtMI
);
796 MachineOperand RegOp1
= getLdStRegOp(*Rt2MI
);
797 // Kill flags may become invalid when moving stores for pairing.
798 if (RegOp0
.isUse()) {
800 // Clear kill flags on store if moving upwards. Example:
803 // STRWui kill %w1 ; need to clear kill flag when moving STRWui upwards
804 RegOp0
.setIsKill(false);
805 RegOp1
.setIsKill(false);
807 // Clear kill flags of the first stores register. Example:
809 // USE kill %w1 ; need to clear kill flag when moving STRWui downwards
811 unsigned Reg
= getLdStRegOp(*I
).getReg();
812 for (MachineInstr
&MI
: make_range(std::next(I
), Paired
))
813 MI
.clearRegisterKills(Reg
, TRI
);
816 MIB
= BuildMI(*MBB
, InsertionPoint
, DL
, TII
->get(getMatchingPairOpcode(Opc
)))
821 .cloneMergedMemRefs({&*I
, &*Paired
})
822 .setMIFlags(I
->mergeFlagsWith(*Paired
));
827 dbgs() << "Creating pair load/store. Replacing instructions:\n ");
828 LLVM_DEBUG(I
->print(dbgs()));
829 LLVM_DEBUG(dbgs() << " ");
830 LLVM_DEBUG(Paired
->print(dbgs()));
831 LLVM_DEBUG(dbgs() << " with instruction:\n ");
833 // Generate the sign extension for the proper result of the ldp.
834 // I.e., with X1, that would be:
835 // %w1 = KILL %w1, implicit-def %x1
836 // %x1 = SBFMXri killed %x1, 0, 31
837 MachineOperand
&DstMO
= MIB
->getOperand(SExtIdx
);
838 // Right now, DstMO has the extended register, since it comes from an
840 unsigned DstRegX
= DstMO
.getReg();
841 // Get the W variant of that register.
842 unsigned DstRegW
= TRI
->getSubReg(DstRegX
, AArch64::sub_32
);
843 // Update the result of LDP to use the W instead of the X variant.
844 DstMO
.setReg(DstRegW
);
845 LLVM_DEBUG(((MachineInstr
*)MIB
)->print(dbgs()));
846 LLVM_DEBUG(dbgs() << "\n");
847 // Make the machine verifier happy by providing a definition for
849 // Insert this definition right after the generated LDP, i.e., before
851 MachineInstrBuilder MIBKill
=
852 BuildMI(*MBB
, InsertionPoint
, DL
, TII
->get(TargetOpcode::KILL
), DstRegW
)
854 .addReg(DstRegX
, RegState::Define
);
855 MIBKill
->getOperand(2).setImplicit();
856 // Create the sign extension.
857 MachineInstrBuilder MIBSXTW
=
858 BuildMI(*MBB
, InsertionPoint
, DL
, TII
->get(AArch64::SBFMXri
), DstRegX
)
863 LLVM_DEBUG(dbgs() << " Extend operand:\n ");
864 LLVM_DEBUG(((MachineInstr
*)MIBSXTW
)->print(dbgs()));
866 LLVM_DEBUG(((MachineInstr
*)MIB
)->print(dbgs()));
868 LLVM_DEBUG(dbgs() << "\n");
870 // Erase the old instructions.
871 I
->eraseFromParent();
872 Paired
->eraseFromParent();
877 MachineBasicBlock::iterator
878 AArch64LoadStoreOpt::promoteLoadFromStore(MachineBasicBlock::iterator LoadI
,
879 MachineBasicBlock::iterator StoreI
) {
880 MachineBasicBlock::iterator NextI
= LoadI
;
883 int LoadSize
= getMemScale(*LoadI
);
884 int StoreSize
= getMemScale(*StoreI
);
885 unsigned LdRt
= getLdStRegOp(*LoadI
).getReg();
886 const MachineOperand
&StMO
= getLdStRegOp(*StoreI
);
887 unsigned StRt
= getLdStRegOp(*StoreI
).getReg();
888 bool IsStoreXReg
= TRI
->getRegClass(AArch64::GPR64RegClassID
)->contains(StRt
);
890 assert((IsStoreXReg
||
891 TRI
->getRegClass(AArch64::GPR32RegClassID
)->contains(StRt
)) &&
892 "Unexpected RegClass");
894 MachineInstr
*BitExtMI
;
895 if (LoadSize
== StoreSize
&& (LoadSize
== 4 || LoadSize
== 8)) {
896 // Remove the load, if the destination register of the loads is the same
897 // register for stored value.
898 if (StRt
== LdRt
&& LoadSize
== 8) {
899 for (MachineInstr
&MI
: make_range(StoreI
->getIterator(),
900 LoadI
->getIterator())) {
901 if (MI
.killsRegister(StRt
, TRI
)) {
902 MI
.clearRegisterKills(StRt
, TRI
);
906 LLVM_DEBUG(dbgs() << "Remove load instruction:\n ");
907 LLVM_DEBUG(LoadI
->print(dbgs()));
908 LLVM_DEBUG(dbgs() << "\n");
909 LoadI
->eraseFromParent();
912 // Replace the load with a mov if the load and store are in the same size.
914 BuildMI(*LoadI
->getParent(), LoadI
, LoadI
->getDebugLoc(),
915 TII
->get(IsStoreXReg
? AArch64::ORRXrs
: AArch64::ORRWrs
), LdRt
)
916 .addReg(IsStoreXReg
? AArch64::XZR
: AArch64::WZR
)
918 .addImm(AArch64_AM::getShifterImm(AArch64_AM::LSL
, 0))
919 .setMIFlags(LoadI
->getFlags());
921 // FIXME: Currently we disable this transformation in big-endian targets as
922 // performance and correctness are verified only in little-endian.
923 if (!Subtarget
->isLittleEndian())
925 bool IsUnscaled
= TII
->isUnscaledLdSt(*LoadI
);
926 assert(IsUnscaled
== TII
->isUnscaledLdSt(*StoreI
) &&
927 "Unsupported ld/st match");
928 assert(LoadSize
<= StoreSize
&& "Invalid load size");
929 int UnscaledLdOffset
= IsUnscaled
930 ? getLdStOffsetOp(*LoadI
).getImm()
931 : getLdStOffsetOp(*LoadI
).getImm() * LoadSize
;
932 int UnscaledStOffset
= IsUnscaled
933 ? getLdStOffsetOp(*StoreI
).getImm()
934 : getLdStOffsetOp(*StoreI
).getImm() * StoreSize
;
935 int Width
= LoadSize
* 8;
936 int Immr
= 8 * (UnscaledLdOffset
- UnscaledStOffset
);
937 int Imms
= Immr
+ Width
- 1;
938 unsigned DestReg
= IsStoreXReg
939 ? TRI
->getMatchingSuperReg(LdRt
, AArch64::sub_32
,
940 &AArch64::GPR64RegClass
)
943 assert((UnscaledLdOffset
>= UnscaledStOffset
&&
944 (UnscaledLdOffset
+ LoadSize
) <= UnscaledStOffset
+ StoreSize
) &&
947 Immr
= 8 * (UnscaledLdOffset
- UnscaledStOffset
);
948 Imms
= Immr
+ Width
- 1;
949 if (UnscaledLdOffset
== UnscaledStOffset
) {
950 uint32_t AndMaskEncoded
= ((IsStoreXReg
? 1 : 0) << 12) // N
951 | ((Immr
) << 6) // immr
952 | ((Imms
) << 0) // imms
956 BuildMI(*LoadI
->getParent(), LoadI
, LoadI
->getDebugLoc(),
957 TII
->get(IsStoreXReg
? AArch64::ANDXri
: AArch64::ANDWri
),
960 .addImm(AndMaskEncoded
)
961 .setMIFlags(LoadI
->getFlags());
964 BuildMI(*LoadI
->getParent(), LoadI
, LoadI
->getDebugLoc(),
965 TII
->get(IsStoreXReg
? AArch64::UBFMXri
: AArch64::UBFMWri
),
970 .setMIFlags(LoadI
->getFlags());
974 // Clear kill flags between store and load.
975 for (MachineInstr
&MI
: make_range(StoreI
->getIterator(),
976 BitExtMI
->getIterator()))
977 if (MI
.killsRegister(StRt
, TRI
)) {
978 MI
.clearRegisterKills(StRt
, TRI
);
982 LLVM_DEBUG(dbgs() << "Promoting load by replacing :\n ");
983 LLVM_DEBUG(StoreI
->print(dbgs()));
984 LLVM_DEBUG(dbgs() << " ");
985 LLVM_DEBUG(LoadI
->print(dbgs()));
986 LLVM_DEBUG(dbgs() << " with instructions:\n ");
987 LLVM_DEBUG(StoreI
->print(dbgs()));
988 LLVM_DEBUG(dbgs() << " ");
989 LLVM_DEBUG((BitExtMI
)->print(dbgs()));
990 LLVM_DEBUG(dbgs() << "\n");
992 // Erase the old instructions.
993 LoadI
->eraseFromParent();
997 static bool inBoundsForPair(bool IsUnscaled
, int Offset
, int OffsetStride
) {
998 // Convert the byte-offset used by unscaled into an "element" offset used
999 // by the scaled pair load/store instructions.
1001 // If the byte-offset isn't a multiple of the stride, there's no point
1002 // trying to match it.
1003 if (Offset
% OffsetStride
)
1005 Offset
/= OffsetStride
;
1007 return Offset
<= 63 && Offset
>= -64;
1010 // Do alignment, specialized to power of 2 and for signed ints,
1011 // avoiding having to do a C-style cast from uint_64t to int when
1012 // using alignTo from include/llvm/Support/MathExtras.h.
1013 // FIXME: Move this function to include/MathExtras.h?
1014 static int alignTo(int Num
, int PowOf2
) {
1015 return (Num
+ PowOf2
- 1) & ~(PowOf2
- 1);
1018 static bool mayAlias(MachineInstr
&MIa
, MachineInstr
&MIb
,
1019 AliasAnalysis
*AA
) {
1020 // One of the instructions must modify memory.
1021 if (!MIa
.mayStore() && !MIb
.mayStore())
1024 // Both instructions must be memory operations.
1025 if (!MIa
.mayLoadOrStore() && !MIb
.mayLoadOrStore())
1028 return MIa
.mayAlias(AA
, MIb
, /*UseTBAA*/false);
1031 static bool mayAlias(MachineInstr
&MIa
,
1032 SmallVectorImpl
<MachineInstr
*> &MemInsns
,
1033 AliasAnalysis
*AA
) {
1034 for (MachineInstr
*MIb
: MemInsns
)
1035 if (mayAlias(MIa
, *MIb
, AA
))
1041 bool AArch64LoadStoreOpt::findMatchingStore(
1042 MachineBasicBlock::iterator I
, unsigned Limit
,
1043 MachineBasicBlock::iterator
&StoreI
) {
1044 MachineBasicBlock::iterator B
= I
->getParent()->begin();
1045 MachineBasicBlock::iterator MBBI
= I
;
1046 MachineInstr
&LoadMI
= *I
;
1047 unsigned BaseReg
= getLdStBaseOp(LoadMI
).getReg();
1049 // If the load is the first instruction in the block, there's obviously
1050 // not any matching store.
1054 // Track which register units have been modified and used between the first
1055 // insn and the second insn.
1056 ModifiedRegUnits
.clear();
1057 UsedRegUnits
.clear();
1062 MachineInstr
&MI
= *MBBI
;
1064 // Don't count transient instructions towards the search limit since there
1065 // may be different numbers of them if e.g. debug information is present.
1066 if (!MI
.isTransient())
1069 // If the load instruction reads directly from the address to which the
1070 // store instruction writes and the stored value is not modified, we can
1071 // promote the load. Since we do not handle stores with pre-/post-index,
1072 // it's unnecessary to check if BaseReg is modified by the store itself.
1073 if (MI
.mayStore() && isMatchingStore(LoadMI
, MI
) &&
1074 BaseReg
== getLdStBaseOp(MI
).getReg() &&
1075 isLdOffsetInRangeOfSt(LoadMI
, MI
, TII
) &&
1076 ModifiedRegUnits
.available(getLdStRegOp(MI
).getReg())) {
1084 // Update modified / uses register units.
1085 LiveRegUnits::accumulateUsedDefed(MI
, ModifiedRegUnits
, UsedRegUnits
, TRI
);
1087 // Otherwise, if the base register is modified, we have no match, so
1089 if (!ModifiedRegUnits
.available(BaseReg
))
1092 // If we encounter a store aliased with the load, return early.
1093 if (MI
.mayStore() && mayAlias(LoadMI
, MI
, AA
))
1095 } while (MBBI
!= B
&& Count
< Limit
);
1099 // Returns true if FirstMI and MI are candidates for merging or pairing.
1100 // Otherwise, returns false.
1101 static bool areCandidatesToMergeOrPair(MachineInstr
&FirstMI
, MachineInstr
&MI
,
1102 LdStPairFlags
&Flags
,
1103 const AArch64InstrInfo
*TII
) {
1104 // If this is volatile or if pairing is suppressed, not a candidate.
1105 if (MI
.hasOrderedMemoryRef() || TII
->isLdStPairSuppressed(MI
))
1108 // We should have already checked FirstMI for pair suppression and volatility.
1109 assert(!FirstMI
.hasOrderedMemoryRef() &&
1110 !TII
->isLdStPairSuppressed(FirstMI
) &&
1111 "FirstMI shouldn't get here if either of these checks are true.");
1113 unsigned OpcA
= FirstMI
.getOpcode();
1114 unsigned OpcB
= MI
.getOpcode();
1116 // Opcodes match: nothing more to check.
1120 // Try to match a sign-extended load/store with a zero-extended load/store.
1121 bool IsValidLdStrOpc
, PairIsValidLdStrOpc
;
1122 unsigned NonSExtOpc
= getMatchingNonSExtOpcode(OpcA
, &IsValidLdStrOpc
);
1123 assert(IsValidLdStrOpc
&&
1124 "Given Opc should be a Load or Store with an immediate");
1125 // OpcA will be the first instruction in the pair.
1126 if (NonSExtOpc
== getMatchingNonSExtOpcode(OpcB
, &PairIsValidLdStrOpc
)) {
1127 Flags
.setSExtIdx(NonSExtOpc
== (unsigned)OpcA
? 1 : 0);
1131 // If the second instruction isn't even a mergable/pairable load/store, bail
1133 if (!PairIsValidLdStrOpc
)
1136 // FIXME: We don't support merging narrow stores with mixed scaled/unscaled
1138 if (isNarrowStore(OpcA
) || isNarrowStore(OpcB
))
1141 // Try to match an unscaled load/store with a scaled load/store.
1142 return TII
->isUnscaledLdSt(OpcA
) != TII
->isUnscaledLdSt(OpcB
) &&
1143 getMatchingPairOpcode(OpcA
) == getMatchingPairOpcode(OpcB
);
1145 // FIXME: Can we also match a mixed sext/zext unscaled/scaled pair?
1148 /// Scan the instructions looking for a load/store that can be combined with the
1149 /// current instruction into a wider equivalent or a load/store pair.
1150 MachineBasicBlock::iterator
1151 AArch64LoadStoreOpt::findMatchingInsn(MachineBasicBlock::iterator I
,
1152 LdStPairFlags
&Flags
, unsigned Limit
,
1153 bool FindNarrowMerge
) {
1154 MachineBasicBlock::iterator E
= I
->getParent()->end();
1155 MachineBasicBlock::iterator MBBI
= I
;
1156 MachineInstr
&FirstMI
= *I
;
1159 bool MayLoad
= FirstMI
.mayLoad();
1160 bool IsUnscaled
= TII
->isUnscaledLdSt(FirstMI
);
1161 unsigned Reg
= getLdStRegOp(FirstMI
).getReg();
1162 unsigned BaseReg
= getLdStBaseOp(FirstMI
).getReg();
1163 int Offset
= getLdStOffsetOp(FirstMI
).getImm();
1164 int OffsetStride
= IsUnscaled
? getMemScale(FirstMI
) : 1;
1165 bool IsPromotableZeroStore
= isPromotableZeroStoreInst(FirstMI
);
1167 // Track which register units have been modified and used between the first
1168 // insn (inclusive) and the second insn.
1169 ModifiedRegUnits
.clear();
1170 UsedRegUnits
.clear();
1172 // Remember any instructions that read/write memory between FirstMI and MI.
1173 SmallVector
<MachineInstr
*, 4> MemInsns
;
1175 for (unsigned Count
= 0; MBBI
!= E
&& Count
< Limit
; ++MBBI
) {
1176 MachineInstr
&MI
= *MBBI
;
1178 // Don't count transient instructions towards the search limit since there
1179 // may be different numbers of them if e.g. debug information is present.
1180 if (!MI
.isTransient())
1183 Flags
.setSExtIdx(-1);
1184 if (areCandidatesToMergeOrPair(FirstMI
, MI
, Flags
, TII
) &&
1185 getLdStOffsetOp(MI
).isImm()) {
1186 assert(MI
.mayLoadOrStore() && "Expected memory operation.");
1187 // If we've found another instruction with the same opcode, check to see
1188 // if the base and offset are compatible with our starting instruction.
1189 // These instructions all have scaled immediate operands, so we just
1190 // check for +1/-1. Make sure to check the new instruction offset is
1191 // actually an immediate and not a symbolic reference destined for
1193 unsigned MIBaseReg
= getLdStBaseOp(MI
).getReg();
1194 int MIOffset
= getLdStOffsetOp(MI
).getImm();
1195 bool MIIsUnscaled
= TII
->isUnscaledLdSt(MI
);
1196 if (IsUnscaled
!= MIIsUnscaled
) {
1197 // We're trying to pair instructions that differ in how they are scaled.
1198 // If FirstMI is scaled then scale the offset of MI accordingly.
1199 // Otherwise, do the opposite (i.e., make MI's offset unscaled).
1200 int MemSize
= getMemScale(MI
);
1202 // If the unscaled offset isn't a multiple of the MemSize, we can't
1203 // pair the operations together: bail and keep looking.
1204 if (MIOffset
% MemSize
) {
1205 LiveRegUnits::accumulateUsedDefed(MI
, ModifiedRegUnits
,
1207 MemInsns
.push_back(&MI
);
1210 MIOffset
/= MemSize
;
1212 MIOffset
*= MemSize
;
1216 if (BaseReg
== MIBaseReg
&& ((Offset
== MIOffset
+ OffsetStride
) ||
1217 (Offset
+ OffsetStride
== MIOffset
))) {
1218 int MinOffset
= Offset
< MIOffset
? Offset
: MIOffset
;
1219 if (FindNarrowMerge
) {
1220 // If the alignment requirements of the scaled wide load/store
1221 // instruction can't express the offset of the scaled narrow input,
1222 // bail and keep looking. For promotable zero stores, allow only when
1223 // the stored value is the same (i.e., WZR).
1224 if ((!IsUnscaled
&& alignTo(MinOffset
, 2) != MinOffset
) ||
1225 (IsPromotableZeroStore
&& Reg
!= getLdStRegOp(MI
).getReg())) {
1226 LiveRegUnits::accumulateUsedDefed(MI
, ModifiedRegUnits
,
1228 MemInsns
.push_back(&MI
);
1232 // Pairwise instructions have a 7-bit signed offset field. Single
1233 // insns have a 12-bit unsigned offset field. If the resultant
1234 // immediate offset of merging these instructions is out of range for
1235 // a pairwise instruction, bail and keep looking.
1236 if (!inBoundsForPair(IsUnscaled
, MinOffset
, OffsetStride
)) {
1237 LiveRegUnits::accumulateUsedDefed(MI
, ModifiedRegUnits
,
1239 MemInsns
.push_back(&MI
);
1242 // If the alignment requirements of the paired (scaled) instruction
1243 // can't express the offset of the unscaled input, bail and keep
1245 if (IsUnscaled
&& (alignTo(MinOffset
, OffsetStride
) != MinOffset
)) {
1246 LiveRegUnits::accumulateUsedDefed(MI
, ModifiedRegUnits
,
1248 MemInsns
.push_back(&MI
);
1252 // If the destination register of the loads is the same register, bail
1253 // and keep looking. A load-pair instruction with both destination
1254 // registers the same is UNPREDICTABLE and will result in an exception.
1255 if (MayLoad
&& Reg
== getLdStRegOp(MI
).getReg()) {
1256 LiveRegUnits::accumulateUsedDefed(MI
, ModifiedRegUnits
, UsedRegUnits
,
1258 MemInsns
.push_back(&MI
);
1262 // If the Rt of the second instruction was not modified or used between
1263 // the two instructions and none of the instructions between the second
1264 // and first alias with the second, we can combine the second into the
1266 if (ModifiedRegUnits
.available(getLdStRegOp(MI
).getReg()) &&
1268 !UsedRegUnits
.available(getLdStRegOp(MI
).getReg())) &&
1269 !mayAlias(MI
, MemInsns
, AA
)) {
1270 Flags
.setMergeForward(false);
1274 // Likewise, if the Rt of the first instruction is not modified or used
1275 // between the two instructions and none of the instructions between the
1276 // first and the second alias with the first, we can combine the first
1278 if (ModifiedRegUnits
.available(getLdStRegOp(FirstMI
).getReg()) &&
1280 !UsedRegUnits
.available(getLdStRegOp(FirstMI
).getReg())) &&
1281 !mayAlias(FirstMI
, MemInsns
, AA
)) {
1282 Flags
.setMergeForward(true);
1285 // Unable to combine these instructions due to interference in between.
1290 // If the instruction wasn't a matching load or store. Stop searching if we
1291 // encounter a call instruction that might modify memory.
1295 // Update modified / uses register units.
1296 LiveRegUnits::accumulateUsedDefed(MI
, ModifiedRegUnits
, UsedRegUnits
, TRI
);
1298 // Otherwise, if the base register is modified, we have no match, so
1300 if (!ModifiedRegUnits
.available(BaseReg
))
1303 // Update list of instructions that read/write memory.
1304 if (MI
.mayLoadOrStore())
1305 MemInsns
.push_back(&MI
);
1310 MachineBasicBlock::iterator
1311 AArch64LoadStoreOpt::mergeUpdateInsn(MachineBasicBlock::iterator I
,
1312 MachineBasicBlock::iterator Update
,
1314 assert((Update
->getOpcode() == AArch64::ADDXri
||
1315 Update
->getOpcode() == AArch64::SUBXri
) &&
1316 "Unexpected base register update instruction to merge!");
1317 MachineBasicBlock::iterator NextI
= I
;
1318 // Return the instruction following the merged instruction, which is
1319 // the instruction following our unmerged load. Unless that's the add/sub
1320 // instruction we're merging, in which case it's the one after that.
1321 if (++NextI
== Update
)
1324 int Value
= Update
->getOperand(2).getImm();
1325 assert(AArch64_AM::getShiftValue(Update
->getOperand(3).getImm()) == 0 &&
1326 "Can't merge 1 << 12 offset into pre-/post-indexed load / store");
1327 if (Update
->getOpcode() == AArch64::SUBXri
)
1330 unsigned NewOpc
= IsPreIdx
? getPreIndexedOpcode(I
->getOpcode())
1331 : getPostIndexedOpcode(I
->getOpcode());
1332 MachineInstrBuilder MIB
;
1333 if (!isPairedLdSt(*I
)) {
1334 // Non-paired instruction.
1335 MIB
= BuildMI(*I
->getParent(), I
, I
->getDebugLoc(), TII
->get(NewOpc
))
1336 .add(getLdStRegOp(*Update
))
1337 .add(getLdStRegOp(*I
))
1338 .add(getLdStBaseOp(*I
))
1340 .setMemRefs(I
->memoperands())
1341 .setMIFlags(I
->mergeFlagsWith(*Update
));
1343 // Paired instruction.
1344 int Scale
= getMemScale(*I
);
1345 MIB
= BuildMI(*I
->getParent(), I
, I
->getDebugLoc(), TII
->get(NewOpc
))
1346 .add(getLdStRegOp(*Update
))
1347 .add(getLdStRegOp(*I
, 0))
1348 .add(getLdStRegOp(*I
, 1))
1349 .add(getLdStBaseOp(*I
))
1350 .addImm(Value
/ Scale
)
1351 .setMemRefs(I
->memoperands())
1352 .setMIFlags(I
->mergeFlagsWith(*Update
));
1358 LLVM_DEBUG(dbgs() << "Creating pre-indexed load/store.");
1361 LLVM_DEBUG(dbgs() << "Creating post-indexed load/store.");
1363 LLVM_DEBUG(dbgs() << " Replacing instructions:\n ");
1364 LLVM_DEBUG(I
->print(dbgs()));
1365 LLVM_DEBUG(dbgs() << " ");
1366 LLVM_DEBUG(Update
->print(dbgs()));
1367 LLVM_DEBUG(dbgs() << " with instruction:\n ");
1368 LLVM_DEBUG(((MachineInstr
*)MIB
)->print(dbgs()));
1369 LLVM_DEBUG(dbgs() << "\n");
1371 // Erase the old instructions for the block.
1372 I
->eraseFromParent();
1373 Update
->eraseFromParent();
1378 bool AArch64LoadStoreOpt::isMatchingUpdateInsn(MachineInstr
&MemMI
,
1380 unsigned BaseReg
, int Offset
) {
1381 switch (MI
.getOpcode()) {
1384 case AArch64::SUBXri
:
1385 case AArch64::ADDXri
:
1386 // Make sure it's a vanilla immediate operand, not a relocation or
1387 // anything else we can't handle.
1388 if (!MI
.getOperand(2).isImm())
1390 // Watch out for 1 << 12 shifted value.
1391 if (AArch64_AM::getShiftValue(MI
.getOperand(3).getImm()))
1394 // The update instruction source and destination register must be the
1395 // same as the load/store base register.
1396 if (MI
.getOperand(0).getReg() != BaseReg
||
1397 MI
.getOperand(1).getReg() != BaseReg
)
1400 bool IsPairedInsn
= isPairedLdSt(MemMI
);
1401 int UpdateOffset
= MI
.getOperand(2).getImm();
1402 if (MI
.getOpcode() == AArch64::SUBXri
)
1403 UpdateOffset
= -UpdateOffset
;
1405 // For non-paired load/store instructions, the immediate must fit in a
1406 // signed 9-bit integer.
1407 if (!IsPairedInsn
&& (UpdateOffset
> 255 || UpdateOffset
< -256))
1410 // For paired load/store instructions, the immediate must be a multiple of
1411 // the scaling factor. The scaled offset must also fit into a signed 7-bit
1414 int Scale
= getMemScale(MemMI
);
1415 if (UpdateOffset
% Scale
!= 0)
1418 int ScaledOffset
= UpdateOffset
/ Scale
;
1419 if (ScaledOffset
> 63 || ScaledOffset
< -64)
1423 // If we have a non-zero Offset, we check that it matches the amount
1424 // we're adding to the register.
1425 if (!Offset
|| Offset
== UpdateOffset
)
1432 MachineBasicBlock::iterator
AArch64LoadStoreOpt::findMatchingUpdateInsnForward(
1433 MachineBasicBlock::iterator I
, int UnscaledOffset
, unsigned Limit
) {
1434 MachineBasicBlock::iterator E
= I
->getParent()->end();
1435 MachineInstr
&MemMI
= *I
;
1436 MachineBasicBlock::iterator MBBI
= I
;
1438 unsigned BaseReg
= getLdStBaseOp(MemMI
).getReg();
1439 int MIUnscaledOffset
= getLdStOffsetOp(MemMI
).getImm() * getMemScale(MemMI
);
1441 // Scan forward looking for post-index opportunities. Updating instructions
1442 // can't be formed if the memory instruction doesn't have the offset we're
1444 if (MIUnscaledOffset
!= UnscaledOffset
)
1447 // If the base register overlaps a destination register, we can't
1448 // merge the update.
1449 bool IsPairedInsn
= isPairedLdSt(MemMI
);
1450 for (unsigned i
= 0, e
= IsPairedInsn
? 2 : 1; i
!= e
; ++i
) {
1451 unsigned DestReg
= getLdStRegOp(MemMI
, i
).getReg();
1452 if (DestReg
== BaseReg
|| TRI
->isSubRegister(BaseReg
, DestReg
))
1456 // Track which register units have been modified and used between the first
1457 // insn (inclusive) and the second insn.
1458 ModifiedRegUnits
.clear();
1459 UsedRegUnits
.clear();
1461 for (unsigned Count
= 0; MBBI
!= E
&& Count
< Limit
; ++MBBI
) {
1462 MachineInstr
&MI
= *MBBI
;
1464 // Don't count transient instructions towards the search limit since there
1465 // may be different numbers of them if e.g. debug information is present.
1466 if (!MI
.isTransient())
1469 // If we found a match, return it.
1470 if (isMatchingUpdateInsn(*I
, MI
, BaseReg
, UnscaledOffset
))
1473 // Update the status of what the instruction clobbered and used.
1474 LiveRegUnits::accumulateUsedDefed(MI
, ModifiedRegUnits
, UsedRegUnits
, TRI
);
1476 // Otherwise, if the base register is used or modified, we have no match, so
1478 if (!ModifiedRegUnits
.available(BaseReg
) ||
1479 !UsedRegUnits
.available(BaseReg
))
1485 MachineBasicBlock::iterator
AArch64LoadStoreOpt::findMatchingUpdateInsnBackward(
1486 MachineBasicBlock::iterator I
, unsigned Limit
) {
1487 MachineBasicBlock::iterator B
= I
->getParent()->begin();
1488 MachineBasicBlock::iterator E
= I
->getParent()->end();
1489 MachineInstr
&MemMI
= *I
;
1490 MachineBasicBlock::iterator MBBI
= I
;
1492 unsigned BaseReg
= getLdStBaseOp(MemMI
).getReg();
1493 int Offset
= getLdStOffsetOp(MemMI
).getImm();
1495 // If the load/store is the first instruction in the block, there's obviously
1496 // not any matching update. Ditto if the memory offset isn't zero.
1497 if (MBBI
== B
|| Offset
!= 0)
1499 // If the base register overlaps a destination register, we can't
1500 // merge the update.
1501 bool IsPairedInsn
= isPairedLdSt(MemMI
);
1502 for (unsigned i
= 0, e
= IsPairedInsn
? 2 : 1; i
!= e
; ++i
) {
1503 unsigned DestReg
= getLdStRegOp(MemMI
, i
).getReg();
1504 if (DestReg
== BaseReg
|| TRI
->isSubRegister(BaseReg
, DestReg
))
1508 // Track which register units have been modified and used between the first
1509 // insn (inclusive) and the second insn.
1510 ModifiedRegUnits
.clear();
1511 UsedRegUnits
.clear();
1515 MachineInstr
&MI
= *MBBI
;
1517 // Don't count transient instructions towards the search limit since there
1518 // may be different numbers of them if e.g. debug information is present.
1519 if (!MI
.isTransient())
1522 // If we found a match, return it.
1523 if (isMatchingUpdateInsn(*I
, MI
, BaseReg
, Offset
))
1526 // Update the status of what the instruction clobbered and used.
1527 LiveRegUnits::accumulateUsedDefed(MI
, ModifiedRegUnits
, UsedRegUnits
, TRI
);
1529 // Otherwise, if the base register is used or modified, we have no match, so
1531 if (!ModifiedRegUnits
.available(BaseReg
) ||
1532 !UsedRegUnits
.available(BaseReg
))
1534 } while (MBBI
!= B
&& Count
< Limit
);
1538 bool AArch64LoadStoreOpt::tryToPromoteLoadFromStore(
1539 MachineBasicBlock::iterator
&MBBI
) {
1540 MachineInstr
&MI
= *MBBI
;
1541 // If this is a volatile load, don't mess with it.
1542 if (MI
.hasOrderedMemoryRef())
1545 // Make sure this is a reg+imm.
1546 // FIXME: It is possible to extend it to handle reg+reg cases.
1547 if (!getLdStOffsetOp(MI
).isImm())
1550 // Look backward up to LdStLimit instructions.
1551 MachineBasicBlock::iterator StoreI
;
1552 if (findMatchingStore(MBBI
, LdStLimit
, StoreI
)) {
1553 ++NumLoadsFromStoresPromoted
;
1554 // Promote the load. Keeping the iterator straight is a
1555 // pain, so we let the merge routine tell us what the next instruction
1556 // is after it's done mucking about.
1557 MBBI
= promoteLoadFromStore(MBBI
, StoreI
);
1563 // Merge adjacent zero stores into a wider store.
1564 bool AArch64LoadStoreOpt::tryToMergeZeroStInst(
1565 MachineBasicBlock::iterator
&MBBI
) {
1566 assert(isPromotableZeroStoreInst(*MBBI
) && "Expected narrow store.");
1567 MachineInstr
&MI
= *MBBI
;
1568 MachineBasicBlock::iterator E
= MI
.getParent()->end();
1570 if (!TII
->isCandidateToMergeOrPair(MI
))
1573 // Look ahead up to LdStLimit instructions for a mergable instruction.
1574 LdStPairFlags Flags
;
1575 MachineBasicBlock::iterator MergeMI
=
1576 findMatchingInsn(MBBI
, Flags
, LdStLimit
, /* FindNarrowMerge = */ true);
1578 ++NumZeroStoresPromoted
;
1580 // Keeping the iterator straight is a pain, so we let the merge routine tell
1581 // us what the next instruction is after it's done mucking about.
1582 MBBI
= mergeNarrowZeroStores(MBBI
, MergeMI
, Flags
);
1588 // Find loads and stores that can be merged into a single load or store pair
1590 bool AArch64LoadStoreOpt::tryToPairLdStInst(MachineBasicBlock::iterator
&MBBI
) {
1591 MachineInstr
&MI
= *MBBI
;
1592 MachineBasicBlock::iterator E
= MI
.getParent()->end();
1594 if (!TII
->isCandidateToMergeOrPair(MI
))
1597 // Early exit if the offset is not possible to match. (6 bits of positive
1598 // range, plus allow an extra one in case we find a later insn that matches
1600 bool IsUnscaled
= TII
->isUnscaledLdSt(MI
);
1601 int Offset
= getLdStOffsetOp(MI
).getImm();
1602 int OffsetStride
= IsUnscaled
? getMemScale(MI
) : 1;
1603 // Allow one more for offset.
1605 Offset
-= OffsetStride
;
1606 if (!inBoundsForPair(IsUnscaled
, Offset
, OffsetStride
))
1609 // Look ahead up to LdStLimit instructions for a pairable instruction.
1610 LdStPairFlags Flags
;
1611 MachineBasicBlock::iterator Paired
=
1612 findMatchingInsn(MBBI
, Flags
, LdStLimit
, /* FindNarrowMerge = */ false);
1615 if (TII
->isUnscaledLdSt(MI
))
1616 ++NumUnscaledPairCreated
;
1617 // Keeping the iterator straight is a pain, so we let the merge routine tell
1618 // us what the next instruction is after it's done mucking about.
1619 MBBI
= mergePairedInsns(MBBI
, Paired
, Flags
);
1625 bool AArch64LoadStoreOpt::tryToMergeLdStUpdate
1626 (MachineBasicBlock::iterator
&MBBI
) {
1627 MachineInstr
&MI
= *MBBI
;
1628 MachineBasicBlock::iterator E
= MI
.getParent()->end();
1629 MachineBasicBlock::iterator Update
;
1631 // Look forward to try to form a post-index instruction. For example,
1633 // add x20, x20, #32
1635 // ldr x0, [x20], #32
1636 Update
= findMatchingUpdateInsnForward(MBBI
, 0, UpdateLimit
);
1638 // Merge the update into the ld/st.
1639 MBBI
= mergeUpdateInsn(MBBI
, Update
, /*IsPreIdx=*/false);
1643 // Don't know how to handle unscaled pre/post-index versions below, so bail.
1644 if (TII
->isUnscaledLdSt(MI
.getOpcode()))
1647 // Look back to try to find a pre-index instruction. For example,
1651 // ldr x1, [x0, #8]!
1652 Update
= findMatchingUpdateInsnBackward(MBBI
, UpdateLimit
);
1654 // Merge the update into the ld/st.
1655 MBBI
= mergeUpdateInsn(MBBI
, Update
, /*IsPreIdx=*/true);
1659 // The immediate in the load/store is scaled by the size of the memory
1660 // operation. The immediate in the add we're looking for,
1661 // however, is not, so adjust here.
1662 int UnscaledOffset
= getLdStOffsetOp(MI
).getImm() * getMemScale(MI
);
1664 // Look forward to try to find a post-index instruction. For example,
1665 // ldr x1, [x0, #64]
1668 // ldr x1, [x0, #64]!
1669 Update
= findMatchingUpdateInsnForward(MBBI
, UnscaledOffset
, UpdateLimit
);
1671 // Merge the update into the ld/st.
1672 MBBI
= mergeUpdateInsn(MBBI
, Update
, /*IsPreIdx=*/true);
1679 bool AArch64LoadStoreOpt::optimizeBlock(MachineBasicBlock
&MBB
,
1680 bool EnableNarrowZeroStOpt
) {
1681 bool Modified
= false;
1682 // Four tranformations to do here:
1683 // 1) Find loads that directly read from stores and promote them by
1684 // replacing with mov instructions. If the store is wider than the load,
1685 // the load will be replaced with a bitfield extract.
1688 // ldrh w2, [x0, #6]
1692 for (MachineBasicBlock::iterator MBBI
= MBB
.begin(), E
= MBB
.end();
1694 if (isPromotableLoadFromStore(*MBBI
) && tryToPromoteLoadFromStore(MBBI
))
1699 // 2) Merge adjacent zero stores into a wider store.
1702 // strh wzr, [x0, #2]
1707 // str wzr, [x0, #4]
1710 if (EnableNarrowZeroStOpt
)
1711 for (MachineBasicBlock::iterator MBBI
= MBB
.begin(), E
= MBB
.end();
1713 if (isPromotableZeroStoreInst(*MBBI
) && tryToMergeZeroStInst(MBBI
))
1718 // 3) Find loads and stores that can be merged into a single load or store
1719 // pair instruction.
1725 for (MachineBasicBlock::iterator MBBI
= MBB
.begin(), E
= MBB
.end();
1727 if (TII
->isPairableLdStInst(*MBBI
) && tryToPairLdStInst(MBBI
))
1732 // 4) Find base register updates that can be merged into the load or store
1733 // as a base-reg writeback.
1739 for (MachineBasicBlock::iterator MBBI
= MBB
.begin(), E
= MBB
.end();
1741 if (isMergeableLdStUpdate(*MBBI
) && tryToMergeLdStUpdate(MBBI
))
1750 bool AArch64LoadStoreOpt::runOnMachineFunction(MachineFunction
&Fn
) {
1751 if (skipFunction(Fn
.getFunction()))
1754 Subtarget
= &static_cast<const AArch64Subtarget
&>(Fn
.getSubtarget());
1755 TII
= static_cast<const AArch64InstrInfo
*>(Subtarget
->getInstrInfo());
1756 TRI
= Subtarget
->getRegisterInfo();
1757 AA
= &getAnalysis
<AAResultsWrapperPass
>().getAAResults();
1759 // Resize the modified and used register unit trackers. We do this once
1760 // per function and then clear the register units each time we optimize a load
1762 ModifiedRegUnits
.init(*TRI
);
1763 UsedRegUnits
.init(*TRI
);
1765 bool Modified
= false;
1766 bool enableNarrowZeroStOpt
= !Subtarget
->requiresStrictAlign();
1767 for (auto &MBB
: Fn
)
1768 Modified
|= optimizeBlock(MBB
, enableNarrowZeroStOpt
);
1773 // FIXME: Do we need/want a pre-alloc pass like ARM has to try to keep loads and
1774 // stores near one another? Note: The pre-RA instruction scheduler already has
1775 // hooks to try and schedule pairable loads/stores together to improve pairing
1776 // opportunities. Thus, pre-RA pairing pass may not be worth the effort.
1778 // FIXME: When pairing store instructions it's very possible for this pass to
1779 // hoist a store with a KILL marker above another use (without a KILL marker).
1780 // The resulting IR is invalid, but nothing uses the KILL markers after this
1781 // pass, so it's never caused a problem in practice.
1783 /// createAArch64LoadStoreOptimizationPass - returns an instance of the
1784 /// load / store optimization pass.
1785 FunctionPass
*llvm::createAArch64LoadStoreOptimizationPass() {
1786 return new AArch64LoadStoreOpt();