1 //===- AArch64LoadStoreOptimizer.cpp - AArch64 load/store opt. pass -------===//
3 // Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
4 // See https://llvm.org/LICENSE.txt for license information.
5 // SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
7 //===----------------------------------------------------------------------===//
9 // This file contains a pass that performs load / store related peephole
10 // optimizations. This pass should be run after register allocation.
12 //===----------------------------------------------------------------------===//
14 #include "AArch64InstrInfo.h"
15 #include "AArch64Subtarget.h"
16 #include "MCTargetDesc/AArch64AddressingModes.h"
17 #include "llvm/ADT/BitVector.h"
18 #include "llvm/ADT/SmallVector.h"
19 #include "llvm/ADT/Statistic.h"
20 #include "llvm/ADT/StringRef.h"
21 #include "llvm/ADT/iterator_range.h"
22 #include "llvm/Analysis/AliasAnalysis.h"
23 #include "llvm/CodeGen/MachineBasicBlock.h"
24 #include "llvm/CodeGen/MachineFunction.h"
25 #include "llvm/CodeGen/MachineFunctionPass.h"
26 #include "llvm/CodeGen/MachineInstr.h"
27 #include "llvm/CodeGen/MachineInstrBuilder.h"
28 #include "llvm/CodeGen/MachineOperand.h"
29 #include "llvm/CodeGen/TargetRegisterInfo.h"
30 #include "llvm/IR/DebugLoc.h"
31 #include "llvm/MC/MCRegisterInfo.h"
32 #include "llvm/Pass.h"
33 #include "llvm/Support/CommandLine.h"
34 #include "llvm/Support/Debug.h"
35 #include "llvm/Support/ErrorHandling.h"
36 #include "llvm/Support/raw_ostream.h"
44 #define DEBUG_TYPE "aarch64-ldst-opt"
46 STATISTIC(NumPairCreated
, "Number of load/store pair instructions generated");
47 STATISTIC(NumPostFolded
, "Number of post-index updates folded");
48 STATISTIC(NumPreFolded
, "Number of pre-index updates folded");
49 STATISTIC(NumUnscaledPairCreated
,
50 "Number of load/store from unscaled generated");
51 STATISTIC(NumZeroStoresPromoted
, "Number of narrow zero stores promoted");
52 STATISTIC(NumLoadsFromStoresPromoted
, "Number of loads from stores promoted");
54 // The LdStLimit limits how far we search for load/store pairs.
55 static cl::opt
<unsigned> LdStLimit("aarch64-load-store-scan-limit",
56 cl::init(20), cl::Hidden
);
58 // The UpdateLimit limits how far we search for update instructions when we form
59 // pre-/post-index instructions.
60 static cl::opt
<unsigned> UpdateLimit("aarch64-update-scan-limit", cl::init(100),
63 #define AARCH64_LOAD_STORE_OPT_NAME "AArch64 load / store optimization pass"
67 using LdStPairFlags
= struct LdStPairFlags
{
68 // If a matching instruction is found, MergeForward is set to true if the
69 // merge is to remove the first instruction and replace the second with
70 // a pair-wise insn, and false if the reverse is true.
71 bool MergeForward
= false;
73 // SExtIdx gives the index of the result of the load pair that must be
74 // extended. The value of SExtIdx assumes that the paired load produces the
75 // value in this order: (I, returned iterator), i.e., -1 means no value has
76 // to be extended, 0 means I, and 1 means the returned iterator.
79 LdStPairFlags() = default;
81 void setMergeForward(bool V
= true) { MergeForward
= V
; }
82 bool getMergeForward() const { return MergeForward
; }
84 void setSExtIdx(int V
) { SExtIdx
= V
; }
85 int getSExtIdx() const { return SExtIdx
; }
88 struct AArch64LoadStoreOpt
: public MachineFunctionPass
{
91 AArch64LoadStoreOpt() : MachineFunctionPass(ID
) {
92 initializeAArch64LoadStoreOptPass(*PassRegistry::getPassRegistry());
96 const AArch64InstrInfo
*TII
;
97 const TargetRegisterInfo
*TRI
;
98 const AArch64Subtarget
*Subtarget
;
100 // Track which register units have been modified and used.
101 LiveRegUnits ModifiedRegUnits
, UsedRegUnits
;
103 void getAnalysisUsage(AnalysisUsage
&AU
) const override
{
104 AU
.addRequired
<AAResultsWrapperPass
>();
105 MachineFunctionPass::getAnalysisUsage(AU
);
108 // Scan the instructions looking for a load/store that can be combined
109 // with the current instruction into a load/store pair.
110 // Return the matching instruction if one is found, else MBB->end().
111 MachineBasicBlock::iterator
findMatchingInsn(MachineBasicBlock::iterator I
,
112 LdStPairFlags
&Flags
,
114 bool FindNarrowMerge
);
116 // Scan the instructions looking for a store that writes to the address from
117 // which the current load instruction reads. Return true if one is found.
118 bool findMatchingStore(MachineBasicBlock::iterator I
, unsigned Limit
,
119 MachineBasicBlock::iterator
&StoreI
);
121 // Merge the two instructions indicated into a wider narrow store instruction.
122 MachineBasicBlock::iterator
123 mergeNarrowZeroStores(MachineBasicBlock::iterator I
,
124 MachineBasicBlock::iterator MergeMI
,
125 const LdStPairFlags
&Flags
);
127 // Merge the two instructions indicated into a single pair-wise instruction.
128 MachineBasicBlock::iterator
129 mergePairedInsns(MachineBasicBlock::iterator I
,
130 MachineBasicBlock::iterator Paired
,
131 const LdStPairFlags
&Flags
);
133 // Promote the load that reads directly from the address stored to.
134 MachineBasicBlock::iterator
135 promoteLoadFromStore(MachineBasicBlock::iterator LoadI
,
136 MachineBasicBlock::iterator StoreI
);
138 // Scan the instruction list to find a base register update that can
139 // be combined with the current instruction (a load or store) using
140 // pre or post indexed addressing with writeback. Scan forwards.
141 MachineBasicBlock::iterator
142 findMatchingUpdateInsnForward(MachineBasicBlock::iterator I
,
143 int UnscaledOffset
, unsigned Limit
);
145 // Scan the instruction list to find a base register update that can
146 // be combined with the current instruction (a load or store) using
147 // pre or post indexed addressing with writeback. Scan backwards.
148 MachineBasicBlock::iterator
149 findMatchingUpdateInsnBackward(MachineBasicBlock::iterator I
, unsigned Limit
);
151 // Find an instruction that updates the base register of the ld/st
153 bool isMatchingUpdateInsn(MachineInstr
&MemMI
, MachineInstr
&MI
,
154 unsigned BaseReg
, int Offset
);
156 // Merge a pre- or post-index base register update into a ld/st instruction.
157 MachineBasicBlock::iterator
158 mergeUpdateInsn(MachineBasicBlock::iterator I
,
159 MachineBasicBlock::iterator Update
, bool IsPreIdx
);
161 // Find and merge zero store instructions.
162 bool tryToMergeZeroStInst(MachineBasicBlock::iterator
&MBBI
);
164 // Find and pair ldr/str instructions.
165 bool tryToPairLdStInst(MachineBasicBlock::iterator
&MBBI
);
167 // Find and promote load instructions which read directly from store.
168 bool tryToPromoteLoadFromStore(MachineBasicBlock::iterator
&MBBI
);
170 // Find and merge a base register updates before or after a ld/st instruction.
171 bool tryToMergeLdStUpdate(MachineBasicBlock::iterator
&MBBI
);
173 bool optimizeBlock(MachineBasicBlock
&MBB
, bool EnableNarrowZeroStOpt
);
175 bool runOnMachineFunction(MachineFunction
&Fn
) override
;
177 MachineFunctionProperties
getRequiredProperties() const override
{
178 return MachineFunctionProperties().set(
179 MachineFunctionProperties::Property::NoVRegs
);
182 StringRef
getPassName() const override
{ return AARCH64_LOAD_STORE_OPT_NAME
; }
185 char AArch64LoadStoreOpt::ID
= 0;
187 } // end anonymous namespace
189 INITIALIZE_PASS(AArch64LoadStoreOpt
, "aarch64-ldst-opt",
190 AARCH64_LOAD_STORE_OPT_NAME
, false, false)
192 static bool isNarrowStore(unsigned Opc
) {
196 case AArch64::STRBBui
:
197 case AArch64::STURBBi
:
198 case AArch64::STRHHui
:
199 case AArch64::STURHHi
:
204 // Scaling factor for unscaled load or store.
205 static int getMemScale(MachineInstr
&MI
) {
206 switch (MI
.getOpcode()) {
208 llvm_unreachable("Opcode has unknown scale!");
209 case AArch64::LDRBBui
:
210 case AArch64::LDURBBi
:
211 case AArch64::LDRSBWui
:
212 case AArch64::LDURSBWi
:
213 case AArch64::STRBBui
:
214 case AArch64::STURBBi
:
216 case AArch64::LDRHHui
:
217 case AArch64::LDURHHi
:
218 case AArch64::LDRSHWui
:
219 case AArch64::LDURSHWi
:
220 case AArch64::STRHHui
:
221 case AArch64::STURHHi
:
223 case AArch64::LDRSui
:
224 case AArch64::LDURSi
:
225 case AArch64::LDRSWui
:
226 case AArch64::LDURSWi
:
227 case AArch64::LDRWui
:
228 case AArch64::LDURWi
:
229 case AArch64::STRSui
:
230 case AArch64::STURSi
:
231 case AArch64::STRWui
:
232 case AArch64::STURWi
:
234 case AArch64::LDPSWi
:
239 case AArch64::LDRDui
:
240 case AArch64::LDURDi
:
241 case AArch64::LDRXui
:
242 case AArch64::LDURXi
:
243 case AArch64::STRDui
:
244 case AArch64::STURDi
:
245 case AArch64::STRXui
:
246 case AArch64::STURXi
:
252 case AArch64::LDRQui
:
253 case AArch64::LDURQi
:
254 case AArch64::STRQui
:
255 case AArch64::STURQi
:
262 static unsigned getMatchingNonSExtOpcode(unsigned Opc
,
263 bool *IsValidLdStrOpc
= nullptr) {
265 *IsValidLdStrOpc
= true;
269 *IsValidLdStrOpc
= false;
270 return std::numeric_limits
<unsigned>::max();
271 case AArch64::STRDui
:
272 case AArch64::STURDi
:
273 case AArch64::STRQui
:
274 case AArch64::STURQi
:
275 case AArch64::STRBBui
:
276 case AArch64::STURBBi
:
277 case AArch64::STRHHui
:
278 case AArch64::STURHHi
:
279 case AArch64::STRWui
:
280 case AArch64::STURWi
:
281 case AArch64::STRXui
:
282 case AArch64::STURXi
:
283 case AArch64::LDRDui
:
284 case AArch64::LDURDi
:
285 case AArch64::LDRQui
:
286 case AArch64::LDURQi
:
287 case AArch64::LDRWui
:
288 case AArch64::LDURWi
:
289 case AArch64::LDRXui
:
290 case AArch64::LDURXi
:
291 case AArch64::STRSui
:
292 case AArch64::STURSi
:
293 case AArch64::LDRSui
:
294 case AArch64::LDURSi
:
296 case AArch64::LDRSWui
:
297 return AArch64::LDRWui
;
298 case AArch64::LDURSWi
:
299 return AArch64::LDURWi
;
303 static unsigned getMatchingWideOpcode(unsigned Opc
) {
306 llvm_unreachable("Opcode has no wide equivalent!");
307 case AArch64::STRBBui
:
308 return AArch64::STRHHui
;
309 case AArch64::STRHHui
:
310 return AArch64::STRWui
;
311 case AArch64::STURBBi
:
312 return AArch64::STURHHi
;
313 case AArch64::STURHHi
:
314 return AArch64::STURWi
;
315 case AArch64::STURWi
:
316 return AArch64::STURXi
;
317 case AArch64::STRWui
:
318 return AArch64::STRXui
;
322 static unsigned getMatchingPairOpcode(unsigned Opc
) {
325 llvm_unreachable("Opcode has no pairwise equivalent!");
326 case AArch64::STRSui
:
327 case AArch64::STURSi
:
328 return AArch64::STPSi
;
329 case AArch64::STRDui
:
330 case AArch64::STURDi
:
331 return AArch64::STPDi
;
332 case AArch64::STRQui
:
333 case AArch64::STURQi
:
334 return AArch64::STPQi
;
335 case AArch64::STRWui
:
336 case AArch64::STURWi
:
337 return AArch64::STPWi
;
338 case AArch64::STRXui
:
339 case AArch64::STURXi
:
340 return AArch64::STPXi
;
341 case AArch64::LDRSui
:
342 case AArch64::LDURSi
:
343 return AArch64::LDPSi
;
344 case AArch64::LDRDui
:
345 case AArch64::LDURDi
:
346 return AArch64::LDPDi
;
347 case AArch64::LDRQui
:
348 case AArch64::LDURQi
:
349 return AArch64::LDPQi
;
350 case AArch64::LDRWui
:
351 case AArch64::LDURWi
:
352 return AArch64::LDPWi
;
353 case AArch64::LDRXui
:
354 case AArch64::LDURXi
:
355 return AArch64::LDPXi
;
356 case AArch64::LDRSWui
:
357 case AArch64::LDURSWi
:
358 return AArch64::LDPSWi
;
362 static unsigned isMatchingStore(MachineInstr
&LoadInst
,
363 MachineInstr
&StoreInst
) {
364 unsigned LdOpc
= LoadInst
.getOpcode();
365 unsigned StOpc
= StoreInst
.getOpcode();
368 llvm_unreachable("Unsupported load instruction!");
369 case AArch64::LDRBBui
:
370 return StOpc
== AArch64::STRBBui
|| StOpc
== AArch64::STRHHui
||
371 StOpc
== AArch64::STRWui
|| StOpc
== AArch64::STRXui
;
372 case AArch64::LDURBBi
:
373 return StOpc
== AArch64::STURBBi
|| StOpc
== AArch64::STURHHi
||
374 StOpc
== AArch64::STURWi
|| StOpc
== AArch64::STURXi
;
375 case AArch64::LDRHHui
:
376 return StOpc
== AArch64::STRHHui
|| StOpc
== AArch64::STRWui
||
377 StOpc
== AArch64::STRXui
;
378 case AArch64::LDURHHi
:
379 return StOpc
== AArch64::STURHHi
|| StOpc
== AArch64::STURWi
||
380 StOpc
== AArch64::STURXi
;
381 case AArch64::LDRWui
:
382 return StOpc
== AArch64::STRWui
|| StOpc
== AArch64::STRXui
;
383 case AArch64::LDURWi
:
384 return StOpc
== AArch64::STURWi
|| StOpc
== AArch64::STURXi
;
385 case AArch64::LDRXui
:
386 return StOpc
== AArch64::STRXui
;
387 case AArch64::LDURXi
:
388 return StOpc
== AArch64::STURXi
;
392 static unsigned getPreIndexedOpcode(unsigned Opc
) {
393 // FIXME: We don't currently support creating pre-indexed loads/stores when
394 // the load or store is the unscaled version. If we decide to perform such an
395 // optimization in the future the cases for the unscaled loads/stores will
396 // need to be added here.
399 llvm_unreachable("Opcode has no pre-indexed equivalent!");
400 case AArch64::STRSui
:
401 return AArch64::STRSpre
;
402 case AArch64::STRDui
:
403 return AArch64::STRDpre
;
404 case AArch64::STRQui
:
405 return AArch64::STRQpre
;
406 case AArch64::STRBBui
:
407 return AArch64::STRBBpre
;
408 case AArch64::STRHHui
:
409 return AArch64::STRHHpre
;
410 case AArch64::STRWui
:
411 return AArch64::STRWpre
;
412 case AArch64::STRXui
:
413 return AArch64::STRXpre
;
414 case AArch64::LDRSui
:
415 return AArch64::LDRSpre
;
416 case AArch64::LDRDui
:
417 return AArch64::LDRDpre
;
418 case AArch64::LDRQui
:
419 return AArch64::LDRQpre
;
420 case AArch64::LDRBBui
:
421 return AArch64::LDRBBpre
;
422 case AArch64::LDRHHui
:
423 return AArch64::LDRHHpre
;
424 case AArch64::LDRWui
:
425 return AArch64::LDRWpre
;
426 case AArch64::LDRXui
:
427 return AArch64::LDRXpre
;
428 case AArch64::LDRSWui
:
429 return AArch64::LDRSWpre
;
431 return AArch64::LDPSpre
;
432 case AArch64::LDPSWi
:
433 return AArch64::LDPSWpre
;
435 return AArch64::LDPDpre
;
437 return AArch64::LDPQpre
;
439 return AArch64::LDPWpre
;
441 return AArch64::LDPXpre
;
443 return AArch64::STPSpre
;
445 return AArch64::STPDpre
;
447 return AArch64::STPQpre
;
449 return AArch64::STPWpre
;
451 return AArch64::STPXpre
;
455 static unsigned getPostIndexedOpcode(unsigned Opc
) {
458 llvm_unreachable("Opcode has no post-indexed wise equivalent!");
459 case AArch64::STRSui
:
460 case AArch64::STURSi
:
461 return AArch64::STRSpost
;
462 case AArch64::STRDui
:
463 case AArch64::STURDi
:
464 return AArch64::STRDpost
;
465 case AArch64::STRQui
:
466 case AArch64::STURQi
:
467 return AArch64::STRQpost
;
468 case AArch64::STRBBui
:
469 return AArch64::STRBBpost
;
470 case AArch64::STRHHui
:
471 return AArch64::STRHHpost
;
472 case AArch64::STRWui
:
473 case AArch64::STURWi
:
474 return AArch64::STRWpost
;
475 case AArch64::STRXui
:
476 case AArch64::STURXi
:
477 return AArch64::STRXpost
;
478 case AArch64::LDRSui
:
479 case AArch64::LDURSi
:
480 return AArch64::LDRSpost
;
481 case AArch64::LDRDui
:
482 case AArch64::LDURDi
:
483 return AArch64::LDRDpost
;
484 case AArch64::LDRQui
:
485 case AArch64::LDURQi
:
486 return AArch64::LDRQpost
;
487 case AArch64::LDRBBui
:
488 return AArch64::LDRBBpost
;
489 case AArch64::LDRHHui
:
490 return AArch64::LDRHHpost
;
491 case AArch64::LDRWui
:
492 case AArch64::LDURWi
:
493 return AArch64::LDRWpost
;
494 case AArch64::LDRXui
:
495 case AArch64::LDURXi
:
496 return AArch64::LDRXpost
;
497 case AArch64::LDRSWui
:
498 return AArch64::LDRSWpost
;
500 return AArch64::LDPSpost
;
501 case AArch64::LDPSWi
:
502 return AArch64::LDPSWpost
;
504 return AArch64::LDPDpost
;
506 return AArch64::LDPQpost
;
508 return AArch64::LDPWpost
;
510 return AArch64::LDPXpost
;
512 return AArch64::STPSpost
;
514 return AArch64::STPDpost
;
516 return AArch64::STPQpost
;
518 return AArch64::STPWpost
;
520 return AArch64::STPXpost
;
524 static bool isPairedLdSt(const MachineInstr
&MI
) {
525 switch (MI
.getOpcode()) {
529 case AArch64::LDPSWi
:
543 static const MachineOperand
&getLdStRegOp(const MachineInstr
&MI
,
544 unsigned PairedRegOp
= 0) {
545 assert(PairedRegOp
< 2 && "Unexpected register operand idx.");
546 unsigned Idx
= isPairedLdSt(MI
) ? PairedRegOp
: 0;
547 return MI
.getOperand(Idx
);
550 static const MachineOperand
&getLdStBaseOp(const MachineInstr
&MI
) {
551 unsigned Idx
= isPairedLdSt(MI
) ? 2 : 1;
552 return MI
.getOperand(Idx
);
555 static const MachineOperand
&getLdStOffsetOp(const MachineInstr
&MI
) {
556 unsigned Idx
= isPairedLdSt(MI
) ? 3 : 2;
557 return MI
.getOperand(Idx
);
560 static bool isLdOffsetInRangeOfSt(MachineInstr
&LoadInst
,
561 MachineInstr
&StoreInst
,
562 const AArch64InstrInfo
*TII
) {
563 assert(isMatchingStore(LoadInst
, StoreInst
) && "Expect only matched ld/st.");
564 int LoadSize
= getMemScale(LoadInst
);
565 int StoreSize
= getMemScale(StoreInst
);
566 int UnscaledStOffset
= TII
->isUnscaledLdSt(StoreInst
)
567 ? getLdStOffsetOp(StoreInst
).getImm()
568 : getLdStOffsetOp(StoreInst
).getImm() * StoreSize
;
569 int UnscaledLdOffset
= TII
->isUnscaledLdSt(LoadInst
)
570 ? getLdStOffsetOp(LoadInst
).getImm()
571 : getLdStOffsetOp(LoadInst
).getImm() * LoadSize
;
572 return (UnscaledStOffset
<= UnscaledLdOffset
) &&
573 (UnscaledLdOffset
+ LoadSize
<= (UnscaledStOffset
+ StoreSize
));
576 static bool isPromotableZeroStoreInst(MachineInstr
&MI
) {
577 unsigned Opc
= MI
.getOpcode();
578 return (Opc
== AArch64::STRWui
|| Opc
== AArch64::STURWi
||
579 isNarrowStore(Opc
)) &&
580 getLdStRegOp(MI
).getReg() == AArch64::WZR
;
583 static bool isPromotableLoadFromStore(MachineInstr
&MI
) {
584 switch (MI
.getOpcode()) {
587 // Scaled instructions.
588 case AArch64::LDRBBui
:
589 case AArch64::LDRHHui
:
590 case AArch64::LDRWui
:
591 case AArch64::LDRXui
:
592 // Unscaled instructions.
593 case AArch64::LDURBBi
:
594 case AArch64::LDURHHi
:
595 case AArch64::LDURWi
:
596 case AArch64::LDURXi
:
601 static bool isMergeableLdStUpdate(MachineInstr
&MI
) {
602 unsigned Opc
= MI
.getOpcode();
606 // Scaled instructions.
607 case AArch64::STRSui
:
608 case AArch64::STRDui
:
609 case AArch64::STRQui
:
610 case AArch64::STRXui
:
611 case AArch64::STRWui
:
612 case AArch64::STRHHui
:
613 case AArch64::STRBBui
:
614 case AArch64::LDRSui
:
615 case AArch64::LDRDui
:
616 case AArch64::LDRQui
:
617 case AArch64::LDRXui
:
618 case AArch64::LDRWui
:
619 case AArch64::LDRHHui
:
620 case AArch64::LDRBBui
:
621 // Unscaled instructions.
622 case AArch64::STURSi
:
623 case AArch64::STURDi
:
624 case AArch64::STURQi
:
625 case AArch64::STURWi
:
626 case AArch64::STURXi
:
627 case AArch64::LDURSi
:
628 case AArch64::LDURDi
:
629 case AArch64::LDURQi
:
630 case AArch64::LDURWi
:
631 case AArch64::LDURXi
:
632 // Paired instructions.
634 case AArch64::LDPSWi
:
644 // Make sure this is a reg+imm (as opposed to an address reloc).
645 if (!getLdStOffsetOp(MI
).isImm())
652 MachineBasicBlock::iterator
653 AArch64LoadStoreOpt::mergeNarrowZeroStores(MachineBasicBlock::iterator I
,
654 MachineBasicBlock::iterator MergeMI
,
655 const LdStPairFlags
&Flags
) {
656 assert(isPromotableZeroStoreInst(*I
) && isPromotableZeroStoreInst(*MergeMI
) &&
657 "Expected promotable zero stores.");
659 MachineBasicBlock::iterator NextI
= I
;
661 // If NextI is the second of the two instructions to be merged, we need
662 // to skip one further. Either way we merge will invalidate the iterator,
663 // and we don't need to scan the new instruction, as it's a pairwise
664 // instruction, which we're not considering for further action anyway.
665 if (NextI
== MergeMI
)
668 unsigned Opc
= I
->getOpcode();
669 bool IsScaled
= !TII
->isUnscaledLdSt(Opc
);
670 int OffsetStride
= IsScaled
? 1 : getMemScale(*I
);
672 bool MergeForward
= Flags
.getMergeForward();
673 // Insert our new paired instruction after whichever of the paired
674 // instructions MergeForward indicates.
675 MachineBasicBlock::iterator InsertionPoint
= MergeForward
? MergeMI
: I
;
676 // Also based on MergeForward is from where we copy the base register operand
677 // so we get the flags compatible with the input code.
678 const MachineOperand
&BaseRegOp
=
679 MergeForward
? getLdStBaseOp(*MergeMI
) : getLdStBaseOp(*I
);
681 // Which register is Rt and which is Rt2 depends on the offset order.
683 if (getLdStOffsetOp(*I
).getImm() ==
684 getLdStOffsetOp(*MergeMI
).getImm() + OffsetStride
)
689 int OffsetImm
= getLdStOffsetOp(*RtMI
).getImm();
690 // Change the scaled offset from small to large type.
692 assert(((OffsetImm
& 1) == 0) && "Unexpected offset to merge");
696 // Construct the new instruction.
697 DebugLoc DL
= I
->getDebugLoc();
698 MachineBasicBlock
*MBB
= I
->getParent();
699 MachineInstrBuilder MIB
;
700 MIB
= BuildMI(*MBB
, InsertionPoint
, DL
, TII
->get(getMatchingWideOpcode(Opc
)))
701 .addReg(isNarrowStore(Opc
) ? AArch64::WZR
: AArch64::XZR
)
704 .cloneMergedMemRefs({&*I
, &*MergeMI
})
705 .setMIFlags(I
->mergeFlagsWith(*MergeMI
));
708 LLVM_DEBUG(dbgs() << "Creating wider store. Replacing instructions:\n ");
709 LLVM_DEBUG(I
->print(dbgs()));
710 LLVM_DEBUG(dbgs() << " ");
711 LLVM_DEBUG(MergeMI
->print(dbgs()));
712 LLVM_DEBUG(dbgs() << " with instruction:\n ");
713 LLVM_DEBUG(((MachineInstr
*)MIB
)->print(dbgs()));
714 LLVM_DEBUG(dbgs() << "\n");
716 // Erase the old instructions.
717 I
->eraseFromParent();
718 MergeMI
->eraseFromParent();
722 MachineBasicBlock::iterator
723 AArch64LoadStoreOpt::mergePairedInsns(MachineBasicBlock::iterator I
,
724 MachineBasicBlock::iterator Paired
,
725 const LdStPairFlags
&Flags
) {
726 MachineBasicBlock::iterator NextI
= I
;
728 // If NextI is the second of the two instructions to be merged, we need
729 // to skip one further. Either way we merge will invalidate the iterator,
730 // and we don't need to scan the new instruction, as it's a pairwise
731 // instruction, which we're not considering for further action anyway.
735 int SExtIdx
= Flags
.getSExtIdx();
737 SExtIdx
== -1 ? I
->getOpcode() : getMatchingNonSExtOpcode(I
->getOpcode());
738 bool IsUnscaled
= TII
->isUnscaledLdSt(Opc
);
739 int OffsetStride
= IsUnscaled
? getMemScale(*I
) : 1;
741 bool MergeForward
= Flags
.getMergeForward();
742 // Insert our new paired instruction after whichever of the paired
743 // instructions MergeForward indicates.
744 MachineBasicBlock::iterator InsertionPoint
= MergeForward
? Paired
: I
;
745 // Also based on MergeForward is from where we copy the base register operand
746 // so we get the flags compatible with the input code.
747 const MachineOperand
&BaseRegOp
=
748 MergeForward
? getLdStBaseOp(*Paired
) : getLdStBaseOp(*I
);
750 int Offset
= getLdStOffsetOp(*I
).getImm();
751 int PairedOffset
= getLdStOffsetOp(*Paired
).getImm();
752 bool PairedIsUnscaled
= TII
->isUnscaledLdSt(Paired
->getOpcode());
753 if (IsUnscaled
!= PairedIsUnscaled
) {
754 // We're trying to pair instructions that differ in how they are scaled. If
755 // I is scaled then scale the offset of Paired accordingly. Otherwise, do
756 // the opposite (i.e., make Paired's offset unscaled).
757 int MemSize
= getMemScale(*Paired
);
758 if (PairedIsUnscaled
) {
759 // If the unscaled offset isn't a multiple of the MemSize, we can't
760 // pair the operations together.
761 assert(!(PairedOffset
% getMemScale(*Paired
)) &&
762 "Offset should be a multiple of the stride!");
763 PairedOffset
/= MemSize
;
765 PairedOffset
*= MemSize
;
769 // Which register is Rt and which is Rt2 depends on the offset order.
770 MachineInstr
*RtMI
, *Rt2MI
;
771 if (Offset
== PairedOffset
+ OffsetStride
) {
774 // Here we swapped the assumption made for SExtIdx.
775 // I.e., we turn ldp I, Paired into ldp Paired, I.
776 // Update the index accordingly.
778 SExtIdx
= (SExtIdx
+ 1) % 2;
783 int OffsetImm
= getLdStOffsetOp(*RtMI
).getImm();
784 // Scale the immediate offset, if necessary.
785 if (TII
->isUnscaledLdSt(RtMI
->getOpcode())) {
786 assert(!(OffsetImm
% getMemScale(*RtMI
)) &&
787 "Unscaled offset cannot be scaled.");
788 OffsetImm
/= getMemScale(*RtMI
);
791 // Construct the new instruction.
792 MachineInstrBuilder MIB
;
793 DebugLoc DL
= I
->getDebugLoc();
794 MachineBasicBlock
*MBB
= I
->getParent();
795 MachineOperand RegOp0
= getLdStRegOp(*RtMI
);
796 MachineOperand RegOp1
= getLdStRegOp(*Rt2MI
);
797 // Kill flags may become invalid when moving stores for pairing.
798 if (RegOp0
.isUse()) {
800 // Clear kill flags on store if moving upwards. Example:
803 // STRWui kill %w1 ; need to clear kill flag when moving STRWui upwards
804 RegOp0
.setIsKill(false);
805 RegOp1
.setIsKill(false);
807 // Clear kill flags of the first stores register. Example:
809 // USE kill %w1 ; need to clear kill flag when moving STRWui downwards
811 Register Reg
= getLdStRegOp(*I
).getReg();
812 for (MachineInstr
&MI
: make_range(std::next(I
), Paired
))
813 MI
.clearRegisterKills(Reg
, TRI
);
816 MIB
= BuildMI(*MBB
, InsertionPoint
, DL
, TII
->get(getMatchingPairOpcode(Opc
)))
821 .cloneMergedMemRefs({&*I
, &*Paired
})
822 .setMIFlags(I
->mergeFlagsWith(*Paired
));
827 dbgs() << "Creating pair load/store. Replacing instructions:\n ");
828 LLVM_DEBUG(I
->print(dbgs()));
829 LLVM_DEBUG(dbgs() << " ");
830 LLVM_DEBUG(Paired
->print(dbgs()));
831 LLVM_DEBUG(dbgs() << " with instruction:\n ");
833 // Generate the sign extension for the proper result of the ldp.
834 // I.e., with X1, that would be:
835 // %w1 = KILL %w1, implicit-def %x1
836 // %x1 = SBFMXri killed %x1, 0, 31
837 MachineOperand
&DstMO
= MIB
->getOperand(SExtIdx
);
838 // Right now, DstMO has the extended register, since it comes from an
840 Register DstRegX
= DstMO
.getReg();
841 // Get the W variant of that register.
842 Register DstRegW
= TRI
->getSubReg(DstRegX
, AArch64::sub_32
);
843 // Update the result of LDP to use the W instead of the X variant.
844 DstMO
.setReg(DstRegW
);
845 LLVM_DEBUG(((MachineInstr
*)MIB
)->print(dbgs()));
846 LLVM_DEBUG(dbgs() << "\n");
847 // Make the machine verifier happy by providing a definition for
849 // Insert this definition right after the generated LDP, i.e., before
851 MachineInstrBuilder MIBKill
=
852 BuildMI(*MBB
, InsertionPoint
, DL
, TII
->get(TargetOpcode::KILL
), DstRegW
)
854 .addReg(DstRegX
, RegState::Define
);
855 MIBKill
->getOperand(2).setImplicit();
856 // Create the sign extension.
857 MachineInstrBuilder MIBSXTW
=
858 BuildMI(*MBB
, InsertionPoint
, DL
, TII
->get(AArch64::SBFMXri
), DstRegX
)
863 LLVM_DEBUG(dbgs() << " Extend operand:\n ");
864 LLVM_DEBUG(((MachineInstr
*)MIBSXTW
)->print(dbgs()));
866 LLVM_DEBUG(((MachineInstr
*)MIB
)->print(dbgs()));
868 LLVM_DEBUG(dbgs() << "\n");
870 // Erase the old instructions.
871 I
->eraseFromParent();
872 Paired
->eraseFromParent();
877 MachineBasicBlock::iterator
878 AArch64LoadStoreOpt::promoteLoadFromStore(MachineBasicBlock::iterator LoadI
,
879 MachineBasicBlock::iterator StoreI
) {
880 MachineBasicBlock::iterator NextI
= LoadI
;
883 int LoadSize
= getMemScale(*LoadI
);
884 int StoreSize
= getMemScale(*StoreI
);
885 Register LdRt
= getLdStRegOp(*LoadI
).getReg();
886 const MachineOperand
&StMO
= getLdStRegOp(*StoreI
);
887 Register StRt
= getLdStRegOp(*StoreI
).getReg();
888 bool IsStoreXReg
= TRI
->getRegClass(AArch64::GPR64RegClassID
)->contains(StRt
);
890 assert((IsStoreXReg
||
891 TRI
->getRegClass(AArch64::GPR32RegClassID
)->contains(StRt
)) &&
892 "Unexpected RegClass");
894 MachineInstr
*BitExtMI
;
895 if (LoadSize
== StoreSize
&& (LoadSize
== 4 || LoadSize
== 8)) {
896 // Remove the load, if the destination register of the loads is the same
897 // register for stored value.
898 if (StRt
== LdRt
&& LoadSize
== 8) {
899 for (MachineInstr
&MI
: make_range(StoreI
->getIterator(),
900 LoadI
->getIterator())) {
901 if (MI
.killsRegister(StRt
, TRI
)) {
902 MI
.clearRegisterKills(StRt
, TRI
);
906 LLVM_DEBUG(dbgs() << "Remove load instruction:\n ");
907 LLVM_DEBUG(LoadI
->print(dbgs()));
908 LLVM_DEBUG(dbgs() << "\n");
909 LoadI
->eraseFromParent();
912 // Replace the load with a mov if the load and store are in the same size.
914 BuildMI(*LoadI
->getParent(), LoadI
, LoadI
->getDebugLoc(),
915 TII
->get(IsStoreXReg
? AArch64::ORRXrs
: AArch64::ORRWrs
), LdRt
)
916 .addReg(IsStoreXReg
? AArch64::XZR
: AArch64::WZR
)
918 .addImm(AArch64_AM::getShifterImm(AArch64_AM::LSL
, 0))
919 .setMIFlags(LoadI
->getFlags());
921 // FIXME: Currently we disable this transformation in big-endian targets as
922 // performance and correctness are verified only in little-endian.
923 if (!Subtarget
->isLittleEndian())
925 bool IsUnscaled
= TII
->isUnscaledLdSt(*LoadI
);
926 assert(IsUnscaled
== TII
->isUnscaledLdSt(*StoreI
) &&
927 "Unsupported ld/st match");
928 assert(LoadSize
<= StoreSize
&& "Invalid load size");
929 int UnscaledLdOffset
= IsUnscaled
930 ? getLdStOffsetOp(*LoadI
).getImm()
931 : getLdStOffsetOp(*LoadI
).getImm() * LoadSize
;
932 int UnscaledStOffset
= IsUnscaled
933 ? getLdStOffsetOp(*StoreI
).getImm()
934 : getLdStOffsetOp(*StoreI
).getImm() * StoreSize
;
935 int Width
= LoadSize
* 8;
937 IsStoreXReg
? Register(TRI
->getMatchingSuperReg(
938 LdRt
, AArch64::sub_32
, &AArch64::GPR64RegClass
))
941 assert((UnscaledLdOffset
>= UnscaledStOffset
&&
942 (UnscaledLdOffset
+ LoadSize
) <= UnscaledStOffset
+ StoreSize
) &&
945 int Immr
= 8 * (UnscaledLdOffset
- UnscaledStOffset
);
946 int Imms
= Immr
+ Width
- 1;
947 if (UnscaledLdOffset
== UnscaledStOffset
) {
948 uint32_t AndMaskEncoded
= ((IsStoreXReg
? 1 : 0) << 12) // N
949 | ((Immr
) << 6) // immr
950 | ((Imms
) << 0) // imms
954 BuildMI(*LoadI
->getParent(), LoadI
, LoadI
->getDebugLoc(),
955 TII
->get(IsStoreXReg
? AArch64::ANDXri
: AArch64::ANDWri
),
958 .addImm(AndMaskEncoded
)
959 .setMIFlags(LoadI
->getFlags());
962 BuildMI(*LoadI
->getParent(), LoadI
, LoadI
->getDebugLoc(),
963 TII
->get(IsStoreXReg
? AArch64::UBFMXri
: AArch64::UBFMWri
),
968 .setMIFlags(LoadI
->getFlags());
972 // Clear kill flags between store and load.
973 for (MachineInstr
&MI
: make_range(StoreI
->getIterator(),
974 BitExtMI
->getIterator()))
975 if (MI
.killsRegister(StRt
, TRI
)) {
976 MI
.clearRegisterKills(StRt
, TRI
);
980 LLVM_DEBUG(dbgs() << "Promoting load by replacing :\n ");
981 LLVM_DEBUG(StoreI
->print(dbgs()));
982 LLVM_DEBUG(dbgs() << " ");
983 LLVM_DEBUG(LoadI
->print(dbgs()));
984 LLVM_DEBUG(dbgs() << " with instructions:\n ");
985 LLVM_DEBUG(StoreI
->print(dbgs()));
986 LLVM_DEBUG(dbgs() << " ");
987 LLVM_DEBUG((BitExtMI
)->print(dbgs()));
988 LLVM_DEBUG(dbgs() << "\n");
990 // Erase the old instructions.
991 LoadI
->eraseFromParent();
995 static bool inBoundsForPair(bool IsUnscaled
, int Offset
, int OffsetStride
) {
996 // Convert the byte-offset used by unscaled into an "element" offset used
997 // by the scaled pair load/store instructions.
999 // If the byte-offset isn't a multiple of the stride, there's no point
1000 // trying to match it.
1001 if (Offset
% OffsetStride
)
1003 Offset
/= OffsetStride
;
1005 return Offset
<= 63 && Offset
>= -64;
1008 // Do alignment, specialized to power of 2 and for signed ints,
1009 // avoiding having to do a C-style cast from uint_64t to int when
1010 // using alignTo from include/llvm/Support/MathExtras.h.
1011 // FIXME: Move this function to include/MathExtras.h?
1012 static int alignTo(int Num
, int PowOf2
) {
1013 return (Num
+ PowOf2
- 1) & ~(PowOf2
- 1);
1016 static bool mayAlias(MachineInstr
&MIa
, MachineInstr
&MIb
,
1017 AliasAnalysis
*AA
) {
1018 // One of the instructions must modify memory.
1019 if (!MIa
.mayStore() && !MIb
.mayStore())
1022 // Both instructions must be memory operations.
1023 if (!MIa
.mayLoadOrStore() && !MIb
.mayLoadOrStore())
1026 return MIa
.mayAlias(AA
, MIb
, /*UseTBAA*/false);
1029 static bool mayAlias(MachineInstr
&MIa
,
1030 SmallVectorImpl
<MachineInstr
*> &MemInsns
,
1031 AliasAnalysis
*AA
) {
1032 for (MachineInstr
*MIb
: MemInsns
)
1033 if (mayAlias(MIa
, *MIb
, AA
))
1039 bool AArch64LoadStoreOpt::findMatchingStore(
1040 MachineBasicBlock::iterator I
, unsigned Limit
,
1041 MachineBasicBlock::iterator
&StoreI
) {
1042 MachineBasicBlock::iterator B
= I
->getParent()->begin();
1043 MachineBasicBlock::iterator MBBI
= I
;
1044 MachineInstr
&LoadMI
= *I
;
1045 Register BaseReg
= getLdStBaseOp(LoadMI
).getReg();
1047 // If the load is the first instruction in the block, there's obviously
1048 // not any matching store.
1052 // Track which register units have been modified and used between the first
1053 // insn and the second insn.
1054 ModifiedRegUnits
.clear();
1055 UsedRegUnits
.clear();
1060 MachineInstr
&MI
= *MBBI
;
1062 // Don't count transient instructions towards the search limit since there
1063 // may be different numbers of them if e.g. debug information is present.
1064 if (!MI
.isTransient())
1067 // If the load instruction reads directly from the address to which the
1068 // store instruction writes and the stored value is not modified, we can
1069 // promote the load. Since we do not handle stores with pre-/post-index,
1070 // it's unnecessary to check if BaseReg is modified by the store itself.
1071 if (MI
.mayStore() && isMatchingStore(LoadMI
, MI
) &&
1072 BaseReg
== getLdStBaseOp(MI
).getReg() &&
1073 isLdOffsetInRangeOfSt(LoadMI
, MI
, TII
) &&
1074 ModifiedRegUnits
.available(getLdStRegOp(MI
).getReg())) {
1082 // Update modified / uses register units.
1083 LiveRegUnits::accumulateUsedDefed(MI
, ModifiedRegUnits
, UsedRegUnits
, TRI
);
1085 // Otherwise, if the base register is modified, we have no match, so
1087 if (!ModifiedRegUnits
.available(BaseReg
))
1090 // If we encounter a store aliased with the load, return early.
1091 if (MI
.mayStore() && mayAlias(LoadMI
, MI
, AA
))
1093 } while (MBBI
!= B
&& Count
< Limit
);
1097 // Returns true if FirstMI and MI are candidates for merging or pairing.
1098 // Otherwise, returns false.
1099 static bool areCandidatesToMergeOrPair(MachineInstr
&FirstMI
, MachineInstr
&MI
,
1100 LdStPairFlags
&Flags
,
1101 const AArch64InstrInfo
*TII
) {
1102 // If this is volatile or if pairing is suppressed, not a candidate.
1103 if (MI
.hasOrderedMemoryRef() || TII
->isLdStPairSuppressed(MI
))
1106 // We should have already checked FirstMI for pair suppression and volatility.
1107 assert(!FirstMI
.hasOrderedMemoryRef() &&
1108 !TII
->isLdStPairSuppressed(FirstMI
) &&
1109 "FirstMI shouldn't get here if either of these checks are true.");
1111 unsigned OpcA
= FirstMI
.getOpcode();
1112 unsigned OpcB
= MI
.getOpcode();
1114 // Opcodes match: nothing more to check.
1118 // Try to match a sign-extended load/store with a zero-extended load/store.
1119 bool IsValidLdStrOpc
, PairIsValidLdStrOpc
;
1120 unsigned NonSExtOpc
= getMatchingNonSExtOpcode(OpcA
, &IsValidLdStrOpc
);
1121 assert(IsValidLdStrOpc
&&
1122 "Given Opc should be a Load or Store with an immediate");
1123 // OpcA will be the first instruction in the pair.
1124 if (NonSExtOpc
== getMatchingNonSExtOpcode(OpcB
, &PairIsValidLdStrOpc
)) {
1125 Flags
.setSExtIdx(NonSExtOpc
== (unsigned)OpcA
? 1 : 0);
1129 // If the second instruction isn't even a mergable/pairable load/store, bail
1131 if (!PairIsValidLdStrOpc
)
1134 // FIXME: We don't support merging narrow stores with mixed scaled/unscaled
1136 if (isNarrowStore(OpcA
) || isNarrowStore(OpcB
))
1139 // Try to match an unscaled load/store with a scaled load/store.
1140 return TII
->isUnscaledLdSt(OpcA
) != TII
->isUnscaledLdSt(OpcB
) &&
1141 getMatchingPairOpcode(OpcA
) == getMatchingPairOpcode(OpcB
);
1143 // FIXME: Can we also match a mixed sext/zext unscaled/scaled pair?
1146 /// Scan the instructions looking for a load/store that can be combined with the
1147 /// current instruction into a wider equivalent or a load/store pair.
1148 MachineBasicBlock::iterator
1149 AArch64LoadStoreOpt::findMatchingInsn(MachineBasicBlock::iterator I
,
1150 LdStPairFlags
&Flags
, unsigned Limit
,
1151 bool FindNarrowMerge
) {
1152 MachineBasicBlock::iterator E
= I
->getParent()->end();
1153 MachineBasicBlock::iterator MBBI
= I
;
1154 MachineInstr
&FirstMI
= *I
;
1157 bool MayLoad
= FirstMI
.mayLoad();
1158 bool IsUnscaled
= TII
->isUnscaledLdSt(FirstMI
);
1159 Register Reg
= getLdStRegOp(FirstMI
).getReg();
1160 Register BaseReg
= getLdStBaseOp(FirstMI
).getReg();
1161 int Offset
= getLdStOffsetOp(FirstMI
).getImm();
1162 int OffsetStride
= IsUnscaled
? getMemScale(FirstMI
) : 1;
1163 bool IsPromotableZeroStore
= isPromotableZeroStoreInst(FirstMI
);
1165 // Track which register units have been modified and used between the first
1166 // insn (inclusive) and the second insn.
1167 ModifiedRegUnits
.clear();
1168 UsedRegUnits
.clear();
1170 // Remember any instructions that read/write memory between FirstMI and MI.
1171 SmallVector
<MachineInstr
*, 4> MemInsns
;
1173 for (unsigned Count
= 0; MBBI
!= E
&& Count
< Limit
; ++MBBI
) {
1174 MachineInstr
&MI
= *MBBI
;
1176 // Don't count transient instructions towards the search limit since there
1177 // may be different numbers of them if e.g. debug information is present.
1178 if (!MI
.isTransient())
1181 Flags
.setSExtIdx(-1);
1182 if (areCandidatesToMergeOrPair(FirstMI
, MI
, Flags
, TII
) &&
1183 getLdStOffsetOp(MI
).isImm()) {
1184 assert(MI
.mayLoadOrStore() && "Expected memory operation.");
1185 // If we've found another instruction with the same opcode, check to see
1186 // if the base and offset are compatible with our starting instruction.
1187 // These instructions all have scaled immediate operands, so we just
1188 // check for +1/-1. Make sure to check the new instruction offset is
1189 // actually an immediate and not a symbolic reference destined for
1191 Register MIBaseReg
= getLdStBaseOp(MI
).getReg();
1192 int MIOffset
= getLdStOffsetOp(MI
).getImm();
1193 bool MIIsUnscaled
= TII
->isUnscaledLdSt(MI
);
1194 if (IsUnscaled
!= MIIsUnscaled
) {
1195 // We're trying to pair instructions that differ in how they are scaled.
1196 // If FirstMI is scaled then scale the offset of MI accordingly.
1197 // Otherwise, do the opposite (i.e., make MI's offset unscaled).
1198 int MemSize
= getMemScale(MI
);
1200 // If the unscaled offset isn't a multiple of the MemSize, we can't
1201 // pair the operations together: bail and keep looking.
1202 if (MIOffset
% MemSize
) {
1203 LiveRegUnits::accumulateUsedDefed(MI
, ModifiedRegUnits
,
1205 MemInsns
.push_back(&MI
);
1208 MIOffset
/= MemSize
;
1210 MIOffset
*= MemSize
;
1214 if (BaseReg
== MIBaseReg
&& ((Offset
== MIOffset
+ OffsetStride
) ||
1215 (Offset
+ OffsetStride
== MIOffset
))) {
1216 int MinOffset
= Offset
< MIOffset
? Offset
: MIOffset
;
1217 if (FindNarrowMerge
) {
1218 // If the alignment requirements of the scaled wide load/store
1219 // instruction can't express the offset of the scaled narrow input,
1220 // bail and keep looking. For promotable zero stores, allow only when
1221 // the stored value is the same (i.e., WZR).
1222 if ((!IsUnscaled
&& alignTo(MinOffset
, 2) != MinOffset
) ||
1223 (IsPromotableZeroStore
&& Reg
!= getLdStRegOp(MI
).getReg())) {
1224 LiveRegUnits::accumulateUsedDefed(MI
, ModifiedRegUnits
,
1226 MemInsns
.push_back(&MI
);
1230 // Pairwise instructions have a 7-bit signed offset field. Single
1231 // insns have a 12-bit unsigned offset field. If the resultant
1232 // immediate offset of merging these instructions is out of range for
1233 // a pairwise instruction, bail and keep looking.
1234 if (!inBoundsForPair(IsUnscaled
, MinOffset
, OffsetStride
)) {
1235 LiveRegUnits::accumulateUsedDefed(MI
, ModifiedRegUnits
,
1237 MemInsns
.push_back(&MI
);
1240 // If the alignment requirements of the paired (scaled) instruction
1241 // can't express the offset of the unscaled input, bail and keep
1243 if (IsUnscaled
&& (alignTo(MinOffset
, OffsetStride
) != MinOffset
)) {
1244 LiveRegUnits::accumulateUsedDefed(MI
, ModifiedRegUnits
,
1246 MemInsns
.push_back(&MI
);
1250 // If the destination register of the loads is the same register, bail
1251 // and keep looking. A load-pair instruction with both destination
1252 // registers the same is UNPREDICTABLE and will result in an exception.
1253 if (MayLoad
&& Reg
== getLdStRegOp(MI
).getReg()) {
1254 LiveRegUnits::accumulateUsedDefed(MI
, ModifiedRegUnits
, UsedRegUnits
,
1256 MemInsns
.push_back(&MI
);
1260 // If the Rt of the second instruction was not modified or used between
1261 // the two instructions and none of the instructions between the second
1262 // and first alias with the second, we can combine the second into the
1264 if (ModifiedRegUnits
.available(getLdStRegOp(MI
).getReg()) &&
1266 !UsedRegUnits
.available(getLdStRegOp(MI
).getReg())) &&
1267 !mayAlias(MI
, MemInsns
, AA
)) {
1268 Flags
.setMergeForward(false);
1272 // Likewise, if the Rt of the first instruction is not modified or used
1273 // between the two instructions and none of the instructions between the
1274 // first and the second alias with the first, we can combine the first
1276 if (ModifiedRegUnits
.available(getLdStRegOp(FirstMI
).getReg()) &&
1278 !UsedRegUnits
.available(getLdStRegOp(FirstMI
).getReg())) &&
1279 !mayAlias(FirstMI
, MemInsns
, AA
)) {
1280 Flags
.setMergeForward(true);
1283 // Unable to combine these instructions due to interference in between.
1288 // If the instruction wasn't a matching load or store. Stop searching if we
1289 // encounter a call instruction that might modify memory.
1293 // Update modified / uses register units.
1294 LiveRegUnits::accumulateUsedDefed(MI
, ModifiedRegUnits
, UsedRegUnits
, TRI
);
1296 // Otherwise, if the base register is modified, we have no match, so
1298 if (!ModifiedRegUnits
.available(BaseReg
))
1301 // Update list of instructions that read/write memory.
1302 if (MI
.mayLoadOrStore())
1303 MemInsns
.push_back(&MI
);
1308 MachineBasicBlock::iterator
1309 AArch64LoadStoreOpt::mergeUpdateInsn(MachineBasicBlock::iterator I
,
1310 MachineBasicBlock::iterator Update
,
1312 assert((Update
->getOpcode() == AArch64::ADDXri
||
1313 Update
->getOpcode() == AArch64::SUBXri
) &&
1314 "Unexpected base register update instruction to merge!");
1315 MachineBasicBlock::iterator NextI
= I
;
1316 // Return the instruction following the merged instruction, which is
1317 // the instruction following our unmerged load. Unless that's the add/sub
1318 // instruction we're merging, in which case it's the one after that.
1319 if (++NextI
== Update
)
1322 int Value
= Update
->getOperand(2).getImm();
1323 assert(AArch64_AM::getShiftValue(Update
->getOperand(3).getImm()) == 0 &&
1324 "Can't merge 1 << 12 offset into pre-/post-indexed load / store");
1325 if (Update
->getOpcode() == AArch64::SUBXri
)
1328 unsigned NewOpc
= IsPreIdx
? getPreIndexedOpcode(I
->getOpcode())
1329 : getPostIndexedOpcode(I
->getOpcode());
1330 MachineInstrBuilder MIB
;
1331 if (!isPairedLdSt(*I
)) {
1332 // Non-paired instruction.
1333 MIB
= BuildMI(*I
->getParent(), I
, I
->getDebugLoc(), TII
->get(NewOpc
))
1334 .add(getLdStRegOp(*Update
))
1335 .add(getLdStRegOp(*I
))
1336 .add(getLdStBaseOp(*I
))
1338 .setMemRefs(I
->memoperands())
1339 .setMIFlags(I
->mergeFlagsWith(*Update
));
1341 // Paired instruction.
1342 int Scale
= getMemScale(*I
);
1343 MIB
= BuildMI(*I
->getParent(), I
, I
->getDebugLoc(), TII
->get(NewOpc
))
1344 .add(getLdStRegOp(*Update
))
1345 .add(getLdStRegOp(*I
, 0))
1346 .add(getLdStRegOp(*I
, 1))
1347 .add(getLdStBaseOp(*I
))
1348 .addImm(Value
/ Scale
)
1349 .setMemRefs(I
->memoperands())
1350 .setMIFlags(I
->mergeFlagsWith(*Update
));
1356 LLVM_DEBUG(dbgs() << "Creating pre-indexed load/store.");
1359 LLVM_DEBUG(dbgs() << "Creating post-indexed load/store.");
1361 LLVM_DEBUG(dbgs() << " Replacing instructions:\n ");
1362 LLVM_DEBUG(I
->print(dbgs()));
1363 LLVM_DEBUG(dbgs() << " ");
1364 LLVM_DEBUG(Update
->print(dbgs()));
1365 LLVM_DEBUG(dbgs() << " with instruction:\n ");
1366 LLVM_DEBUG(((MachineInstr
*)MIB
)->print(dbgs()));
1367 LLVM_DEBUG(dbgs() << "\n");
1369 // Erase the old instructions for the block.
1370 I
->eraseFromParent();
1371 Update
->eraseFromParent();
1376 bool AArch64LoadStoreOpt::isMatchingUpdateInsn(MachineInstr
&MemMI
,
1378 unsigned BaseReg
, int Offset
) {
1379 switch (MI
.getOpcode()) {
1382 case AArch64::SUBXri
:
1383 case AArch64::ADDXri
:
1384 // Make sure it's a vanilla immediate operand, not a relocation or
1385 // anything else we can't handle.
1386 if (!MI
.getOperand(2).isImm())
1388 // Watch out for 1 << 12 shifted value.
1389 if (AArch64_AM::getShiftValue(MI
.getOperand(3).getImm()))
1392 // The update instruction source and destination register must be the
1393 // same as the load/store base register.
1394 if (MI
.getOperand(0).getReg() != BaseReg
||
1395 MI
.getOperand(1).getReg() != BaseReg
)
1398 bool IsPairedInsn
= isPairedLdSt(MemMI
);
1399 int UpdateOffset
= MI
.getOperand(2).getImm();
1400 if (MI
.getOpcode() == AArch64::SUBXri
)
1401 UpdateOffset
= -UpdateOffset
;
1403 // For non-paired load/store instructions, the immediate must fit in a
1404 // signed 9-bit integer.
1405 if (!IsPairedInsn
&& (UpdateOffset
> 255 || UpdateOffset
< -256))
1408 // For paired load/store instructions, the immediate must be a multiple of
1409 // the scaling factor. The scaled offset must also fit into a signed 7-bit
1412 int Scale
= getMemScale(MemMI
);
1413 if (UpdateOffset
% Scale
!= 0)
1416 int ScaledOffset
= UpdateOffset
/ Scale
;
1417 if (ScaledOffset
> 63 || ScaledOffset
< -64)
1421 // If we have a non-zero Offset, we check that it matches the amount
1422 // we're adding to the register.
1423 if (!Offset
|| Offset
== UpdateOffset
)
1430 MachineBasicBlock::iterator
AArch64LoadStoreOpt::findMatchingUpdateInsnForward(
1431 MachineBasicBlock::iterator I
, int UnscaledOffset
, unsigned Limit
) {
1432 MachineBasicBlock::iterator E
= I
->getParent()->end();
1433 MachineInstr
&MemMI
= *I
;
1434 MachineBasicBlock::iterator MBBI
= I
;
1436 Register BaseReg
= getLdStBaseOp(MemMI
).getReg();
1437 int MIUnscaledOffset
= getLdStOffsetOp(MemMI
).getImm() * getMemScale(MemMI
);
1439 // Scan forward looking for post-index opportunities. Updating instructions
1440 // can't be formed if the memory instruction doesn't have the offset we're
1442 if (MIUnscaledOffset
!= UnscaledOffset
)
1445 // If the base register overlaps a destination register, we can't
1446 // merge the update.
1447 bool IsPairedInsn
= isPairedLdSt(MemMI
);
1448 for (unsigned i
= 0, e
= IsPairedInsn
? 2 : 1; i
!= e
; ++i
) {
1449 Register DestReg
= getLdStRegOp(MemMI
, i
).getReg();
1450 if (DestReg
== BaseReg
|| TRI
->isSubRegister(BaseReg
, DestReg
))
1454 // Track which register units have been modified and used between the first
1455 // insn (inclusive) and the second insn.
1456 ModifiedRegUnits
.clear();
1457 UsedRegUnits
.clear();
1459 for (unsigned Count
= 0; MBBI
!= E
&& Count
< Limit
; ++MBBI
) {
1460 MachineInstr
&MI
= *MBBI
;
1462 // Don't count transient instructions towards the search limit since there
1463 // may be different numbers of them if e.g. debug information is present.
1464 if (!MI
.isTransient())
1467 // If we found a match, return it.
1468 if (isMatchingUpdateInsn(*I
, MI
, BaseReg
, UnscaledOffset
))
1471 // Update the status of what the instruction clobbered and used.
1472 LiveRegUnits::accumulateUsedDefed(MI
, ModifiedRegUnits
, UsedRegUnits
, TRI
);
1474 // Otherwise, if the base register is used or modified, we have no match, so
1476 if (!ModifiedRegUnits
.available(BaseReg
) ||
1477 !UsedRegUnits
.available(BaseReg
))
1483 MachineBasicBlock::iterator
AArch64LoadStoreOpt::findMatchingUpdateInsnBackward(
1484 MachineBasicBlock::iterator I
, unsigned Limit
) {
1485 MachineBasicBlock::iterator B
= I
->getParent()->begin();
1486 MachineBasicBlock::iterator E
= I
->getParent()->end();
1487 MachineInstr
&MemMI
= *I
;
1488 MachineBasicBlock::iterator MBBI
= I
;
1490 Register BaseReg
= getLdStBaseOp(MemMI
).getReg();
1491 int Offset
= getLdStOffsetOp(MemMI
).getImm();
1493 // If the load/store is the first instruction in the block, there's obviously
1494 // not any matching update. Ditto if the memory offset isn't zero.
1495 if (MBBI
== B
|| Offset
!= 0)
1497 // If the base register overlaps a destination register, we can't
1498 // merge the update.
1499 bool IsPairedInsn
= isPairedLdSt(MemMI
);
1500 for (unsigned i
= 0, e
= IsPairedInsn
? 2 : 1; i
!= e
; ++i
) {
1501 Register DestReg
= getLdStRegOp(MemMI
, i
).getReg();
1502 if (DestReg
== BaseReg
|| TRI
->isSubRegister(BaseReg
, DestReg
))
1506 // Track which register units have been modified and used between the first
1507 // insn (inclusive) and the second insn.
1508 ModifiedRegUnits
.clear();
1509 UsedRegUnits
.clear();
1513 MachineInstr
&MI
= *MBBI
;
1515 // Don't count transient instructions towards the search limit since there
1516 // may be different numbers of them if e.g. debug information is present.
1517 if (!MI
.isTransient())
1520 // If we found a match, return it.
1521 if (isMatchingUpdateInsn(*I
, MI
, BaseReg
, Offset
))
1524 // Update the status of what the instruction clobbered and used.
1525 LiveRegUnits::accumulateUsedDefed(MI
, ModifiedRegUnits
, UsedRegUnits
, TRI
);
1527 // Otherwise, if the base register is used or modified, we have no match, so
1529 if (!ModifiedRegUnits
.available(BaseReg
) ||
1530 !UsedRegUnits
.available(BaseReg
))
1532 } while (MBBI
!= B
&& Count
< Limit
);
1536 bool AArch64LoadStoreOpt::tryToPromoteLoadFromStore(
1537 MachineBasicBlock::iterator
&MBBI
) {
1538 MachineInstr
&MI
= *MBBI
;
1539 // If this is a volatile load, don't mess with it.
1540 if (MI
.hasOrderedMemoryRef())
1543 // Make sure this is a reg+imm.
1544 // FIXME: It is possible to extend it to handle reg+reg cases.
1545 if (!getLdStOffsetOp(MI
).isImm())
1548 // Look backward up to LdStLimit instructions.
1549 MachineBasicBlock::iterator StoreI
;
1550 if (findMatchingStore(MBBI
, LdStLimit
, StoreI
)) {
1551 ++NumLoadsFromStoresPromoted
;
1552 // Promote the load. Keeping the iterator straight is a
1553 // pain, so we let the merge routine tell us what the next instruction
1554 // is after it's done mucking about.
1555 MBBI
= promoteLoadFromStore(MBBI
, StoreI
);
1561 // Merge adjacent zero stores into a wider store.
1562 bool AArch64LoadStoreOpt::tryToMergeZeroStInst(
1563 MachineBasicBlock::iterator
&MBBI
) {
1564 assert(isPromotableZeroStoreInst(*MBBI
) && "Expected narrow store.");
1565 MachineInstr
&MI
= *MBBI
;
1566 MachineBasicBlock::iterator E
= MI
.getParent()->end();
1568 if (!TII
->isCandidateToMergeOrPair(MI
))
1571 // Look ahead up to LdStLimit instructions for a mergable instruction.
1572 LdStPairFlags Flags
;
1573 MachineBasicBlock::iterator MergeMI
=
1574 findMatchingInsn(MBBI
, Flags
, LdStLimit
, /* FindNarrowMerge = */ true);
1576 ++NumZeroStoresPromoted
;
1578 // Keeping the iterator straight is a pain, so we let the merge routine tell
1579 // us what the next instruction is after it's done mucking about.
1580 MBBI
= mergeNarrowZeroStores(MBBI
, MergeMI
, Flags
);
1586 // Find loads and stores that can be merged into a single load or store pair
1588 bool AArch64LoadStoreOpt::tryToPairLdStInst(MachineBasicBlock::iterator
&MBBI
) {
1589 MachineInstr
&MI
= *MBBI
;
1590 MachineBasicBlock::iterator E
= MI
.getParent()->end();
1592 if (!TII
->isCandidateToMergeOrPair(MI
))
1595 // Early exit if the offset is not possible to match. (6 bits of positive
1596 // range, plus allow an extra one in case we find a later insn that matches
1598 bool IsUnscaled
= TII
->isUnscaledLdSt(MI
);
1599 int Offset
= getLdStOffsetOp(MI
).getImm();
1600 int OffsetStride
= IsUnscaled
? getMemScale(MI
) : 1;
1601 // Allow one more for offset.
1603 Offset
-= OffsetStride
;
1604 if (!inBoundsForPair(IsUnscaled
, Offset
, OffsetStride
))
1607 // Look ahead up to LdStLimit instructions for a pairable instruction.
1608 LdStPairFlags Flags
;
1609 MachineBasicBlock::iterator Paired
=
1610 findMatchingInsn(MBBI
, Flags
, LdStLimit
, /* FindNarrowMerge = */ false);
1613 if (TII
->isUnscaledLdSt(MI
))
1614 ++NumUnscaledPairCreated
;
1615 // Keeping the iterator straight is a pain, so we let the merge routine tell
1616 // us what the next instruction is after it's done mucking about.
1617 MBBI
= mergePairedInsns(MBBI
, Paired
, Flags
);
1623 bool AArch64LoadStoreOpt::tryToMergeLdStUpdate
1624 (MachineBasicBlock::iterator
&MBBI
) {
1625 MachineInstr
&MI
= *MBBI
;
1626 MachineBasicBlock::iterator E
= MI
.getParent()->end();
1627 MachineBasicBlock::iterator Update
;
1629 // Look forward to try to form a post-index instruction. For example,
1631 // add x20, x20, #32
1633 // ldr x0, [x20], #32
1634 Update
= findMatchingUpdateInsnForward(MBBI
, 0, UpdateLimit
);
1636 // Merge the update into the ld/st.
1637 MBBI
= mergeUpdateInsn(MBBI
, Update
, /*IsPreIdx=*/false);
1641 // Don't know how to handle unscaled pre/post-index versions below, so bail.
1642 if (TII
->isUnscaledLdSt(MI
.getOpcode()))
1645 // Look back to try to find a pre-index instruction. For example,
1649 // ldr x1, [x0, #8]!
1650 Update
= findMatchingUpdateInsnBackward(MBBI
, UpdateLimit
);
1652 // Merge the update into the ld/st.
1653 MBBI
= mergeUpdateInsn(MBBI
, Update
, /*IsPreIdx=*/true);
1657 // The immediate in the load/store is scaled by the size of the memory
1658 // operation. The immediate in the add we're looking for,
1659 // however, is not, so adjust here.
1660 int UnscaledOffset
= getLdStOffsetOp(MI
).getImm() * getMemScale(MI
);
1662 // Look forward to try to find a post-index instruction. For example,
1663 // ldr x1, [x0, #64]
1666 // ldr x1, [x0, #64]!
1667 Update
= findMatchingUpdateInsnForward(MBBI
, UnscaledOffset
, UpdateLimit
);
1669 // Merge the update into the ld/st.
1670 MBBI
= mergeUpdateInsn(MBBI
, Update
, /*IsPreIdx=*/true);
1677 bool AArch64LoadStoreOpt::optimizeBlock(MachineBasicBlock
&MBB
,
1678 bool EnableNarrowZeroStOpt
) {
1679 bool Modified
= false;
1680 // Four tranformations to do here:
1681 // 1) Find loads that directly read from stores and promote them by
1682 // replacing with mov instructions. If the store is wider than the load,
1683 // the load will be replaced with a bitfield extract.
1686 // ldrh w2, [x0, #6]
1690 for (MachineBasicBlock::iterator MBBI
= MBB
.begin(), E
= MBB
.end();
1692 if (isPromotableLoadFromStore(*MBBI
) && tryToPromoteLoadFromStore(MBBI
))
1697 // 2) Merge adjacent zero stores into a wider store.
1700 // strh wzr, [x0, #2]
1705 // str wzr, [x0, #4]
1708 if (EnableNarrowZeroStOpt
)
1709 for (MachineBasicBlock::iterator MBBI
= MBB
.begin(), E
= MBB
.end();
1711 if (isPromotableZeroStoreInst(*MBBI
) && tryToMergeZeroStInst(MBBI
))
1716 // 3) Find loads and stores that can be merged into a single load or store
1717 // pair instruction.
1723 for (MachineBasicBlock::iterator MBBI
= MBB
.begin(), E
= MBB
.end();
1725 if (TII
->isPairableLdStInst(*MBBI
) && tryToPairLdStInst(MBBI
))
1730 // 4) Find base register updates that can be merged into the load or store
1731 // as a base-reg writeback.
1737 for (MachineBasicBlock::iterator MBBI
= MBB
.begin(), E
= MBB
.end();
1739 if (isMergeableLdStUpdate(*MBBI
) && tryToMergeLdStUpdate(MBBI
))
1748 bool AArch64LoadStoreOpt::runOnMachineFunction(MachineFunction
&Fn
) {
1749 if (skipFunction(Fn
.getFunction()))
1752 Subtarget
= &static_cast<const AArch64Subtarget
&>(Fn
.getSubtarget());
1753 TII
= static_cast<const AArch64InstrInfo
*>(Subtarget
->getInstrInfo());
1754 TRI
= Subtarget
->getRegisterInfo();
1755 AA
= &getAnalysis
<AAResultsWrapperPass
>().getAAResults();
1757 // Resize the modified and used register unit trackers. We do this once
1758 // per function and then clear the register units each time we optimize a load
1760 ModifiedRegUnits
.init(*TRI
);
1761 UsedRegUnits
.init(*TRI
);
1763 bool Modified
= false;
1764 bool enableNarrowZeroStOpt
= !Subtarget
->requiresStrictAlign();
1765 for (auto &MBB
: Fn
)
1766 Modified
|= optimizeBlock(MBB
, enableNarrowZeroStOpt
);
1771 // FIXME: Do we need/want a pre-alloc pass like ARM has to try to keep loads and
1772 // stores near one another? Note: The pre-RA instruction scheduler already has
1773 // hooks to try and schedule pairable loads/stores together to improve pairing
1774 // opportunities. Thus, pre-RA pairing pass may not be worth the effort.
1776 // FIXME: When pairing store instructions it's very possible for this pass to
1777 // hoist a store with a KILL marker above another use (without a KILL marker).
1778 // The resulting IR is invalid, but nothing uses the KILL markers after this
1779 // pass, so it's never caused a problem in practice.
1781 /// createAArch64LoadStoreOptimizationPass - returns an instance of the
1782 /// load / store optimization pass.
1783 FunctionPass
*llvm::createAArch64LoadStoreOptimizationPass() {
1784 return new AArch64LoadStoreOpt();