1 //===- AArch64LoadStoreOptimizer.cpp - AArch64 load/store opt. pass -------===//
3 // Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
4 // See https://llvm.org/LICENSE.txt for license information.
5 // SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
7 //===----------------------------------------------------------------------===//
9 // This file contains a pass that performs load / store related peephole
10 // optimizations. This pass should be run after register allocation.
12 //===----------------------------------------------------------------------===//
14 #include "AArch64InstrInfo.h"
15 #include "AArch64Subtarget.h"
16 #include "MCTargetDesc/AArch64AddressingModes.h"
17 #include "llvm/ADT/BitVector.h"
18 #include "llvm/ADT/SmallVector.h"
19 #include "llvm/ADT/Statistic.h"
20 #include "llvm/ADT/StringRef.h"
21 #include "llvm/ADT/iterator_range.h"
22 #include "llvm/Analysis/AliasAnalysis.h"
23 #include "llvm/CodeGen/MachineBasicBlock.h"
24 #include "llvm/CodeGen/MachineFunction.h"
25 #include "llvm/CodeGen/MachineFunctionPass.h"
26 #include "llvm/CodeGen/MachineInstr.h"
27 #include "llvm/CodeGen/MachineInstrBuilder.h"
28 #include "llvm/CodeGen/MachineOperand.h"
29 #include "llvm/CodeGen/TargetRegisterInfo.h"
30 #include "llvm/IR/DebugLoc.h"
31 #include "llvm/MC/MCRegisterInfo.h"
32 #include "llvm/Pass.h"
33 #include "llvm/Support/CommandLine.h"
34 #include "llvm/Support/Debug.h"
35 #include "llvm/Support/ErrorHandling.h"
36 #include "llvm/Support/raw_ostream.h"
44 #define DEBUG_TYPE "aarch64-ldst-opt"
46 STATISTIC(NumPairCreated
, "Number of load/store pair instructions generated");
47 STATISTIC(NumPostFolded
, "Number of post-index updates folded");
48 STATISTIC(NumPreFolded
, "Number of pre-index updates folded");
49 STATISTIC(NumUnscaledPairCreated
,
50 "Number of load/store from unscaled generated");
51 STATISTIC(NumZeroStoresPromoted
, "Number of narrow zero stores promoted");
52 STATISTIC(NumLoadsFromStoresPromoted
, "Number of loads from stores promoted");
54 // The LdStLimit limits how far we search for load/store pairs.
55 static cl::opt
<unsigned> LdStLimit("aarch64-load-store-scan-limit",
56 cl::init(20), cl::Hidden
);
58 // The UpdateLimit limits how far we search for update instructions when we form
59 // pre-/post-index instructions.
60 static cl::opt
<unsigned> UpdateLimit("aarch64-update-scan-limit", cl::init(100),
63 #define AARCH64_LOAD_STORE_OPT_NAME "AArch64 load / store optimization pass"
67 using LdStPairFlags
= struct LdStPairFlags
{
68 // If a matching instruction is found, MergeForward is set to true if the
69 // merge is to remove the first instruction and replace the second with
70 // a pair-wise insn, and false if the reverse is true.
71 bool MergeForward
= false;
73 // SExtIdx gives the index of the result of the load pair that must be
74 // extended. The value of SExtIdx assumes that the paired load produces the
75 // value in this order: (I, returned iterator), i.e., -1 means no value has
76 // to be extended, 0 means I, and 1 means the returned iterator.
79 LdStPairFlags() = default;
81 void setMergeForward(bool V
= true) { MergeForward
= V
; }
82 bool getMergeForward() const { return MergeForward
; }
84 void setSExtIdx(int V
) { SExtIdx
= V
; }
85 int getSExtIdx() const { return SExtIdx
; }
88 struct AArch64LoadStoreOpt
: public MachineFunctionPass
{
91 AArch64LoadStoreOpt() : MachineFunctionPass(ID
) {
92 initializeAArch64LoadStoreOptPass(*PassRegistry::getPassRegistry());
96 const AArch64InstrInfo
*TII
;
97 const TargetRegisterInfo
*TRI
;
98 const AArch64Subtarget
*Subtarget
;
100 // Track which register units have been modified and used.
101 LiveRegUnits ModifiedRegUnits
, UsedRegUnits
;
103 void getAnalysisUsage(AnalysisUsage
&AU
) const override
{
104 AU
.addRequired
<AAResultsWrapperPass
>();
105 MachineFunctionPass::getAnalysisUsage(AU
);
108 // Scan the instructions looking for a load/store that can be combined
109 // with the current instruction into a load/store pair.
110 // Return the matching instruction if one is found, else MBB->end().
111 MachineBasicBlock::iterator
findMatchingInsn(MachineBasicBlock::iterator I
,
112 LdStPairFlags
&Flags
,
114 bool FindNarrowMerge
);
116 // Scan the instructions looking for a store that writes to the address from
117 // which the current load instruction reads. Return true if one is found.
118 bool findMatchingStore(MachineBasicBlock::iterator I
, unsigned Limit
,
119 MachineBasicBlock::iterator
&StoreI
);
121 // Merge the two instructions indicated into a wider narrow store instruction.
122 MachineBasicBlock::iterator
123 mergeNarrowZeroStores(MachineBasicBlock::iterator I
,
124 MachineBasicBlock::iterator MergeMI
,
125 const LdStPairFlags
&Flags
);
127 // Merge the two instructions indicated into a single pair-wise instruction.
128 MachineBasicBlock::iterator
129 mergePairedInsns(MachineBasicBlock::iterator I
,
130 MachineBasicBlock::iterator Paired
,
131 const LdStPairFlags
&Flags
);
133 // Promote the load that reads directly from the address stored to.
134 MachineBasicBlock::iterator
135 promoteLoadFromStore(MachineBasicBlock::iterator LoadI
,
136 MachineBasicBlock::iterator StoreI
);
138 // Scan the instruction list to find a base register update that can
139 // be combined with the current instruction (a load or store) using
140 // pre or post indexed addressing with writeback. Scan forwards.
141 MachineBasicBlock::iterator
142 findMatchingUpdateInsnForward(MachineBasicBlock::iterator I
,
143 int UnscaledOffset
, unsigned Limit
);
145 // Scan the instruction list to find a base register update that can
146 // be combined with the current instruction (a load or store) using
147 // pre or post indexed addressing with writeback. Scan backwards.
148 MachineBasicBlock::iterator
149 findMatchingUpdateInsnBackward(MachineBasicBlock::iterator I
, unsigned Limit
);
151 // Find an instruction that updates the base register of the ld/st
153 bool isMatchingUpdateInsn(MachineInstr
&MemMI
, MachineInstr
&MI
,
154 unsigned BaseReg
, int Offset
);
156 // Merge a pre- or post-index base register update into a ld/st instruction.
157 MachineBasicBlock::iterator
158 mergeUpdateInsn(MachineBasicBlock::iterator I
,
159 MachineBasicBlock::iterator Update
, bool IsPreIdx
);
161 // Find and merge zero store instructions.
162 bool tryToMergeZeroStInst(MachineBasicBlock::iterator
&MBBI
);
164 // Find and pair ldr/str instructions.
165 bool tryToPairLdStInst(MachineBasicBlock::iterator
&MBBI
);
167 // Find and promote load instructions which read directly from store.
168 bool tryToPromoteLoadFromStore(MachineBasicBlock::iterator
&MBBI
);
170 // Find and merge a base register updates before or after a ld/st instruction.
171 bool tryToMergeLdStUpdate(MachineBasicBlock::iterator
&MBBI
);
173 bool optimizeBlock(MachineBasicBlock
&MBB
, bool EnableNarrowZeroStOpt
);
175 bool runOnMachineFunction(MachineFunction
&Fn
) override
;
177 MachineFunctionProperties
getRequiredProperties() const override
{
178 return MachineFunctionProperties().set(
179 MachineFunctionProperties::Property::NoVRegs
);
182 StringRef
getPassName() const override
{ return AARCH64_LOAD_STORE_OPT_NAME
; }
185 char AArch64LoadStoreOpt::ID
= 0;
187 } // end anonymous namespace
189 INITIALIZE_PASS(AArch64LoadStoreOpt
, "aarch64-ldst-opt",
190 AARCH64_LOAD_STORE_OPT_NAME
, false, false)
192 static bool isNarrowStore(unsigned Opc
) {
196 case AArch64::STRBBui
:
197 case AArch64::STURBBi
:
198 case AArch64::STRHHui
:
199 case AArch64::STURHHi
:
204 // These instruction set memory tag and either keep memory contents unchanged or
205 // set it to zero, ignoring the address part of the source register.
206 static bool isTagStore(const MachineInstr
&MI
) {
207 switch (MI
.getOpcode()) {
210 case AArch64::STGOffset
:
211 case AArch64::STZGOffset
:
212 case AArch64::ST2GOffset
:
213 case AArch64::STZ2GOffset
:
218 // Scaling factor for unscaled load or store.
219 static int getMemScale(const MachineInstr
&MI
) {
220 switch (MI
.getOpcode()) {
222 llvm_unreachable("Opcode has unknown scale!");
223 case AArch64::LDRBBui
:
224 case AArch64::LDURBBi
:
225 case AArch64::LDRSBWui
:
226 case AArch64::LDURSBWi
:
227 case AArch64::STRBBui
:
228 case AArch64::STURBBi
:
230 case AArch64::LDRHHui
:
231 case AArch64::LDURHHi
:
232 case AArch64::LDRSHWui
:
233 case AArch64::LDURSHWi
:
234 case AArch64::STRHHui
:
235 case AArch64::STURHHi
:
237 case AArch64::LDRSui
:
238 case AArch64::LDURSi
:
239 case AArch64::LDRSWui
:
240 case AArch64::LDURSWi
:
241 case AArch64::LDRWui
:
242 case AArch64::LDURWi
:
243 case AArch64::STRSui
:
244 case AArch64::STURSi
:
245 case AArch64::STRWui
:
246 case AArch64::STURWi
:
248 case AArch64::LDPSWi
:
253 case AArch64::LDRDui
:
254 case AArch64::LDURDi
:
255 case AArch64::LDRXui
:
256 case AArch64::LDURXi
:
257 case AArch64::STRDui
:
258 case AArch64::STURDi
:
259 case AArch64::STRXui
:
260 case AArch64::STURXi
:
266 case AArch64::LDRQui
:
267 case AArch64::LDURQi
:
268 case AArch64::STRQui
:
269 case AArch64::STURQi
:
272 case AArch64::STGOffset
:
273 case AArch64::STZGOffset
:
274 case AArch64::ST2GOffset
:
275 case AArch64::STZ2GOffset
:
281 static unsigned getMatchingNonSExtOpcode(unsigned Opc
,
282 bool *IsValidLdStrOpc
= nullptr) {
284 *IsValidLdStrOpc
= true;
288 *IsValidLdStrOpc
= false;
289 return std::numeric_limits
<unsigned>::max();
290 case AArch64::STRDui
:
291 case AArch64::STURDi
:
292 case AArch64::STRQui
:
293 case AArch64::STURQi
:
294 case AArch64::STRBBui
:
295 case AArch64::STURBBi
:
296 case AArch64::STRHHui
:
297 case AArch64::STURHHi
:
298 case AArch64::STRWui
:
299 case AArch64::STURWi
:
300 case AArch64::STRXui
:
301 case AArch64::STURXi
:
302 case AArch64::LDRDui
:
303 case AArch64::LDURDi
:
304 case AArch64::LDRQui
:
305 case AArch64::LDURQi
:
306 case AArch64::LDRWui
:
307 case AArch64::LDURWi
:
308 case AArch64::LDRXui
:
309 case AArch64::LDURXi
:
310 case AArch64::STRSui
:
311 case AArch64::STURSi
:
312 case AArch64::LDRSui
:
313 case AArch64::LDURSi
:
315 case AArch64::LDRSWui
:
316 return AArch64::LDRWui
;
317 case AArch64::LDURSWi
:
318 return AArch64::LDURWi
;
322 static unsigned getMatchingWideOpcode(unsigned Opc
) {
325 llvm_unreachable("Opcode has no wide equivalent!");
326 case AArch64::STRBBui
:
327 return AArch64::STRHHui
;
328 case AArch64::STRHHui
:
329 return AArch64::STRWui
;
330 case AArch64::STURBBi
:
331 return AArch64::STURHHi
;
332 case AArch64::STURHHi
:
333 return AArch64::STURWi
;
334 case AArch64::STURWi
:
335 return AArch64::STURXi
;
336 case AArch64::STRWui
:
337 return AArch64::STRXui
;
341 static unsigned getMatchingPairOpcode(unsigned Opc
) {
344 llvm_unreachable("Opcode has no pairwise equivalent!");
345 case AArch64::STRSui
:
346 case AArch64::STURSi
:
347 return AArch64::STPSi
;
348 case AArch64::STRDui
:
349 case AArch64::STURDi
:
350 return AArch64::STPDi
;
351 case AArch64::STRQui
:
352 case AArch64::STURQi
:
353 return AArch64::STPQi
;
354 case AArch64::STRWui
:
355 case AArch64::STURWi
:
356 return AArch64::STPWi
;
357 case AArch64::STRXui
:
358 case AArch64::STURXi
:
359 return AArch64::STPXi
;
360 case AArch64::LDRSui
:
361 case AArch64::LDURSi
:
362 return AArch64::LDPSi
;
363 case AArch64::LDRDui
:
364 case AArch64::LDURDi
:
365 return AArch64::LDPDi
;
366 case AArch64::LDRQui
:
367 case AArch64::LDURQi
:
368 return AArch64::LDPQi
;
369 case AArch64::LDRWui
:
370 case AArch64::LDURWi
:
371 return AArch64::LDPWi
;
372 case AArch64::LDRXui
:
373 case AArch64::LDURXi
:
374 return AArch64::LDPXi
;
375 case AArch64::LDRSWui
:
376 case AArch64::LDURSWi
:
377 return AArch64::LDPSWi
;
381 static unsigned isMatchingStore(MachineInstr
&LoadInst
,
382 MachineInstr
&StoreInst
) {
383 unsigned LdOpc
= LoadInst
.getOpcode();
384 unsigned StOpc
= StoreInst
.getOpcode();
387 llvm_unreachable("Unsupported load instruction!");
388 case AArch64::LDRBBui
:
389 return StOpc
== AArch64::STRBBui
|| StOpc
== AArch64::STRHHui
||
390 StOpc
== AArch64::STRWui
|| StOpc
== AArch64::STRXui
;
391 case AArch64::LDURBBi
:
392 return StOpc
== AArch64::STURBBi
|| StOpc
== AArch64::STURHHi
||
393 StOpc
== AArch64::STURWi
|| StOpc
== AArch64::STURXi
;
394 case AArch64::LDRHHui
:
395 return StOpc
== AArch64::STRHHui
|| StOpc
== AArch64::STRWui
||
396 StOpc
== AArch64::STRXui
;
397 case AArch64::LDURHHi
:
398 return StOpc
== AArch64::STURHHi
|| StOpc
== AArch64::STURWi
||
399 StOpc
== AArch64::STURXi
;
400 case AArch64::LDRWui
:
401 return StOpc
== AArch64::STRWui
|| StOpc
== AArch64::STRXui
;
402 case AArch64::LDURWi
:
403 return StOpc
== AArch64::STURWi
|| StOpc
== AArch64::STURXi
;
404 case AArch64::LDRXui
:
405 return StOpc
== AArch64::STRXui
;
406 case AArch64::LDURXi
:
407 return StOpc
== AArch64::STURXi
;
411 static unsigned getPreIndexedOpcode(unsigned Opc
) {
412 // FIXME: We don't currently support creating pre-indexed loads/stores when
413 // the load or store is the unscaled version. If we decide to perform such an
414 // optimization in the future the cases for the unscaled loads/stores will
415 // need to be added here.
418 llvm_unreachable("Opcode has no pre-indexed equivalent!");
419 case AArch64::STRSui
:
420 return AArch64::STRSpre
;
421 case AArch64::STRDui
:
422 return AArch64::STRDpre
;
423 case AArch64::STRQui
:
424 return AArch64::STRQpre
;
425 case AArch64::STRBBui
:
426 return AArch64::STRBBpre
;
427 case AArch64::STRHHui
:
428 return AArch64::STRHHpre
;
429 case AArch64::STRWui
:
430 return AArch64::STRWpre
;
431 case AArch64::STRXui
:
432 return AArch64::STRXpre
;
433 case AArch64::LDRSui
:
434 return AArch64::LDRSpre
;
435 case AArch64::LDRDui
:
436 return AArch64::LDRDpre
;
437 case AArch64::LDRQui
:
438 return AArch64::LDRQpre
;
439 case AArch64::LDRBBui
:
440 return AArch64::LDRBBpre
;
441 case AArch64::LDRHHui
:
442 return AArch64::LDRHHpre
;
443 case AArch64::LDRWui
:
444 return AArch64::LDRWpre
;
445 case AArch64::LDRXui
:
446 return AArch64::LDRXpre
;
447 case AArch64::LDRSWui
:
448 return AArch64::LDRSWpre
;
450 return AArch64::LDPSpre
;
451 case AArch64::LDPSWi
:
452 return AArch64::LDPSWpre
;
454 return AArch64::LDPDpre
;
456 return AArch64::LDPQpre
;
458 return AArch64::LDPWpre
;
460 return AArch64::LDPXpre
;
462 return AArch64::STPSpre
;
464 return AArch64::STPDpre
;
466 return AArch64::STPQpre
;
468 return AArch64::STPWpre
;
470 return AArch64::STPXpre
;
471 case AArch64::STGOffset
:
472 return AArch64::STGPreIndex
;
473 case AArch64::STZGOffset
:
474 return AArch64::STZGPreIndex
;
475 case AArch64::ST2GOffset
:
476 return AArch64::ST2GPreIndex
;
477 case AArch64::STZ2GOffset
:
478 return AArch64::STZ2GPreIndex
;
480 return AArch64::STGPpre
;
484 static unsigned getPostIndexedOpcode(unsigned Opc
) {
487 llvm_unreachable("Opcode has no post-indexed wise equivalent!");
488 case AArch64::STRSui
:
489 case AArch64::STURSi
:
490 return AArch64::STRSpost
;
491 case AArch64::STRDui
:
492 case AArch64::STURDi
:
493 return AArch64::STRDpost
;
494 case AArch64::STRQui
:
495 case AArch64::STURQi
:
496 return AArch64::STRQpost
;
497 case AArch64::STRBBui
:
498 return AArch64::STRBBpost
;
499 case AArch64::STRHHui
:
500 return AArch64::STRHHpost
;
501 case AArch64::STRWui
:
502 case AArch64::STURWi
:
503 return AArch64::STRWpost
;
504 case AArch64::STRXui
:
505 case AArch64::STURXi
:
506 return AArch64::STRXpost
;
507 case AArch64::LDRSui
:
508 case AArch64::LDURSi
:
509 return AArch64::LDRSpost
;
510 case AArch64::LDRDui
:
511 case AArch64::LDURDi
:
512 return AArch64::LDRDpost
;
513 case AArch64::LDRQui
:
514 case AArch64::LDURQi
:
515 return AArch64::LDRQpost
;
516 case AArch64::LDRBBui
:
517 return AArch64::LDRBBpost
;
518 case AArch64::LDRHHui
:
519 return AArch64::LDRHHpost
;
520 case AArch64::LDRWui
:
521 case AArch64::LDURWi
:
522 return AArch64::LDRWpost
;
523 case AArch64::LDRXui
:
524 case AArch64::LDURXi
:
525 return AArch64::LDRXpost
;
526 case AArch64::LDRSWui
:
527 return AArch64::LDRSWpost
;
529 return AArch64::LDPSpost
;
530 case AArch64::LDPSWi
:
531 return AArch64::LDPSWpost
;
533 return AArch64::LDPDpost
;
535 return AArch64::LDPQpost
;
537 return AArch64::LDPWpost
;
539 return AArch64::LDPXpost
;
541 return AArch64::STPSpost
;
543 return AArch64::STPDpost
;
545 return AArch64::STPQpost
;
547 return AArch64::STPWpost
;
549 return AArch64::STPXpost
;
550 case AArch64::STGOffset
:
551 return AArch64::STGPostIndex
;
552 case AArch64::STZGOffset
:
553 return AArch64::STZGPostIndex
;
554 case AArch64::ST2GOffset
:
555 return AArch64::ST2GPostIndex
;
556 case AArch64::STZ2GOffset
:
557 return AArch64::STZ2GPostIndex
;
559 return AArch64::STGPpost
;
563 static bool isPairedLdSt(const MachineInstr
&MI
) {
564 switch (MI
.getOpcode()) {
568 case AArch64::LDPSWi
:
583 // Returns the scale and offset range of pre/post indexed variants of MI.
584 static void getPrePostIndexedMemOpInfo(const MachineInstr
&MI
, int &Scale
,
585 int &MinOffset
, int &MaxOffset
) {
586 bool IsPaired
= isPairedLdSt(MI
);
587 bool IsTagStore
= isTagStore(MI
);
588 // ST*G and all paired ldst have the same scale in pre/post-indexed variants
589 // as in the "unsigned offset" variant.
590 // All other pre/post indexed ldst instructions are unscaled.
591 Scale
= (IsTagStore
|| IsPaired
) ? getMemScale(MI
) : 1;
602 static const MachineOperand
&getLdStRegOp(const MachineInstr
&MI
,
603 unsigned PairedRegOp
= 0) {
604 assert(PairedRegOp
< 2 && "Unexpected register operand idx.");
605 unsigned Idx
= isPairedLdSt(MI
) ? PairedRegOp
: 0;
606 return MI
.getOperand(Idx
);
609 static const MachineOperand
&getLdStBaseOp(const MachineInstr
&MI
) {
610 unsigned Idx
= isPairedLdSt(MI
) ? 2 : 1;
611 return MI
.getOperand(Idx
);
614 static const MachineOperand
&getLdStOffsetOp(const MachineInstr
&MI
) {
615 unsigned Idx
= isPairedLdSt(MI
) ? 3 : 2;
616 return MI
.getOperand(Idx
);
619 static bool isLdOffsetInRangeOfSt(MachineInstr
&LoadInst
,
620 MachineInstr
&StoreInst
,
621 const AArch64InstrInfo
*TII
) {
622 assert(isMatchingStore(LoadInst
, StoreInst
) && "Expect only matched ld/st.");
623 int LoadSize
= getMemScale(LoadInst
);
624 int StoreSize
= getMemScale(StoreInst
);
625 int UnscaledStOffset
= TII
->isUnscaledLdSt(StoreInst
)
626 ? getLdStOffsetOp(StoreInst
).getImm()
627 : getLdStOffsetOp(StoreInst
).getImm() * StoreSize
;
628 int UnscaledLdOffset
= TII
->isUnscaledLdSt(LoadInst
)
629 ? getLdStOffsetOp(LoadInst
).getImm()
630 : getLdStOffsetOp(LoadInst
).getImm() * LoadSize
;
631 return (UnscaledStOffset
<= UnscaledLdOffset
) &&
632 (UnscaledLdOffset
+ LoadSize
<= (UnscaledStOffset
+ StoreSize
));
635 static bool isPromotableZeroStoreInst(MachineInstr
&MI
) {
636 unsigned Opc
= MI
.getOpcode();
637 return (Opc
== AArch64::STRWui
|| Opc
== AArch64::STURWi
||
638 isNarrowStore(Opc
)) &&
639 getLdStRegOp(MI
).getReg() == AArch64::WZR
;
642 static bool isPromotableLoadFromStore(MachineInstr
&MI
) {
643 switch (MI
.getOpcode()) {
646 // Scaled instructions.
647 case AArch64::LDRBBui
:
648 case AArch64::LDRHHui
:
649 case AArch64::LDRWui
:
650 case AArch64::LDRXui
:
651 // Unscaled instructions.
652 case AArch64::LDURBBi
:
653 case AArch64::LDURHHi
:
654 case AArch64::LDURWi
:
655 case AArch64::LDURXi
:
660 static bool isMergeableLdStUpdate(MachineInstr
&MI
) {
661 unsigned Opc
= MI
.getOpcode();
665 // Scaled instructions.
666 case AArch64::STRSui
:
667 case AArch64::STRDui
:
668 case AArch64::STRQui
:
669 case AArch64::STRXui
:
670 case AArch64::STRWui
:
671 case AArch64::STRHHui
:
672 case AArch64::STRBBui
:
673 case AArch64::LDRSui
:
674 case AArch64::LDRDui
:
675 case AArch64::LDRQui
:
676 case AArch64::LDRXui
:
677 case AArch64::LDRWui
:
678 case AArch64::LDRHHui
:
679 case AArch64::LDRBBui
:
680 case AArch64::STGOffset
:
681 case AArch64::STZGOffset
:
682 case AArch64::ST2GOffset
:
683 case AArch64::STZ2GOffset
:
685 // Unscaled instructions.
686 case AArch64::STURSi
:
687 case AArch64::STURDi
:
688 case AArch64::STURQi
:
689 case AArch64::STURWi
:
690 case AArch64::STURXi
:
691 case AArch64::LDURSi
:
692 case AArch64::LDURDi
:
693 case AArch64::LDURQi
:
694 case AArch64::LDURWi
:
695 case AArch64::LDURXi
:
696 // Paired instructions.
698 case AArch64::LDPSWi
:
708 // Make sure this is a reg+imm (as opposed to an address reloc).
709 if (!getLdStOffsetOp(MI
).isImm())
716 MachineBasicBlock::iterator
717 AArch64LoadStoreOpt::mergeNarrowZeroStores(MachineBasicBlock::iterator I
,
718 MachineBasicBlock::iterator MergeMI
,
719 const LdStPairFlags
&Flags
) {
720 assert(isPromotableZeroStoreInst(*I
) && isPromotableZeroStoreInst(*MergeMI
) &&
721 "Expected promotable zero stores.");
723 MachineBasicBlock::iterator NextI
= I
;
725 // If NextI is the second of the two instructions to be merged, we need
726 // to skip one further. Either way we merge will invalidate the iterator,
727 // and we don't need to scan the new instruction, as it's a pairwise
728 // instruction, which we're not considering for further action anyway.
729 if (NextI
== MergeMI
)
732 unsigned Opc
= I
->getOpcode();
733 bool IsScaled
= !TII
->isUnscaledLdSt(Opc
);
734 int OffsetStride
= IsScaled
? 1 : getMemScale(*I
);
736 bool MergeForward
= Flags
.getMergeForward();
737 // Insert our new paired instruction after whichever of the paired
738 // instructions MergeForward indicates.
739 MachineBasicBlock::iterator InsertionPoint
= MergeForward
? MergeMI
: I
;
740 // Also based on MergeForward is from where we copy the base register operand
741 // so we get the flags compatible with the input code.
742 const MachineOperand
&BaseRegOp
=
743 MergeForward
? getLdStBaseOp(*MergeMI
) : getLdStBaseOp(*I
);
745 // Which register is Rt and which is Rt2 depends on the offset order.
747 if (getLdStOffsetOp(*I
).getImm() ==
748 getLdStOffsetOp(*MergeMI
).getImm() + OffsetStride
)
753 int OffsetImm
= getLdStOffsetOp(*RtMI
).getImm();
754 // Change the scaled offset from small to large type.
756 assert(((OffsetImm
& 1) == 0) && "Unexpected offset to merge");
760 // Construct the new instruction.
761 DebugLoc DL
= I
->getDebugLoc();
762 MachineBasicBlock
*MBB
= I
->getParent();
763 MachineInstrBuilder MIB
;
764 MIB
= BuildMI(*MBB
, InsertionPoint
, DL
, TII
->get(getMatchingWideOpcode(Opc
)))
765 .addReg(isNarrowStore(Opc
) ? AArch64::WZR
: AArch64::XZR
)
768 .cloneMergedMemRefs({&*I
, &*MergeMI
})
769 .setMIFlags(I
->mergeFlagsWith(*MergeMI
));
772 LLVM_DEBUG(dbgs() << "Creating wider store. Replacing instructions:\n ");
773 LLVM_DEBUG(I
->print(dbgs()));
774 LLVM_DEBUG(dbgs() << " ");
775 LLVM_DEBUG(MergeMI
->print(dbgs()));
776 LLVM_DEBUG(dbgs() << " with instruction:\n ");
777 LLVM_DEBUG(((MachineInstr
*)MIB
)->print(dbgs()));
778 LLVM_DEBUG(dbgs() << "\n");
780 // Erase the old instructions.
781 I
->eraseFromParent();
782 MergeMI
->eraseFromParent();
786 MachineBasicBlock::iterator
787 AArch64LoadStoreOpt::mergePairedInsns(MachineBasicBlock::iterator I
,
788 MachineBasicBlock::iterator Paired
,
789 const LdStPairFlags
&Flags
) {
790 MachineBasicBlock::iterator NextI
= I
;
792 // If NextI is the second of the two instructions to be merged, we need
793 // to skip one further. Either way we merge will invalidate the iterator,
794 // and we don't need to scan the new instruction, as it's a pairwise
795 // instruction, which we're not considering for further action anyway.
799 int SExtIdx
= Flags
.getSExtIdx();
801 SExtIdx
== -1 ? I
->getOpcode() : getMatchingNonSExtOpcode(I
->getOpcode());
802 bool IsUnscaled
= TII
->isUnscaledLdSt(Opc
);
803 int OffsetStride
= IsUnscaled
? getMemScale(*I
) : 1;
805 bool MergeForward
= Flags
.getMergeForward();
806 // Insert our new paired instruction after whichever of the paired
807 // instructions MergeForward indicates.
808 MachineBasicBlock::iterator InsertionPoint
= MergeForward
? Paired
: I
;
809 // Also based on MergeForward is from where we copy the base register operand
810 // so we get the flags compatible with the input code.
811 const MachineOperand
&BaseRegOp
=
812 MergeForward
? getLdStBaseOp(*Paired
) : getLdStBaseOp(*I
);
814 int Offset
= getLdStOffsetOp(*I
).getImm();
815 int PairedOffset
= getLdStOffsetOp(*Paired
).getImm();
816 bool PairedIsUnscaled
= TII
->isUnscaledLdSt(Paired
->getOpcode());
817 if (IsUnscaled
!= PairedIsUnscaled
) {
818 // We're trying to pair instructions that differ in how they are scaled. If
819 // I is scaled then scale the offset of Paired accordingly. Otherwise, do
820 // the opposite (i.e., make Paired's offset unscaled).
821 int MemSize
= getMemScale(*Paired
);
822 if (PairedIsUnscaled
) {
823 // If the unscaled offset isn't a multiple of the MemSize, we can't
824 // pair the operations together.
825 assert(!(PairedOffset
% getMemScale(*Paired
)) &&
826 "Offset should be a multiple of the stride!");
827 PairedOffset
/= MemSize
;
829 PairedOffset
*= MemSize
;
833 // Which register is Rt and which is Rt2 depends on the offset order.
834 MachineInstr
*RtMI
, *Rt2MI
;
835 if (Offset
== PairedOffset
+ OffsetStride
) {
838 // Here we swapped the assumption made for SExtIdx.
839 // I.e., we turn ldp I, Paired into ldp Paired, I.
840 // Update the index accordingly.
842 SExtIdx
= (SExtIdx
+ 1) % 2;
847 int OffsetImm
= getLdStOffsetOp(*RtMI
).getImm();
848 // Scale the immediate offset, if necessary.
849 if (TII
->isUnscaledLdSt(RtMI
->getOpcode())) {
850 assert(!(OffsetImm
% getMemScale(*RtMI
)) &&
851 "Unscaled offset cannot be scaled.");
852 OffsetImm
/= getMemScale(*RtMI
);
855 // Construct the new instruction.
856 MachineInstrBuilder MIB
;
857 DebugLoc DL
= I
->getDebugLoc();
858 MachineBasicBlock
*MBB
= I
->getParent();
859 MachineOperand RegOp0
= getLdStRegOp(*RtMI
);
860 MachineOperand RegOp1
= getLdStRegOp(*Rt2MI
);
861 // Kill flags may become invalid when moving stores for pairing.
862 if (RegOp0
.isUse()) {
864 // Clear kill flags on store if moving upwards. Example:
867 // STRWui kill %w1 ; need to clear kill flag when moving STRWui upwards
868 RegOp0
.setIsKill(false);
869 RegOp1
.setIsKill(false);
871 // Clear kill flags of the first stores register. Example:
873 // USE kill %w1 ; need to clear kill flag when moving STRWui downwards
875 Register Reg
= getLdStRegOp(*I
).getReg();
876 for (MachineInstr
&MI
: make_range(std::next(I
), Paired
))
877 MI
.clearRegisterKills(Reg
, TRI
);
880 MIB
= BuildMI(*MBB
, InsertionPoint
, DL
, TII
->get(getMatchingPairOpcode(Opc
)))
885 .cloneMergedMemRefs({&*I
, &*Paired
})
886 .setMIFlags(I
->mergeFlagsWith(*Paired
));
891 dbgs() << "Creating pair load/store. Replacing instructions:\n ");
892 LLVM_DEBUG(I
->print(dbgs()));
893 LLVM_DEBUG(dbgs() << " ");
894 LLVM_DEBUG(Paired
->print(dbgs()));
895 LLVM_DEBUG(dbgs() << " with instruction:\n ");
897 // Generate the sign extension for the proper result of the ldp.
898 // I.e., with X1, that would be:
899 // %w1 = KILL %w1, implicit-def %x1
900 // %x1 = SBFMXri killed %x1, 0, 31
901 MachineOperand
&DstMO
= MIB
->getOperand(SExtIdx
);
902 // Right now, DstMO has the extended register, since it comes from an
904 Register DstRegX
= DstMO
.getReg();
905 // Get the W variant of that register.
906 Register DstRegW
= TRI
->getSubReg(DstRegX
, AArch64::sub_32
);
907 // Update the result of LDP to use the W instead of the X variant.
908 DstMO
.setReg(DstRegW
);
909 LLVM_DEBUG(((MachineInstr
*)MIB
)->print(dbgs()));
910 LLVM_DEBUG(dbgs() << "\n");
911 // Make the machine verifier happy by providing a definition for
913 // Insert this definition right after the generated LDP, i.e., before
915 MachineInstrBuilder MIBKill
=
916 BuildMI(*MBB
, InsertionPoint
, DL
, TII
->get(TargetOpcode::KILL
), DstRegW
)
918 .addReg(DstRegX
, RegState::Define
);
919 MIBKill
->getOperand(2).setImplicit();
920 // Create the sign extension.
921 MachineInstrBuilder MIBSXTW
=
922 BuildMI(*MBB
, InsertionPoint
, DL
, TII
->get(AArch64::SBFMXri
), DstRegX
)
927 LLVM_DEBUG(dbgs() << " Extend operand:\n ");
928 LLVM_DEBUG(((MachineInstr
*)MIBSXTW
)->print(dbgs()));
930 LLVM_DEBUG(((MachineInstr
*)MIB
)->print(dbgs()));
932 LLVM_DEBUG(dbgs() << "\n");
934 // Erase the old instructions.
935 I
->eraseFromParent();
936 Paired
->eraseFromParent();
941 MachineBasicBlock::iterator
942 AArch64LoadStoreOpt::promoteLoadFromStore(MachineBasicBlock::iterator LoadI
,
943 MachineBasicBlock::iterator StoreI
) {
944 MachineBasicBlock::iterator NextI
= LoadI
;
947 int LoadSize
= getMemScale(*LoadI
);
948 int StoreSize
= getMemScale(*StoreI
);
949 Register LdRt
= getLdStRegOp(*LoadI
).getReg();
950 const MachineOperand
&StMO
= getLdStRegOp(*StoreI
);
951 Register StRt
= getLdStRegOp(*StoreI
).getReg();
952 bool IsStoreXReg
= TRI
->getRegClass(AArch64::GPR64RegClassID
)->contains(StRt
);
954 assert((IsStoreXReg
||
955 TRI
->getRegClass(AArch64::GPR32RegClassID
)->contains(StRt
)) &&
956 "Unexpected RegClass");
958 MachineInstr
*BitExtMI
;
959 if (LoadSize
== StoreSize
&& (LoadSize
== 4 || LoadSize
== 8)) {
960 // Remove the load, if the destination register of the loads is the same
961 // register for stored value.
962 if (StRt
== LdRt
&& LoadSize
== 8) {
963 for (MachineInstr
&MI
: make_range(StoreI
->getIterator(),
964 LoadI
->getIterator())) {
965 if (MI
.killsRegister(StRt
, TRI
)) {
966 MI
.clearRegisterKills(StRt
, TRI
);
970 LLVM_DEBUG(dbgs() << "Remove load instruction:\n ");
971 LLVM_DEBUG(LoadI
->print(dbgs()));
972 LLVM_DEBUG(dbgs() << "\n");
973 LoadI
->eraseFromParent();
976 // Replace the load with a mov if the load and store are in the same size.
978 BuildMI(*LoadI
->getParent(), LoadI
, LoadI
->getDebugLoc(),
979 TII
->get(IsStoreXReg
? AArch64::ORRXrs
: AArch64::ORRWrs
), LdRt
)
980 .addReg(IsStoreXReg
? AArch64::XZR
: AArch64::WZR
)
982 .addImm(AArch64_AM::getShifterImm(AArch64_AM::LSL
, 0))
983 .setMIFlags(LoadI
->getFlags());
985 // FIXME: Currently we disable this transformation in big-endian targets as
986 // performance and correctness are verified only in little-endian.
987 if (!Subtarget
->isLittleEndian())
989 bool IsUnscaled
= TII
->isUnscaledLdSt(*LoadI
);
990 assert(IsUnscaled
== TII
->isUnscaledLdSt(*StoreI
) &&
991 "Unsupported ld/st match");
992 assert(LoadSize
<= StoreSize
&& "Invalid load size");
993 int UnscaledLdOffset
= IsUnscaled
994 ? getLdStOffsetOp(*LoadI
).getImm()
995 : getLdStOffsetOp(*LoadI
).getImm() * LoadSize
;
996 int UnscaledStOffset
= IsUnscaled
997 ? getLdStOffsetOp(*StoreI
).getImm()
998 : getLdStOffsetOp(*StoreI
).getImm() * StoreSize
;
999 int Width
= LoadSize
* 8;
1001 IsStoreXReg
? Register(TRI
->getMatchingSuperReg(
1002 LdRt
, AArch64::sub_32
, &AArch64::GPR64RegClass
))
1005 assert((UnscaledLdOffset
>= UnscaledStOffset
&&
1006 (UnscaledLdOffset
+ LoadSize
) <= UnscaledStOffset
+ StoreSize
) &&
1009 int Immr
= 8 * (UnscaledLdOffset
- UnscaledStOffset
);
1010 int Imms
= Immr
+ Width
- 1;
1011 if (UnscaledLdOffset
== UnscaledStOffset
) {
1012 uint32_t AndMaskEncoded
= ((IsStoreXReg
? 1 : 0) << 12) // N
1013 | ((Immr
) << 6) // immr
1014 | ((Imms
) << 0) // imms
1018 BuildMI(*LoadI
->getParent(), LoadI
, LoadI
->getDebugLoc(),
1019 TII
->get(IsStoreXReg
? AArch64::ANDXri
: AArch64::ANDWri
),
1022 .addImm(AndMaskEncoded
)
1023 .setMIFlags(LoadI
->getFlags());
1026 BuildMI(*LoadI
->getParent(), LoadI
, LoadI
->getDebugLoc(),
1027 TII
->get(IsStoreXReg
? AArch64::UBFMXri
: AArch64::UBFMWri
),
1032 .setMIFlags(LoadI
->getFlags());
1036 // Clear kill flags between store and load.
1037 for (MachineInstr
&MI
: make_range(StoreI
->getIterator(),
1038 BitExtMI
->getIterator()))
1039 if (MI
.killsRegister(StRt
, TRI
)) {
1040 MI
.clearRegisterKills(StRt
, TRI
);
1044 LLVM_DEBUG(dbgs() << "Promoting load by replacing :\n ");
1045 LLVM_DEBUG(StoreI
->print(dbgs()));
1046 LLVM_DEBUG(dbgs() << " ");
1047 LLVM_DEBUG(LoadI
->print(dbgs()));
1048 LLVM_DEBUG(dbgs() << " with instructions:\n ");
1049 LLVM_DEBUG(StoreI
->print(dbgs()));
1050 LLVM_DEBUG(dbgs() << " ");
1051 LLVM_DEBUG((BitExtMI
)->print(dbgs()));
1052 LLVM_DEBUG(dbgs() << "\n");
1054 // Erase the old instructions.
1055 LoadI
->eraseFromParent();
1059 static bool inBoundsForPair(bool IsUnscaled
, int Offset
, int OffsetStride
) {
1060 // Convert the byte-offset used by unscaled into an "element" offset used
1061 // by the scaled pair load/store instructions.
1063 // If the byte-offset isn't a multiple of the stride, there's no point
1064 // trying to match it.
1065 if (Offset
% OffsetStride
)
1067 Offset
/= OffsetStride
;
1069 return Offset
<= 63 && Offset
>= -64;
1072 // Do alignment, specialized to power of 2 and for signed ints,
1073 // avoiding having to do a C-style cast from uint_64t to int when
1074 // using alignTo from include/llvm/Support/MathExtras.h.
1075 // FIXME: Move this function to include/MathExtras.h?
1076 static int alignTo(int Num
, int PowOf2
) {
1077 return (Num
+ PowOf2
- 1) & ~(PowOf2
- 1);
1080 static bool mayAlias(MachineInstr
&MIa
, MachineInstr
&MIb
,
1081 AliasAnalysis
*AA
) {
1082 // One of the instructions must modify memory.
1083 if (!MIa
.mayStore() && !MIb
.mayStore())
1086 // Both instructions must be memory operations.
1087 if (!MIa
.mayLoadOrStore() && !MIb
.mayLoadOrStore())
1090 return MIa
.mayAlias(AA
, MIb
, /*UseTBAA*/false);
1093 static bool mayAlias(MachineInstr
&MIa
,
1094 SmallVectorImpl
<MachineInstr
*> &MemInsns
,
1095 AliasAnalysis
*AA
) {
1096 for (MachineInstr
*MIb
: MemInsns
)
1097 if (mayAlias(MIa
, *MIb
, AA
))
1103 bool AArch64LoadStoreOpt::findMatchingStore(
1104 MachineBasicBlock::iterator I
, unsigned Limit
,
1105 MachineBasicBlock::iterator
&StoreI
) {
1106 MachineBasicBlock::iterator B
= I
->getParent()->begin();
1107 MachineBasicBlock::iterator MBBI
= I
;
1108 MachineInstr
&LoadMI
= *I
;
1109 Register BaseReg
= getLdStBaseOp(LoadMI
).getReg();
1111 // If the load is the first instruction in the block, there's obviously
1112 // not any matching store.
1116 // Track which register units have been modified and used between the first
1117 // insn and the second insn.
1118 ModifiedRegUnits
.clear();
1119 UsedRegUnits
.clear();
1124 MachineInstr
&MI
= *MBBI
;
1126 // Don't count transient instructions towards the search limit since there
1127 // may be different numbers of them if e.g. debug information is present.
1128 if (!MI
.isTransient())
1131 // If the load instruction reads directly from the address to which the
1132 // store instruction writes and the stored value is not modified, we can
1133 // promote the load. Since we do not handle stores with pre-/post-index,
1134 // it's unnecessary to check if BaseReg is modified by the store itself.
1135 if (MI
.mayStore() && isMatchingStore(LoadMI
, MI
) &&
1136 BaseReg
== getLdStBaseOp(MI
).getReg() &&
1137 isLdOffsetInRangeOfSt(LoadMI
, MI
, TII
) &&
1138 ModifiedRegUnits
.available(getLdStRegOp(MI
).getReg())) {
1146 // Update modified / uses register units.
1147 LiveRegUnits::accumulateUsedDefed(MI
, ModifiedRegUnits
, UsedRegUnits
, TRI
);
1149 // Otherwise, if the base register is modified, we have no match, so
1151 if (!ModifiedRegUnits
.available(BaseReg
))
1154 // If we encounter a store aliased with the load, return early.
1155 if (MI
.mayStore() && mayAlias(LoadMI
, MI
, AA
))
1157 } while (MBBI
!= B
&& Count
< Limit
);
1161 // Returns true if FirstMI and MI are candidates for merging or pairing.
1162 // Otherwise, returns false.
1163 static bool areCandidatesToMergeOrPair(MachineInstr
&FirstMI
, MachineInstr
&MI
,
1164 LdStPairFlags
&Flags
,
1165 const AArch64InstrInfo
*TII
) {
1166 // If this is volatile or if pairing is suppressed, not a candidate.
1167 if (MI
.hasOrderedMemoryRef() || TII
->isLdStPairSuppressed(MI
))
1170 // We should have already checked FirstMI for pair suppression and volatility.
1171 assert(!FirstMI
.hasOrderedMemoryRef() &&
1172 !TII
->isLdStPairSuppressed(FirstMI
) &&
1173 "FirstMI shouldn't get here if either of these checks are true.");
1175 unsigned OpcA
= FirstMI
.getOpcode();
1176 unsigned OpcB
= MI
.getOpcode();
1178 // Opcodes match: nothing more to check.
1182 // Try to match a sign-extended load/store with a zero-extended load/store.
1183 bool IsValidLdStrOpc
, PairIsValidLdStrOpc
;
1184 unsigned NonSExtOpc
= getMatchingNonSExtOpcode(OpcA
, &IsValidLdStrOpc
);
1185 assert(IsValidLdStrOpc
&&
1186 "Given Opc should be a Load or Store with an immediate");
1187 // OpcA will be the first instruction in the pair.
1188 if (NonSExtOpc
== getMatchingNonSExtOpcode(OpcB
, &PairIsValidLdStrOpc
)) {
1189 Flags
.setSExtIdx(NonSExtOpc
== (unsigned)OpcA
? 1 : 0);
1193 // If the second instruction isn't even a mergable/pairable load/store, bail
1195 if (!PairIsValidLdStrOpc
)
1198 // FIXME: We don't support merging narrow stores with mixed scaled/unscaled
1200 if (isNarrowStore(OpcA
) || isNarrowStore(OpcB
))
1203 // Try to match an unscaled load/store with a scaled load/store.
1204 return TII
->isUnscaledLdSt(OpcA
) != TII
->isUnscaledLdSt(OpcB
) &&
1205 getMatchingPairOpcode(OpcA
) == getMatchingPairOpcode(OpcB
);
1207 // FIXME: Can we also match a mixed sext/zext unscaled/scaled pair?
1210 /// Scan the instructions looking for a load/store that can be combined with the
1211 /// current instruction into a wider equivalent or a load/store pair.
1212 MachineBasicBlock::iterator
1213 AArch64LoadStoreOpt::findMatchingInsn(MachineBasicBlock::iterator I
,
1214 LdStPairFlags
&Flags
, unsigned Limit
,
1215 bool FindNarrowMerge
) {
1216 MachineBasicBlock::iterator E
= I
->getParent()->end();
1217 MachineBasicBlock::iterator MBBI
= I
;
1218 MachineInstr
&FirstMI
= *I
;
1221 bool MayLoad
= FirstMI
.mayLoad();
1222 bool IsUnscaled
= TII
->isUnscaledLdSt(FirstMI
);
1223 Register Reg
= getLdStRegOp(FirstMI
).getReg();
1224 Register BaseReg
= getLdStBaseOp(FirstMI
).getReg();
1225 int Offset
= getLdStOffsetOp(FirstMI
).getImm();
1226 int OffsetStride
= IsUnscaled
? getMemScale(FirstMI
) : 1;
1227 bool IsPromotableZeroStore
= isPromotableZeroStoreInst(FirstMI
);
1229 // Track which register units have been modified and used between the first
1230 // insn (inclusive) and the second insn.
1231 ModifiedRegUnits
.clear();
1232 UsedRegUnits
.clear();
1234 // Remember any instructions that read/write memory between FirstMI and MI.
1235 SmallVector
<MachineInstr
*, 4> MemInsns
;
1237 for (unsigned Count
= 0; MBBI
!= E
&& Count
< Limit
; ++MBBI
) {
1238 MachineInstr
&MI
= *MBBI
;
1240 // Don't count transient instructions towards the search limit since there
1241 // may be different numbers of them if e.g. debug information is present.
1242 if (!MI
.isTransient())
1245 Flags
.setSExtIdx(-1);
1246 if (areCandidatesToMergeOrPair(FirstMI
, MI
, Flags
, TII
) &&
1247 getLdStOffsetOp(MI
).isImm()) {
1248 assert(MI
.mayLoadOrStore() && "Expected memory operation.");
1249 // If we've found another instruction with the same opcode, check to see
1250 // if the base and offset are compatible with our starting instruction.
1251 // These instructions all have scaled immediate operands, so we just
1252 // check for +1/-1. Make sure to check the new instruction offset is
1253 // actually an immediate and not a symbolic reference destined for
1255 Register MIBaseReg
= getLdStBaseOp(MI
).getReg();
1256 int MIOffset
= getLdStOffsetOp(MI
).getImm();
1257 bool MIIsUnscaled
= TII
->isUnscaledLdSt(MI
);
1258 if (IsUnscaled
!= MIIsUnscaled
) {
1259 // We're trying to pair instructions that differ in how they are scaled.
1260 // If FirstMI is scaled then scale the offset of MI accordingly.
1261 // Otherwise, do the opposite (i.e., make MI's offset unscaled).
1262 int MemSize
= getMemScale(MI
);
1264 // If the unscaled offset isn't a multiple of the MemSize, we can't
1265 // pair the operations together: bail and keep looking.
1266 if (MIOffset
% MemSize
) {
1267 LiveRegUnits::accumulateUsedDefed(MI
, ModifiedRegUnits
,
1269 MemInsns
.push_back(&MI
);
1272 MIOffset
/= MemSize
;
1274 MIOffset
*= MemSize
;
1278 if (BaseReg
== MIBaseReg
&& ((Offset
== MIOffset
+ OffsetStride
) ||
1279 (Offset
+ OffsetStride
== MIOffset
))) {
1280 int MinOffset
= Offset
< MIOffset
? Offset
: MIOffset
;
1281 if (FindNarrowMerge
) {
1282 // If the alignment requirements of the scaled wide load/store
1283 // instruction can't express the offset of the scaled narrow input,
1284 // bail and keep looking. For promotable zero stores, allow only when
1285 // the stored value is the same (i.e., WZR).
1286 if ((!IsUnscaled
&& alignTo(MinOffset
, 2) != MinOffset
) ||
1287 (IsPromotableZeroStore
&& Reg
!= getLdStRegOp(MI
).getReg())) {
1288 LiveRegUnits::accumulateUsedDefed(MI
, ModifiedRegUnits
,
1290 MemInsns
.push_back(&MI
);
1294 // Pairwise instructions have a 7-bit signed offset field. Single
1295 // insns have a 12-bit unsigned offset field. If the resultant
1296 // immediate offset of merging these instructions is out of range for
1297 // a pairwise instruction, bail and keep looking.
1298 if (!inBoundsForPair(IsUnscaled
, MinOffset
, OffsetStride
)) {
1299 LiveRegUnits::accumulateUsedDefed(MI
, ModifiedRegUnits
,
1301 MemInsns
.push_back(&MI
);
1304 // If the alignment requirements of the paired (scaled) instruction
1305 // can't express the offset of the unscaled input, bail and keep
1307 if (IsUnscaled
&& (alignTo(MinOffset
, OffsetStride
) != MinOffset
)) {
1308 LiveRegUnits::accumulateUsedDefed(MI
, ModifiedRegUnits
,
1310 MemInsns
.push_back(&MI
);
1314 // If the destination register of the loads is the same register, bail
1315 // and keep looking. A load-pair instruction with both destination
1316 // registers the same is UNPREDICTABLE and will result in an exception.
1317 if (MayLoad
&& Reg
== getLdStRegOp(MI
).getReg()) {
1318 LiveRegUnits::accumulateUsedDefed(MI
, ModifiedRegUnits
, UsedRegUnits
,
1320 MemInsns
.push_back(&MI
);
1324 // If the Rt of the second instruction was not modified or used between
1325 // the two instructions and none of the instructions between the second
1326 // and first alias with the second, we can combine the second into the
1328 if (ModifiedRegUnits
.available(getLdStRegOp(MI
).getReg()) &&
1330 !UsedRegUnits
.available(getLdStRegOp(MI
).getReg())) &&
1331 !mayAlias(MI
, MemInsns
, AA
)) {
1332 Flags
.setMergeForward(false);
1336 // Likewise, if the Rt of the first instruction is not modified or used
1337 // between the two instructions and none of the instructions between the
1338 // first and the second alias with the first, we can combine the first
1340 if (ModifiedRegUnits
.available(getLdStRegOp(FirstMI
).getReg()) &&
1342 !UsedRegUnits
.available(getLdStRegOp(FirstMI
).getReg())) &&
1343 !mayAlias(FirstMI
, MemInsns
, AA
)) {
1344 Flags
.setMergeForward(true);
1347 // Unable to combine these instructions due to interference in between.
1352 // If the instruction wasn't a matching load or store. Stop searching if we
1353 // encounter a call instruction that might modify memory.
1357 // Update modified / uses register units.
1358 LiveRegUnits::accumulateUsedDefed(MI
, ModifiedRegUnits
, UsedRegUnits
, TRI
);
1360 // Otherwise, if the base register is modified, we have no match, so
1362 if (!ModifiedRegUnits
.available(BaseReg
))
1365 // Update list of instructions that read/write memory.
1366 if (MI
.mayLoadOrStore())
1367 MemInsns
.push_back(&MI
);
1372 MachineBasicBlock::iterator
1373 AArch64LoadStoreOpt::mergeUpdateInsn(MachineBasicBlock::iterator I
,
1374 MachineBasicBlock::iterator Update
,
1376 assert((Update
->getOpcode() == AArch64::ADDXri
||
1377 Update
->getOpcode() == AArch64::SUBXri
) &&
1378 "Unexpected base register update instruction to merge!");
1379 MachineBasicBlock::iterator NextI
= I
;
1380 // Return the instruction following the merged instruction, which is
1381 // the instruction following our unmerged load. Unless that's the add/sub
1382 // instruction we're merging, in which case it's the one after that.
1383 if (++NextI
== Update
)
1386 int Value
= Update
->getOperand(2).getImm();
1387 assert(AArch64_AM::getShiftValue(Update
->getOperand(3).getImm()) == 0 &&
1388 "Can't merge 1 << 12 offset into pre-/post-indexed load / store");
1389 if (Update
->getOpcode() == AArch64::SUBXri
)
1392 unsigned NewOpc
= IsPreIdx
? getPreIndexedOpcode(I
->getOpcode())
1393 : getPostIndexedOpcode(I
->getOpcode());
1394 MachineInstrBuilder MIB
;
1395 int Scale
, MinOffset
, MaxOffset
;
1396 getPrePostIndexedMemOpInfo(*I
, Scale
, MinOffset
, MaxOffset
);
1397 if (!isPairedLdSt(*I
)) {
1398 // Non-paired instruction.
1399 MIB
= BuildMI(*I
->getParent(), I
, I
->getDebugLoc(), TII
->get(NewOpc
))
1400 .add(getLdStRegOp(*Update
))
1401 .add(getLdStRegOp(*I
))
1402 .add(getLdStBaseOp(*I
))
1403 .addImm(Value
/ Scale
)
1404 .setMemRefs(I
->memoperands())
1405 .setMIFlags(I
->mergeFlagsWith(*Update
));
1407 // Paired instruction.
1408 MIB
= BuildMI(*I
->getParent(), I
, I
->getDebugLoc(), TII
->get(NewOpc
))
1409 .add(getLdStRegOp(*Update
))
1410 .add(getLdStRegOp(*I
, 0))
1411 .add(getLdStRegOp(*I
, 1))
1412 .add(getLdStBaseOp(*I
))
1413 .addImm(Value
/ Scale
)
1414 .setMemRefs(I
->memoperands())
1415 .setMIFlags(I
->mergeFlagsWith(*Update
));
1421 LLVM_DEBUG(dbgs() << "Creating pre-indexed load/store.");
1424 LLVM_DEBUG(dbgs() << "Creating post-indexed load/store.");
1426 LLVM_DEBUG(dbgs() << " Replacing instructions:\n ");
1427 LLVM_DEBUG(I
->print(dbgs()));
1428 LLVM_DEBUG(dbgs() << " ");
1429 LLVM_DEBUG(Update
->print(dbgs()));
1430 LLVM_DEBUG(dbgs() << " with instruction:\n ");
1431 LLVM_DEBUG(((MachineInstr
*)MIB
)->print(dbgs()));
1432 LLVM_DEBUG(dbgs() << "\n");
1434 // Erase the old instructions for the block.
1435 I
->eraseFromParent();
1436 Update
->eraseFromParent();
1441 bool AArch64LoadStoreOpt::isMatchingUpdateInsn(MachineInstr
&MemMI
,
1443 unsigned BaseReg
, int Offset
) {
1444 switch (MI
.getOpcode()) {
1447 case AArch64::SUBXri
:
1448 case AArch64::ADDXri
:
1449 // Make sure it's a vanilla immediate operand, not a relocation or
1450 // anything else we can't handle.
1451 if (!MI
.getOperand(2).isImm())
1453 // Watch out for 1 << 12 shifted value.
1454 if (AArch64_AM::getShiftValue(MI
.getOperand(3).getImm()))
1457 // The update instruction source and destination register must be the
1458 // same as the load/store base register.
1459 if (MI
.getOperand(0).getReg() != BaseReg
||
1460 MI
.getOperand(1).getReg() != BaseReg
)
1463 int UpdateOffset
= MI
.getOperand(2).getImm();
1464 if (MI
.getOpcode() == AArch64::SUBXri
)
1465 UpdateOffset
= -UpdateOffset
;
1467 // The immediate must be a multiple of the scaling factor of the pre/post
1468 // indexed instruction.
1469 int Scale
, MinOffset
, MaxOffset
;
1470 getPrePostIndexedMemOpInfo(MemMI
, Scale
, MinOffset
, MaxOffset
);
1471 if (UpdateOffset
% Scale
!= 0)
1474 // Scaled offset must fit in the instruction immediate.
1475 int ScaledOffset
= UpdateOffset
/ Scale
;
1476 if (ScaledOffset
> MaxOffset
|| ScaledOffset
< MinOffset
)
1479 // If we have a non-zero Offset, we check that it matches the amount
1480 // we're adding to the register.
1481 if (!Offset
|| Offset
== UpdateOffset
)
1488 MachineBasicBlock::iterator
AArch64LoadStoreOpt::findMatchingUpdateInsnForward(
1489 MachineBasicBlock::iterator I
, int UnscaledOffset
, unsigned Limit
) {
1490 MachineBasicBlock::iterator E
= I
->getParent()->end();
1491 MachineInstr
&MemMI
= *I
;
1492 MachineBasicBlock::iterator MBBI
= I
;
1494 Register BaseReg
= getLdStBaseOp(MemMI
).getReg();
1495 int MIUnscaledOffset
= getLdStOffsetOp(MemMI
).getImm() * getMemScale(MemMI
);
1497 // Scan forward looking for post-index opportunities. Updating instructions
1498 // can't be formed if the memory instruction doesn't have the offset we're
1500 if (MIUnscaledOffset
!= UnscaledOffset
)
1503 // If the base register overlaps a source/destination register, we can't
1504 // merge the update. This does not apply to tag store instructions which
1505 // ignore the address part of the source register.
1506 // This does not apply to STGPi as well, which does not have unpredictable
1507 // behavior in this case unlike normal stores, and always performs writeback
1508 // after reading the source register value.
1509 if (!isTagStore(MemMI
) && MemMI
.getOpcode() != AArch64::STGPi
) {
1510 bool IsPairedInsn
= isPairedLdSt(MemMI
);
1511 for (unsigned i
= 0, e
= IsPairedInsn
? 2 : 1; i
!= e
; ++i
) {
1512 Register DestReg
= getLdStRegOp(MemMI
, i
).getReg();
1513 if (DestReg
== BaseReg
|| TRI
->isSubRegister(BaseReg
, DestReg
))
1518 // Track which register units have been modified and used between the first
1519 // insn (inclusive) and the second insn.
1520 ModifiedRegUnits
.clear();
1521 UsedRegUnits
.clear();
1523 for (unsigned Count
= 0; MBBI
!= E
&& Count
< Limit
; ++MBBI
) {
1524 MachineInstr
&MI
= *MBBI
;
1526 // Don't count transient instructions towards the search limit since there
1527 // may be different numbers of them if e.g. debug information is present.
1528 if (!MI
.isTransient())
1531 // If we found a match, return it.
1532 if (isMatchingUpdateInsn(*I
, MI
, BaseReg
, UnscaledOffset
))
1535 // Update the status of what the instruction clobbered and used.
1536 LiveRegUnits::accumulateUsedDefed(MI
, ModifiedRegUnits
, UsedRegUnits
, TRI
);
1538 // Otherwise, if the base register is used or modified, we have no match, so
1540 if (!ModifiedRegUnits
.available(BaseReg
) ||
1541 !UsedRegUnits
.available(BaseReg
))
1547 MachineBasicBlock::iterator
AArch64LoadStoreOpt::findMatchingUpdateInsnBackward(
1548 MachineBasicBlock::iterator I
, unsigned Limit
) {
1549 MachineBasicBlock::iterator B
= I
->getParent()->begin();
1550 MachineBasicBlock::iterator E
= I
->getParent()->end();
1551 MachineInstr
&MemMI
= *I
;
1552 MachineBasicBlock::iterator MBBI
= I
;
1554 Register BaseReg
= getLdStBaseOp(MemMI
).getReg();
1555 int Offset
= getLdStOffsetOp(MemMI
).getImm();
1557 // If the load/store is the first instruction in the block, there's obviously
1558 // not any matching update. Ditto if the memory offset isn't zero.
1559 if (MBBI
== B
|| Offset
!= 0)
1561 // If the base register overlaps a destination register, we can't
1562 // merge the update.
1563 if (!isTagStore(MemMI
)) {
1564 bool IsPairedInsn
= isPairedLdSt(MemMI
);
1565 for (unsigned i
= 0, e
= IsPairedInsn
? 2 : 1; i
!= e
; ++i
) {
1566 Register DestReg
= getLdStRegOp(MemMI
, i
).getReg();
1567 if (DestReg
== BaseReg
|| TRI
->isSubRegister(BaseReg
, DestReg
))
1572 // Track which register units have been modified and used between the first
1573 // insn (inclusive) and the second insn.
1574 ModifiedRegUnits
.clear();
1575 UsedRegUnits
.clear();
1579 MachineInstr
&MI
= *MBBI
;
1581 // Don't count transient instructions towards the search limit since there
1582 // may be different numbers of them if e.g. debug information is present.
1583 if (!MI
.isTransient())
1586 // If we found a match, return it.
1587 if (isMatchingUpdateInsn(*I
, MI
, BaseReg
, Offset
))
1590 // Update the status of what the instruction clobbered and used.
1591 LiveRegUnits::accumulateUsedDefed(MI
, ModifiedRegUnits
, UsedRegUnits
, TRI
);
1593 // Otherwise, if the base register is used or modified, we have no match, so
1595 if (!ModifiedRegUnits
.available(BaseReg
) ||
1596 !UsedRegUnits
.available(BaseReg
))
1598 } while (MBBI
!= B
&& Count
< Limit
);
1602 bool AArch64LoadStoreOpt::tryToPromoteLoadFromStore(
1603 MachineBasicBlock::iterator
&MBBI
) {
1604 MachineInstr
&MI
= *MBBI
;
1605 // If this is a volatile load, don't mess with it.
1606 if (MI
.hasOrderedMemoryRef())
1609 // Make sure this is a reg+imm.
1610 // FIXME: It is possible to extend it to handle reg+reg cases.
1611 if (!getLdStOffsetOp(MI
).isImm())
1614 // Look backward up to LdStLimit instructions.
1615 MachineBasicBlock::iterator StoreI
;
1616 if (findMatchingStore(MBBI
, LdStLimit
, StoreI
)) {
1617 ++NumLoadsFromStoresPromoted
;
1618 // Promote the load. Keeping the iterator straight is a
1619 // pain, so we let the merge routine tell us what the next instruction
1620 // is after it's done mucking about.
1621 MBBI
= promoteLoadFromStore(MBBI
, StoreI
);
1627 // Merge adjacent zero stores into a wider store.
1628 bool AArch64LoadStoreOpt::tryToMergeZeroStInst(
1629 MachineBasicBlock::iterator
&MBBI
) {
1630 assert(isPromotableZeroStoreInst(*MBBI
) && "Expected narrow store.");
1631 MachineInstr
&MI
= *MBBI
;
1632 MachineBasicBlock::iterator E
= MI
.getParent()->end();
1634 if (!TII
->isCandidateToMergeOrPair(MI
))
1637 // Look ahead up to LdStLimit instructions for a mergable instruction.
1638 LdStPairFlags Flags
;
1639 MachineBasicBlock::iterator MergeMI
=
1640 findMatchingInsn(MBBI
, Flags
, LdStLimit
, /* FindNarrowMerge = */ true);
1642 ++NumZeroStoresPromoted
;
1644 // Keeping the iterator straight is a pain, so we let the merge routine tell
1645 // us what the next instruction is after it's done mucking about.
1646 MBBI
= mergeNarrowZeroStores(MBBI
, MergeMI
, Flags
);
1652 // Find loads and stores that can be merged into a single load or store pair
1654 bool AArch64LoadStoreOpt::tryToPairLdStInst(MachineBasicBlock::iterator
&MBBI
) {
1655 MachineInstr
&MI
= *MBBI
;
1656 MachineBasicBlock::iterator E
= MI
.getParent()->end();
1658 if (!TII
->isCandidateToMergeOrPair(MI
))
1661 // Early exit if the offset is not possible to match. (6 bits of positive
1662 // range, plus allow an extra one in case we find a later insn that matches
1664 bool IsUnscaled
= TII
->isUnscaledLdSt(MI
);
1665 int Offset
= getLdStOffsetOp(MI
).getImm();
1666 int OffsetStride
= IsUnscaled
? getMemScale(MI
) : 1;
1667 // Allow one more for offset.
1669 Offset
-= OffsetStride
;
1670 if (!inBoundsForPair(IsUnscaled
, Offset
, OffsetStride
))
1673 // Look ahead up to LdStLimit instructions for a pairable instruction.
1674 LdStPairFlags Flags
;
1675 MachineBasicBlock::iterator Paired
=
1676 findMatchingInsn(MBBI
, Flags
, LdStLimit
, /* FindNarrowMerge = */ false);
1679 if (TII
->isUnscaledLdSt(MI
))
1680 ++NumUnscaledPairCreated
;
1681 // Keeping the iterator straight is a pain, so we let the merge routine tell
1682 // us what the next instruction is after it's done mucking about.
1683 MBBI
= mergePairedInsns(MBBI
, Paired
, Flags
);
1689 bool AArch64LoadStoreOpt::tryToMergeLdStUpdate
1690 (MachineBasicBlock::iterator
&MBBI
) {
1691 MachineInstr
&MI
= *MBBI
;
1692 MachineBasicBlock::iterator E
= MI
.getParent()->end();
1693 MachineBasicBlock::iterator Update
;
1695 // Look forward to try to form a post-index instruction. For example,
1697 // add x20, x20, #32
1699 // ldr x0, [x20], #32
1700 Update
= findMatchingUpdateInsnForward(MBBI
, 0, UpdateLimit
);
1702 // Merge the update into the ld/st.
1703 MBBI
= mergeUpdateInsn(MBBI
, Update
, /*IsPreIdx=*/false);
1707 // Don't know how to handle unscaled pre/post-index versions below, so bail.
1708 if (TII
->isUnscaledLdSt(MI
.getOpcode()))
1711 // Look back to try to find a pre-index instruction. For example,
1715 // ldr x1, [x0, #8]!
1716 Update
= findMatchingUpdateInsnBackward(MBBI
, UpdateLimit
);
1718 // Merge the update into the ld/st.
1719 MBBI
= mergeUpdateInsn(MBBI
, Update
, /*IsPreIdx=*/true);
1723 // The immediate in the load/store is scaled by the size of the memory
1724 // operation. The immediate in the add we're looking for,
1725 // however, is not, so adjust here.
1726 int UnscaledOffset
= getLdStOffsetOp(MI
).getImm() * getMemScale(MI
);
1728 // Look forward to try to find a pre-index instruction. For example,
1729 // ldr x1, [x0, #64]
1732 // ldr x1, [x0, #64]!
1733 Update
= findMatchingUpdateInsnForward(MBBI
, UnscaledOffset
, UpdateLimit
);
1735 // Merge the update into the ld/st.
1736 MBBI
= mergeUpdateInsn(MBBI
, Update
, /*IsPreIdx=*/true);
1743 bool AArch64LoadStoreOpt::optimizeBlock(MachineBasicBlock
&MBB
,
1744 bool EnableNarrowZeroStOpt
) {
1745 bool Modified
= false;
1746 // Four tranformations to do here:
1747 // 1) Find loads that directly read from stores and promote them by
1748 // replacing with mov instructions. If the store is wider than the load,
1749 // the load will be replaced with a bitfield extract.
1752 // ldrh w2, [x0, #6]
1756 for (MachineBasicBlock::iterator MBBI
= MBB
.begin(), E
= MBB
.end();
1758 if (isPromotableLoadFromStore(*MBBI
) && tryToPromoteLoadFromStore(MBBI
))
1763 // 2) Merge adjacent zero stores into a wider store.
1766 // strh wzr, [x0, #2]
1771 // str wzr, [x0, #4]
1774 if (EnableNarrowZeroStOpt
)
1775 for (MachineBasicBlock::iterator MBBI
= MBB
.begin(), E
= MBB
.end();
1777 if (isPromotableZeroStoreInst(*MBBI
) && tryToMergeZeroStInst(MBBI
))
1782 // 3) Find loads and stores that can be merged into a single load or store
1783 // pair instruction.
1789 for (MachineBasicBlock::iterator MBBI
= MBB
.begin(), E
= MBB
.end();
1791 if (TII
->isPairableLdStInst(*MBBI
) && tryToPairLdStInst(MBBI
))
1796 // 4) Find base register updates that can be merged into the load or store
1797 // as a base-reg writeback.
1803 for (MachineBasicBlock::iterator MBBI
= MBB
.begin(), E
= MBB
.end();
1805 if (isMergeableLdStUpdate(*MBBI
) && tryToMergeLdStUpdate(MBBI
))
1814 bool AArch64LoadStoreOpt::runOnMachineFunction(MachineFunction
&Fn
) {
1815 if (skipFunction(Fn
.getFunction()))
1818 Subtarget
= &static_cast<const AArch64Subtarget
&>(Fn
.getSubtarget());
1819 TII
= static_cast<const AArch64InstrInfo
*>(Subtarget
->getInstrInfo());
1820 TRI
= Subtarget
->getRegisterInfo();
1821 AA
= &getAnalysis
<AAResultsWrapperPass
>().getAAResults();
1823 // Resize the modified and used register unit trackers. We do this once
1824 // per function and then clear the register units each time we optimize a load
1826 ModifiedRegUnits
.init(*TRI
);
1827 UsedRegUnits
.init(*TRI
);
1829 bool Modified
= false;
1830 bool enableNarrowZeroStOpt
= !Subtarget
->requiresStrictAlign();
1831 for (auto &MBB
: Fn
)
1832 Modified
|= optimizeBlock(MBB
, enableNarrowZeroStOpt
);
1837 // FIXME: Do we need/want a pre-alloc pass like ARM has to try to keep loads and
1838 // stores near one another? Note: The pre-RA instruction scheduler already has
1839 // hooks to try and schedule pairable loads/stores together to improve pairing
1840 // opportunities. Thus, pre-RA pairing pass may not be worth the effort.
1842 // FIXME: When pairing store instructions it's very possible for this pass to
1843 // hoist a store with a KILL marker above another use (without a KILL marker).
1844 // The resulting IR is invalid, but nothing uses the KILL markers after this
1845 // pass, so it's never caused a problem in practice.
1847 /// createAArch64LoadStoreOptimizationPass - returns an instance of the
1848 /// load / store optimization pass.
1849 FunctionPass
*llvm::createAArch64LoadStoreOptimizationPass() {
1850 return new AArch64LoadStoreOpt();