1 //===- AArch64LoadStoreOptimizer.cpp - AArch64 load/store opt. pass -------===//
3 // Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
4 // See https://llvm.org/LICENSE.txt for license information.
5 // SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
7 //===----------------------------------------------------------------------===//
9 // This file contains a pass that performs load / store related peephole
10 // optimizations. This pass should be run after register allocation.
12 //===----------------------------------------------------------------------===//
14 #include "AArch64InstrInfo.h"
15 #include "AArch64MachineFunctionInfo.h"
16 #include "AArch64Subtarget.h"
17 #include "MCTargetDesc/AArch64AddressingModes.h"
18 #include "llvm/ADT/BitVector.h"
19 #include "llvm/ADT/SmallVector.h"
20 #include "llvm/ADT/Statistic.h"
21 #include "llvm/ADT/StringRef.h"
22 #include "llvm/ADT/iterator_range.h"
23 #include "llvm/Analysis/AliasAnalysis.h"
24 #include "llvm/CodeGen/MachineBasicBlock.h"
25 #include "llvm/CodeGen/MachineFunction.h"
26 #include "llvm/CodeGen/MachineFunctionPass.h"
27 #include "llvm/CodeGen/MachineInstr.h"
28 #include "llvm/CodeGen/MachineInstrBuilder.h"
29 #include "llvm/CodeGen/MachineOperand.h"
30 #include "llvm/CodeGen/MachineRegisterInfo.h"
31 #include "llvm/CodeGen/TargetRegisterInfo.h"
32 #include "llvm/IR/DebugLoc.h"
33 #include "llvm/MC/MCAsmInfo.h"
34 #include "llvm/MC/MCRegisterInfo.h"
35 #include "llvm/Pass.h"
36 #include "llvm/Support/CommandLine.h"
37 #include "llvm/Support/Debug.h"
38 #include "llvm/Support/DebugCounter.h"
39 #include "llvm/Support/ErrorHandling.h"
40 #include "llvm/Support/raw_ostream.h"
49 #define DEBUG_TYPE "aarch64-ldst-opt"
51 STATISTIC(NumPairCreated
, "Number of load/store pair instructions generated");
52 STATISTIC(NumPostFolded
, "Number of post-index updates folded");
53 STATISTIC(NumPreFolded
, "Number of pre-index updates folded");
54 STATISTIC(NumUnscaledPairCreated
,
55 "Number of load/store from unscaled generated");
56 STATISTIC(NumZeroStoresPromoted
, "Number of narrow zero stores promoted");
57 STATISTIC(NumLoadsFromStoresPromoted
, "Number of loads from stores promoted");
59 DEBUG_COUNTER(RegRenamingCounter
, DEBUG_TYPE
"-reg-renaming",
60 "Controls which pairs are considered for renaming");
62 // The LdStLimit limits how far we search for load/store pairs.
63 static cl::opt
<unsigned> LdStLimit("aarch64-load-store-scan-limit",
64 cl::init(20), cl::Hidden
);
66 // The UpdateLimit limits how far we search for update instructions when we form
67 // pre-/post-index instructions.
68 static cl::opt
<unsigned> UpdateLimit("aarch64-update-scan-limit", cl::init(100),
71 // Enable register renaming to find additional store pairing opportunities.
72 static cl::opt
<bool> EnableRenaming("aarch64-load-store-renaming",
73 cl::init(true), cl::Hidden
);
75 #define AARCH64_LOAD_STORE_OPT_NAME "AArch64 load / store optimization pass"
79 using LdStPairFlags
= struct LdStPairFlags
{
80 // If a matching instruction is found, MergeForward is set to true if the
81 // merge is to remove the first instruction and replace the second with
82 // a pair-wise insn, and false if the reverse is true.
83 bool MergeForward
= false;
85 // SExtIdx gives the index of the result of the load pair that must be
86 // extended. The value of SExtIdx assumes that the paired load produces the
87 // value in this order: (I, returned iterator), i.e., -1 means no value has
88 // to be extended, 0 means I, and 1 means the returned iterator.
91 // If not none, RenameReg can be used to rename the result register of the
92 // first store in a pair. Currently this only works when merging stores
94 Optional
<MCPhysReg
> RenameReg
= None
;
96 LdStPairFlags() = default;
98 void setMergeForward(bool V
= true) { MergeForward
= V
; }
99 bool getMergeForward() const { return MergeForward
; }
101 void setSExtIdx(int V
) { SExtIdx
= V
; }
102 int getSExtIdx() const { return SExtIdx
; }
104 void setRenameReg(MCPhysReg R
) { RenameReg
= R
; }
105 void clearRenameReg() { RenameReg
= None
; }
106 Optional
<MCPhysReg
> getRenameReg() const { return RenameReg
; }
109 struct AArch64LoadStoreOpt
: public MachineFunctionPass
{
112 AArch64LoadStoreOpt() : MachineFunctionPass(ID
) {
113 initializeAArch64LoadStoreOptPass(*PassRegistry::getPassRegistry());
117 const AArch64InstrInfo
*TII
;
118 const TargetRegisterInfo
*TRI
;
119 const AArch64Subtarget
*Subtarget
;
121 // Track which register units have been modified and used.
122 LiveRegUnits ModifiedRegUnits
, UsedRegUnits
;
123 LiveRegUnits DefinedInBB
;
125 void getAnalysisUsage(AnalysisUsage
&AU
) const override
{
126 AU
.addRequired
<AAResultsWrapperPass
>();
127 MachineFunctionPass::getAnalysisUsage(AU
);
130 // Scan the instructions looking for a load/store that can be combined
131 // with the current instruction into a load/store pair.
132 // Return the matching instruction if one is found, else MBB->end().
133 MachineBasicBlock::iterator
findMatchingInsn(MachineBasicBlock::iterator I
,
134 LdStPairFlags
&Flags
,
136 bool FindNarrowMerge
);
138 // Scan the instructions looking for a store that writes to the address from
139 // which the current load instruction reads. Return true if one is found.
140 bool findMatchingStore(MachineBasicBlock::iterator I
, unsigned Limit
,
141 MachineBasicBlock::iterator
&StoreI
);
143 // Merge the two instructions indicated into a wider narrow store instruction.
144 MachineBasicBlock::iterator
145 mergeNarrowZeroStores(MachineBasicBlock::iterator I
,
146 MachineBasicBlock::iterator MergeMI
,
147 const LdStPairFlags
&Flags
);
149 // Merge the two instructions indicated into a single pair-wise instruction.
150 MachineBasicBlock::iterator
151 mergePairedInsns(MachineBasicBlock::iterator I
,
152 MachineBasicBlock::iterator Paired
,
153 const LdStPairFlags
&Flags
);
155 // Promote the load that reads directly from the address stored to.
156 MachineBasicBlock::iterator
157 promoteLoadFromStore(MachineBasicBlock::iterator LoadI
,
158 MachineBasicBlock::iterator StoreI
);
160 // Scan the instruction list to find a base register update that can
161 // be combined with the current instruction (a load or store) using
162 // pre or post indexed addressing with writeback. Scan forwards.
163 MachineBasicBlock::iterator
164 findMatchingUpdateInsnForward(MachineBasicBlock::iterator I
,
165 int UnscaledOffset
, unsigned Limit
);
167 // Scan the instruction list to find a base register update that can
168 // be combined with the current instruction (a load or store) using
169 // pre or post indexed addressing with writeback. Scan backwards.
170 MachineBasicBlock::iterator
171 findMatchingUpdateInsnBackward(MachineBasicBlock::iterator I
, unsigned Limit
);
173 // Find an instruction that updates the base register of the ld/st
175 bool isMatchingUpdateInsn(MachineInstr
&MemMI
, MachineInstr
&MI
,
176 unsigned BaseReg
, int Offset
);
178 // Merge a pre- or post-index base register update into a ld/st instruction.
179 MachineBasicBlock::iterator
180 mergeUpdateInsn(MachineBasicBlock::iterator I
,
181 MachineBasicBlock::iterator Update
, bool IsPreIdx
);
183 // Find and merge zero store instructions.
184 bool tryToMergeZeroStInst(MachineBasicBlock::iterator
&MBBI
);
186 // Find and pair ldr/str instructions.
187 bool tryToPairLdStInst(MachineBasicBlock::iterator
&MBBI
);
189 // Find and promote load instructions which read directly from store.
190 bool tryToPromoteLoadFromStore(MachineBasicBlock::iterator
&MBBI
);
192 // Find and merge a base register updates before or after a ld/st instruction.
193 bool tryToMergeLdStUpdate(MachineBasicBlock::iterator
&MBBI
);
195 bool optimizeBlock(MachineBasicBlock
&MBB
, bool EnableNarrowZeroStOpt
);
197 bool runOnMachineFunction(MachineFunction
&Fn
) override
;
199 MachineFunctionProperties
getRequiredProperties() const override
{
200 return MachineFunctionProperties().set(
201 MachineFunctionProperties::Property::NoVRegs
);
204 StringRef
getPassName() const override
{ return AARCH64_LOAD_STORE_OPT_NAME
; }
207 char AArch64LoadStoreOpt::ID
= 0;
209 } // end anonymous namespace
211 INITIALIZE_PASS(AArch64LoadStoreOpt
, "aarch64-ldst-opt",
212 AARCH64_LOAD_STORE_OPT_NAME
, false, false)
214 static bool isNarrowStore(unsigned Opc
) {
218 case AArch64::STRBBui
:
219 case AArch64::STURBBi
:
220 case AArch64::STRHHui
:
221 case AArch64::STURHHi
:
226 // These instruction set memory tag and either keep memory contents unchanged or
227 // set it to zero, ignoring the address part of the source register.
228 static bool isTagStore(const MachineInstr
&MI
) {
229 switch (MI
.getOpcode()) {
232 case AArch64::STGOffset
:
233 case AArch64::STZGOffset
:
234 case AArch64::ST2GOffset
:
235 case AArch64::STZ2GOffset
:
240 static unsigned getMatchingNonSExtOpcode(unsigned Opc
,
241 bool *IsValidLdStrOpc
= nullptr) {
243 *IsValidLdStrOpc
= true;
247 *IsValidLdStrOpc
= false;
248 return std::numeric_limits
<unsigned>::max();
249 case AArch64::STRDui
:
250 case AArch64::STURDi
:
251 case AArch64::STRDpre
:
252 case AArch64::STRQui
:
253 case AArch64::STURQi
:
254 case AArch64::STRQpre
:
255 case AArch64::STRBBui
:
256 case AArch64::STURBBi
:
257 case AArch64::STRHHui
:
258 case AArch64::STURHHi
:
259 case AArch64::STRWui
:
260 case AArch64::STRWpre
:
261 case AArch64::STURWi
:
262 case AArch64::STRXui
:
263 case AArch64::STRXpre
:
264 case AArch64::STURXi
:
265 case AArch64::LDRDui
:
266 case AArch64::LDURDi
:
267 case AArch64::LDRDpre
:
268 case AArch64::LDRQui
:
269 case AArch64::LDURQi
:
270 case AArch64::LDRQpre
:
271 case AArch64::LDRWui
:
272 case AArch64::LDURWi
:
273 case AArch64::LDRWpre
:
274 case AArch64::LDRXui
:
275 case AArch64::LDURXi
:
276 case AArch64::LDRXpre
:
277 case AArch64::STRSui
:
278 case AArch64::STURSi
:
279 case AArch64::STRSpre
:
280 case AArch64::LDRSui
:
281 case AArch64::LDURSi
:
282 case AArch64::LDRSpre
:
284 case AArch64::LDRSWui
:
285 return AArch64::LDRWui
;
286 case AArch64::LDURSWi
:
287 return AArch64::LDURWi
;
291 static unsigned getMatchingWideOpcode(unsigned Opc
) {
294 llvm_unreachable("Opcode has no wide equivalent!");
295 case AArch64::STRBBui
:
296 return AArch64::STRHHui
;
297 case AArch64::STRHHui
:
298 return AArch64::STRWui
;
299 case AArch64::STURBBi
:
300 return AArch64::STURHHi
;
301 case AArch64::STURHHi
:
302 return AArch64::STURWi
;
303 case AArch64::STURWi
:
304 return AArch64::STURXi
;
305 case AArch64::STRWui
:
306 return AArch64::STRXui
;
310 static unsigned getMatchingPairOpcode(unsigned Opc
) {
313 llvm_unreachable("Opcode has no pairwise equivalent!");
314 case AArch64::STRSui
:
315 case AArch64::STURSi
:
316 return AArch64::STPSi
;
317 case AArch64::STRSpre
:
318 return AArch64::STPSpre
;
319 case AArch64::STRDui
:
320 case AArch64::STURDi
:
321 return AArch64::STPDi
;
322 case AArch64::STRDpre
:
323 return AArch64::STPDpre
;
324 case AArch64::STRQui
:
325 case AArch64::STURQi
:
326 return AArch64::STPQi
;
327 case AArch64::STRQpre
:
328 return AArch64::STPQpre
;
329 case AArch64::STRWui
:
330 case AArch64::STURWi
:
331 return AArch64::STPWi
;
332 case AArch64::STRWpre
:
333 return AArch64::STPWpre
;
334 case AArch64::STRXui
:
335 case AArch64::STURXi
:
336 return AArch64::STPXi
;
337 case AArch64::STRXpre
:
338 return AArch64::STPXpre
;
339 case AArch64::LDRSui
:
340 case AArch64::LDURSi
:
341 return AArch64::LDPSi
;
342 case AArch64::LDRSpre
:
343 return AArch64::LDPSpre
;
344 case AArch64::LDRDui
:
345 case AArch64::LDURDi
:
346 return AArch64::LDPDi
;
347 case AArch64::LDRDpre
:
348 return AArch64::LDPDpre
;
349 case AArch64::LDRQui
:
350 case AArch64::LDURQi
:
351 return AArch64::LDPQi
;
352 case AArch64::LDRQpre
:
353 return AArch64::LDPQpre
;
354 case AArch64::LDRWui
:
355 case AArch64::LDURWi
:
356 return AArch64::LDPWi
;
357 case AArch64::LDRWpre
:
358 return AArch64::LDPWpre
;
359 case AArch64::LDRXui
:
360 case AArch64::LDURXi
:
361 return AArch64::LDPXi
;
362 case AArch64::LDRXpre
:
363 return AArch64::LDPXpre
;
364 case AArch64::LDRSWui
:
365 case AArch64::LDURSWi
:
366 return AArch64::LDPSWi
;
370 static unsigned isMatchingStore(MachineInstr
&LoadInst
,
371 MachineInstr
&StoreInst
) {
372 unsigned LdOpc
= LoadInst
.getOpcode();
373 unsigned StOpc
= StoreInst
.getOpcode();
376 llvm_unreachable("Unsupported load instruction!");
377 case AArch64::LDRBBui
:
378 return StOpc
== AArch64::STRBBui
|| StOpc
== AArch64::STRHHui
||
379 StOpc
== AArch64::STRWui
|| StOpc
== AArch64::STRXui
;
380 case AArch64::LDURBBi
:
381 return StOpc
== AArch64::STURBBi
|| StOpc
== AArch64::STURHHi
||
382 StOpc
== AArch64::STURWi
|| StOpc
== AArch64::STURXi
;
383 case AArch64::LDRHHui
:
384 return StOpc
== AArch64::STRHHui
|| StOpc
== AArch64::STRWui
||
385 StOpc
== AArch64::STRXui
;
386 case AArch64::LDURHHi
:
387 return StOpc
== AArch64::STURHHi
|| StOpc
== AArch64::STURWi
||
388 StOpc
== AArch64::STURXi
;
389 case AArch64::LDRWui
:
390 return StOpc
== AArch64::STRWui
|| StOpc
== AArch64::STRXui
;
391 case AArch64::LDURWi
:
392 return StOpc
== AArch64::STURWi
|| StOpc
== AArch64::STURXi
;
393 case AArch64::LDRXui
:
394 return StOpc
== AArch64::STRXui
;
395 case AArch64::LDURXi
:
396 return StOpc
== AArch64::STURXi
;
400 static unsigned getPreIndexedOpcode(unsigned Opc
) {
401 // FIXME: We don't currently support creating pre-indexed loads/stores when
402 // the load or store is the unscaled version. If we decide to perform such an
403 // optimization in the future the cases for the unscaled loads/stores will
404 // need to be added here.
407 llvm_unreachable("Opcode has no pre-indexed equivalent!");
408 case AArch64::STRSui
:
409 return AArch64::STRSpre
;
410 case AArch64::STRDui
:
411 return AArch64::STRDpre
;
412 case AArch64::STRQui
:
413 return AArch64::STRQpre
;
414 case AArch64::STRBBui
:
415 return AArch64::STRBBpre
;
416 case AArch64::STRHHui
:
417 return AArch64::STRHHpre
;
418 case AArch64::STRWui
:
419 return AArch64::STRWpre
;
420 case AArch64::STRXui
:
421 return AArch64::STRXpre
;
422 case AArch64::LDRSui
:
423 return AArch64::LDRSpre
;
424 case AArch64::LDRDui
:
425 return AArch64::LDRDpre
;
426 case AArch64::LDRQui
:
427 return AArch64::LDRQpre
;
428 case AArch64::LDRBBui
:
429 return AArch64::LDRBBpre
;
430 case AArch64::LDRHHui
:
431 return AArch64::LDRHHpre
;
432 case AArch64::LDRWui
:
433 return AArch64::LDRWpre
;
434 case AArch64::LDRXui
:
435 return AArch64::LDRXpre
;
436 case AArch64::LDRSWui
:
437 return AArch64::LDRSWpre
;
439 return AArch64::LDPSpre
;
440 case AArch64::LDPSWi
:
441 return AArch64::LDPSWpre
;
443 return AArch64::LDPDpre
;
445 return AArch64::LDPQpre
;
447 return AArch64::LDPWpre
;
449 return AArch64::LDPXpre
;
451 return AArch64::STPSpre
;
453 return AArch64::STPDpre
;
455 return AArch64::STPQpre
;
457 return AArch64::STPWpre
;
459 return AArch64::STPXpre
;
460 case AArch64::STGOffset
:
461 return AArch64::STGPreIndex
;
462 case AArch64::STZGOffset
:
463 return AArch64::STZGPreIndex
;
464 case AArch64::ST2GOffset
:
465 return AArch64::ST2GPreIndex
;
466 case AArch64::STZ2GOffset
:
467 return AArch64::STZ2GPreIndex
;
469 return AArch64::STGPpre
;
473 static unsigned getPostIndexedOpcode(unsigned Opc
) {
476 llvm_unreachable("Opcode has no post-indexed wise equivalent!");
477 case AArch64::STRSui
:
478 case AArch64::STURSi
:
479 return AArch64::STRSpost
;
480 case AArch64::STRDui
:
481 case AArch64::STURDi
:
482 return AArch64::STRDpost
;
483 case AArch64::STRQui
:
484 case AArch64::STURQi
:
485 return AArch64::STRQpost
;
486 case AArch64::STRBBui
:
487 return AArch64::STRBBpost
;
488 case AArch64::STRHHui
:
489 return AArch64::STRHHpost
;
490 case AArch64::STRWui
:
491 case AArch64::STURWi
:
492 return AArch64::STRWpost
;
493 case AArch64::STRXui
:
494 case AArch64::STURXi
:
495 return AArch64::STRXpost
;
496 case AArch64::LDRSui
:
497 case AArch64::LDURSi
:
498 return AArch64::LDRSpost
;
499 case AArch64::LDRDui
:
500 case AArch64::LDURDi
:
501 return AArch64::LDRDpost
;
502 case AArch64::LDRQui
:
503 case AArch64::LDURQi
:
504 return AArch64::LDRQpost
;
505 case AArch64::LDRBBui
:
506 return AArch64::LDRBBpost
;
507 case AArch64::LDRHHui
:
508 return AArch64::LDRHHpost
;
509 case AArch64::LDRWui
:
510 case AArch64::LDURWi
:
511 return AArch64::LDRWpost
;
512 case AArch64::LDRXui
:
513 case AArch64::LDURXi
:
514 return AArch64::LDRXpost
;
515 case AArch64::LDRSWui
:
516 return AArch64::LDRSWpost
;
518 return AArch64::LDPSpost
;
519 case AArch64::LDPSWi
:
520 return AArch64::LDPSWpost
;
522 return AArch64::LDPDpost
;
524 return AArch64::LDPQpost
;
526 return AArch64::LDPWpost
;
528 return AArch64::LDPXpost
;
530 return AArch64::STPSpost
;
532 return AArch64::STPDpost
;
534 return AArch64::STPQpost
;
536 return AArch64::STPWpost
;
538 return AArch64::STPXpost
;
539 case AArch64::STGOffset
:
540 return AArch64::STGPostIndex
;
541 case AArch64::STZGOffset
:
542 return AArch64::STZGPostIndex
;
543 case AArch64::ST2GOffset
:
544 return AArch64::ST2GPostIndex
;
545 case AArch64::STZ2GOffset
:
546 return AArch64::STZ2GPostIndex
;
548 return AArch64::STGPpost
;
552 static bool isPairedLdSt(const MachineInstr
&MI
) {
553 switch (MI
.getOpcode()) {
557 case AArch64::LDPSWi
:
572 static bool isPreLdStPairCandidate(MachineInstr
&FirstMI
, MachineInstr
&MI
) {
574 unsigned OpcA
= FirstMI
.getOpcode();
575 unsigned OpcB
= MI
.getOpcode();
580 case AArch64::STRSpre
:
581 return (OpcB
== AArch64::STRSui
) || (OpcB
== AArch64::STURSi
);
582 case AArch64::STRDpre
:
583 return (OpcB
== AArch64::STRDui
) || (OpcB
== AArch64::STURDi
);
584 case AArch64::STRQpre
:
585 return (OpcB
== AArch64::STRQui
) || (OpcB
== AArch64::STURQi
);
586 case AArch64::STRWpre
:
587 return (OpcB
== AArch64::STRWui
) || (OpcB
== AArch64::STURWi
);
588 case AArch64::STRXpre
:
589 return (OpcB
== AArch64::STRXui
) || (OpcB
== AArch64::STURXi
);
590 case AArch64::LDRSpre
:
591 return (OpcB
== AArch64::LDRSui
) || (OpcB
== AArch64::LDURSi
);
592 case AArch64::LDRDpre
:
593 return (OpcB
== AArch64::LDRDui
) || (OpcB
== AArch64::LDURDi
);
594 case AArch64::LDRQpre
:
595 return (OpcB
== AArch64::LDRQui
) || (OpcB
== AArch64::LDURQi
);
596 case AArch64::LDRWpre
:
597 return (OpcB
== AArch64::LDRWui
) || (OpcB
== AArch64::LDURWi
);
598 case AArch64::LDRXpre
:
599 return (OpcB
== AArch64::LDRXui
) || (OpcB
== AArch64::LDURXi
);
603 // Returns the scale and offset range of pre/post indexed variants of MI.
604 static void getPrePostIndexedMemOpInfo(const MachineInstr
&MI
, int &Scale
,
605 int &MinOffset
, int &MaxOffset
) {
606 bool IsPaired
= isPairedLdSt(MI
);
607 bool IsTagStore
= isTagStore(MI
);
608 // ST*G and all paired ldst have the same scale in pre/post-indexed variants
609 // as in the "unsigned offset" variant.
610 // All other pre/post indexed ldst instructions are unscaled.
611 Scale
= (IsTagStore
|| IsPaired
) ? AArch64InstrInfo::getMemScale(MI
) : 1;
622 static MachineOperand
&getLdStRegOp(MachineInstr
&MI
,
623 unsigned PairedRegOp
= 0) {
624 assert(PairedRegOp
< 2 && "Unexpected register operand idx.");
625 bool IsPreLdSt
= AArch64InstrInfo::isPreLdSt(MI
);
628 unsigned Idx
= isPairedLdSt(MI
) || IsPreLdSt
? PairedRegOp
: 0;
629 return MI
.getOperand(Idx
);
632 static const MachineOperand
&getLdStBaseOp(const MachineInstr
&MI
) {
633 unsigned Idx
= isPairedLdSt(MI
) || AArch64InstrInfo::isPreLdSt(MI
) ? 2 : 1;
634 return MI
.getOperand(Idx
);
637 static const MachineOperand
&getLdStOffsetOp(const MachineInstr
&MI
) {
638 unsigned Idx
= isPairedLdSt(MI
) || AArch64InstrInfo::isPreLdSt(MI
) ? 3 : 2;
639 return MI
.getOperand(Idx
);
642 static bool isLdOffsetInRangeOfSt(MachineInstr
&LoadInst
,
643 MachineInstr
&StoreInst
,
644 const AArch64InstrInfo
*TII
) {
645 assert(isMatchingStore(LoadInst
, StoreInst
) && "Expect only matched ld/st.");
646 int LoadSize
= TII
->getMemScale(LoadInst
);
647 int StoreSize
= TII
->getMemScale(StoreInst
);
648 int UnscaledStOffset
= TII
->hasUnscaledLdStOffset(StoreInst
)
649 ? getLdStOffsetOp(StoreInst
).getImm()
650 : getLdStOffsetOp(StoreInst
).getImm() * StoreSize
;
651 int UnscaledLdOffset
= TII
->hasUnscaledLdStOffset(LoadInst
)
652 ? getLdStOffsetOp(LoadInst
).getImm()
653 : getLdStOffsetOp(LoadInst
).getImm() * LoadSize
;
654 return (UnscaledStOffset
<= UnscaledLdOffset
) &&
655 (UnscaledLdOffset
+ LoadSize
<= (UnscaledStOffset
+ StoreSize
));
658 static bool isPromotableZeroStoreInst(MachineInstr
&MI
) {
659 unsigned Opc
= MI
.getOpcode();
660 return (Opc
== AArch64::STRWui
|| Opc
== AArch64::STURWi
||
661 isNarrowStore(Opc
)) &&
662 getLdStRegOp(MI
).getReg() == AArch64::WZR
;
665 static bool isPromotableLoadFromStore(MachineInstr
&MI
) {
666 switch (MI
.getOpcode()) {
669 // Scaled instructions.
670 case AArch64::LDRBBui
:
671 case AArch64::LDRHHui
:
672 case AArch64::LDRWui
:
673 case AArch64::LDRXui
:
674 // Unscaled instructions.
675 case AArch64::LDURBBi
:
676 case AArch64::LDURHHi
:
677 case AArch64::LDURWi
:
678 case AArch64::LDURXi
:
683 static bool isMergeableLdStUpdate(MachineInstr
&MI
) {
684 unsigned Opc
= MI
.getOpcode();
688 // Scaled instructions.
689 case AArch64::STRSui
:
690 case AArch64::STRDui
:
691 case AArch64::STRQui
:
692 case AArch64::STRXui
:
693 case AArch64::STRWui
:
694 case AArch64::STRHHui
:
695 case AArch64::STRBBui
:
696 case AArch64::LDRSui
:
697 case AArch64::LDRDui
:
698 case AArch64::LDRQui
:
699 case AArch64::LDRXui
:
700 case AArch64::LDRWui
:
701 case AArch64::LDRHHui
:
702 case AArch64::LDRBBui
:
703 case AArch64::STGOffset
:
704 case AArch64::STZGOffset
:
705 case AArch64::ST2GOffset
:
706 case AArch64::STZ2GOffset
:
708 // Unscaled instructions.
709 case AArch64::STURSi
:
710 case AArch64::STURDi
:
711 case AArch64::STURQi
:
712 case AArch64::STURWi
:
713 case AArch64::STURXi
:
714 case AArch64::LDURSi
:
715 case AArch64::LDURDi
:
716 case AArch64::LDURQi
:
717 case AArch64::LDURWi
:
718 case AArch64::LDURXi
:
719 // Paired instructions.
721 case AArch64::LDPSWi
:
731 // Make sure this is a reg+imm (as opposed to an address reloc).
732 if (!getLdStOffsetOp(MI
).isImm())
739 MachineBasicBlock::iterator
740 AArch64LoadStoreOpt::mergeNarrowZeroStores(MachineBasicBlock::iterator I
,
741 MachineBasicBlock::iterator MergeMI
,
742 const LdStPairFlags
&Flags
) {
743 assert(isPromotableZeroStoreInst(*I
) && isPromotableZeroStoreInst(*MergeMI
) &&
744 "Expected promotable zero stores.");
746 MachineBasicBlock::iterator E
= I
->getParent()->end();
747 MachineBasicBlock::iterator NextI
= next_nodbg(I
, E
);
748 // If NextI is the second of the two instructions to be merged, we need
749 // to skip one further. Either way we merge will invalidate the iterator,
750 // and we don't need to scan the new instruction, as it's a pairwise
751 // instruction, which we're not considering for further action anyway.
752 if (NextI
== MergeMI
)
753 NextI
= next_nodbg(NextI
, E
);
755 unsigned Opc
= I
->getOpcode();
756 bool IsScaled
= !TII
->hasUnscaledLdStOffset(Opc
);
757 int OffsetStride
= IsScaled
? 1 : TII
->getMemScale(*I
);
759 bool MergeForward
= Flags
.getMergeForward();
760 // Insert our new paired instruction after whichever of the paired
761 // instructions MergeForward indicates.
762 MachineBasicBlock::iterator InsertionPoint
= MergeForward
? MergeMI
: I
;
763 // Also based on MergeForward is from where we copy the base register operand
764 // so we get the flags compatible with the input code.
765 const MachineOperand
&BaseRegOp
=
766 MergeForward
? getLdStBaseOp(*MergeMI
) : getLdStBaseOp(*I
);
768 // Which register is Rt and which is Rt2 depends on the offset order.
770 if (getLdStOffsetOp(*I
).getImm() ==
771 getLdStOffsetOp(*MergeMI
).getImm() + OffsetStride
)
776 int OffsetImm
= getLdStOffsetOp(*RtMI
).getImm();
777 // Change the scaled offset from small to large type.
779 assert(((OffsetImm
& 1) == 0) && "Unexpected offset to merge");
783 // Construct the new instruction.
784 DebugLoc DL
= I
->getDebugLoc();
785 MachineBasicBlock
*MBB
= I
->getParent();
786 MachineInstrBuilder MIB
;
787 MIB
= BuildMI(*MBB
, InsertionPoint
, DL
, TII
->get(getMatchingWideOpcode(Opc
)))
788 .addReg(isNarrowStore(Opc
) ? AArch64::WZR
: AArch64::XZR
)
791 .cloneMergedMemRefs({&*I
, &*MergeMI
})
792 .setMIFlags(I
->mergeFlagsWith(*MergeMI
));
795 LLVM_DEBUG(dbgs() << "Creating wider store. Replacing instructions:\n ");
796 LLVM_DEBUG(I
->print(dbgs()));
797 LLVM_DEBUG(dbgs() << " ");
798 LLVM_DEBUG(MergeMI
->print(dbgs()));
799 LLVM_DEBUG(dbgs() << " with instruction:\n ");
800 LLVM_DEBUG(((MachineInstr
*)MIB
)->print(dbgs()));
801 LLVM_DEBUG(dbgs() << "\n");
803 // Erase the old instructions.
804 I
->eraseFromParent();
805 MergeMI
->eraseFromParent();
809 // Apply Fn to all instructions between MI and the beginning of the block, until
810 // a def for DefReg is reached. Returns true, iff Fn returns true for all
811 // visited instructions. Stop after visiting Limit iterations.
812 static bool forAllMIsUntilDef(MachineInstr
&MI
, MCPhysReg DefReg
,
813 const TargetRegisterInfo
*TRI
, unsigned Limit
,
814 std::function
<bool(MachineInstr
&, bool)> &Fn
) {
815 auto MBB
= MI
.getParent();
816 for (MachineInstr
&I
:
817 instructionsWithoutDebug(MI
.getReverseIterator(), MBB
->instr_rend())) {
822 bool isDef
= any_of(I
.operands(), [DefReg
, TRI
](MachineOperand
&MOP
) {
823 return MOP
.isReg() && MOP
.isDef() && !MOP
.isDebug() && MOP
.getReg() &&
824 TRI
->regsOverlap(MOP
.getReg(), DefReg
);
834 static void updateDefinedRegisters(MachineInstr
&MI
, LiveRegUnits
&Units
,
835 const TargetRegisterInfo
*TRI
) {
837 for (const MachineOperand
&MOP
: phys_regs_and_masks(MI
))
838 if (MOP
.isReg() && MOP
.isKill())
839 Units
.removeReg(MOP
.getReg());
841 for (const MachineOperand
&MOP
: phys_regs_and_masks(MI
))
842 if (MOP
.isReg() && !MOP
.isKill())
843 Units
.addReg(MOP
.getReg());
846 MachineBasicBlock::iterator
847 AArch64LoadStoreOpt::mergePairedInsns(MachineBasicBlock::iterator I
,
848 MachineBasicBlock::iterator Paired
,
849 const LdStPairFlags
&Flags
) {
850 MachineBasicBlock::iterator E
= I
->getParent()->end();
851 MachineBasicBlock::iterator NextI
= next_nodbg(I
, E
);
852 // If NextI is the second of the two instructions to be merged, we need
853 // to skip one further. Either way we merge will invalidate the iterator,
854 // and we don't need to scan the new instruction, as it's a pairwise
855 // instruction, which we're not considering for further action anyway.
857 NextI
= next_nodbg(NextI
, E
);
859 int SExtIdx
= Flags
.getSExtIdx();
861 SExtIdx
== -1 ? I
->getOpcode() : getMatchingNonSExtOpcode(I
->getOpcode());
862 bool IsUnscaled
= TII
->hasUnscaledLdStOffset(Opc
);
863 int OffsetStride
= IsUnscaled
? TII
->getMemScale(*I
) : 1;
865 bool MergeForward
= Flags
.getMergeForward();
867 Optional
<MCPhysReg
> RenameReg
= Flags
.getRenameReg();
868 if (MergeForward
&& RenameReg
) {
869 MCRegister RegToRename
= getLdStRegOp(*I
).getReg();
870 DefinedInBB
.addReg(*RenameReg
);
872 // Return the sub/super register for RenameReg, matching the size of
874 auto GetMatchingSubReg
= [this,
875 RenameReg
](MCPhysReg OriginalReg
) -> MCPhysReg
{
876 for (MCPhysReg SubOrSuper
: TRI
->sub_and_superregs_inclusive(*RenameReg
))
877 if (TRI
->getMinimalPhysRegClass(OriginalReg
) ==
878 TRI
->getMinimalPhysRegClass(SubOrSuper
))
880 llvm_unreachable("Should have found matching sub or super register!");
883 std::function
<bool(MachineInstr
&, bool)> UpdateMIs
=
884 [this, RegToRename
, GetMatchingSubReg
](MachineInstr
&MI
, bool IsDef
) {
886 bool SeenDef
= false;
887 for (auto &MOP
: MI
.operands()) {
888 // Rename the first explicit definition and all implicit
889 // definitions matching RegToRename.
890 if (MOP
.isReg() && !MOP
.isDebug() && MOP
.getReg() &&
891 (!SeenDef
|| (MOP
.isDef() && MOP
.isImplicit())) &&
892 TRI
->regsOverlap(MOP
.getReg(), RegToRename
)) {
893 assert((MOP
.isImplicit() ||
894 (MOP
.isRenamable() && !MOP
.isEarlyClobber())) &&
895 "Need renamable operands");
896 MOP
.setReg(GetMatchingSubReg(MOP
.getReg()));
901 for (auto &MOP
: MI
.operands()) {
902 if (MOP
.isReg() && !MOP
.isDebug() && MOP
.getReg() &&
903 TRI
->regsOverlap(MOP
.getReg(), RegToRename
)) {
904 assert((MOP
.isImplicit() ||
905 (MOP
.isRenamable() && !MOP
.isEarlyClobber())) &&
906 "Need renamable operands");
907 MOP
.setReg(GetMatchingSubReg(MOP
.getReg()));
911 LLVM_DEBUG(dbgs() << "Renamed " << MI
<< "\n");
914 forAllMIsUntilDef(*I
, RegToRename
, TRI
, LdStLimit
, UpdateMIs
);
917 // Make sure the register used for renaming is not used between the paired
918 // instructions. That would trash the content before the new paired
921 iterator_range
<MachineInstrBundleIterator
<llvm::MachineInstr
>>(
922 std::next(I
), std::next(Paired
)))
923 assert(all_of(MI
.operands(),
924 [this, &RenameReg
](const MachineOperand
&MOP
) {
925 return !MOP
.isReg() || MOP
.isDebug() || !MOP
.getReg() ||
927 !TRI
->regsOverlap(MOP
.getReg(), *RenameReg
);
929 "Rename register used between paired instruction, trashing the "
934 // Insert our new paired instruction after whichever of the paired
935 // instructions MergeForward indicates.
936 MachineBasicBlock::iterator InsertionPoint
= MergeForward
? Paired
: I
;
937 // Also based on MergeForward is from where we copy the base register operand
938 // so we get the flags compatible with the input code.
939 const MachineOperand
&BaseRegOp
=
940 MergeForward
? getLdStBaseOp(*Paired
) : getLdStBaseOp(*I
);
942 int Offset
= getLdStOffsetOp(*I
).getImm();
943 int PairedOffset
= getLdStOffsetOp(*Paired
).getImm();
944 bool PairedIsUnscaled
= TII
->hasUnscaledLdStOffset(Paired
->getOpcode());
945 if (IsUnscaled
!= PairedIsUnscaled
) {
946 // We're trying to pair instructions that differ in how they are scaled. If
947 // I is scaled then scale the offset of Paired accordingly. Otherwise, do
948 // the opposite (i.e., make Paired's offset unscaled).
949 int MemSize
= TII
->getMemScale(*Paired
);
950 if (PairedIsUnscaled
) {
951 // If the unscaled offset isn't a multiple of the MemSize, we can't
952 // pair the operations together.
953 assert(!(PairedOffset
% TII
->getMemScale(*Paired
)) &&
954 "Offset should be a multiple of the stride!");
955 PairedOffset
/= MemSize
;
957 PairedOffset
*= MemSize
;
961 // Which register is Rt and which is Rt2 depends on the offset order.
962 // However, for pre load/stores the Rt should be the one of the pre
964 MachineInstr
*RtMI
, *Rt2MI
;
965 if (Offset
== PairedOffset
+ OffsetStride
&&
966 !AArch64InstrInfo::isPreLdSt(*I
)) {
969 // Here we swapped the assumption made for SExtIdx.
970 // I.e., we turn ldp I, Paired into ldp Paired, I.
971 // Update the index accordingly.
973 SExtIdx
= (SExtIdx
+ 1) % 2;
978 int OffsetImm
= getLdStOffsetOp(*RtMI
).getImm();
979 // Scale the immediate offset, if necessary.
980 if (TII
->hasUnscaledLdStOffset(RtMI
->getOpcode())) {
981 assert(!(OffsetImm
% TII
->getMemScale(*RtMI
)) &&
982 "Unscaled offset cannot be scaled.");
983 OffsetImm
/= TII
->getMemScale(*RtMI
);
986 // Construct the new instruction.
987 MachineInstrBuilder MIB
;
988 DebugLoc DL
= I
->getDebugLoc();
989 MachineBasicBlock
*MBB
= I
->getParent();
990 MachineOperand RegOp0
= getLdStRegOp(*RtMI
);
991 MachineOperand RegOp1
= getLdStRegOp(*Rt2MI
);
992 // Kill flags may become invalid when moving stores for pairing.
993 if (RegOp0
.isUse()) {
995 // Clear kill flags on store if moving upwards. Example:
998 // STRWui kill %w1 ; need to clear kill flag when moving STRWui upwards
999 RegOp0
.setIsKill(false);
1000 RegOp1
.setIsKill(false);
1002 // Clear kill flags of the first stores register. Example:
1004 // USE kill %w1 ; need to clear kill flag when moving STRWui downwards
1006 Register Reg
= getLdStRegOp(*I
).getReg();
1007 for (MachineInstr
&MI
: make_range(std::next(I
), Paired
))
1008 MI
.clearRegisterKills(Reg
, TRI
);
1012 unsigned int MatchPairOpcode
= getMatchingPairOpcode(Opc
);
1013 MIB
= BuildMI(*MBB
, InsertionPoint
, DL
, TII
->get(MatchPairOpcode
));
1015 // Adds the pre-index operand for pre-indexed ld/st pairs.
1016 if (AArch64InstrInfo::isPreLdSt(*RtMI
))
1017 MIB
.addReg(BaseRegOp
.getReg(), RegState::Define
);
1023 .cloneMergedMemRefs({&*I
, &*Paired
})
1024 .setMIFlags(I
->mergeFlagsWith(*Paired
));
1029 dbgs() << "Creating pair load/store. Replacing instructions:\n ");
1030 LLVM_DEBUG(I
->print(dbgs()));
1031 LLVM_DEBUG(dbgs() << " ");
1032 LLVM_DEBUG(Paired
->print(dbgs()));
1033 LLVM_DEBUG(dbgs() << " with instruction:\n ");
1034 if (SExtIdx
!= -1) {
1035 // Generate the sign extension for the proper result of the ldp.
1036 // I.e., with X1, that would be:
1037 // %w1 = KILL %w1, implicit-def %x1
1038 // %x1 = SBFMXri killed %x1, 0, 31
1039 MachineOperand
&DstMO
= MIB
->getOperand(SExtIdx
);
1040 // Right now, DstMO has the extended register, since it comes from an
1042 Register DstRegX
= DstMO
.getReg();
1043 // Get the W variant of that register.
1044 Register DstRegW
= TRI
->getSubReg(DstRegX
, AArch64::sub_32
);
1045 // Update the result of LDP to use the W instead of the X variant.
1046 DstMO
.setReg(DstRegW
);
1047 LLVM_DEBUG(((MachineInstr
*)MIB
)->print(dbgs()));
1048 LLVM_DEBUG(dbgs() << "\n");
1049 // Make the machine verifier happy by providing a definition for
1051 // Insert this definition right after the generated LDP, i.e., before
1053 MachineInstrBuilder MIBKill
=
1054 BuildMI(*MBB
, InsertionPoint
, DL
, TII
->get(TargetOpcode::KILL
), DstRegW
)
1056 .addReg(DstRegX
, RegState::Define
);
1057 MIBKill
->getOperand(2).setImplicit();
1058 // Create the sign extension.
1059 MachineInstrBuilder MIBSXTW
=
1060 BuildMI(*MBB
, InsertionPoint
, DL
, TII
->get(AArch64::SBFMXri
), DstRegX
)
1065 LLVM_DEBUG(dbgs() << " Extend operand:\n ");
1066 LLVM_DEBUG(((MachineInstr
*)MIBSXTW
)->print(dbgs()));
1068 LLVM_DEBUG(((MachineInstr
*)MIB
)->print(dbgs()));
1070 LLVM_DEBUG(dbgs() << "\n");
1073 for (const MachineOperand
&MOP
: phys_regs_and_masks(*I
))
1074 if (MOP
.isReg() && MOP
.isKill())
1075 DefinedInBB
.addReg(MOP
.getReg());
1077 // Erase the old instructions.
1078 I
->eraseFromParent();
1079 Paired
->eraseFromParent();
1084 MachineBasicBlock::iterator
1085 AArch64LoadStoreOpt::promoteLoadFromStore(MachineBasicBlock::iterator LoadI
,
1086 MachineBasicBlock::iterator StoreI
) {
1087 MachineBasicBlock::iterator NextI
=
1088 next_nodbg(LoadI
, LoadI
->getParent()->end());
1090 int LoadSize
= TII
->getMemScale(*LoadI
);
1091 int StoreSize
= TII
->getMemScale(*StoreI
);
1092 Register LdRt
= getLdStRegOp(*LoadI
).getReg();
1093 const MachineOperand
&StMO
= getLdStRegOp(*StoreI
);
1094 Register StRt
= getLdStRegOp(*StoreI
).getReg();
1095 bool IsStoreXReg
= TRI
->getRegClass(AArch64::GPR64RegClassID
)->contains(StRt
);
1097 assert((IsStoreXReg
||
1098 TRI
->getRegClass(AArch64::GPR32RegClassID
)->contains(StRt
)) &&
1099 "Unexpected RegClass");
1101 MachineInstr
*BitExtMI
;
1102 if (LoadSize
== StoreSize
&& (LoadSize
== 4 || LoadSize
== 8)) {
1103 // Remove the load, if the destination register of the loads is the same
1104 // register for stored value.
1105 if (StRt
== LdRt
&& LoadSize
== 8) {
1106 for (MachineInstr
&MI
: make_range(StoreI
->getIterator(),
1107 LoadI
->getIterator())) {
1108 if (MI
.killsRegister(StRt
, TRI
)) {
1109 MI
.clearRegisterKills(StRt
, TRI
);
1113 LLVM_DEBUG(dbgs() << "Remove load instruction:\n ");
1114 LLVM_DEBUG(LoadI
->print(dbgs()));
1115 LLVM_DEBUG(dbgs() << "\n");
1116 LoadI
->eraseFromParent();
1119 // Replace the load with a mov if the load and store are in the same size.
1121 BuildMI(*LoadI
->getParent(), LoadI
, LoadI
->getDebugLoc(),
1122 TII
->get(IsStoreXReg
? AArch64::ORRXrs
: AArch64::ORRWrs
), LdRt
)
1123 .addReg(IsStoreXReg
? AArch64::XZR
: AArch64::WZR
)
1125 .addImm(AArch64_AM::getShifterImm(AArch64_AM::LSL
, 0))
1126 .setMIFlags(LoadI
->getFlags());
1128 // FIXME: Currently we disable this transformation in big-endian targets as
1129 // performance and correctness are verified only in little-endian.
1130 if (!Subtarget
->isLittleEndian())
1132 bool IsUnscaled
= TII
->hasUnscaledLdStOffset(*LoadI
);
1133 assert(IsUnscaled
== TII
->hasUnscaledLdStOffset(*StoreI
) &&
1134 "Unsupported ld/st match");
1135 assert(LoadSize
<= StoreSize
&& "Invalid load size");
1136 int UnscaledLdOffset
= IsUnscaled
1137 ? getLdStOffsetOp(*LoadI
).getImm()
1138 : getLdStOffsetOp(*LoadI
).getImm() * LoadSize
;
1139 int UnscaledStOffset
= IsUnscaled
1140 ? getLdStOffsetOp(*StoreI
).getImm()
1141 : getLdStOffsetOp(*StoreI
).getImm() * StoreSize
;
1142 int Width
= LoadSize
* 8;
1144 IsStoreXReg
? Register(TRI
->getMatchingSuperReg(
1145 LdRt
, AArch64::sub_32
, &AArch64::GPR64RegClass
))
1148 assert((UnscaledLdOffset
>= UnscaledStOffset
&&
1149 (UnscaledLdOffset
+ LoadSize
) <= UnscaledStOffset
+ StoreSize
) &&
1152 int Immr
= 8 * (UnscaledLdOffset
- UnscaledStOffset
);
1153 int Imms
= Immr
+ Width
- 1;
1154 if (UnscaledLdOffset
== UnscaledStOffset
) {
1155 uint32_t AndMaskEncoded
= ((IsStoreXReg
? 1 : 0) << 12) // N
1156 | ((Immr
) << 6) // immr
1157 | ((Imms
) << 0) // imms
1161 BuildMI(*LoadI
->getParent(), LoadI
, LoadI
->getDebugLoc(),
1162 TII
->get(IsStoreXReg
? AArch64::ANDXri
: AArch64::ANDWri
),
1165 .addImm(AndMaskEncoded
)
1166 .setMIFlags(LoadI
->getFlags());
1169 BuildMI(*LoadI
->getParent(), LoadI
, LoadI
->getDebugLoc(),
1170 TII
->get(IsStoreXReg
? AArch64::UBFMXri
: AArch64::UBFMWri
),
1175 .setMIFlags(LoadI
->getFlags());
1179 // Clear kill flags between store and load.
1180 for (MachineInstr
&MI
: make_range(StoreI
->getIterator(),
1181 BitExtMI
->getIterator()))
1182 if (MI
.killsRegister(StRt
, TRI
)) {
1183 MI
.clearRegisterKills(StRt
, TRI
);
1187 LLVM_DEBUG(dbgs() << "Promoting load by replacing :\n ");
1188 LLVM_DEBUG(StoreI
->print(dbgs()));
1189 LLVM_DEBUG(dbgs() << " ");
1190 LLVM_DEBUG(LoadI
->print(dbgs()));
1191 LLVM_DEBUG(dbgs() << " with instructions:\n ");
1192 LLVM_DEBUG(StoreI
->print(dbgs()));
1193 LLVM_DEBUG(dbgs() << " ");
1194 LLVM_DEBUG((BitExtMI
)->print(dbgs()));
1195 LLVM_DEBUG(dbgs() << "\n");
1197 // Erase the old instructions.
1198 LoadI
->eraseFromParent();
1202 static bool inBoundsForPair(bool IsUnscaled
, int Offset
, int OffsetStride
) {
1203 // Convert the byte-offset used by unscaled into an "element" offset used
1204 // by the scaled pair load/store instructions.
1206 // If the byte-offset isn't a multiple of the stride, there's no point
1207 // trying to match it.
1208 if (Offset
% OffsetStride
)
1210 Offset
/= OffsetStride
;
1212 return Offset
<= 63 && Offset
>= -64;
1215 // Do alignment, specialized to power of 2 and for signed ints,
1216 // avoiding having to do a C-style cast from uint_64t to int when
1217 // using alignTo from include/llvm/Support/MathExtras.h.
1218 // FIXME: Move this function to include/MathExtras.h?
1219 static int alignTo(int Num
, int PowOf2
) {
1220 return (Num
+ PowOf2
- 1) & ~(PowOf2
- 1);
1223 static bool mayAlias(MachineInstr
&MIa
,
1224 SmallVectorImpl
<MachineInstr
*> &MemInsns
,
1225 AliasAnalysis
*AA
) {
1226 for (MachineInstr
*MIb
: MemInsns
)
1227 if (MIa
.mayAlias(AA
, *MIb
, /*UseTBAA*/ false))
1233 bool AArch64LoadStoreOpt::findMatchingStore(
1234 MachineBasicBlock::iterator I
, unsigned Limit
,
1235 MachineBasicBlock::iterator
&StoreI
) {
1236 MachineBasicBlock::iterator B
= I
->getParent()->begin();
1237 MachineBasicBlock::iterator MBBI
= I
;
1238 MachineInstr
&LoadMI
= *I
;
1239 Register BaseReg
= getLdStBaseOp(LoadMI
).getReg();
1241 // If the load is the first instruction in the block, there's obviously
1242 // not any matching store.
1246 // Track which register units have been modified and used between the first
1247 // insn and the second insn.
1248 ModifiedRegUnits
.clear();
1249 UsedRegUnits
.clear();
1253 MBBI
= prev_nodbg(MBBI
, B
);
1254 MachineInstr
&MI
= *MBBI
;
1256 // Don't count transient instructions towards the search limit since there
1257 // may be different numbers of them if e.g. debug information is present.
1258 if (!MI
.isTransient())
1261 // If the load instruction reads directly from the address to which the
1262 // store instruction writes and the stored value is not modified, we can
1263 // promote the load. Since we do not handle stores with pre-/post-index,
1264 // it's unnecessary to check if BaseReg is modified by the store itself.
1265 // Also we can't handle stores without an immediate offset operand,
1266 // while the operand might be the address for a global variable.
1267 if (MI
.mayStore() && isMatchingStore(LoadMI
, MI
) &&
1268 BaseReg
== getLdStBaseOp(MI
).getReg() && getLdStOffsetOp(MI
).isImm() &&
1269 isLdOffsetInRangeOfSt(LoadMI
, MI
, TII
) &&
1270 ModifiedRegUnits
.available(getLdStRegOp(MI
).getReg())) {
1278 // Update modified / uses register units.
1279 LiveRegUnits::accumulateUsedDefed(MI
, ModifiedRegUnits
, UsedRegUnits
, TRI
);
1281 // Otherwise, if the base register is modified, we have no match, so
1283 if (!ModifiedRegUnits
.available(BaseReg
))
1286 // If we encounter a store aliased with the load, return early.
1287 if (MI
.mayStore() && LoadMI
.mayAlias(AA
, MI
, /*UseTBAA*/ false))
1289 } while (MBBI
!= B
&& Count
< Limit
);
1293 // Returns true if FirstMI and MI are candidates for merging or pairing.
1294 // Otherwise, returns false.
1295 static bool areCandidatesToMergeOrPair(MachineInstr
&FirstMI
, MachineInstr
&MI
,
1296 LdStPairFlags
&Flags
,
1297 const AArch64InstrInfo
*TII
) {
1298 // If this is volatile or if pairing is suppressed, not a candidate.
1299 if (MI
.hasOrderedMemoryRef() || TII
->isLdStPairSuppressed(MI
))
1302 // We should have already checked FirstMI for pair suppression and volatility.
1303 assert(!FirstMI
.hasOrderedMemoryRef() &&
1304 !TII
->isLdStPairSuppressed(FirstMI
) &&
1305 "FirstMI shouldn't get here if either of these checks are true.");
1307 unsigned OpcA
= FirstMI
.getOpcode();
1308 unsigned OpcB
= MI
.getOpcode();
1310 // Opcodes match: If the opcodes are pre ld/st there is nothing more to check.
1312 return !AArch64InstrInfo::isPreLdSt(FirstMI
);
1314 // Try to match a sign-extended load/store with a zero-extended load/store.
1315 bool IsValidLdStrOpc
, PairIsValidLdStrOpc
;
1316 unsigned NonSExtOpc
= getMatchingNonSExtOpcode(OpcA
, &IsValidLdStrOpc
);
1317 assert(IsValidLdStrOpc
&&
1318 "Given Opc should be a Load or Store with an immediate");
1319 // OpcA will be the first instruction in the pair.
1320 if (NonSExtOpc
== getMatchingNonSExtOpcode(OpcB
, &PairIsValidLdStrOpc
)) {
1321 Flags
.setSExtIdx(NonSExtOpc
== (unsigned)OpcA
? 1 : 0);
1325 // If the second instruction isn't even a mergable/pairable load/store, bail
1327 if (!PairIsValidLdStrOpc
)
1330 // FIXME: We don't support merging narrow stores with mixed scaled/unscaled
1332 if (isNarrowStore(OpcA
) || isNarrowStore(OpcB
))
1335 // The STR<S,D,Q,W,X>pre - STR<S,D,Q,W,X>ui and
1336 // LDR<S,D,Q,W,X>pre-LDR<S,D,Q,W,X>ui
1337 // are candidate pairs that can be merged.
1338 if (isPreLdStPairCandidate(FirstMI
, MI
))
1341 // Try to match an unscaled load/store with a scaled load/store.
1342 return TII
->hasUnscaledLdStOffset(OpcA
) != TII
->hasUnscaledLdStOffset(OpcB
) &&
1343 getMatchingPairOpcode(OpcA
) == getMatchingPairOpcode(OpcB
);
1345 // FIXME: Can we also match a mixed sext/zext unscaled/scaled pair?
1349 canRenameUpToDef(MachineInstr
&FirstMI
, LiveRegUnits
&UsedInBetween
,
1350 SmallPtrSetImpl
<const TargetRegisterClass
*> &RequiredClasses
,
1351 const TargetRegisterInfo
*TRI
) {
1352 if (!FirstMI
.mayStore())
1355 // Check if we can find an unused register which we can use to rename
1356 // the register used by the first load/store.
1357 auto *RegClass
= TRI
->getMinimalPhysRegClass(getLdStRegOp(FirstMI
).getReg());
1358 MachineFunction
&MF
= *FirstMI
.getParent()->getParent();
1359 if (!RegClass
|| !MF
.getRegInfo().tracksLiveness())
1362 auto RegToRename
= getLdStRegOp(FirstMI
).getReg();
1363 // For now, we only rename if the store operand gets killed at the store.
1364 if (!getLdStRegOp(FirstMI
).isKill() &&
1365 !any_of(FirstMI
.operands(),
1366 [TRI
, RegToRename
](const MachineOperand
&MOP
) {
1367 return MOP
.isReg() && !MOP
.isDebug() && MOP
.getReg() &&
1368 MOP
.isImplicit() && MOP
.isKill() &&
1369 TRI
->regsOverlap(RegToRename
, MOP
.getReg());
1371 LLVM_DEBUG(dbgs() << " Operand not killed at " << FirstMI
<< "\n");
1374 auto canRenameMOP
= [TRI
](const MachineOperand
&MOP
) {
1376 auto *RegClass
= TRI
->getMinimalPhysRegClass(MOP
.getReg());
1377 // Renaming registers with multiple disjunct sub-registers (e.g. the
1378 // result of a LD3) means that all sub-registers are renamed, potentially
1379 // impacting other instructions we did not check. Bail out.
1380 // Note that this relies on the structure of the AArch64 register file. In
1381 // particular, a subregister cannot be written without overwriting the
1383 if (RegClass
->HasDisjunctSubRegs
) {
1386 << " Cannot rename operands with multiple disjunct subregisters ("
1391 return MOP
.isImplicit() ||
1392 (MOP
.isRenamable() && !MOP
.isEarlyClobber() && !MOP
.isTied());
1395 bool FoundDef
= false;
1397 // For each instruction between FirstMI and the previous def for RegToRename,
1399 // * check if we can rename RegToRename in this instruction
1400 // * collect the registers used and required register classes for RegToRename.
1401 std::function
<bool(MachineInstr
&, bool)> CheckMIs
= [&](MachineInstr
&MI
,
1403 LLVM_DEBUG(dbgs() << "Checking " << MI
<< "\n");
1404 // Currently we do not try to rename across frame-setup instructions.
1405 if (MI
.getFlag(MachineInstr::FrameSetup
)) {
1406 LLVM_DEBUG(dbgs() << " Cannot rename framesetup instructions currently ("
1411 UsedInBetween
.accumulate(MI
);
1413 // For a definition, check that we can rename the definition and exit the
1417 // For defs, check if we can rename the first def of RegToRename.
1419 // For some pseudo instructions, we might not generate code in the end
1420 // (e.g. KILL) and we would end up without a correct def for the rename
1422 // TODO: This might be overly conservative and we could handle those cases
1423 // in multiple ways:
1424 // 1. Insert an extra copy, to materialize the def.
1425 // 2. Skip pseudo-defs until we find an non-pseudo def.
1426 if (MI
.isPseudo()) {
1427 LLVM_DEBUG(dbgs() << " Cannot rename pseudo instruction " << MI
1432 for (auto &MOP
: MI
.operands()) {
1433 if (!MOP
.isReg() || !MOP
.isDef() || MOP
.isDebug() || !MOP
.getReg() ||
1434 !TRI
->regsOverlap(MOP
.getReg(), RegToRename
))
1436 if (!canRenameMOP(MOP
)) {
1438 << " Cannot rename " << MOP
<< " in " << MI
<< "\n");
1441 RequiredClasses
.insert(TRI
->getMinimalPhysRegClass(MOP
.getReg()));
1445 for (auto &MOP
: MI
.operands()) {
1446 if (!MOP
.isReg() || MOP
.isDebug() || !MOP
.getReg() ||
1447 !TRI
->regsOverlap(MOP
.getReg(), RegToRename
))
1450 if (!canRenameMOP(MOP
)) {
1452 << " Cannot rename " << MOP
<< " in " << MI
<< "\n");
1455 RequiredClasses
.insert(TRI
->getMinimalPhysRegClass(MOP
.getReg()));
1461 if (!forAllMIsUntilDef(FirstMI
, RegToRename
, TRI
, LdStLimit
, CheckMIs
))
1465 LLVM_DEBUG(dbgs() << " Did not find definition for register in BB\n");
1471 // Check if we can find a physical register for renaming \p Reg. This register
1473 // * not be defined already in \p DefinedInBB; DefinedInBB must contain all
1474 // defined registers up to the point where the renamed register will be used,
1475 // * not used in \p UsedInBetween; UsedInBetween must contain all accessed
1476 // registers in the range the rename register will be used,
1477 // * is available in all used register classes (checked using RequiredClasses).
1478 static Optional
<MCPhysReg
> tryToFindRegisterToRename(
1479 const MachineFunction
&MF
, Register Reg
, LiveRegUnits
&DefinedInBB
,
1480 LiveRegUnits
&UsedInBetween
,
1481 SmallPtrSetImpl
<const TargetRegisterClass
*> &RequiredClasses
,
1482 const TargetRegisterInfo
*TRI
) {
1483 const MachineRegisterInfo
&RegInfo
= MF
.getRegInfo();
1485 // Checks if any sub- or super-register of PR is callee saved.
1486 auto AnySubOrSuperRegCalleePreserved
= [&MF
, TRI
](MCPhysReg PR
) {
1487 return any_of(TRI
->sub_and_superregs_inclusive(PR
),
1488 [&MF
, TRI
](MCPhysReg SubOrSuper
) {
1489 return TRI
->isCalleeSavedPhysReg(SubOrSuper
, MF
);
1493 // Check if PR or one of its sub- or super-registers can be used for all
1494 // required register classes.
1495 auto CanBeUsedForAllClasses
= [&RequiredClasses
, TRI
](MCPhysReg PR
) {
1496 return all_of(RequiredClasses
, [PR
, TRI
](const TargetRegisterClass
*C
) {
1497 return any_of(TRI
->sub_and_superregs_inclusive(PR
),
1498 [C
, TRI
](MCPhysReg SubOrSuper
) {
1499 return C
== TRI
->getMinimalPhysRegClass(SubOrSuper
);
1504 auto *RegClass
= TRI
->getMinimalPhysRegClass(Reg
);
1505 for (const MCPhysReg
&PR
: *RegClass
) {
1506 if (DefinedInBB
.available(PR
) && UsedInBetween
.available(PR
) &&
1507 !RegInfo
.isReserved(PR
) && !AnySubOrSuperRegCalleePreserved(PR
) &&
1508 CanBeUsedForAllClasses(PR
)) {
1509 DefinedInBB
.addReg(PR
);
1510 LLVM_DEBUG(dbgs() << "Found rename register " << printReg(PR
, TRI
)
1515 LLVM_DEBUG(dbgs() << "No rename register found from "
1516 << TRI
->getRegClassName(RegClass
) << "\n");
1520 /// Scan the instructions looking for a load/store that can be combined with the
1521 /// current instruction into a wider equivalent or a load/store pair.
1522 MachineBasicBlock::iterator
1523 AArch64LoadStoreOpt::findMatchingInsn(MachineBasicBlock::iterator I
,
1524 LdStPairFlags
&Flags
, unsigned Limit
,
1525 bool FindNarrowMerge
) {
1526 MachineBasicBlock::iterator E
= I
->getParent()->end();
1527 MachineBasicBlock::iterator MBBI
= I
;
1528 MachineBasicBlock::iterator MBBIWithRenameReg
;
1529 MachineInstr
&FirstMI
= *I
;
1530 MBBI
= next_nodbg(MBBI
, E
);
1532 bool MayLoad
= FirstMI
.mayLoad();
1533 bool IsUnscaled
= TII
->hasUnscaledLdStOffset(FirstMI
);
1534 Register Reg
= getLdStRegOp(FirstMI
).getReg();
1535 Register BaseReg
= getLdStBaseOp(FirstMI
).getReg();
1536 int Offset
= getLdStOffsetOp(FirstMI
).getImm();
1537 int OffsetStride
= IsUnscaled
? TII
->getMemScale(FirstMI
) : 1;
1538 bool IsPromotableZeroStore
= isPromotableZeroStoreInst(FirstMI
);
1540 Optional
<bool> MaybeCanRename
= None
;
1541 if (!EnableRenaming
)
1542 MaybeCanRename
= {false};
1544 SmallPtrSet
<const TargetRegisterClass
*, 5> RequiredClasses
;
1545 LiveRegUnits UsedInBetween
;
1546 UsedInBetween
.init(*TRI
);
1548 Flags
.clearRenameReg();
1550 // Track which register units have been modified and used between the first
1551 // insn (inclusive) and the second insn.
1552 ModifiedRegUnits
.clear();
1553 UsedRegUnits
.clear();
1555 // Remember any instructions that read/write memory between FirstMI and MI.
1556 SmallVector
<MachineInstr
*, 4> MemInsns
;
1558 for (unsigned Count
= 0; MBBI
!= E
&& Count
< Limit
;
1559 MBBI
= next_nodbg(MBBI
, E
)) {
1560 MachineInstr
&MI
= *MBBI
;
1562 UsedInBetween
.accumulate(MI
);
1564 // Don't count transient instructions towards the search limit since there
1565 // may be different numbers of them if e.g. debug information is present.
1566 if (!MI
.isTransient())
1569 Flags
.setSExtIdx(-1);
1570 if (areCandidatesToMergeOrPair(FirstMI
, MI
, Flags
, TII
) &&
1571 getLdStOffsetOp(MI
).isImm()) {
1572 assert(MI
.mayLoadOrStore() && "Expected memory operation.");
1573 // If we've found another instruction with the same opcode, check to see
1574 // if the base and offset are compatible with our starting instruction.
1575 // These instructions all have scaled immediate operands, so we just
1576 // check for +1/-1. Make sure to check the new instruction offset is
1577 // actually an immediate and not a symbolic reference destined for
1579 Register MIBaseReg
= getLdStBaseOp(MI
).getReg();
1580 int MIOffset
= getLdStOffsetOp(MI
).getImm();
1581 bool MIIsUnscaled
= TII
->hasUnscaledLdStOffset(MI
);
1582 if (IsUnscaled
!= MIIsUnscaled
) {
1583 // We're trying to pair instructions that differ in how they are scaled.
1584 // If FirstMI is scaled then scale the offset of MI accordingly.
1585 // Otherwise, do the opposite (i.e., make MI's offset unscaled).
1586 int MemSize
= TII
->getMemScale(MI
);
1588 // If the unscaled offset isn't a multiple of the MemSize, we can't
1589 // pair the operations together: bail and keep looking.
1590 if (MIOffset
% MemSize
) {
1591 LiveRegUnits::accumulateUsedDefed(MI
, ModifiedRegUnits
,
1593 MemInsns
.push_back(&MI
);
1596 MIOffset
/= MemSize
;
1598 MIOffset
*= MemSize
;
1602 bool IsPreLdSt
= isPreLdStPairCandidate(FirstMI
, MI
);
1604 if (BaseReg
== MIBaseReg
) {
1605 // If the offset of the second ld/st is not equal to the size of the
1606 // destination register it can’t be paired with a pre-index ld/st
1607 // pair. Additionally if the base reg is used or modified the operations
1608 // can't be paired: bail and keep looking.
1610 bool IsOutOfBounds
= MIOffset
!= TII
->getMemScale(MI
);
1611 bool IsBaseRegUsed
=
1612 !UsedRegUnits
.available(getLdStBaseOp(MI
).getReg());
1613 bool IsBaseRegModified
=
1614 !ModifiedRegUnits
.available(getLdStBaseOp(MI
).getReg());
1615 // If the stored value and the address of the second instruction is
1616 // the same, it needs to be using the updated register and therefore
1617 // it must not be folded.
1618 bool IsMIRegTheSame
= TRI
->regsOverlap(getLdStRegOp(MI
).getReg(),
1619 getLdStBaseOp(MI
).getReg());
1620 if (IsOutOfBounds
|| IsBaseRegUsed
|| IsBaseRegModified
||
1622 LiveRegUnits::accumulateUsedDefed(MI
, ModifiedRegUnits
,
1624 MemInsns
.push_back(&MI
);
1628 if ((Offset
!= MIOffset
+ OffsetStride
) &&
1629 (Offset
+ OffsetStride
!= MIOffset
)) {
1630 LiveRegUnits::accumulateUsedDefed(MI
, ModifiedRegUnits
,
1632 MemInsns
.push_back(&MI
);
1637 int MinOffset
= Offset
< MIOffset
? Offset
: MIOffset
;
1638 if (FindNarrowMerge
) {
1639 // If the alignment requirements of the scaled wide load/store
1640 // instruction can't express the offset of the scaled narrow input,
1641 // bail and keep looking. For promotable zero stores, allow only when
1642 // the stored value is the same (i.e., WZR).
1643 if ((!IsUnscaled
&& alignTo(MinOffset
, 2) != MinOffset
) ||
1644 (IsPromotableZeroStore
&& Reg
!= getLdStRegOp(MI
).getReg())) {
1645 LiveRegUnits::accumulateUsedDefed(MI
, ModifiedRegUnits
,
1647 MemInsns
.push_back(&MI
);
1651 // Pairwise instructions have a 7-bit signed offset field. Single
1652 // insns have a 12-bit unsigned offset field. If the resultant
1653 // immediate offset of merging these instructions is out of range for
1654 // a pairwise instruction, bail and keep looking.
1655 if (!inBoundsForPair(IsUnscaled
, MinOffset
, OffsetStride
)) {
1656 LiveRegUnits::accumulateUsedDefed(MI
, ModifiedRegUnits
,
1658 MemInsns
.push_back(&MI
);
1661 // If the alignment requirements of the paired (scaled) instruction
1662 // can't express the offset of the unscaled input, bail and keep
1664 if (IsUnscaled
&& (alignTo(MinOffset
, OffsetStride
) != MinOffset
)) {
1665 LiveRegUnits::accumulateUsedDefed(MI
, ModifiedRegUnits
,
1667 MemInsns
.push_back(&MI
);
1671 // If the destination register of one load is the same register or a
1672 // sub/super register of the other load, bail and keep looking. A
1673 // load-pair instruction with both destination registers the same is
1674 // UNPREDICTABLE and will result in an exception.
1676 TRI
->isSuperOrSubRegisterEq(Reg
, getLdStRegOp(MI
).getReg())) {
1677 LiveRegUnits::accumulateUsedDefed(MI
, ModifiedRegUnits
, UsedRegUnits
,
1679 MemInsns
.push_back(&MI
);
1683 // If the BaseReg has been modified, then we cannot do the optimization.
1684 // For example, in the following pattern
1688 // the first and third ldr cannot be converted to ldp x1, x4, [x2]
1689 if (!ModifiedRegUnits
.available(BaseReg
))
1692 // If the Rt of the second instruction was not modified or used between
1693 // the two instructions and none of the instructions between the second
1694 // and first alias with the second, we can combine the second into the
1696 if (ModifiedRegUnits
.available(getLdStRegOp(MI
).getReg()) &&
1698 !UsedRegUnits
.available(getLdStRegOp(MI
).getReg())) &&
1699 !mayAlias(MI
, MemInsns
, AA
)) {
1701 Flags
.setMergeForward(false);
1702 Flags
.clearRenameReg();
1706 // Likewise, if the Rt of the first instruction is not modified or used
1707 // between the two instructions and none of the instructions between the
1708 // first and the second alias with the first, we can combine the first
1711 !UsedRegUnits
.available(getLdStRegOp(FirstMI
).getReg())) &&
1712 !mayAlias(FirstMI
, MemInsns
, AA
)) {
1714 if (ModifiedRegUnits
.available(getLdStRegOp(FirstMI
).getReg())) {
1715 Flags
.setMergeForward(true);
1716 Flags
.clearRenameReg();
1720 if (DebugCounter::shouldExecute(RegRenamingCounter
)) {
1721 if (!MaybeCanRename
)
1722 MaybeCanRename
= {canRenameUpToDef(FirstMI
, UsedInBetween
,
1723 RequiredClasses
, TRI
)};
1725 if (*MaybeCanRename
) {
1726 Optional
<MCPhysReg
> MaybeRenameReg
= tryToFindRegisterToRename(
1727 *FirstMI
.getParent()->getParent(), Reg
, DefinedInBB
,
1728 UsedInBetween
, RequiredClasses
, TRI
);
1729 if (MaybeRenameReg
) {
1730 Flags
.setRenameReg(*MaybeRenameReg
);
1731 Flags
.setMergeForward(true);
1732 MBBIWithRenameReg
= MBBI
;
1737 // Unable to combine these instructions due to interference in between.
1742 if (Flags
.getRenameReg())
1743 return MBBIWithRenameReg
;
1745 // If the instruction wasn't a matching load or store. Stop searching if we
1746 // encounter a call instruction that might modify memory.
1750 // Update modified / uses register units.
1751 LiveRegUnits::accumulateUsedDefed(MI
, ModifiedRegUnits
, UsedRegUnits
, TRI
);
1753 // Otherwise, if the base register is modified, we have no match, so
1755 if (!ModifiedRegUnits
.available(BaseReg
))
1758 // Update list of instructions that read/write memory.
1759 if (MI
.mayLoadOrStore())
1760 MemInsns
.push_back(&MI
);
1765 MachineBasicBlock::iterator
1766 AArch64LoadStoreOpt::mergeUpdateInsn(MachineBasicBlock::iterator I
,
1767 MachineBasicBlock::iterator Update
,
1769 assert((Update
->getOpcode() == AArch64::ADDXri
||
1770 Update
->getOpcode() == AArch64::SUBXri
) &&
1771 "Unexpected base register update instruction to merge!");
1772 MachineBasicBlock::iterator E
= I
->getParent()->end();
1773 MachineBasicBlock::iterator NextI
= next_nodbg(I
, E
);
1774 // Return the instruction following the merged instruction, which is
1775 // the instruction following our unmerged load. Unless that's the add/sub
1776 // instruction we're merging, in which case it's the one after that.
1777 if (NextI
== Update
)
1778 NextI
= next_nodbg(NextI
, E
);
1780 int Value
= Update
->getOperand(2).getImm();
1781 assert(AArch64_AM::getShiftValue(Update
->getOperand(3).getImm()) == 0 &&
1782 "Can't merge 1 << 12 offset into pre-/post-indexed load / store");
1783 if (Update
->getOpcode() == AArch64::SUBXri
)
1786 unsigned NewOpc
= IsPreIdx
? getPreIndexedOpcode(I
->getOpcode())
1787 : getPostIndexedOpcode(I
->getOpcode());
1788 MachineInstrBuilder MIB
;
1789 int Scale
, MinOffset
, MaxOffset
;
1790 getPrePostIndexedMemOpInfo(*I
, Scale
, MinOffset
, MaxOffset
);
1791 if (!isPairedLdSt(*I
)) {
1792 // Non-paired instruction.
1793 MIB
= BuildMI(*I
->getParent(), I
, I
->getDebugLoc(), TII
->get(NewOpc
))
1794 .add(getLdStRegOp(*Update
))
1795 .add(getLdStRegOp(*I
))
1796 .add(getLdStBaseOp(*I
))
1797 .addImm(Value
/ Scale
)
1798 .setMemRefs(I
->memoperands())
1799 .setMIFlags(I
->mergeFlagsWith(*Update
));
1801 // Paired instruction.
1802 MIB
= BuildMI(*I
->getParent(), I
, I
->getDebugLoc(), TII
->get(NewOpc
))
1803 .add(getLdStRegOp(*Update
))
1804 .add(getLdStRegOp(*I
, 0))
1805 .add(getLdStRegOp(*I
, 1))
1806 .add(getLdStBaseOp(*I
))
1807 .addImm(Value
/ Scale
)
1808 .setMemRefs(I
->memoperands())
1809 .setMIFlags(I
->mergeFlagsWith(*Update
));
1815 LLVM_DEBUG(dbgs() << "Creating pre-indexed load/store.");
1818 LLVM_DEBUG(dbgs() << "Creating post-indexed load/store.");
1820 LLVM_DEBUG(dbgs() << " Replacing instructions:\n ");
1821 LLVM_DEBUG(I
->print(dbgs()));
1822 LLVM_DEBUG(dbgs() << " ");
1823 LLVM_DEBUG(Update
->print(dbgs()));
1824 LLVM_DEBUG(dbgs() << " with instruction:\n ");
1825 LLVM_DEBUG(((MachineInstr
*)MIB
)->print(dbgs()));
1826 LLVM_DEBUG(dbgs() << "\n");
1828 // Erase the old instructions for the block.
1829 I
->eraseFromParent();
1830 Update
->eraseFromParent();
1835 bool AArch64LoadStoreOpt::isMatchingUpdateInsn(MachineInstr
&MemMI
,
1837 unsigned BaseReg
, int Offset
) {
1838 switch (MI
.getOpcode()) {
1841 case AArch64::SUBXri
:
1842 case AArch64::ADDXri
:
1843 // Make sure it's a vanilla immediate operand, not a relocation or
1844 // anything else we can't handle.
1845 if (!MI
.getOperand(2).isImm())
1847 // Watch out for 1 << 12 shifted value.
1848 if (AArch64_AM::getShiftValue(MI
.getOperand(3).getImm()))
1851 // The update instruction source and destination register must be the
1852 // same as the load/store base register.
1853 if (MI
.getOperand(0).getReg() != BaseReg
||
1854 MI
.getOperand(1).getReg() != BaseReg
)
1857 int UpdateOffset
= MI
.getOperand(2).getImm();
1858 if (MI
.getOpcode() == AArch64::SUBXri
)
1859 UpdateOffset
= -UpdateOffset
;
1861 // The immediate must be a multiple of the scaling factor of the pre/post
1862 // indexed instruction.
1863 int Scale
, MinOffset
, MaxOffset
;
1864 getPrePostIndexedMemOpInfo(MemMI
, Scale
, MinOffset
, MaxOffset
);
1865 if (UpdateOffset
% Scale
!= 0)
1868 // Scaled offset must fit in the instruction immediate.
1869 int ScaledOffset
= UpdateOffset
/ Scale
;
1870 if (ScaledOffset
> MaxOffset
|| ScaledOffset
< MinOffset
)
1873 // If we have a non-zero Offset, we check that it matches the amount
1874 // we're adding to the register.
1875 if (!Offset
|| Offset
== UpdateOffset
)
1882 static bool needsWinCFI(const MachineFunction
*MF
) {
1883 return MF
->getTarget().getMCAsmInfo()->usesWindowsCFI() &&
1884 MF
->getFunction().needsUnwindTableEntry();
1887 MachineBasicBlock::iterator
AArch64LoadStoreOpt::findMatchingUpdateInsnForward(
1888 MachineBasicBlock::iterator I
, int UnscaledOffset
, unsigned Limit
) {
1889 MachineBasicBlock::iterator E
= I
->getParent()->end();
1890 MachineInstr
&MemMI
= *I
;
1891 MachineBasicBlock::iterator MBBI
= I
;
1893 Register BaseReg
= getLdStBaseOp(MemMI
).getReg();
1894 int MIUnscaledOffset
= getLdStOffsetOp(MemMI
).getImm() * TII
->getMemScale(MemMI
);
1896 // Scan forward looking for post-index opportunities. Updating instructions
1897 // can't be formed if the memory instruction doesn't have the offset we're
1899 if (MIUnscaledOffset
!= UnscaledOffset
)
1902 // If the base register overlaps a source/destination register, we can't
1903 // merge the update. This does not apply to tag store instructions which
1904 // ignore the address part of the source register.
1905 // This does not apply to STGPi as well, which does not have unpredictable
1906 // behavior in this case unlike normal stores, and always performs writeback
1907 // after reading the source register value.
1908 if (!isTagStore(MemMI
) && MemMI
.getOpcode() != AArch64::STGPi
) {
1909 bool IsPairedInsn
= isPairedLdSt(MemMI
);
1910 for (unsigned i
= 0, e
= IsPairedInsn
? 2 : 1; i
!= e
; ++i
) {
1911 Register DestReg
= getLdStRegOp(MemMI
, i
).getReg();
1912 if (DestReg
== BaseReg
|| TRI
->isSubRegister(BaseReg
, DestReg
))
1917 // Track which register units have been modified and used between the first
1918 // insn (inclusive) and the second insn.
1919 ModifiedRegUnits
.clear();
1920 UsedRegUnits
.clear();
1921 MBBI
= next_nodbg(MBBI
, E
);
1923 // We can't post-increment the stack pointer if any instruction between
1924 // the memory access (I) and the increment (MBBI) can access the memory
1925 // region defined by [SP, MBBI].
1926 const bool BaseRegSP
= BaseReg
== AArch64::SP
;
1927 if (BaseRegSP
&& needsWinCFI(I
->getMF())) {
1928 // FIXME: For now, we always block the optimization over SP in windows
1929 // targets as it requires to adjust the unwind/debug info, messing up
1930 // the unwind info can actually cause a miscompile.
1934 for (unsigned Count
= 0; MBBI
!= E
&& Count
< Limit
;
1935 MBBI
= next_nodbg(MBBI
, E
)) {
1936 MachineInstr
&MI
= *MBBI
;
1938 // Don't count transient instructions towards the search limit since there
1939 // may be different numbers of them if e.g. debug information is present.
1940 if (!MI
.isTransient())
1943 // If we found a match, return it.
1944 if (isMatchingUpdateInsn(*I
, MI
, BaseReg
, UnscaledOffset
))
1947 // Update the status of what the instruction clobbered and used.
1948 LiveRegUnits::accumulateUsedDefed(MI
, ModifiedRegUnits
, UsedRegUnits
, TRI
);
1950 // Otherwise, if the base register is used or modified, we have no match, so
1952 // If we are optimizing SP, do not allow instructions that may load or store
1953 // in between the load and the optimized value update.
1954 if (!ModifiedRegUnits
.available(BaseReg
) ||
1955 !UsedRegUnits
.available(BaseReg
) ||
1956 (BaseRegSP
&& MBBI
->mayLoadOrStore()))
1962 MachineBasicBlock::iterator
AArch64LoadStoreOpt::findMatchingUpdateInsnBackward(
1963 MachineBasicBlock::iterator I
, unsigned Limit
) {
1964 MachineBasicBlock::iterator B
= I
->getParent()->begin();
1965 MachineBasicBlock::iterator E
= I
->getParent()->end();
1966 MachineInstr
&MemMI
= *I
;
1967 MachineBasicBlock::iterator MBBI
= I
;
1968 MachineFunction
&MF
= *MemMI
.getMF();
1970 Register BaseReg
= getLdStBaseOp(MemMI
).getReg();
1971 int Offset
= getLdStOffsetOp(MemMI
).getImm();
1973 // If the load/store is the first instruction in the block, there's obviously
1974 // not any matching update. Ditto if the memory offset isn't zero.
1975 if (MBBI
== B
|| Offset
!= 0)
1977 // If the base register overlaps a destination register, we can't
1978 // merge the update.
1979 if (!isTagStore(MemMI
)) {
1980 bool IsPairedInsn
= isPairedLdSt(MemMI
);
1981 for (unsigned i
= 0, e
= IsPairedInsn
? 2 : 1; i
!= e
; ++i
) {
1982 Register DestReg
= getLdStRegOp(MemMI
, i
).getReg();
1983 if (DestReg
== BaseReg
|| TRI
->isSubRegister(BaseReg
, DestReg
))
1988 const bool BaseRegSP
= BaseReg
== AArch64::SP
;
1989 if (BaseRegSP
&& needsWinCFI(I
->getMF())) {
1990 // FIXME: For now, we always block the optimization over SP in windows
1991 // targets as it requires to adjust the unwind/debug info, messing up
1992 // the unwind info can actually cause a miscompile.
1996 const AArch64Subtarget
&Subtarget
= MF
.getSubtarget
<AArch64Subtarget
>();
1997 unsigned RedZoneSize
=
1998 Subtarget
.getTargetLowering()->getRedZoneSize(MF
.getFunction());
2000 // Track which register units have been modified and used between the first
2001 // insn (inclusive) and the second insn.
2002 ModifiedRegUnits
.clear();
2003 UsedRegUnits
.clear();
2005 bool MemAcessBeforeSPPreInc
= false;
2007 MBBI
= prev_nodbg(MBBI
, B
);
2008 MachineInstr
&MI
= *MBBI
;
2010 // Don't count transient instructions towards the search limit since there
2011 // may be different numbers of them if e.g. debug information is present.
2012 if (!MI
.isTransient())
2015 // If we found a match, return it.
2016 if (isMatchingUpdateInsn(*I
, MI
, BaseReg
, Offset
)) {
2017 // Check that the update value is within our red zone limit (which may be
2019 if (MemAcessBeforeSPPreInc
&& MBBI
->getOperand(2).getImm() > RedZoneSize
)
2024 // Update the status of what the instruction clobbered and used.
2025 LiveRegUnits::accumulateUsedDefed(MI
, ModifiedRegUnits
, UsedRegUnits
, TRI
);
2027 // Otherwise, if the base register is used or modified, we have no match, so
2029 if (!ModifiedRegUnits
.available(BaseReg
) ||
2030 !UsedRegUnits
.available(BaseReg
))
2032 // Keep track if we have a memory access before an SP pre-increment, in this
2033 // case we need to validate later that the update amount respects the red
2035 if (BaseRegSP
&& MBBI
->mayLoadOrStore())
2036 MemAcessBeforeSPPreInc
= true;
2037 } while (MBBI
!= B
&& Count
< Limit
);
2041 bool AArch64LoadStoreOpt::tryToPromoteLoadFromStore(
2042 MachineBasicBlock::iterator
&MBBI
) {
2043 MachineInstr
&MI
= *MBBI
;
2044 // If this is a volatile load, don't mess with it.
2045 if (MI
.hasOrderedMemoryRef())
2048 // Make sure this is a reg+imm.
2049 // FIXME: It is possible to extend it to handle reg+reg cases.
2050 if (!getLdStOffsetOp(MI
).isImm())
2053 // Look backward up to LdStLimit instructions.
2054 MachineBasicBlock::iterator StoreI
;
2055 if (findMatchingStore(MBBI
, LdStLimit
, StoreI
)) {
2056 ++NumLoadsFromStoresPromoted
;
2057 // Promote the load. Keeping the iterator straight is a
2058 // pain, so we let the merge routine tell us what the next instruction
2059 // is after it's done mucking about.
2060 MBBI
= promoteLoadFromStore(MBBI
, StoreI
);
2066 // Merge adjacent zero stores into a wider store.
2067 bool AArch64LoadStoreOpt::tryToMergeZeroStInst(
2068 MachineBasicBlock::iterator
&MBBI
) {
2069 assert(isPromotableZeroStoreInst(*MBBI
) && "Expected narrow store.");
2070 MachineInstr
&MI
= *MBBI
;
2071 MachineBasicBlock::iterator E
= MI
.getParent()->end();
2073 if (!TII
->isCandidateToMergeOrPair(MI
))
2076 // Look ahead up to LdStLimit instructions for a mergable instruction.
2077 LdStPairFlags Flags
;
2078 MachineBasicBlock::iterator MergeMI
=
2079 findMatchingInsn(MBBI
, Flags
, LdStLimit
, /* FindNarrowMerge = */ true);
2081 ++NumZeroStoresPromoted
;
2083 // Keeping the iterator straight is a pain, so we let the merge routine tell
2084 // us what the next instruction is after it's done mucking about.
2085 MBBI
= mergeNarrowZeroStores(MBBI
, MergeMI
, Flags
);
2091 // Find loads and stores that can be merged into a single load or store pair
2093 bool AArch64LoadStoreOpt::tryToPairLdStInst(MachineBasicBlock::iterator
&MBBI
) {
2094 MachineInstr
&MI
= *MBBI
;
2095 MachineBasicBlock::iterator E
= MI
.getParent()->end();
2097 if (!TII
->isCandidateToMergeOrPair(MI
))
2100 // Early exit if the offset is not possible to match. (6 bits of positive
2101 // range, plus allow an extra one in case we find a later insn that matches
2103 bool IsUnscaled
= TII
->hasUnscaledLdStOffset(MI
);
2104 int Offset
= getLdStOffsetOp(MI
).getImm();
2105 int OffsetStride
= IsUnscaled
? TII
->getMemScale(MI
) : 1;
2106 // Allow one more for offset.
2108 Offset
-= OffsetStride
;
2109 if (!inBoundsForPair(IsUnscaled
, Offset
, OffsetStride
))
2112 // Look ahead up to LdStLimit instructions for a pairable instruction.
2113 LdStPairFlags Flags
;
2114 MachineBasicBlock::iterator Paired
=
2115 findMatchingInsn(MBBI
, Flags
, LdStLimit
, /* FindNarrowMerge = */ false);
2118 if (TII
->hasUnscaledLdStOffset(MI
))
2119 ++NumUnscaledPairCreated
;
2120 // Keeping the iterator straight is a pain, so we let the merge routine tell
2121 // us what the next instruction is after it's done mucking about.
2122 auto Prev
= std::prev(MBBI
);
2123 MBBI
= mergePairedInsns(MBBI
, Paired
, Flags
);
2124 // Collect liveness info for instructions between Prev and the new position
2126 for (auto I
= std::next(Prev
); I
!= MBBI
; I
++)
2127 updateDefinedRegisters(*I
, DefinedInBB
, TRI
);
2134 bool AArch64LoadStoreOpt::tryToMergeLdStUpdate
2135 (MachineBasicBlock::iterator
&MBBI
) {
2136 MachineInstr
&MI
= *MBBI
;
2137 MachineBasicBlock::iterator E
= MI
.getParent()->end();
2138 MachineBasicBlock::iterator Update
;
2140 // Look forward to try to form a post-index instruction. For example,
2142 // add x20, x20, #32
2144 // ldr x0, [x20], #32
2145 Update
= findMatchingUpdateInsnForward(MBBI
, 0, UpdateLimit
);
2147 // Merge the update into the ld/st.
2148 MBBI
= mergeUpdateInsn(MBBI
, Update
, /*IsPreIdx=*/false);
2152 // Don't know how to handle unscaled pre/post-index versions below, so bail.
2153 if (TII
->hasUnscaledLdStOffset(MI
.getOpcode()))
2156 // Look back to try to find a pre-index instruction. For example,
2160 // ldr x1, [x0, #8]!
2161 Update
= findMatchingUpdateInsnBackward(MBBI
, UpdateLimit
);
2163 // Merge the update into the ld/st.
2164 MBBI
= mergeUpdateInsn(MBBI
, Update
, /*IsPreIdx=*/true);
2168 // The immediate in the load/store is scaled by the size of the memory
2169 // operation. The immediate in the add we're looking for,
2170 // however, is not, so adjust here.
2171 int UnscaledOffset
= getLdStOffsetOp(MI
).getImm() * TII
->getMemScale(MI
);
2173 // Look forward to try to find a pre-index instruction. For example,
2174 // ldr x1, [x0, #64]
2177 // ldr x1, [x0, #64]!
2178 Update
= findMatchingUpdateInsnForward(MBBI
, UnscaledOffset
, UpdateLimit
);
2180 // Merge the update into the ld/st.
2181 MBBI
= mergeUpdateInsn(MBBI
, Update
, /*IsPreIdx=*/true);
2188 bool AArch64LoadStoreOpt::optimizeBlock(MachineBasicBlock
&MBB
,
2189 bool EnableNarrowZeroStOpt
) {
2191 bool Modified
= false;
2192 // Four tranformations to do here:
2193 // 1) Find loads that directly read from stores and promote them by
2194 // replacing with mov instructions. If the store is wider than the load,
2195 // the load will be replaced with a bitfield extract.
2198 // ldrh w2, [x0, #6]
2202 for (MachineBasicBlock::iterator MBBI
= MBB
.begin(), E
= MBB
.end();
2204 if (isPromotableLoadFromStore(*MBBI
) && tryToPromoteLoadFromStore(MBBI
))
2209 // 2) Merge adjacent zero stores into a wider store.
2212 // strh wzr, [x0, #2]
2217 // str wzr, [x0, #4]
2220 if (EnableNarrowZeroStOpt
)
2221 for (MachineBasicBlock::iterator MBBI
= MBB
.begin(), E
= MBB
.end();
2223 if (isPromotableZeroStoreInst(*MBBI
) && tryToMergeZeroStInst(MBBI
))
2228 // 3) Find loads and stores that can be merged into a single load or store
2229 // pair instruction.
2236 if (MBB
.getParent()->getRegInfo().tracksLiveness()) {
2237 DefinedInBB
.clear();
2238 DefinedInBB
.addLiveIns(MBB
);
2241 for (MachineBasicBlock::iterator MBBI
= MBB
.begin(), E
= MBB
.end();
2243 // Track currently live registers up to this point, to help with
2244 // searching for a rename register on demand.
2245 updateDefinedRegisters(*MBBI
, DefinedInBB
, TRI
);
2246 if (TII
->isPairableLdStInst(*MBBI
) && tryToPairLdStInst(MBBI
))
2251 // 4) Find base register updates that can be merged into the load or store
2252 // as a base-reg writeback.
2258 for (MachineBasicBlock::iterator MBBI
= MBB
.begin(), E
= MBB
.end();
2260 if (isMergeableLdStUpdate(*MBBI
) && tryToMergeLdStUpdate(MBBI
))
2269 bool AArch64LoadStoreOpt::runOnMachineFunction(MachineFunction
&Fn
) {
2270 if (skipFunction(Fn
.getFunction()))
2273 Subtarget
= &static_cast<const AArch64Subtarget
&>(Fn
.getSubtarget());
2274 TII
= static_cast<const AArch64InstrInfo
*>(Subtarget
->getInstrInfo());
2275 TRI
= Subtarget
->getRegisterInfo();
2276 AA
= &getAnalysis
<AAResultsWrapperPass
>().getAAResults();
2278 // Resize the modified and used register unit trackers. We do this once
2279 // per function and then clear the register units each time we optimize a load
2281 ModifiedRegUnits
.init(*TRI
);
2282 UsedRegUnits
.init(*TRI
);
2283 DefinedInBB
.init(*TRI
);
2285 bool Modified
= false;
2286 bool enableNarrowZeroStOpt
= !Subtarget
->requiresStrictAlign();
2287 for (auto &MBB
: Fn
) {
2288 auto M
= optimizeBlock(MBB
, enableNarrowZeroStOpt
);
2295 // FIXME: Do we need/want a pre-alloc pass like ARM has to try to keep loads and
2296 // stores near one another? Note: The pre-RA instruction scheduler already has
2297 // hooks to try and schedule pairable loads/stores together to improve pairing
2298 // opportunities. Thus, pre-RA pairing pass may not be worth the effort.
2300 // FIXME: When pairing store instructions it's very possible for this pass to
2301 // hoist a store with a KILL marker above another use (without a KILL marker).
2302 // The resulting IR is invalid, but nothing uses the KILL markers after this
2303 // pass, so it's never caused a problem in practice.
2305 /// createAArch64LoadStoreOptimizationPass - returns an instance of the
2306 /// load / store optimization pass.
2307 FunctionPass
*llvm::createAArch64LoadStoreOptimizationPass() {
2308 return new AArch64LoadStoreOpt();