1 //===- AArch64LoadStoreOptimizer.cpp - AArch64 load/store opt. pass -------===//
3 // Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
4 // See https://llvm.org/LICENSE.txt for license information.
5 // SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
7 //===----------------------------------------------------------------------===//
9 // This file contains a pass that performs load / store related peephole
10 // optimizations. This pass should be run after register allocation.
12 //===----------------------------------------------------------------------===//
14 #include "AArch64InstrInfo.h"
15 #include "AArch64MachineFunctionInfo.h"
16 #include "AArch64Subtarget.h"
17 #include "MCTargetDesc/AArch64AddressingModes.h"
18 #include "llvm/ADT/BitVector.h"
19 #include "llvm/ADT/SmallVector.h"
20 #include "llvm/ADT/Statistic.h"
21 #include "llvm/ADT/StringRef.h"
22 #include "llvm/ADT/iterator_range.h"
23 #include "llvm/Analysis/AliasAnalysis.h"
24 #include "llvm/CodeGen/MachineBasicBlock.h"
25 #include "llvm/CodeGen/MachineFunction.h"
26 #include "llvm/CodeGen/MachineFunctionPass.h"
27 #include "llvm/CodeGen/MachineInstr.h"
28 #include "llvm/CodeGen/MachineInstrBuilder.h"
29 #include "llvm/CodeGen/MachineOperand.h"
30 #include "llvm/CodeGen/MachineRegisterInfo.h"
31 #include "llvm/CodeGen/TargetRegisterInfo.h"
32 #include "llvm/IR/DebugLoc.h"
33 #include "llvm/MC/MCAsmInfo.h"
34 #include "llvm/MC/MCRegisterInfo.h"
35 #include "llvm/Pass.h"
36 #include "llvm/Support/CommandLine.h"
37 #include "llvm/Support/Debug.h"
38 #include "llvm/Support/DebugCounter.h"
39 #include "llvm/Support/ErrorHandling.h"
40 #include "llvm/Support/raw_ostream.h"
49 #define DEBUG_TYPE "aarch64-ldst-opt"
51 STATISTIC(NumPairCreated
, "Number of load/store pair instructions generated");
52 STATISTIC(NumPostFolded
, "Number of post-index updates folded");
53 STATISTIC(NumPreFolded
, "Number of pre-index updates folded");
54 STATISTIC(NumUnscaledPairCreated
,
55 "Number of load/store from unscaled generated");
56 STATISTIC(NumZeroStoresPromoted
, "Number of narrow zero stores promoted");
57 STATISTIC(NumLoadsFromStoresPromoted
, "Number of loads from stores promoted");
59 DEBUG_COUNTER(RegRenamingCounter
, DEBUG_TYPE
"-reg-renaming",
60 "Controls which pairs are considered for renaming");
62 // The LdStLimit limits how far we search for load/store pairs.
63 static cl::opt
<unsigned> LdStLimit("aarch64-load-store-scan-limit",
64 cl::init(20), cl::Hidden
);
66 // The UpdateLimit limits how far we search for update instructions when we form
67 // pre-/post-index instructions.
68 static cl::opt
<unsigned> UpdateLimit("aarch64-update-scan-limit", cl::init(100),
71 // Enable register renaming to find additional store pairing opportunities.
72 static cl::opt
<bool> EnableRenaming("aarch64-load-store-renaming",
73 cl::init(true), cl::Hidden
);
75 #define AARCH64_LOAD_STORE_OPT_NAME "AArch64 load / store optimization pass"
79 using LdStPairFlags
= struct LdStPairFlags
{
80 // If a matching instruction is found, MergeForward is set to true if the
81 // merge is to remove the first instruction and replace the second with
82 // a pair-wise insn, and false if the reverse is true.
83 bool MergeForward
= false;
85 // SExtIdx gives the index of the result of the load pair that must be
86 // extended. The value of SExtIdx assumes that the paired load produces the
87 // value in this order: (I, returned iterator), i.e., -1 means no value has
88 // to be extended, 0 means I, and 1 means the returned iterator.
91 // If not none, RenameReg can be used to rename the result register of the
92 // first store in a pair. Currently this only works when merging stores
94 Optional
<MCPhysReg
> RenameReg
= None
;
96 LdStPairFlags() = default;
98 void setMergeForward(bool V
= true) { MergeForward
= V
; }
99 bool getMergeForward() const { return MergeForward
; }
101 void setSExtIdx(int V
) { SExtIdx
= V
; }
102 int getSExtIdx() const { return SExtIdx
; }
104 void setRenameReg(MCPhysReg R
) { RenameReg
= R
; }
105 void clearRenameReg() { RenameReg
= None
; }
106 Optional
<MCPhysReg
> getRenameReg() const { return RenameReg
; }
109 struct AArch64LoadStoreOpt
: public MachineFunctionPass
{
112 AArch64LoadStoreOpt() : MachineFunctionPass(ID
) {
113 initializeAArch64LoadStoreOptPass(*PassRegistry::getPassRegistry());
117 const AArch64InstrInfo
*TII
;
118 const TargetRegisterInfo
*TRI
;
119 const AArch64Subtarget
*Subtarget
;
121 // Track which register units have been modified and used.
122 LiveRegUnits ModifiedRegUnits
, UsedRegUnits
;
123 LiveRegUnits DefinedInBB
;
125 void getAnalysisUsage(AnalysisUsage
&AU
) const override
{
126 AU
.addRequired
<AAResultsWrapperPass
>();
127 MachineFunctionPass::getAnalysisUsage(AU
);
130 // Scan the instructions looking for a load/store that can be combined
131 // with the current instruction into a load/store pair.
132 // Return the matching instruction if one is found, else MBB->end().
133 MachineBasicBlock::iterator
findMatchingInsn(MachineBasicBlock::iterator I
,
134 LdStPairFlags
&Flags
,
136 bool FindNarrowMerge
);
138 // Scan the instructions looking for a store that writes to the address from
139 // which the current load instruction reads. Return true if one is found.
140 bool findMatchingStore(MachineBasicBlock::iterator I
, unsigned Limit
,
141 MachineBasicBlock::iterator
&StoreI
);
143 // Merge the two instructions indicated into a wider narrow store instruction.
144 MachineBasicBlock::iterator
145 mergeNarrowZeroStores(MachineBasicBlock::iterator I
,
146 MachineBasicBlock::iterator MergeMI
,
147 const LdStPairFlags
&Flags
);
149 // Merge the two instructions indicated into a single pair-wise instruction.
150 MachineBasicBlock::iterator
151 mergePairedInsns(MachineBasicBlock::iterator I
,
152 MachineBasicBlock::iterator Paired
,
153 const LdStPairFlags
&Flags
);
155 // Promote the load that reads directly from the address stored to.
156 MachineBasicBlock::iterator
157 promoteLoadFromStore(MachineBasicBlock::iterator LoadI
,
158 MachineBasicBlock::iterator StoreI
);
160 // Scan the instruction list to find a base register update that can
161 // be combined with the current instruction (a load or store) using
162 // pre or post indexed addressing with writeback. Scan forwards.
163 MachineBasicBlock::iterator
164 findMatchingUpdateInsnForward(MachineBasicBlock::iterator I
,
165 int UnscaledOffset
, unsigned Limit
);
167 // Scan the instruction list to find a base register update that can
168 // be combined with the current instruction (a load or store) using
169 // pre or post indexed addressing with writeback. Scan backwards.
170 MachineBasicBlock::iterator
171 findMatchingUpdateInsnBackward(MachineBasicBlock::iterator I
, unsigned Limit
);
173 // Find an instruction that updates the base register of the ld/st
175 bool isMatchingUpdateInsn(MachineInstr
&MemMI
, MachineInstr
&MI
,
176 unsigned BaseReg
, int Offset
);
178 // Merge a pre- or post-index base register update into a ld/st instruction.
179 MachineBasicBlock::iterator
180 mergeUpdateInsn(MachineBasicBlock::iterator I
,
181 MachineBasicBlock::iterator Update
, bool IsPreIdx
);
183 // Find and merge zero store instructions.
184 bool tryToMergeZeroStInst(MachineBasicBlock::iterator
&MBBI
);
186 // Find and pair ldr/str instructions.
187 bool tryToPairLdStInst(MachineBasicBlock::iterator
&MBBI
);
189 // Find and promote load instructions which read directly from store.
190 bool tryToPromoteLoadFromStore(MachineBasicBlock::iterator
&MBBI
);
192 // Find and merge a base register updates before or after a ld/st instruction.
193 bool tryToMergeLdStUpdate(MachineBasicBlock::iterator
&MBBI
);
195 bool optimizeBlock(MachineBasicBlock
&MBB
, bool EnableNarrowZeroStOpt
);
197 bool runOnMachineFunction(MachineFunction
&Fn
) override
;
199 MachineFunctionProperties
getRequiredProperties() const override
{
200 return MachineFunctionProperties().set(
201 MachineFunctionProperties::Property::NoVRegs
);
204 StringRef
getPassName() const override
{ return AARCH64_LOAD_STORE_OPT_NAME
; }
207 char AArch64LoadStoreOpt::ID
= 0;
209 } // end anonymous namespace
211 INITIALIZE_PASS(AArch64LoadStoreOpt
, "aarch64-ldst-opt",
212 AARCH64_LOAD_STORE_OPT_NAME
, false, false)
214 static bool isNarrowStore(unsigned Opc
) {
218 case AArch64::STRBBui
:
219 case AArch64::STURBBi
:
220 case AArch64::STRHHui
:
221 case AArch64::STURHHi
:
226 // These instruction set memory tag and either keep memory contents unchanged or
227 // set it to zero, ignoring the address part of the source register.
228 static bool isTagStore(const MachineInstr
&MI
) {
229 switch (MI
.getOpcode()) {
232 case AArch64::STGOffset
:
233 case AArch64::STZGOffset
:
234 case AArch64::ST2GOffset
:
235 case AArch64::STZ2GOffset
:
240 static unsigned getMatchingNonSExtOpcode(unsigned Opc
,
241 bool *IsValidLdStrOpc
= nullptr) {
243 *IsValidLdStrOpc
= true;
247 *IsValidLdStrOpc
= false;
248 return std::numeric_limits
<unsigned>::max();
249 case AArch64::STRDui
:
250 case AArch64::STURDi
:
251 case AArch64::STRDpre
:
252 case AArch64::STRQui
:
253 case AArch64::STURQi
:
254 case AArch64::STRQpre
:
255 case AArch64::STRBBui
:
256 case AArch64::STURBBi
:
257 case AArch64::STRHHui
:
258 case AArch64::STURHHi
:
259 case AArch64::STRWui
:
260 case AArch64::STRWpre
:
261 case AArch64::STURWi
:
262 case AArch64::STRXui
:
263 case AArch64::STRXpre
:
264 case AArch64::STURXi
:
265 case AArch64::LDRDui
:
266 case AArch64::LDURDi
:
267 case AArch64::LDRDpre
:
268 case AArch64::LDRQui
:
269 case AArch64::LDURQi
:
270 case AArch64::LDRQpre
:
271 case AArch64::LDRWui
:
272 case AArch64::LDURWi
:
273 case AArch64::LDRWpre
:
274 case AArch64::LDRXui
:
275 case AArch64::LDURXi
:
276 case AArch64::LDRXpre
:
277 case AArch64::STRSui
:
278 case AArch64::STURSi
:
279 case AArch64::STRSpre
:
280 case AArch64::LDRSui
:
281 case AArch64::LDURSi
:
282 case AArch64::LDRSpre
:
284 case AArch64::LDRSWui
:
285 return AArch64::LDRWui
;
286 case AArch64::LDURSWi
:
287 return AArch64::LDURWi
;
291 static unsigned getMatchingWideOpcode(unsigned Opc
) {
294 llvm_unreachable("Opcode has no wide equivalent!");
295 case AArch64::STRBBui
:
296 return AArch64::STRHHui
;
297 case AArch64::STRHHui
:
298 return AArch64::STRWui
;
299 case AArch64::STURBBi
:
300 return AArch64::STURHHi
;
301 case AArch64::STURHHi
:
302 return AArch64::STURWi
;
303 case AArch64::STURWi
:
304 return AArch64::STURXi
;
305 case AArch64::STRWui
:
306 return AArch64::STRXui
;
310 static unsigned getMatchingPairOpcode(unsigned Opc
) {
313 llvm_unreachable("Opcode has no pairwise equivalent!");
314 case AArch64::STRSui
:
315 case AArch64::STURSi
:
316 return AArch64::STPSi
;
317 case AArch64::STRSpre
:
318 return AArch64::STPSpre
;
319 case AArch64::STRDui
:
320 case AArch64::STURDi
:
321 return AArch64::STPDi
;
322 case AArch64::STRDpre
:
323 return AArch64::STPDpre
;
324 case AArch64::STRQui
:
325 case AArch64::STURQi
:
326 return AArch64::STPQi
;
327 case AArch64::STRQpre
:
328 return AArch64::STPQpre
;
329 case AArch64::STRWui
:
330 case AArch64::STURWi
:
331 return AArch64::STPWi
;
332 case AArch64::STRWpre
:
333 return AArch64::STPWpre
;
334 case AArch64::STRXui
:
335 case AArch64::STURXi
:
336 return AArch64::STPXi
;
337 case AArch64::STRXpre
:
338 return AArch64::STPXpre
;
339 case AArch64::LDRSui
:
340 case AArch64::LDURSi
:
341 return AArch64::LDPSi
;
342 case AArch64::LDRSpre
:
343 return AArch64::LDPSpre
;
344 case AArch64::LDRDui
:
345 case AArch64::LDURDi
:
346 return AArch64::LDPDi
;
347 case AArch64::LDRDpre
:
348 return AArch64::LDPDpre
;
349 case AArch64::LDRQui
:
350 case AArch64::LDURQi
:
351 return AArch64::LDPQi
;
352 case AArch64::LDRQpre
:
353 return AArch64::LDPQpre
;
354 case AArch64::LDRWui
:
355 case AArch64::LDURWi
:
356 return AArch64::LDPWi
;
357 case AArch64::LDRWpre
:
358 return AArch64::LDPWpre
;
359 case AArch64::LDRXui
:
360 case AArch64::LDURXi
:
361 return AArch64::LDPXi
;
362 case AArch64::LDRXpre
:
363 return AArch64::LDPXpre
;
364 case AArch64::LDRSWui
:
365 case AArch64::LDURSWi
:
366 return AArch64::LDPSWi
;
370 static unsigned isMatchingStore(MachineInstr
&LoadInst
,
371 MachineInstr
&StoreInst
) {
372 unsigned LdOpc
= LoadInst
.getOpcode();
373 unsigned StOpc
= StoreInst
.getOpcode();
376 llvm_unreachable("Unsupported load instruction!");
377 case AArch64::LDRBBui
:
378 return StOpc
== AArch64::STRBBui
|| StOpc
== AArch64::STRHHui
||
379 StOpc
== AArch64::STRWui
|| StOpc
== AArch64::STRXui
;
380 case AArch64::LDURBBi
:
381 return StOpc
== AArch64::STURBBi
|| StOpc
== AArch64::STURHHi
||
382 StOpc
== AArch64::STURWi
|| StOpc
== AArch64::STURXi
;
383 case AArch64::LDRHHui
:
384 return StOpc
== AArch64::STRHHui
|| StOpc
== AArch64::STRWui
||
385 StOpc
== AArch64::STRXui
;
386 case AArch64::LDURHHi
:
387 return StOpc
== AArch64::STURHHi
|| StOpc
== AArch64::STURWi
||
388 StOpc
== AArch64::STURXi
;
389 case AArch64::LDRWui
:
390 return StOpc
== AArch64::STRWui
|| StOpc
== AArch64::STRXui
;
391 case AArch64::LDURWi
:
392 return StOpc
== AArch64::STURWi
|| StOpc
== AArch64::STURXi
;
393 case AArch64::LDRXui
:
394 return StOpc
== AArch64::STRXui
;
395 case AArch64::LDURXi
:
396 return StOpc
== AArch64::STURXi
;
400 static unsigned getPreIndexedOpcode(unsigned Opc
) {
401 // FIXME: We don't currently support creating pre-indexed loads/stores when
402 // the load or store is the unscaled version. If we decide to perform such an
403 // optimization in the future the cases for the unscaled loads/stores will
404 // need to be added here.
407 llvm_unreachable("Opcode has no pre-indexed equivalent!");
408 case AArch64::STRSui
:
409 return AArch64::STRSpre
;
410 case AArch64::STRDui
:
411 return AArch64::STRDpre
;
412 case AArch64::STRQui
:
413 return AArch64::STRQpre
;
414 case AArch64::STRBBui
:
415 return AArch64::STRBBpre
;
416 case AArch64::STRHHui
:
417 return AArch64::STRHHpre
;
418 case AArch64::STRWui
:
419 return AArch64::STRWpre
;
420 case AArch64::STRXui
:
421 return AArch64::STRXpre
;
422 case AArch64::LDRSui
:
423 return AArch64::LDRSpre
;
424 case AArch64::LDRDui
:
425 return AArch64::LDRDpre
;
426 case AArch64::LDRQui
:
427 return AArch64::LDRQpre
;
428 case AArch64::LDRBBui
:
429 return AArch64::LDRBBpre
;
430 case AArch64::LDRHHui
:
431 return AArch64::LDRHHpre
;
432 case AArch64::LDRWui
:
433 return AArch64::LDRWpre
;
434 case AArch64::LDRXui
:
435 return AArch64::LDRXpre
;
436 case AArch64::LDRSWui
:
437 return AArch64::LDRSWpre
;
439 return AArch64::LDPSpre
;
440 case AArch64::LDPSWi
:
441 return AArch64::LDPSWpre
;
443 return AArch64::LDPDpre
;
445 return AArch64::LDPQpre
;
447 return AArch64::LDPWpre
;
449 return AArch64::LDPXpre
;
451 return AArch64::STPSpre
;
453 return AArch64::STPDpre
;
455 return AArch64::STPQpre
;
457 return AArch64::STPWpre
;
459 return AArch64::STPXpre
;
460 case AArch64::STGOffset
:
461 return AArch64::STGPreIndex
;
462 case AArch64::STZGOffset
:
463 return AArch64::STZGPreIndex
;
464 case AArch64::ST2GOffset
:
465 return AArch64::ST2GPreIndex
;
466 case AArch64::STZ2GOffset
:
467 return AArch64::STZ2GPreIndex
;
469 return AArch64::STGPpre
;
473 static unsigned getPostIndexedOpcode(unsigned Opc
) {
476 llvm_unreachable("Opcode has no post-indexed wise equivalent!");
477 case AArch64::STRSui
:
478 case AArch64::STURSi
:
479 return AArch64::STRSpost
;
480 case AArch64::STRDui
:
481 case AArch64::STURDi
:
482 return AArch64::STRDpost
;
483 case AArch64::STRQui
:
484 case AArch64::STURQi
:
485 return AArch64::STRQpost
;
486 case AArch64::STRBBui
:
487 return AArch64::STRBBpost
;
488 case AArch64::STRHHui
:
489 return AArch64::STRHHpost
;
490 case AArch64::STRWui
:
491 case AArch64::STURWi
:
492 return AArch64::STRWpost
;
493 case AArch64::STRXui
:
494 case AArch64::STURXi
:
495 return AArch64::STRXpost
;
496 case AArch64::LDRSui
:
497 case AArch64::LDURSi
:
498 return AArch64::LDRSpost
;
499 case AArch64::LDRDui
:
500 case AArch64::LDURDi
:
501 return AArch64::LDRDpost
;
502 case AArch64::LDRQui
:
503 case AArch64::LDURQi
:
504 return AArch64::LDRQpost
;
505 case AArch64::LDRBBui
:
506 return AArch64::LDRBBpost
;
507 case AArch64::LDRHHui
:
508 return AArch64::LDRHHpost
;
509 case AArch64::LDRWui
:
510 case AArch64::LDURWi
:
511 return AArch64::LDRWpost
;
512 case AArch64::LDRXui
:
513 case AArch64::LDURXi
:
514 return AArch64::LDRXpost
;
515 case AArch64::LDRSWui
:
516 return AArch64::LDRSWpost
;
518 return AArch64::LDPSpost
;
519 case AArch64::LDPSWi
:
520 return AArch64::LDPSWpost
;
522 return AArch64::LDPDpost
;
524 return AArch64::LDPQpost
;
526 return AArch64::LDPWpost
;
528 return AArch64::LDPXpost
;
530 return AArch64::STPSpost
;
532 return AArch64::STPDpost
;
534 return AArch64::STPQpost
;
536 return AArch64::STPWpost
;
538 return AArch64::STPXpost
;
539 case AArch64::STGOffset
:
540 return AArch64::STGPostIndex
;
541 case AArch64::STZGOffset
:
542 return AArch64::STZGPostIndex
;
543 case AArch64::ST2GOffset
:
544 return AArch64::ST2GPostIndex
;
545 case AArch64::STZ2GOffset
:
546 return AArch64::STZ2GPostIndex
;
548 return AArch64::STGPpost
;
552 static bool isPairedLdSt(const MachineInstr
&MI
) {
553 switch (MI
.getOpcode()) {
557 case AArch64::LDPSWi
:
572 static bool isPreLdStPairCandidate(MachineInstr
&FirstMI
, MachineInstr
&MI
) {
574 unsigned OpcA
= FirstMI
.getOpcode();
575 unsigned OpcB
= MI
.getOpcode();
580 case AArch64::STRSpre
:
581 return (OpcB
== AArch64::STRSui
) || (OpcB
== AArch64::STURSi
);
582 case AArch64::STRDpre
:
583 return (OpcB
== AArch64::STRDui
) || (OpcB
== AArch64::STURDi
);
584 case AArch64::STRQpre
:
585 return (OpcB
== AArch64::STRQui
) || (OpcB
== AArch64::STURQi
);
586 case AArch64::STRWpre
:
587 return (OpcB
== AArch64::STRWui
) || (OpcB
== AArch64::STURWi
);
588 case AArch64::STRXpre
:
589 return (OpcB
== AArch64::STRXui
) || (OpcB
== AArch64::STURXi
);
590 case AArch64::LDRSpre
:
591 return (OpcB
== AArch64::LDRSui
) || (OpcB
== AArch64::LDURSi
);
592 case AArch64::LDRDpre
:
593 return (OpcB
== AArch64::LDRDui
) || (OpcB
== AArch64::LDURDi
);
594 case AArch64::LDRQpre
:
595 return (OpcB
== AArch64::LDRQui
) || (OpcB
== AArch64::LDURQi
);
596 case AArch64::LDRWpre
:
597 return (OpcB
== AArch64::LDRWui
) || (OpcB
== AArch64::LDURWi
);
598 case AArch64::LDRXpre
:
599 return (OpcB
== AArch64::LDRXui
) || (OpcB
== AArch64::LDURXi
);
603 // Returns the scale and offset range of pre/post indexed variants of MI.
604 static void getPrePostIndexedMemOpInfo(const MachineInstr
&MI
, int &Scale
,
605 int &MinOffset
, int &MaxOffset
) {
606 bool IsPaired
= isPairedLdSt(MI
);
607 bool IsTagStore
= isTagStore(MI
);
608 // ST*G and all paired ldst have the same scale in pre/post-indexed variants
609 // as in the "unsigned offset" variant.
610 // All other pre/post indexed ldst instructions are unscaled.
611 Scale
= (IsTagStore
|| IsPaired
) ? AArch64InstrInfo::getMemScale(MI
) : 1;
622 static MachineOperand
&getLdStRegOp(MachineInstr
&MI
,
623 unsigned PairedRegOp
= 0) {
624 assert(PairedRegOp
< 2 && "Unexpected register operand idx.");
625 bool IsPreLdSt
= AArch64InstrInfo::isPreLdSt(MI
);
628 unsigned Idx
= isPairedLdSt(MI
) || IsPreLdSt
? PairedRegOp
: 0;
629 return MI
.getOperand(Idx
);
632 static const MachineOperand
&getLdStBaseOp(const MachineInstr
&MI
) {
633 unsigned Idx
= isPairedLdSt(MI
) || AArch64InstrInfo::isPreLdSt(MI
) ? 2 : 1;
634 return MI
.getOperand(Idx
);
637 static const MachineOperand
&getLdStOffsetOp(const MachineInstr
&MI
) {
638 unsigned Idx
= isPairedLdSt(MI
) || AArch64InstrInfo::isPreLdSt(MI
) ? 3 : 2;
639 return MI
.getOperand(Idx
);
642 static bool isLdOffsetInRangeOfSt(MachineInstr
&LoadInst
,
643 MachineInstr
&StoreInst
,
644 const AArch64InstrInfo
*TII
) {
645 assert(isMatchingStore(LoadInst
, StoreInst
) && "Expect only matched ld/st.");
646 int LoadSize
= TII
->getMemScale(LoadInst
);
647 int StoreSize
= TII
->getMemScale(StoreInst
);
648 int UnscaledStOffset
= TII
->hasUnscaledLdStOffset(StoreInst
)
649 ? getLdStOffsetOp(StoreInst
).getImm()
650 : getLdStOffsetOp(StoreInst
).getImm() * StoreSize
;
651 int UnscaledLdOffset
= TII
->hasUnscaledLdStOffset(LoadInst
)
652 ? getLdStOffsetOp(LoadInst
).getImm()
653 : getLdStOffsetOp(LoadInst
).getImm() * LoadSize
;
654 return (UnscaledStOffset
<= UnscaledLdOffset
) &&
655 (UnscaledLdOffset
+ LoadSize
<= (UnscaledStOffset
+ StoreSize
));
658 static bool isPromotableZeroStoreInst(MachineInstr
&MI
) {
659 unsigned Opc
= MI
.getOpcode();
660 return (Opc
== AArch64::STRWui
|| Opc
== AArch64::STURWi
||
661 isNarrowStore(Opc
)) &&
662 getLdStRegOp(MI
).getReg() == AArch64::WZR
;
665 static bool isPromotableLoadFromStore(MachineInstr
&MI
) {
666 switch (MI
.getOpcode()) {
669 // Scaled instructions.
670 case AArch64::LDRBBui
:
671 case AArch64::LDRHHui
:
672 case AArch64::LDRWui
:
673 case AArch64::LDRXui
:
674 // Unscaled instructions.
675 case AArch64::LDURBBi
:
676 case AArch64::LDURHHi
:
677 case AArch64::LDURWi
:
678 case AArch64::LDURXi
:
683 static bool isMergeableLdStUpdate(MachineInstr
&MI
) {
684 unsigned Opc
= MI
.getOpcode();
688 // Scaled instructions.
689 case AArch64::STRSui
:
690 case AArch64::STRDui
:
691 case AArch64::STRQui
:
692 case AArch64::STRXui
:
693 case AArch64::STRWui
:
694 case AArch64::STRHHui
:
695 case AArch64::STRBBui
:
696 case AArch64::LDRSui
:
697 case AArch64::LDRDui
:
698 case AArch64::LDRQui
:
699 case AArch64::LDRXui
:
700 case AArch64::LDRWui
:
701 case AArch64::LDRHHui
:
702 case AArch64::LDRBBui
:
703 case AArch64::STGOffset
:
704 case AArch64::STZGOffset
:
705 case AArch64::ST2GOffset
:
706 case AArch64::STZ2GOffset
:
708 // Unscaled instructions.
709 case AArch64::STURSi
:
710 case AArch64::STURDi
:
711 case AArch64::STURQi
:
712 case AArch64::STURWi
:
713 case AArch64::STURXi
:
714 case AArch64::LDURSi
:
715 case AArch64::LDURDi
:
716 case AArch64::LDURQi
:
717 case AArch64::LDURWi
:
718 case AArch64::LDURXi
:
719 // Paired instructions.
721 case AArch64::LDPSWi
:
731 // Make sure this is a reg+imm (as opposed to an address reloc).
732 if (!getLdStOffsetOp(MI
).isImm())
739 MachineBasicBlock::iterator
740 AArch64LoadStoreOpt::mergeNarrowZeroStores(MachineBasicBlock::iterator I
,
741 MachineBasicBlock::iterator MergeMI
,
742 const LdStPairFlags
&Flags
) {
743 assert(isPromotableZeroStoreInst(*I
) && isPromotableZeroStoreInst(*MergeMI
) &&
744 "Expected promotable zero stores.");
746 MachineBasicBlock::iterator E
= I
->getParent()->end();
747 MachineBasicBlock::iterator NextI
= next_nodbg(I
, E
);
748 // If NextI is the second of the two instructions to be merged, we need
749 // to skip one further. Either way we merge will invalidate the iterator,
750 // and we don't need to scan the new instruction, as it's a pairwise
751 // instruction, which we're not considering for further action anyway.
752 if (NextI
== MergeMI
)
753 NextI
= next_nodbg(NextI
, E
);
755 unsigned Opc
= I
->getOpcode();
756 bool IsScaled
= !TII
->hasUnscaledLdStOffset(Opc
);
757 int OffsetStride
= IsScaled
? 1 : TII
->getMemScale(*I
);
759 bool MergeForward
= Flags
.getMergeForward();
760 // Insert our new paired instruction after whichever of the paired
761 // instructions MergeForward indicates.
762 MachineBasicBlock::iterator InsertionPoint
= MergeForward
? MergeMI
: I
;
763 // Also based on MergeForward is from where we copy the base register operand
764 // so we get the flags compatible with the input code.
765 const MachineOperand
&BaseRegOp
=
766 MergeForward
? getLdStBaseOp(*MergeMI
) : getLdStBaseOp(*I
);
768 // Which register is Rt and which is Rt2 depends on the offset order.
770 if (getLdStOffsetOp(*I
).getImm() ==
771 getLdStOffsetOp(*MergeMI
).getImm() + OffsetStride
)
776 int OffsetImm
= getLdStOffsetOp(*RtMI
).getImm();
777 // Change the scaled offset from small to large type.
779 assert(((OffsetImm
& 1) == 0) && "Unexpected offset to merge");
783 // Construct the new instruction.
784 DebugLoc DL
= I
->getDebugLoc();
785 MachineBasicBlock
*MBB
= I
->getParent();
786 MachineInstrBuilder MIB
;
787 MIB
= BuildMI(*MBB
, InsertionPoint
, DL
, TII
->get(getMatchingWideOpcode(Opc
)))
788 .addReg(isNarrowStore(Opc
) ? AArch64::WZR
: AArch64::XZR
)
791 .cloneMergedMemRefs({&*I
, &*MergeMI
})
792 .setMIFlags(I
->mergeFlagsWith(*MergeMI
));
795 LLVM_DEBUG(dbgs() << "Creating wider store. Replacing instructions:\n ");
796 LLVM_DEBUG(I
->print(dbgs()));
797 LLVM_DEBUG(dbgs() << " ");
798 LLVM_DEBUG(MergeMI
->print(dbgs()));
799 LLVM_DEBUG(dbgs() << " with instruction:\n ");
800 LLVM_DEBUG(((MachineInstr
*)MIB
)->print(dbgs()));
801 LLVM_DEBUG(dbgs() << "\n");
803 // Erase the old instructions.
804 I
->eraseFromParent();
805 MergeMI
->eraseFromParent();
809 // Apply Fn to all instructions between MI and the beginning of the block, until
810 // a def for DefReg is reached. Returns true, iff Fn returns true for all
811 // visited instructions. Stop after visiting Limit iterations.
812 static bool forAllMIsUntilDef(MachineInstr
&MI
, MCPhysReg DefReg
,
813 const TargetRegisterInfo
*TRI
, unsigned Limit
,
814 std::function
<bool(MachineInstr
&, bool)> &Fn
) {
815 auto MBB
= MI
.getParent();
816 for (MachineInstr
&I
:
817 instructionsWithoutDebug(MI
.getReverseIterator(), MBB
->instr_rend())) {
822 bool isDef
= any_of(I
.operands(), [DefReg
, TRI
](MachineOperand
&MOP
) {
823 return MOP
.isReg() && MOP
.isDef() && !MOP
.isDebug() && MOP
.getReg() &&
824 TRI
->regsOverlap(MOP
.getReg(), DefReg
);
834 static void updateDefinedRegisters(MachineInstr
&MI
, LiveRegUnits
&Units
,
835 const TargetRegisterInfo
*TRI
) {
837 for (const MachineOperand
&MOP
: phys_regs_and_masks(MI
))
838 if (MOP
.isReg() && MOP
.isKill())
839 Units
.removeReg(MOP
.getReg());
841 for (const MachineOperand
&MOP
: phys_regs_and_masks(MI
))
842 if (MOP
.isReg() && !MOP
.isKill())
843 Units
.addReg(MOP
.getReg());
846 MachineBasicBlock::iterator
847 AArch64LoadStoreOpt::mergePairedInsns(MachineBasicBlock::iterator I
,
848 MachineBasicBlock::iterator Paired
,
849 const LdStPairFlags
&Flags
) {
850 MachineBasicBlock::iterator E
= I
->getParent()->end();
851 MachineBasicBlock::iterator NextI
= next_nodbg(I
, E
);
852 // If NextI is the second of the two instructions to be merged, we need
853 // to skip one further. Either way we merge will invalidate the iterator,
854 // and we don't need to scan the new instruction, as it's a pairwise
855 // instruction, which we're not considering for further action anyway.
857 NextI
= next_nodbg(NextI
, E
);
859 int SExtIdx
= Flags
.getSExtIdx();
861 SExtIdx
== -1 ? I
->getOpcode() : getMatchingNonSExtOpcode(I
->getOpcode());
862 bool IsUnscaled
= TII
->hasUnscaledLdStOffset(Opc
);
863 int OffsetStride
= IsUnscaled
? TII
->getMemScale(*I
) : 1;
865 bool MergeForward
= Flags
.getMergeForward();
867 Optional
<MCPhysReg
> RenameReg
= Flags
.getRenameReg();
868 if (MergeForward
&& RenameReg
) {
869 MCRegister RegToRename
= getLdStRegOp(*I
).getReg();
870 DefinedInBB
.addReg(*RenameReg
);
872 // Return the sub/super register for RenameReg, matching the size of
874 auto GetMatchingSubReg
= [this,
875 RenameReg
](MCPhysReg OriginalReg
) -> MCPhysReg
{
876 for (MCPhysReg SubOrSuper
: TRI
->sub_and_superregs_inclusive(*RenameReg
))
877 if (TRI
->getMinimalPhysRegClass(OriginalReg
) ==
878 TRI
->getMinimalPhysRegClass(SubOrSuper
))
880 llvm_unreachable("Should have found matching sub or super register!");
883 std::function
<bool(MachineInstr
&, bool)> UpdateMIs
=
884 [this, RegToRename
, GetMatchingSubReg
](MachineInstr
&MI
, bool IsDef
) {
886 bool SeenDef
= false;
887 for (auto &MOP
: MI
.operands()) {
888 // Rename the first explicit definition and all implicit
889 // definitions matching RegToRename.
890 if (MOP
.isReg() && !MOP
.isDebug() && MOP
.getReg() &&
891 (!SeenDef
|| (MOP
.isDef() && MOP
.isImplicit())) &&
892 TRI
->regsOverlap(MOP
.getReg(), RegToRename
)) {
893 assert((MOP
.isImplicit() ||
894 (MOP
.isRenamable() && !MOP
.isEarlyClobber())) &&
895 "Need renamable operands");
896 MOP
.setReg(GetMatchingSubReg(MOP
.getReg()));
901 for (auto &MOP
: MI
.operands()) {
902 if (MOP
.isReg() && !MOP
.isDebug() && MOP
.getReg() &&
903 TRI
->regsOverlap(MOP
.getReg(), RegToRename
)) {
904 assert((MOP
.isImplicit() ||
905 (MOP
.isRenamable() && !MOP
.isEarlyClobber())) &&
906 "Need renamable operands");
907 MOP
.setReg(GetMatchingSubReg(MOP
.getReg()));
911 LLVM_DEBUG(dbgs() << "Renamed " << MI
<< "\n");
914 forAllMIsUntilDef(*I
, RegToRename
, TRI
, LdStLimit
, UpdateMIs
);
917 // Make sure the register used for renaming is not used between the paired
918 // instructions. That would trash the content before the new paired
921 iterator_range
<MachineInstrBundleIterator
<llvm::MachineInstr
>>(
922 std::next(I
), std::next(Paired
)))
923 assert(all_of(MI
.operands(),
924 [this, &RenameReg
](const MachineOperand
&MOP
) {
925 return !MOP
.isReg() || MOP
.isDebug() || !MOP
.getReg() ||
926 !TRI
->regsOverlap(MOP
.getReg(), *RenameReg
);
928 "Rename register used between paired instruction, trashing the "
933 // Insert our new paired instruction after whichever of the paired
934 // instructions MergeForward indicates.
935 MachineBasicBlock::iterator InsertionPoint
= MergeForward
? Paired
: I
;
936 // Also based on MergeForward is from where we copy the base register operand
937 // so we get the flags compatible with the input code.
938 const MachineOperand
&BaseRegOp
=
939 MergeForward
? getLdStBaseOp(*Paired
) : getLdStBaseOp(*I
);
941 int Offset
= getLdStOffsetOp(*I
).getImm();
942 int PairedOffset
= getLdStOffsetOp(*Paired
).getImm();
943 bool PairedIsUnscaled
= TII
->hasUnscaledLdStOffset(Paired
->getOpcode());
944 if (IsUnscaled
!= PairedIsUnscaled
) {
945 // We're trying to pair instructions that differ in how they are scaled. If
946 // I is scaled then scale the offset of Paired accordingly. Otherwise, do
947 // the opposite (i.e., make Paired's offset unscaled).
948 int MemSize
= TII
->getMemScale(*Paired
);
949 if (PairedIsUnscaled
) {
950 // If the unscaled offset isn't a multiple of the MemSize, we can't
951 // pair the operations together.
952 assert(!(PairedOffset
% TII
->getMemScale(*Paired
)) &&
953 "Offset should be a multiple of the stride!");
954 PairedOffset
/= MemSize
;
956 PairedOffset
*= MemSize
;
960 // Which register is Rt and which is Rt2 depends on the offset order.
961 // However, for pre load/stores the Rt should be the one of the pre
963 MachineInstr
*RtMI
, *Rt2MI
;
964 if (Offset
== PairedOffset
+ OffsetStride
&&
965 !AArch64InstrInfo::isPreLdSt(*I
)) {
968 // Here we swapped the assumption made for SExtIdx.
969 // I.e., we turn ldp I, Paired into ldp Paired, I.
970 // Update the index accordingly.
972 SExtIdx
= (SExtIdx
+ 1) % 2;
977 int OffsetImm
= getLdStOffsetOp(*RtMI
).getImm();
978 // Scale the immediate offset, if necessary.
979 if (TII
->hasUnscaledLdStOffset(RtMI
->getOpcode())) {
980 assert(!(OffsetImm
% TII
->getMemScale(*RtMI
)) &&
981 "Unscaled offset cannot be scaled.");
982 OffsetImm
/= TII
->getMemScale(*RtMI
);
985 // Construct the new instruction.
986 MachineInstrBuilder MIB
;
987 DebugLoc DL
= I
->getDebugLoc();
988 MachineBasicBlock
*MBB
= I
->getParent();
989 MachineOperand RegOp0
= getLdStRegOp(*RtMI
);
990 MachineOperand RegOp1
= getLdStRegOp(*Rt2MI
);
991 // Kill flags may become invalid when moving stores for pairing.
992 if (RegOp0
.isUse()) {
994 // Clear kill flags on store if moving upwards. Example:
997 // STRWui kill %w1 ; need to clear kill flag when moving STRWui upwards
998 RegOp0
.setIsKill(false);
999 RegOp1
.setIsKill(false);
1001 // Clear kill flags of the first stores register. Example:
1003 // USE kill %w1 ; need to clear kill flag when moving STRWui downwards
1005 Register Reg
= getLdStRegOp(*I
).getReg();
1006 for (MachineInstr
&MI
: make_range(std::next(I
), Paired
))
1007 MI
.clearRegisterKills(Reg
, TRI
);
1011 unsigned int MatchPairOpcode
= getMatchingPairOpcode(Opc
);
1012 MIB
= BuildMI(*MBB
, InsertionPoint
, DL
, TII
->get(MatchPairOpcode
));
1014 // Adds the pre-index operand for pre-indexed ld/st pairs.
1015 if (AArch64InstrInfo::isPreLdSt(*RtMI
))
1016 MIB
.addReg(BaseRegOp
.getReg(), RegState::Define
);
1022 .cloneMergedMemRefs({&*I
, &*Paired
})
1023 .setMIFlags(I
->mergeFlagsWith(*Paired
));
1028 dbgs() << "Creating pair load/store. Replacing instructions:\n ");
1029 LLVM_DEBUG(I
->print(dbgs()));
1030 LLVM_DEBUG(dbgs() << " ");
1031 LLVM_DEBUG(Paired
->print(dbgs()));
1032 LLVM_DEBUG(dbgs() << " with instruction:\n ");
1033 if (SExtIdx
!= -1) {
1034 // Generate the sign extension for the proper result of the ldp.
1035 // I.e., with X1, that would be:
1036 // %w1 = KILL %w1, implicit-def %x1
1037 // %x1 = SBFMXri killed %x1, 0, 31
1038 MachineOperand
&DstMO
= MIB
->getOperand(SExtIdx
);
1039 // Right now, DstMO has the extended register, since it comes from an
1041 Register DstRegX
= DstMO
.getReg();
1042 // Get the W variant of that register.
1043 Register DstRegW
= TRI
->getSubReg(DstRegX
, AArch64::sub_32
);
1044 // Update the result of LDP to use the W instead of the X variant.
1045 DstMO
.setReg(DstRegW
);
1046 LLVM_DEBUG(((MachineInstr
*)MIB
)->print(dbgs()));
1047 LLVM_DEBUG(dbgs() << "\n");
1048 // Make the machine verifier happy by providing a definition for
1050 // Insert this definition right after the generated LDP, i.e., before
1052 MachineInstrBuilder MIBKill
=
1053 BuildMI(*MBB
, InsertionPoint
, DL
, TII
->get(TargetOpcode::KILL
), DstRegW
)
1055 .addReg(DstRegX
, RegState::Define
);
1056 MIBKill
->getOperand(2).setImplicit();
1057 // Create the sign extension.
1058 MachineInstrBuilder MIBSXTW
=
1059 BuildMI(*MBB
, InsertionPoint
, DL
, TII
->get(AArch64::SBFMXri
), DstRegX
)
1064 LLVM_DEBUG(dbgs() << " Extend operand:\n ");
1065 LLVM_DEBUG(((MachineInstr
*)MIBSXTW
)->print(dbgs()));
1067 LLVM_DEBUG(((MachineInstr
*)MIB
)->print(dbgs()));
1069 LLVM_DEBUG(dbgs() << "\n");
1072 for (const MachineOperand
&MOP
: phys_regs_and_masks(*I
))
1073 if (MOP
.isReg() && MOP
.isKill())
1074 DefinedInBB
.addReg(MOP
.getReg());
1076 // Erase the old instructions.
1077 I
->eraseFromParent();
1078 Paired
->eraseFromParent();
1083 MachineBasicBlock::iterator
1084 AArch64LoadStoreOpt::promoteLoadFromStore(MachineBasicBlock::iterator LoadI
,
1085 MachineBasicBlock::iterator StoreI
) {
1086 MachineBasicBlock::iterator NextI
=
1087 next_nodbg(LoadI
, LoadI
->getParent()->end());
1089 int LoadSize
= TII
->getMemScale(*LoadI
);
1090 int StoreSize
= TII
->getMemScale(*StoreI
);
1091 Register LdRt
= getLdStRegOp(*LoadI
).getReg();
1092 const MachineOperand
&StMO
= getLdStRegOp(*StoreI
);
1093 Register StRt
= getLdStRegOp(*StoreI
).getReg();
1094 bool IsStoreXReg
= TRI
->getRegClass(AArch64::GPR64RegClassID
)->contains(StRt
);
1096 assert((IsStoreXReg
||
1097 TRI
->getRegClass(AArch64::GPR32RegClassID
)->contains(StRt
)) &&
1098 "Unexpected RegClass");
1100 MachineInstr
*BitExtMI
;
1101 if (LoadSize
== StoreSize
&& (LoadSize
== 4 || LoadSize
== 8)) {
1102 // Remove the load, if the destination register of the loads is the same
1103 // register for stored value.
1104 if (StRt
== LdRt
&& LoadSize
== 8) {
1105 for (MachineInstr
&MI
: make_range(StoreI
->getIterator(),
1106 LoadI
->getIterator())) {
1107 if (MI
.killsRegister(StRt
, TRI
)) {
1108 MI
.clearRegisterKills(StRt
, TRI
);
1112 LLVM_DEBUG(dbgs() << "Remove load instruction:\n ");
1113 LLVM_DEBUG(LoadI
->print(dbgs()));
1114 LLVM_DEBUG(dbgs() << "\n");
1115 LoadI
->eraseFromParent();
1118 // Replace the load with a mov if the load and store are in the same size.
1120 BuildMI(*LoadI
->getParent(), LoadI
, LoadI
->getDebugLoc(),
1121 TII
->get(IsStoreXReg
? AArch64::ORRXrs
: AArch64::ORRWrs
), LdRt
)
1122 .addReg(IsStoreXReg
? AArch64::XZR
: AArch64::WZR
)
1124 .addImm(AArch64_AM::getShifterImm(AArch64_AM::LSL
, 0))
1125 .setMIFlags(LoadI
->getFlags());
1127 // FIXME: Currently we disable this transformation in big-endian targets as
1128 // performance and correctness are verified only in little-endian.
1129 if (!Subtarget
->isLittleEndian())
1131 bool IsUnscaled
= TII
->hasUnscaledLdStOffset(*LoadI
);
1132 assert(IsUnscaled
== TII
->hasUnscaledLdStOffset(*StoreI
) &&
1133 "Unsupported ld/st match");
1134 assert(LoadSize
<= StoreSize
&& "Invalid load size");
1135 int UnscaledLdOffset
= IsUnscaled
1136 ? getLdStOffsetOp(*LoadI
).getImm()
1137 : getLdStOffsetOp(*LoadI
).getImm() * LoadSize
;
1138 int UnscaledStOffset
= IsUnscaled
1139 ? getLdStOffsetOp(*StoreI
).getImm()
1140 : getLdStOffsetOp(*StoreI
).getImm() * StoreSize
;
1141 int Width
= LoadSize
* 8;
1143 IsStoreXReg
? Register(TRI
->getMatchingSuperReg(
1144 LdRt
, AArch64::sub_32
, &AArch64::GPR64RegClass
))
1147 assert((UnscaledLdOffset
>= UnscaledStOffset
&&
1148 (UnscaledLdOffset
+ LoadSize
) <= UnscaledStOffset
+ StoreSize
) &&
1151 int Immr
= 8 * (UnscaledLdOffset
- UnscaledStOffset
);
1152 int Imms
= Immr
+ Width
- 1;
1153 if (UnscaledLdOffset
== UnscaledStOffset
) {
1154 uint32_t AndMaskEncoded
= ((IsStoreXReg
? 1 : 0) << 12) // N
1155 | ((Immr
) << 6) // immr
1156 | ((Imms
) << 0) // imms
1160 BuildMI(*LoadI
->getParent(), LoadI
, LoadI
->getDebugLoc(),
1161 TII
->get(IsStoreXReg
? AArch64::ANDXri
: AArch64::ANDWri
),
1164 .addImm(AndMaskEncoded
)
1165 .setMIFlags(LoadI
->getFlags());
1168 BuildMI(*LoadI
->getParent(), LoadI
, LoadI
->getDebugLoc(),
1169 TII
->get(IsStoreXReg
? AArch64::UBFMXri
: AArch64::UBFMWri
),
1174 .setMIFlags(LoadI
->getFlags());
1178 // Clear kill flags between store and load.
1179 for (MachineInstr
&MI
: make_range(StoreI
->getIterator(),
1180 BitExtMI
->getIterator()))
1181 if (MI
.killsRegister(StRt
, TRI
)) {
1182 MI
.clearRegisterKills(StRt
, TRI
);
1186 LLVM_DEBUG(dbgs() << "Promoting load by replacing :\n ");
1187 LLVM_DEBUG(StoreI
->print(dbgs()));
1188 LLVM_DEBUG(dbgs() << " ");
1189 LLVM_DEBUG(LoadI
->print(dbgs()));
1190 LLVM_DEBUG(dbgs() << " with instructions:\n ");
1191 LLVM_DEBUG(StoreI
->print(dbgs()));
1192 LLVM_DEBUG(dbgs() << " ");
1193 LLVM_DEBUG((BitExtMI
)->print(dbgs()));
1194 LLVM_DEBUG(dbgs() << "\n");
1196 // Erase the old instructions.
1197 LoadI
->eraseFromParent();
1201 static bool inBoundsForPair(bool IsUnscaled
, int Offset
, int OffsetStride
) {
1202 // Convert the byte-offset used by unscaled into an "element" offset used
1203 // by the scaled pair load/store instructions.
1205 // If the byte-offset isn't a multiple of the stride, there's no point
1206 // trying to match it.
1207 if (Offset
% OffsetStride
)
1209 Offset
/= OffsetStride
;
1211 return Offset
<= 63 && Offset
>= -64;
1214 // Do alignment, specialized to power of 2 and for signed ints,
1215 // avoiding having to do a C-style cast from uint_64t to int when
1216 // using alignTo from include/llvm/Support/MathExtras.h.
1217 // FIXME: Move this function to include/MathExtras.h?
1218 static int alignTo(int Num
, int PowOf2
) {
1219 return (Num
+ PowOf2
- 1) & ~(PowOf2
- 1);
1222 static bool mayAlias(MachineInstr
&MIa
,
1223 SmallVectorImpl
<MachineInstr
*> &MemInsns
,
1224 AliasAnalysis
*AA
) {
1225 for (MachineInstr
*MIb
: MemInsns
)
1226 if (MIa
.mayAlias(AA
, *MIb
, /*UseTBAA*/ false))
1232 bool AArch64LoadStoreOpt::findMatchingStore(
1233 MachineBasicBlock::iterator I
, unsigned Limit
,
1234 MachineBasicBlock::iterator
&StoreI
) {
1235 MachineBasicBlock::iterator B
= I
->getParent()->begin();
1236 MachineBasicBlock::iterator MBBI
= I
;
1237 MachineInstr
&LoadMI
= *I
;
1238 Register BaseReg
= getLdStBaseOp(LoadMI
).getReg();
1240 // If the load is the first instruction in the block, there's obviously
1241 // not any matching store.
1245 // Track which register units have been modified and used between the first
1246 // insn and the second insn.
1247 ModifiedRegUnits
.clear();
1248 UsedRegUnits
.clear();
1252 MBBI
= prev_nodbg(MBBI
, B
);
1253 MachineInstr
&MI
= *MBBI
;
1255 // Don't count transient instructions towards the search limit since there
1256 // may be different numbers of them if e.g. debug information is present.
1257 if (!MI
.isTransient())
1260 // If the load instruction reads directly from the address to which the
1261 // store instruction writes and the stored value is not modified, we can
1262 // promote the load. Since we do not handle stores with pre-/post-index,
1263 // it's unnecessary to check if BaseReg is modified by the store itself.
1264 // Also we can't handle stores without an immediate offset operand,
1265 // while the operand might be the address for a global variable.
1266 if (MI
.mayStore() && isMatchingStore(LoadMI
, MI
) &&
1267 BaseReg
== getLdStBaseOp(MI
).getReg() && getLdStOffsetOp(MI
).isImm() &&
1268 isLdOffsetInRangeOfSt(LoadMI
, MI
, TII
) &&
1269 ModifiedRegUnits
.available(getLdStRegOp(MI
).getReg())) {
1277 // Update modified / uses register units.
1278 LiveRegUnits::accumulateUsedDefed(MI
, ModifiedRegUnits
, UsedRegUnits
, TRI
);
1280 // Otherwise, if the base register is modified, we have no match, so
1282 if (!ModifiedRegUnits
.available(BaseReg
))
1285 // If we encounter a store aliased with the load, return early.
1286 if (MI
.mayStore() && LoadMI
.mayAlias(AA
, MI
, /*UseTBAA*/ false))
1288 } while (MBBI
!= B
&& Count
< Limit
);
1292 // Returns true if FirstMI and MI are candidates for merging or pairing.
1293 // Otherwise, returns false.
1294 static bool areCandidatesToMergeOrPair(MachineInstr
&FirstMI
, MachineInstr
&MI
,
1295 LdStPairFlags
&Flags
,
1296 const AArch64InstrInfo
*TII
) {
1297 // If this is volatile or if pairing is suppressed, not a candidate.
1298 if (MI
.hasOrderedMemoryRef() || TII
->isLdStPairSuppressed(MI
))
1301 // We should have already checked FirstMI for pair suppression and volatility.
1302 assert(!FirstMI
.hasOrderedMemoryRef() &&
1303 !TII
->isLdStPairSuppressed(FirstMI
) &&
1304 "FirstMI shouldn't get here if either of these checks are true.");
1306 unsigned OpcA
= FirstMI
.getOpcode();
1307 unsigned OpcB
= MI
.getOpcode();
1309 // Opcodes match: If the opcodes are pre ld/st there is nothing more to check.
1311 return !AArch64InstrInfo::isPreLdSt(FirstMI
);
1313 // Try to match a sign-extended load/store with a zero-extended load/store.
1314 bool IsValidLdStrOpc
, PairIsValidLdStrOpc
;
1315 unsigned NonSExtOpc
= getMatchingNonSExtOpcode(OpcA
, &IsValidLdStrOpc
);
1316 assert(IsValidLdStrOpc
&&
1317 "Given Opc should be a Load or Store with an immediate");
1318 // OpcA will be the first instruction in the pair.
1319 if (NonSExtOpc
== getMatchingNonSExtOpcode(OpcB
, &PairIsValidLdStrOpc
)) {
1320 Flags
.setSExtIdx(NonSExtOpc
== (unsigned)OpcA
? 1 : 0);
1324 // If the second instruction isn't even a mergable/pairable load/store, bail
1326 if (!PairIsValidLdStrOpc
)
1329 // FIXME: We don't support merging narrow stores with mixed scaled/unscaled
1331 if (isNarrowStore(OpcA
) || isNarrowStore(OpcB
))
1334 // The STR<S,D,Q,W,X>pre - STR<S,D,Q,W,X>ui and
1335 // LDR<S,D,Q,W,X>pre-LDR<S,D,Q,W,X>ui
1336 // are candidate pairs that can be merged.
1337 if (isPreLdStPairCandidate(FirstMI
, MI
))
1340 // Try to match an unscaled load/store with a scaled load/store.
1341 return TII
->hasUnscaledLdStOffset(OpcA
) != TII
->hasUnscaledLdStOffset(OpcB
) &&
1342 getMatchingPairOpcode(OpcA
) == getMatchingPairOpcode(OpcB
);
1344 // FIXME: Can we also match a mixed sext/zext unscaled/scaled pair?
1348 canRenameUpToDef(MachineInstr
&FirstMI
, LiveRegUnits
&UsedInBetween
,
1349 SmallPtrSetImpl
<const TargetRegisterClass
*> &RequiredClasses
,
1350 const TargetRegisterInfo
*TRI
) {
1351 if (!FirstMI
.mayStore())
1354 // Check if we can find an unused register which we can use to rename
1355 // the register used by the first load/store.
1356 auto *RegClass
= TRI
->getMinimalPhysRegClass(getLdStRegOp(FirstMI
).getReg());
1357 MachineFunction
&MF
= *FirstMI
.getParent()->getParent();
1358 if (!RegClass
|| !MF
.getRegInfo().tracksLiveness())
1361 auto RegToRename
= getLdStRegOp(FirstMI
).getReg();
1362 // For now, we only rename if the store operand gets killed at the store.
1363 if (!getLdStRegOp(FirstMI
).isKill() &&
1364 !any_of(FirstMI
.operands(),
1365 [TRI
, RegToRename
](const MachineOperand
&MOP
) {
1366 return MOP
.isReg() && !MOP
.isDebug() && MOP
.getReg() &&
1367 MOP
.isImplicit() && MOP
.isKill() &&
1368 TRI
->regsOverlap(RegToRename
, MOP
.getReg());
1370 LLVM_DEBUG(dbgs() << " Operand not killed at " << FirstMI
<< "\n");
1373 auto canRenameMOP
= [TRI
](const MachineOperand
&MOP
) {
1375 auto *RegClass
= TRI
->getMinimalPhysRegClass(MOP
.getReg());
1376 // Renaming registers with multiple disjunct sub-registers (e.g. the
1377 // result of a LD3) means that all sub-registers are renamed, potentially
1378 // impacting other instructions we did not check. Bail out.
1379 // Note that this relies on the structure of the AArch64 register file. In
1380 // particular, a subregister cannot be written without overwriting the
1382 if (RegClass
->HasDisjunctSubRegs
) {
1385 << " Cannot rename operands with multiple disjunct subregisters ("
1390 return MOP
.isImplicit() ||
1391 (MOP
.isRenamable() && !MOP
.isEarlyClobber() && !MOP
.isTied());
1394 bool FoundDef
= false;
1396 // For each instruction between FirstMI and the previous def for RegToRename,
1398 // * check if we can rename RegToRename in this instruction
1399 // * collect the registers used and required register classes for RegToRename.
1400 std::function
<bool(MachineInstr
&, bool)> CheckMIs
= [&](MachineInstr
&MI
,
1402 LLVM_DEBUG(dbgs() << "Checking " << MI
<< "\n");
1403 // Currently we do not try to rename across frame-setup instructions.
1404 if (MI
.getFlag(MachineInstr::FrameSetup
)) {
1405 LLVM_DEBUG(dbgs() << " Cannot rename framesetup instructions currently ("
1410 UsedInBetween
.accumulate(MI
);
1412 // For a definition, check that we can rename the definition and exit the
1416 // For defs, check if we can rename the first def of RegToRename.
1418 // For some pseudo instructions, we might not generate code in the end
1419 // (e.g. KILL) and we would end up without a correct def for the rename
1421 // TODO: This might be overly conservative and we could handle those cases
1422 // in multiple ways:
1423 // 1. Insert an extra copy, to materialize the def.
1424 // 2. Skip pseudo-defs until we find an non-pseudo def.
1425 if (MI
.isPseudo()) {
1426 LLVM_DEBUG(dbgs() << " Cannot rename pseudo instruction " << MI
1431 for (auto &MOP
: MI
.operands()) {
1432 if (!MOP
.isReg() || !MOP
.isDef() || MOP
.isDebug() || !MOP
.getReg() ||
1433 !TRI
->regsOverlap(MOP
.getReg(), RegToRename
))
1435 if (!canRenameMOP(MOP
)) {
1437 << " Cannot rename " << MOP
<< " in " << MI
<< "\n");
1440 RequiredClasses
.insert(TRI
->getMinimalPhysRegClass(MOP
.getReg()));
1444 for (auto &MOP
: MI
.operands()) {
1445 if (!MOP
.isReg() || MOP
.isDebug() || !MOP
.getReg() ||
1446 !TRI
->regsOverlap(MOP
.getReg(), RegToRename
))
1449 if (!canRenameMOP(MOP
)) {
1451 << " Cannot rename " << MOP
<< " in " << MI
<< "\n");
1454 RequiredClasses
.insert(TRI
->getMinimalPhysRegClass(MOP
.getReg()));
1460 if (!forAllMIsUntilDef(FirstMI
, RegToRename
, TRI
, LdStLimit
, CheckMIs
))
1464 LLVM_DEBUG(dbgs() << " Did not find definition for register in BB\n");
1470 // Check if we can find a physical register for renaming. This register must:
1471 // * not be defined up to FirstMI (checking DefinedInBB)
1472 // * not used between the MI and the defining instruction of the register to
1473 // rename (checked using UsedInBetween).
1474 // * is available in all used register classes (checked using RequiredClasses).
1475 static Optional
<MCPhysReg
> tryToFindRegisterToRename(
1476 MachineInstr
&FirstMI
, MachineInstr
&MI
, LiveRegUnits
&DefinedInBB
,
1477 LiveRegUnits
&UsedInBetween
,
1478 SmallPtrSetImpl
<const TargetRegisterClass
*> &RequiredClasses
,
1479 const TargetRegisterInfo
*TRI
) {
1480 auto &MF
= *FirstMI
.getParent()->getParent();
1481 MachineRegisterInfo
&RegInfo
= MF
.getRegInfo();
1483 // Checks if any sub- or super-register of PR is callee saved.
1484 auto AnySubOrSuperRegCalleePreserved
= [&MF
, TRI
](MCPhysReg PR
) {
1485 return any_of(TRI
->sub_and_superregs_inclusive(PR
),
1486 [&MF
, TRI
](MCPhysReg SubOrSuper
) {
1487 return TRI
->isCalleeSavedPhysReg(SubOrSuper
, MF
);
1491 // Check if PR or one of its sub- or super-registers can be used for all
1492 // required register classes.
1493 auto CanBeUsedForAllClasses
= [&RequiredClasses
, TRI
](MCPhysReg PR
) {
1494 return all_of(RequiredClasses
, [PR
, TRI
](const TargetRegisterClass
*C
) {
1495 return any_of(TRI
->sub_and_superregs_inclusive(PR
),
1496 [C
, TRI
](MCPhysReg SubOrSuper
) {
1497 return C
== TRI
->getMinimalPhysRegClass(SubOrSuper
);
1502 auto *RegClass
= TRI
->getMinimalPhysRegClass(getLdStRegOp(FirstMI
).getReg());
1503 for (const MCPhysReg
&PR
: *RegClass
) {
1504 if (DefinedInBB
.available(PR
) && UsedInBetween
.available(PR
) &&
1505 !RegInfo
.isReserved(PR
) && !AnySubOrSuperRegCalleePreserved(PR
) &&
1506 CanBeUsedForAllClasses(PR
)) {
1507 DefinedInBB
.addReg(PR
);
1508 LLVM_DEBUG(dbgs() << "Found rename register " << printReg(PR
, TRI
)
1513 LLVM_DEBUG(dbgs() << "No rename register found from "
1514 << TRI
->getRegClassName(RegClass
) << "\n");
1518 /// Scan the instructions looking for a load/store that can be combined with the
1519 /// current instruction into a wider equivalent or a load/store pair.
1520 MachineBasicBlock::iterator
1521 AArch64LoadStoreOpt::findMatchingInsn(MachineBasicBlock::iterator I
,
1522 LdStPairFlags
&Flags
, unsigned Limit
,
1523 bool FindNarrowMerge
) {
1524 MachineBasicBlock::iterator E
= I
->getParent()->end();
1525 MachineBasicBlock::iterator MBBI
= I
;
1526 MachineBasicBlock::iterator MBBIWithRenameReg
;
1527 MachineInstr
&FirstMI
= *I
;
1528 MBBI
= next_nodbg(MBBI
, E
);
1530 bool MayLoad
= FirstMI
.mayLoad();
1531 bool IsUnscaled
= TII
->hasUnscaledLdStOffset(FirstMI
);
1532 Register Reg
= getLdStRegOp(FirstMI
).getReg();
1533 Register BaseReg
= getLdStBaseOp(FirstMI
).getReg();
1534 int Offset
= getLdStOffsetOp(FirstMI
).getImm();
1535 int OffsetStride
= IsUnscaled
? TII
->getMemScale(FirstMI
) : 1;
1536 bool IsPromotableZeroStore
= isPromotableZeroStoreInst(FirstMI
);
1538 Optional
<bool> MaybeCanRename
= None
;
1539 if (!EnableRenaming
)
1540 MaybeCanRename
= {false};
1542 SmallPtrSet
<const TargetRegisterClass
*, 5> RequiredClasses
;
1543 LiveRegUnits UsedInBetween
;
1544 UsedInBetween
.init(*TRI
);
1546 Flags
.clearRenameReg();
1548 // Track which register units have been modified and used between the first
1549 // insn (inclusive) and the second insn.
1550 ModifiedRegUnits
.clear();
1551 UsedRegUnits
.clear();
1553 // Remember any instructions that read/write memory between FirstMI and MI.
1554 SmallVector
<MachineInstr
*, 4> MemInsns
;
1556 for (unsigned Count
= 0; MBBI
!= E
&& Count
< Limit
;
1557 MBBI
= next_nodbg(MBBI
, E
)) {
1558 MachineInstr
&MI
= *MBBI
;
1560 UsedInBetween
.accumulate(MI
);
1562 // Don't count transient instructions towards the search limit since there
1563 // may be different numbers of them if e.g. debug information is present.
1564 if (!MI
.isTransient())
1567 Flags
.setSExtIdx(-1);
1568 if (areCandidatesToMergeOrPair(FirstMI
, MI
, Flags
, TII
) &&
1569 getLdStOffsetOp(MI
).isImm()) {
1570 assert(MI
.mayLoadOrStore() && "Expected memory operation.");
1571 // If we've found another instruction with the same opcode, check to see
1572 // if the base and offset are compatible with our starting instruction.
1573 // These instructions all have scaled immediate operands, so we just
1574 // check for +1/-1. Make sure to check the new instruction offset is
1575 // actually an immediate and not a symbolic reference destined for
1577 Register MIBaseReg
= getLdStBaseOp(MI
).getReg();
1578 int MIOffset
= getLdStOffsetOp(MI
).getImm();
1579 bool MIIsUnscaled
= TII
->hasUnscaledLdStOffset(MI
);
1580 if (IsUnscaled
!= MIIsUnscaled
) {
1581 // We're trying to pair instructions that differ in how they are scaled.
1582 // If FirstMI is scaled then scale the offset of MI accordingly.
1583 // Otherwise, do the opposite (i.e., make MI's offset unscaled).
1584 int MemSize
= TII
->getMemScale(MI
);
1586 // If the unscaled offset isn't a multiple of the MemSize, we can't
1587 // pair the operations together: bail and keep looking.
1588 if (MIOffset
% MemSize
) {
1589 LiveRegUnits::accumulateUsedDefed(MI
, ModifiedRegUnits
,
1591 MemInsns
.push_back(&MI
);
1594 MIOffset
/= MemSize
;
1596 MIOffset
*= MemSize
;
1600 bool IsPreLdSt
= isPreLdStPairCandidate(FirstMI
, MI
);
1602 if (BaseReg
== MIBaseReg
) {
1603 // If the offset of the second ld/st is not equal to the size of the
1604 // destination register it can’t be paired with a pre-index ld/st
1605 // pair. Additionally if the base reg is used or modified the operations
1606 // can't be paired: bail and keep looking.
1608 bool IsOutOfBounds
= MIOffset
!= TII
->getMemScale(MI
);
1609 bool IsBaseRegUsed
=
1610 !UsedRegUnits
.available(getLdStBaseOp(MI
).getReg());
1611 bool IsBaseRegModified
=
1612 !ModifiedRegUnits
.available(getLdStBaseOp(MI
).getReg());
1613 // If the stored value and the address of the second instruction is
1614 // the same, it needs to be using the updated register and therefore
1615 // it must not be folded.
1616 bool IsMIRegTheSame
= TRI
->regsOverlap(getLdStRegOp(MI
).getReg(),
1617 getLdStBaseOp(MI
).getReg());
1618 if (IsOutOfBounds
|| IsBaseRegUsed
|| IsBaseRegModified
||
1620 LiveRegUnits::accumulateUsedDefed(MI
, ModifiedRegUnits
,
1622 MemInsns
.push_back(&MI
);
1626 if ((Offset
!= MIOffset
+ OffsetStride
) &&
1627 (Offset
+ OffsetStride
!= MIOffset
)) {
1628 LiveRegUnits::accumulateUsedDefed(MI
, ModifiedRegUnits
,
1630 MemInsns
.push_back(&MI
);
1635 int MinOffset
= Offset
< MIOffset
? Offset
: MIOffset
;
1636 if (FindNarrowMerge
) {
1637 // If the alignment requirements of the scaled wide load/store
1638 // instruction can't express the offset of the scaled narrow input,
1639 // bail and keep looking. For promotable zero stores, allow only when
1640 // the stored value is the same (i.e., WZR).
1641 if ((!IsUnscaled
&& alignTo(MinOffset
, 2) != MinOffset
) ||
1642 (IsPromotableZeroStore
&& Reg
!= getLdStRegOp(MI
).getReg())) {
1643 LiveRegUnits::accumulateUsedDefed(MI
, ModifiedRegUnits
,
1645 MemInsns
.push_back(&MI
);
1649 // Pairwise instructions have a 7-bit signed offset field. Single
1650 // insns have a 12-bit unsigned offset field. If the resultant
1651 // immediate offset of merging these instructions is out of range for
1652 // a pairwise instruction, bail and keep looking.
1653 if (!inBoundsForPair(IsUnscaled
, MinOffset
, OffsetStride
)) {
1654 LiveRegUnits::accumulateUsedDefed(MI
, ModifiedRegUnits
,
1656 MemInsns
.push_back(&MI
);
1659 // If the alignment requirements of the paired (scaled) instruction
1660 // can't express the offset of the unscaled input, bail and keep
1662 if (IsUnscaled
&& (alignTo(MinOffset
, OffsetStride
) != MinOffset
)) {
1663 LiveRegUnits::accumulateUsedDefed(MI
, ModifiedRegUnits
,
1665 MemInsns
.push_back(&MI
);
1669 // If the destination register of one load is the same register or a
1670 // sub/super register of the other load, bail and keep looking. A
1671 // load-pair instruction with both destination registers the same is
1672 // UNPREDICTABLE and will result in an exception.
1674 TRI
->isSuperOrSubRegisterEq(Reg
, getLdStRegOp(MI
).getReg())) {
1675 LiveRegUnits::accumulateUsedDefed(MI
, ModifiedRegUnits
, UsedRegUnits
,
1677 MemInsns
.push_back(&MI
);
1681 // If the BaseReg has been modified, then we cannot do the optimization.
1682 // For example, in the following pattern
1686 // the first and third ldr cannot be converted to ldp x1, x4, [x2]
1687 if (!ModifiedRegUnits
.available(BaseReg
))
1690 // If the Rt of the second instruction was not modified or used between
1691 // the two instructions and none of the instructions between the second
1692 // and first alias with the second, we can combine the second into the
1694 if (ModifiedRegUnits
.available(getLdStRegOp(MI
).getReg()) &&
1696 !UsedRegUnits
.available(getLdStRegOp(MI
).getReg())) &&
1697 !mayAlias(MI
, MemInsns
, AA
)) {
1699 Flags
.setMergeForward(false);
1700 Flags
.clearRenameReg();
1704 // Likewise, if the Rt of the first instruction is not modified or used
1705 // between the two instructions and none of the instructions between the
1706 // first and the second alias with the first, we can combine the first
1709 !UsedRegUnits
.available(getLdStRegOp(FirstMI
).getReg())) &&
1710 !mayAlias(FirstMI
, MemInsns
, AA
)) {
1712 if (ModifiedRegUnits
.available(getLdStRegOp(FirstMI
).getReg())) {
1713 Flags
.setMergeForward(true);
1714 Flags
.clearRenameReg();
1718 if (DebugCounter::shouldExecute(RegRenamingCounter
)) {
1719 if (!MaybeCanRename
)
1720 MaybeCanRename
= {canRenameUpToDef(FirstMI
, UsedInBetween
,
1721 RequiredClasses
, TRI
)};
1723 if (*MaybeCanRename
) {
1724 Optional
<MCPhysReg
> MaybeRenameReg
= tryToFindRegisterToRename(
1725 FirstMI
, MI
, DefinedInBB
, UsedInBetween
, RequiredClasses
,
1727 if (MaybeRenameReg
) {
1728 Flags
.setRenameReg(*MaybeRenameReg
);
1729 Flags
.setMergeForward(true);
1730 MBBIWithRenameReg
= MBBI
;
1735 // Unable to combine these instructions due to interference in between.
1740 if (Flags
.getRenameReg())
1741 return MBBIWithRenameReg
;
1743 // If the instruction wasn't a matching load or store. Stop searching if we
1744 // encounter a call instruction that might modify memory.
1748 // Update modified / uses register units.
1749 LiveRegUnits::accumulateUsedDefed(MI
, ModifiedRegUnits
, UsedRegUnits
, TRI
);
1751 // Otherwise, if the base register is modified, we have no match, so
1753 if (!ModifiedRegUnits
.available(BaseReg
))
1756 // Update list of instructions that read/write memory.
1757 if (MI
.mayLoadOrStore())
1758 MemInsns
.push_back(&MI
);
1763 MachineBasicBlock::iterator
1764 AArch64LoadStoreOpt::mergeUpdateInsn(MachineBasicBlock::iterator I
,
1765 MachineBasicBlock::iterator Update
,
1767 assert((Update
->getOpcode() == AArch64::ADDXri
||
1768 Update
->getOpcode() == AArch64::SUBXri
) &&
1769 "Unexpected base register update instruction to merge!");
1770 MachineBasicBlock::iterator E
= I
->getParent()->end();
1771 MachineBasicBlock::iterator NextI
= next_nodbg(I
, E
);
1772 // Return the instruction following the merged instruction, which is
1773 // the instruction following our unmerged load. Unless that's the add/sub
1774 // instruction we're merging, in which case it's the one after that.
1775 if (NextI
== Update
)
1776 NextI
= next_nodbg(NextI
, E
);
1778 int Value
= Update
->getOperand(2).getImm();
1779 assert(AArch64_AM::getShiftValue(Update
->getOperand(3).getImm()) == 0 &&
1780 "Can't merge 1 << 12 offset into pre-/post-indexed load / store");
1781 if (Update
->getOpcode() == AArch64::SUBXri
)
1784 unsigned NewOpc
= IsPreIdx
? getPreIndexedOpcode(I
->getOpcode())
1785 : getPostIndexedOpcode(I
->getOpcode());
1786 MachineInstrBuilder MIB
;
1787 int Scale
, MinOffset
, MaxOffset
;
1788 getPrePostIndexedMemOpInfo(*I
, Scale
, MinOffset
, MaxOffset
);
1789 if (!isPairedLdSt(*I
)) {
1790 // Non-paired instruction.
1791 MIB
= BuildMI(*I
->getParent(), I
, I
->getDebugLoc(), TII
->get(NewOpc
))
1792 .add(getLdStRegOp(*Update
))
1793 .add(getLdStRegOp(*I
))
1794 .add(getLdStBaseOp(*I
))
1795 .addImm(Value
/ Scale
)
1796 .setMemRefs(I
->memoperands())
1797 .setMIFlags(I
->mergeFlagsWith(*Update
));
1799 // Paired instruction.
1800 MIB
= BuildMI(*I
->getParent(), I
, I
->getDebugLoc(), TII
->get(NewOpc
))
1801 .add(getLdStRegOp(*Update
))
1802 .add(getLdStRegOp(*I
, 0))
1803 .add(getLdStRegOp(*I
, 1))
1804 .add(getLdStBaseOp(*I
))
1805 .addImm(Value
/ Scale
)
1806 .setMemRefs(I
->memoperands())
1807 .setMIFlags(I
->mergeFlagsWith(*Update
));
1813 LLVM_DEBUG(dbgs() << "Creating pre-indexed load/store.");
1816 LLVM_DEBUG(dbgs() << "Creating post-indexed load/store.");
1818 LLVM_DEBUG(dbgs() << " Replacing instructions:\n ");
1819 LLVM_DEBUG(I
->print(dbgs()));
1820 LLVM_DEBUG(dbgs() << " ");
1821 LLVM_DEBUG(Update
->print(dbgs()));
1822 LLVM_DEBUG(dbgs() << " with instruction:\n ");
1823 LLVM_DEBUG(((MachineInstr
*)MIB
)->print(dbgs()));
1824 LLVM_DEBUG(dbgs() << "\n");
1826 // Erase the old instructions for the block.
1827 I
->eraseFromParent();
1828 Update
->eraseFromParent();
1833 bool AArch64LoadStoreOpt::isMatchingUpdateInsn(MachineInstr
&MemMI
,
1835 unsigned BaseReg
, int Offset
) {
1836 switch (MI
.getOpcode()) {
1839 case AArch64::SUBXri
:
1840 case AArch64::ADDXri
:
1841 // Make sure it's a vanilla immediate operand, not a relocation or
1842 // anything else we can't handle.
1843 if (!MI
.getOperand(2).isImm())
1845 // Watch out for 1 << 12 shifted value.
1846 if (AArch64_AM::getShiftValue(MI
.getOperand(3).getImm()))
1849 // The update instruction source and destination register must be the
1850 // same as the load/store base register.
1851 if (MI
.getOperand(0).getReg() != BaseReg
||
1852 MI
.getOperand(1).getReg() != BaseReg
)
1855 int UpdateOffset
= MI
.getOperand(2).getImm();
1856 if (MI
.getOpcode() == AArch64::SUBXri
)
1857 UpdateOffset
= -UpdateOffset
;
1859 // The immediate must be a multiple of the scaling factor of the pre/post
1860 // indexed instruction.
1861 int Scale
, MinOffset
, MaxOffset
;
1862 getPrePostIndexedMemOpInfo(MemMI
, Scale
, MinOffset
, MaxOffset
);
1863 if (UpdateOffset
% Scale
!= 0)
1866 // Scaled offset must fit in the instruction immediate.
1867 int ScaledOffset
= UpdateOffset
/ Scale
;
1868 if (ScaledOffset
> MaxOffset
|| ScaledOffset
< MinOffset
)
1871 // If we have a non-zero Offset, we check that it matches the amount
1872 // we're adding to the register.
1873 if (!Offset
|| Offset
== UpdateOffset
)
1880 static bool needsWinCFI(const MachineFunction
*MF
) {
1881 return MF
->getTarget().getMCAsmInfo()->usesWindowsCFI() &&
1882 MF
->getFunction().needsUnwindTableEntry();
1885 MachineBasicBlock::iterator
AArch64LoadStoreOpt::findMatchingUpdateInsnForward(
1886 MachineBasicBlock::iterator I
, int UnscaledOffset
, unsigned Limit
) {
1887 MachineBasicBlock::iterator E
= I
->getParent()->end();
1888 MachineInstr
&MemMI
= *I
;
1889 MachineBasicBlock::iterator MBBI
= I
;
1891 Register BaseReg
= getLdStBaseOp(MemMI
).getReg();
1892 int MIUnscaledOffset
= getLdStOffsetOp(MemMI
).getImm() * TII
->getMemScale(MemMI
);
1894 // Scan forward looking for post-index opportunities. Updating instructions
1895 // can't be formed if the memory instruction doesn't have the offset we're
1897 if (MIUnscaledOffset
!= UnscaledOffset
)
1900 // If the base register overlaps a source/destination register, we can't
1901 // merge the update. This does not apply to tag store instructions which
1902 // ignore the address part of the source register.
1903 // This does not apply to STGPi as well, which does not have unpredictable
1904 // behavior in this case unlike normal stores, and always performs writeback
1905 // after reading the source register value.
1906 if (!isTagStore(MemMI
) && MemMI
.getOpcode() != AArch64::STGPi
) {
1907 bool IsPairedInsn
= isPairedLdSt(MemMI
);
1908 for (unsigned i
= 0, e
= IsPairedInsn
? 2 : 1; i
!= e
; ++i
) {
1909 Register DestReg
= getLdStRegOp(MemMI
, i
).getReg();
1910 if (DestReg
== BaseReg
|| TRI
->isSubRegister(BaseReg
, DestReg
))
1915 // Track which register units have been modified and used between the first
1916 // insn (inclusive) and the second insn.
1917 ModifiedRegUnits
.clear();
1918 UsedRegUnits
.clear();
1919 MBBI
= next_nodbg(MBBI
, E
);
1921 // We can't post-increment the stack pointer if any instruction between
1922 // the memory access (I) and the increment (MBBI) can access the memory
1923 // region defined by [SP, MBBI].
1924 const bool BaseRegSP
= BaseReg
== AArch64::SP
;
1925 if (BaseRegSP
&& needsWinCFI(I
->getMF())) {
1926 // FIXME: For now, we always block the optimization over SP in windows
1927 // targets as it requires to adjust the unwind/debug info, messing up
1928 // the unwind info can actually cause a miscompile.
1932 for (unsigned Count
= 0; MBBI
!= E
&& Count
< Limit
;
1933 MBBI
= next_nodbg(MBBI
, E
)) {
1934 MachineInstr
&MI
= *MBBI
;
1936 // Don't count transient instructions towards the search limit since there
1937 // may be different numbers of them if e.g. debug information is present.
1938 if (!MI
.isTransient())
1941 // If we found a match, return it.
1942 if (isMatchingUpdateInsn(*I
, MI
, BaseReg
, UnscaledOffset
))
1945 // Update the status of what the instruction clobbered and used.
1946 LiveRegUnits::accumulateUsedDefed(MI
, ModifiedRegUnits
, UsedRegUnits
, TRI
);
1948 // Otherwise, if the base register is used or modified, we have no match, so
1950 // If we are optimizing SP, do not allow instructions that may load or store
1951 // in between the load and the optimized value update.
1952 if (!ModifiedRegUnits
.available(BaseReg
) ||
1953 !UsedRegUnits
.available(BaseReg
) ||
1954 (BaseRegSP
&& MBBI
->mayLoadOrStore()))
1960 MachineBasicBlock::iterator
AArch64LoadStoreOpt::findMatchingUpdateInsnBackward(
1961 MachineBasicBlock::iterator I
, unsigned Limit
) {
1962 MachineBasicBlock::iterator B
= I
->getParent()->begin();
1963 MachineBasicBlock::iterator E
= I
->getParent()->end();
1964 MachineInstr
&MemMI
= *I
;
1965 MachineBasicBlock::iterator MBBI
= I
;
1966 MachineFunction
&MF
= *MemMI
.getMF();
1968 Register BaseReg
= getLdStBaseOp(MemMI
).getReg();
1969 int Offset
= getLdStOffsetOp(MemMI
).getImm();
1971 // If the load/store is the first instruction in the block, there's obviously
1972 // not any matching update. Ditto if the memory offset isn't zero.
1973 if (MBBI
== B
|| Offset
!= 0)
1975 // If the base register overlaps a destination register, we can't
1976 // merge the update.
1977 if (!isTagStore(MemMI
)) {
1978 bool IsPairedInsn
= isPairedLdSt(MemMI
);
1979 for (unsigned i
= 0, e
= IsPairedInsn
? 2 : 1; i
!= e
; ++i
) {
1980 Register DestReg
= getLdStRegOp(MemMI
, i
).getReg();
1981 if (DestReg
== BaseReg
|| TRI
->isSubRegister(BaseReg
, DestReg
))
1986 const bool BaseRegSP
= BaseReg
== AArch64::SP
;
1987 if (BaseRegSP
&& needsWinCFI(I
->getMF())) {
1988 // FIXME: For now, we always block the optimization over SP in windows
1989 // targets as it requires to adjust the unwind/debug info, messing up
1990 // the unwind info can actually cause a miscompile.
1994 const AArch64Subtarget
&Subtarget
= MF
.getSubtarget
<AArch64Subtarget
>();
1995 unsigned RedZoneSize
=
1996 Subtarget
.getTargetLowering()->getRedZoneSize(MF
.getFunction());
1998 // Track which register units have been modified and used between the first
1999 // insn (inclusive) and the second insn.
2000 ModifiedRegUnits
.clear();
2001 UsedRegUnits
.clear();
2003 bool MemAcessBeforeSPPreInc
= false;
2005 MBBI
= prev_nodbg(MBBI
, B
);
2006 MachineInstr
&MI
= *MBBI
;
2008 // Don't count transient instructions towards the search limit since there
2009 // may be different numbers of them if e.g. debug information is present.
2010 if (!MI
.isTransient())
2013 // If we found a match, return it.
2014 if (isMatchingUpdateInsn(*I
, MI
, BaseReg
, Offset
)) {
2015 // Check that the update value is within our red zone limit (which may be
2017 if (MemAcessBeforeSPPreInc
&& MBBI
->getOperand(2).getImm() > RedZoneSize
)
2022 // Update the status of what the instruction clobbered and used.
2023 LiveRegUnits::accumulateUsedDefed(MI
, ModifiedRegUnits
, UsedRegUnits
, TRI
);
2025 // Otherwise, if the base register is used or modified, we have no match, so
2027 if (!ModifiedRegUnits
.available(BaseReg
) ||
2028 !UsedRegUnits
.available(BaseReg
))
2030 // Keep track if we have a memory access before an SP pre-increment, in this
2031 // case we need to validate later that the update amount respects the red
2033 if (BaseRegSP
&& MBBI
->mayLoadOrStore())
2034 MemAcessBeforeSPPreInc
= true;
2035 } while (MBBI
!= B
&& Count
< Limit
);
2039 bool AArch64LoadStoreOpt::tryToPromoteLoadFromStore(
2040 MachineBasicBlock::iterator
&MBBI
) {
2041 MachineInstr
&MI
= *MBBI
;
2042 // If this is a volatile load, don't mess with it.
2043 if (MI
.hasOrderedMemoryRef())
2046 // Make sure this is a reg+imm.
2047 // FIXME: It is possible to extend it to handle reg+reg cases.
2048 if (!getLdStOffsetOp(MI
).isImm())
2051 // Look backward up to LdStLimit instructions.
2052 MachineBasicBlock::iterator StoreI
;
2053 if (findMatchingStore(MBBI
, LdStLimit
, StoreI
)) {
2054 ++NumLoadsFromStoresPromoted
;
2055 // Promote the load. Keeping the iterator straight is a
2056 // pain, so we let the merge routine tell us what the next instruction
2057 // is after it's done mucking about.
2058 MBBI
= promoteLoadFromStore(MBBI
, StoreI
);
2064 // Merge adjacent zero stores into a wider store.
2065 bool AArch64LoadStoreOpt::tryToMergeZeroStInst(
2066 MachineBasicBlock::iterator
&MBBI
) {
2067 assert(isPromotableZeroStoreInst(*MBBI
) && "Expected narrow store.");
2068 MachineInstr
&MI
= *MBBI
;
2069 MachineBasicBlock::iterator E
= MI
.getParent()->end();
2071 if (!TII
->isCandidateToMergeOrPair(MI
))
2074 // Look ahead up to LdStLimit instructions for a mergable instruction.
2075 LdStPairFlags Flags
;
2076 MachineBasicBlock::iterator MergeMI
=
2077 findMatchingInsn(MBBI
, Flags
, LdStLimit
, /* FindNarrowMerge = */ true);
2079 ++NumZeroStoresPromoted
;
2081 // Keeping the iterator straight is a pain, so we let the merge routine tell
2082 // us what the next instruction is after it's done mucking about.
2083 MBBI
= mergeNarrowZeroStores(MBBI
, MergeMI
, Flags
);
2089 // Find loads and stores that can be merged into a single load or store pair
2091 bool AArch64LoadStoreOpt::tryToPairLdStInst(MachineBasicBlock::iterator
&MBBI
) {
2092 MachineInstr
&MI
= *MBBI
;
2093 MachineBasicBlock::iterator E
= MI
.getParent()->end();
2095 if (!TII
->isCandidateToMergeOrPair(MI
))
2098 // Early exit if the offset is not possible to match. (6 bits of positive
2099 // range, plus allow an extra one in case we find a later insn that matches
2101 bool IsUnscaled
= TII
->hasUnscaledLdStOffset(MI
);
2102 int Offset
= getLdStOffsetOp(MI
).getImm();
2103 int OffsetStride
= IsUnscaled
? TII
->getMemScale(MI
) : 1;
2104 // Allow one more for offset.
2106 Offset
-= OffsetStride
;
2107 if (!inBoundsForPair(IsUnscaled
, Offset
, OffsetStride
))
2110 // Look ahead up to LdStLimit instructions for a pairable instruction.
2111 LdStPairFlags Flags
;
2112 MachineBasicBlock::iterator Paired
=
2113 findMatchingInsn(MBBI
, Flags
, LdStLimit
, /* FindNarrowMerge = */ false);
2116 if (TII
->hasUnscaledLdStOffset(MI
))
2117 ++NumUnscaledPairCreated
;
2118 // Keeping the iterator straight is a pain, so we let the merge routine tell
2119 // us what the next instruction is after it's done mucking about.
2120 auto Prev
= std::prev(MBBI
);
2121 MBBI
= mergePairedInsns(MBBI
, Paired
, Flags
);
2122 // Collect liveness info for instructions between Prev and the new position
2124 for (auto I
= std::next(Prev
); I
!= MBBI
; I
++)
2125 updateDefinedRegisters(*I
, DefinedInBB
, TRI
);
2132 bool AArch64LoadStoreOpt::tryToMergeLdStUpdate
2133 (MachineBasicBlock::iterator
&MBBI
) {
2134 MachineInstr
&MI
= *MBBI
;
2135 MachineBasicBlock::iterator E
= MI
.getParent()->end();
2136 MachineBasicBlock::iterator Update
;
2138 // Look forward to try to form a post-index instruction. For example,
2140 // add x20, x20, #32
2142 // ldr x0, [x20], #32
2143 Update
= findMatchingUpdateInsnForward(MBBI
, 0, UpdateLimit
);
2145 // Merge the update into the ld/st.
2146 MBBI
= mergeUpdateInsn(MBBI
, Update
, /*IsPreIdx=*/false);
2150 // Don't know how to handle unscaled pre/post-index versions below, so bail.
2151 if (TII
->hasUnscaledLdStOffset(MI
.getOpcode()))
2154 // Look back to try to find a pre-index instruction. For example,
2158 // ldr x1, [x0, #8]!
2159 Update
= findMatchingUpdateInsnBackward(MBBI
, UpdateLimit
);
2161 // Merge the update into the ld/st.
2162 MBBI
= mergeUpdateInsn(MBBI
, Update
, /*IsPreIdx=*/true);
2166 // The immediate in the load/store is scaled by the size of the memory
2167 // operation. The immediate in the add we're looking for,
2168 // however, is not, so adjust here.
2169 int UnscaledOffset
= getLdStOffsetOp(MI
).getImm() * TII
->getMemScale(MI
);
2171 // Look forward to try to find a pre-index instruction. For example,
2172 // ldr x1, [x0, #64]
2175 // ldr x1, [x0, #64]!
2176 Update
= findMatchingUpdateInsnForward(MBBI
, UnscaledOffset
, UpdateLimit
);
2178 // Merge the update into the ld/st.
2179 MBBI
= mergeUpdateInsn(MBBI
, Update
, /*IsPreIdx=*/true);
2186 bool AArch64LoadStoreOpt::optimizeBlock(MachineBasicBlock
&MBB
,
2187 bool EnableNarrowZeroStOpt
) {
2189 bool Modified
= false;
2190 // Four tranformations to do here:
2191 // 1) Find loads that directly read from stores and promote them by
2192 // replacing with mov instructions. If the store is wider than the load,
2193 // the load will be replaced with a bitfield extract.
2196 // ldrh w2, [x0, #6]
2200 for (MachineBasicBlock::iterator MBBI
= MBB
.begin(), E
= MBB
.end();
2202 if (isPromotableLoadFromStore(*MBBI
) && tryToPromoteLoadFromStore(MBBI
))
2207 // 2) Merge adjacent zero stores into a wider store.
2210 // strh wzr, [x0, #2]
2215 // str wzr, [x0, #4]
2218 if (EnableNarrowZeroStOpt
)
2219 for (MachineBasicBlock::iterator MBBI
= MBB
.begin(), E
= MBB
.end();
2221 if (isPromotableZeroStoreInst(*MBBI
) && tryToMergeZeroStInst(MBBI
))
2226 // 3) Find loads and stores that can be merged into a single load or store
2227 // pair instruction.
2234 if (MBB
.getParent()->getRegInfo().tracksLiveness()) {
2235 DefinedInBB
.clear();
2236 DefinedInBB
.addLiveIns(MBB
);
2239 for (MachineBasicBlock::iterator MBBI
= MBB
.begin(), E
= MBB
.end();
2241 // Track currently live registers up to this point, to help with
2242 // searching for a rename register on demand.
2243 updateDefinedRegisters(*MBBI
, DefinedInBB
, TRI
);
2244 if (TII
->isPairableLdStInst(*MBBI
) && tryToPairLdStInst(MBBI
))
2249 // 4) Find base register updates that can be merged into the load or store
2250 // as a base-reg writeback.
2256 for (MachineBasicBlock::iterator MBBI
= MBB
.begin(), E
= MBB
.end();
2258 if (isMergeableLdStUpdate(*MBBI
) && tryToMergeLdStUpdate(MBBI
))
2267 bool AArch64LoadStoreOpt::runOnMachineFunction(MachineFunction
&Fn
) {
2268 if (skipFunction(Fn
.getFunction()))
2271 Subtarget
= &static_cast<const AArch64Subtarget
&>(Fn
.getSubtarget());
2272 TII
= static_cast<const AArch64InstrInfo
*>(Subtarget
->getInstrInfo());
2273 TRI
= Subtarget
->getRegisterInfo();
2274 AA
= &getAnalysis
<AAResultsWrapperPass
>().getAAResults();
2276 // Resize the modified and used register unit trackers. We do this once
2277 // per function and then clear the register units each time we optimize a load
2279 ModifiedRegUnits
.init(*TRI
);
2280 UsedRegUnits
.init(*TRI
);
2281 DefinedInBB
.init(*TRI
);
2283 bool Modified
= false;
2284 bool enableNarrowZeroStOpt
= !Subtarget
->requiresStrictAlign();
2285 for (auto &MBB
: Fn
) {
2286 auto M
= optimizeBlock(MBB
, enableNarrowZeroStOpt
);
2293 // FIXME: Do we need/want a pre-alloc pass like ARM has to try to keep loads and
2294 // stores near one another? Note: The pre-RA instruction scheduler already has
2295 // hooks to try and schedule pairable loads/stores together to improve pairing
2296 // opportunities. Thus, pre-RA pairing pass may not be worth the effort.
2298 // FIXME: When pairing store instructions it's very possible for this pass to
2299 // hoist a store with a KILL marker above another use (without a KILL marker).
2300 // The resulting IR is invalid, but nothing uses the KILL markers after this
2301 // pass, so it's never caused a problem in practice.
2303 /// createAArch64LoadStoreOptimizationPass - returns an instance of the
2304 /// load / store optimization pass.
2305 FunctionPass
*llvm::createAArch64LoadStoreOptimizationPass() {
2306 return new AArch64LoadStoreOpt();