Revert r354244 "[DAGCombiner] Eliminate dead stores to stack."
[llvm-complete.git] / lib / Target / ARM / ARMScheduleA57.td
blob6db6eda317abf224d39afce0499cbdf0d77904a2
1 //=- ARMScheduleA57.td - ARM Cortex-A57 Scheduling Defs -----*- tablegen -*-=//
2 //
3 // Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
4 // See https://llvm.org/LICENSE.txt for license information.
5 // SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
6 //
7 //===----------------------------------------------------------------------===//
8 //
9 // This file defines the machine model for ARM Cortex-A57 to support
10 // instruction scheduling and other instruction cost heuristics.
12 //===----------------------------------------------------------------------===//
14 //===----------------------------------------------------------------------===//
15 // *** Common description and scheduling model parameters taken from AArch64 ***
16 // The Cortex-A57 is a traditional superscalar microprocessor with a
17 // conservative 3-wide in-order stage for decode and dispatch. Combined with the
18 // much wider out-of-order issue stage, this produced a need to carefully
19 // schedule micro-ops so that all three decoded each cycle are successfully
20 // issued as the reservation station(s) simply don't stay occupied for long.
21 // Therefore, IssueWidth is set to the narrower of the two at three, while still
22 // modeling the machine as out-of-order.
24 def IsCPSRDefinedPred : SchedPredicate<[{TII->isCPSRDefined(*MI)}]>;
25 def IsCPSRDefinedAndPredicatedPred :
26   SchedPredicate<[{TII->isCPSRDefined(*MI) && TII->isPredicated(*MI)}]>;
28 // Cortex A57 rev. r1p0 or later (false = r0px)
29 def IsR1P0AndLaterPred : SchedPredicate<[{false}]>;
31 // If Addrmode3 contains register offset (not immediate)
32 def IsLdrAm3RegOffPred :
33   SchedPredicate<[{!TII->isAddrMode3OpImm(*MI, 1)}]>;
34 // The same predicate with operand offset 2 and 3:
35 def IsLdrAm3RegOffPredX2 :
36   SchedPredicate<[{!TII->isAddrMode3OpImm(*MI, 2)}]>;
37 def IsLdrAm3RegOffPredX3 :
38   SchedPredicate<[{!TII->isAddrMode3OpImm(*MI, 3)}]>;
40 // If Addrmode3 contains "minus register"
41 def IsLdrAm3NegRegOffPred :
42   SchedPredicate<[{TII->isAddrMode3OpMinusReg(*MI, 1)}]>;
43 // The same predicate with operand offset 2 and 3:
44 def IsLdrAm3NegRegOffPredX2 :
45   SchedPredicate<[{TII->isAddrMode3OpMinusReg(*MI, 2)}]>;
46 def IsLdrAm3NegRegOffPredX3 :
47   SchedPredicate<[{TII->isAddrMode3OpMinusReg(*MI, 3)}]>;
49 // Load, scaled register offset, not plus LSL2
50 def IsLdstsoScaledNotOptimalPredX0 :
51   SchedPredicate<[{TII->isLdstScaledRegNotPlusLsl2(*MI, 0)}]>;
52 def IsLdstsoScaledNotOptimalPred :
53   SchedPredicate<[{TII->isLdstScaledRegNotPlusLsl2(*MI, 1)}]>;
54 def IsLdstsoScaledNotOptimalPredX2 :
55   SchedPredicate<[{TII->isLdstScaledRegNotPlusLsl2(*MI, 2)}]>;
57 // Load, scaled register offset
58 def IsLdstsoScaledPred :
59   SchedPredicate<[{TII->isLdstScaledReg(*MI, 1)}]>;
60 def IsLdstsoScaledPredX2 :
61   SchedPredicate<[{TII->isLdstScaledReg(*MI, 2)}]>;
63 def IsLdstsoMinusRegPredX0 :
64   SchedPredicate<[{TII->isLdstSoMinusReg(*MI, 0)}]>;
65 def IsLdstsoMinusRegPred :
66   SchedPredicate<[{TII->isLdstSoMinusReg(*MI, 1)}]>;
67 def IsLdstsoMinusRegPredX2 :
68   SchedPredicate<[{TII->isLdstSoMinusReg(*MI, 2)}]>;
70 // Load, scaled register offset
71 def IsLdrAm2ScaledPred :
72   SchedPredicate<[{TII->isAm2ScaledReg(*MI, 1)}]>;
74 // LDM, base reg in list
75 def IsLdmBaseRegInList :
76   SchedPredicate<[{TII->isLDMBaseRegInList(*MI)}]>;
78 class A57WriteLMOpsListType<list<SchedWriteRes> writes> {
79   list <SchedWriteRes> Writes = writes;
80   SchedMachineModel SchedModel = ?;
83 // *** Common description and scheduling model parameters taken from AArch64 ***
84 // (AArch64SchedA57.td)
85 def CortexA57Model : SchedMachineModel {
86   let IssueWidth        =   3; // 3-way decode and dispatch
87   let MicroOpBufferSize = 128; // 128 micro-op re-order buffer
88   let LoadLatency       =   4; // Optimistic load latency
89   let MispredictPenalty =  16; // Fetch + Decode/Rename/Dispatch + Branch
91   // Enable partial & runtime unrolling.
92   let LoopMicroOpBufferSize = 16;
93   let CompleteModel = 1;
95   // FIXME: Remove when all errors have been fixed.
96   let FullInstRWOverlapCheck = 0;
99 //===----------------------------------------------------------------------===//
100 // Define each kind of processor resource and number available on Cortex-A57.
101 // Cortex A-57 has 8 pipelines that each has its own 8-entry queue where
102 // micro-ops wait for their operands and then issue out-of-order.
104 def A57UnitB : ProcResource<1>;  // Type B micro-ops
105 def A57UnitI : ProcResource<2>;  // Type I micro-ops
106 def A57UnitM : ProcResource<1>;  // Type M micro-ops
107 def A57UnitL : ProcResource<1>;  // Type L micro-ops
108 def A57UnitS : ProcResource<1>;  // Type S micro-ops
110 def A57UnitX : ProcResource<1>;  // Type X micro-ops (F1)
111 def A57UnitW : ProcResource<1>;  // Type W micro-ops (F0)
113 let SchedModel = CortexA57Model in {
114   def A57UnitV : ProcResGroup<[A57UnitX, A57UnitW]>;    // Type V micro-ops
117 let SchedModel = CortexA57Model in {
119 //===----------------------------------------------------------------------===//
120 // Define customized scheduler read/write types specific to the Cortex-A57.
122 include "ARMScheduleA57WriteRes.td"
124 // To have "CompleteModel = 1", support of pseudos and special instructions
125 def : InstRW<[WriteNoop], (instregex "(t)?BKPT$", "(t2)?CDP(2)?$",
126   "(t2)?CLREX$", "CONSTPOOL_ENTRY$", "COPY_STRUCT_BYVAL_I32$",
127   "(t2)?CPS[123]p$", "(t2)?DBG$", "(t2)?DMB$", "(t2)?DSB$", "ERET$",
128   "(t2|t)?HINT$", "(t)?HLT$", "(t2)?HVC$", "(t2)?ISB$", "ITasm$",
129   "(t2)?RFE(DA|DB|IA|IB)", "(t)?SETEND", "(t2)?SETPAN", "(t2)?SMC", "SPACE",
130   "(t2)?SRS(DA|DB|IA|IB)", "SWP(B)?", "t?TRAP", "(t2|t)?UDF$", "t2DCPS", "t2SG",
131   "t2TT", "tCPS", "CMP_SWAP", "t?SVC", "t2IT", "CompilerBarrier",
132   "t__brkdiv0")>;
134 def : InstRW<[WriteNoop], (instregex "VMRS", "VMSR", "FMSTAT")>;
136 // Specific memory instrs
137 def : InstRW<[WriteNoop, WriteNoop], (instregex "(t2)?LDA", "(t2)?LDC", "(t2)?STC",
138   "(t2)?STL", "(t2)?LDREX", "(t2)?STREX", "MEMCPY")>;
140 // coprocessor moves
141 def : InstRW<[WriteNoop, WriteNoop], (instregex
142   "(t2)?MCR(2|R|R2)?$", "(t2)?MRC(2)?$",
143   "(t2)?MRRC(2)?$", "(t2)?MRS(banked|sys|_AR|_M|sys_AR)?$",
144   "(t2)?MSR(banked|i|_AR|_M)?$")>;
146 // Deprecated instructions
147 def : InstRW<[WriteNoop], (instregex "FLDM", "FSTM")>;
149 // Pseudos
150 def : InstRW<[WriteNoop], (instregex "(t2)?ABS$",
151   "(t)?ADJCALLSTACKDOWN$", "(t)?ADJCALLSTACKUP$", "(t2|t)?Int_eh_sjlj",
152   "tLDRpci_pic", "(t2)?SUBS_PC_LR",
153   "JUMPTABLE", "tInt_WIN_eh_sjlj_longjmp",
154   "VLD(1|2)LN(d|q)(WB_fixed_|WB_register_)?Asm",
155   "VLD(3|4)(DUP|LN)?(d|q)(WB_fixed_|WB_register_)?Asm",
156   "VST(1|2)LN(d|q)(WB_fixed_|WB_register_)?Asm",
157   "VST(3|4)(DUP|LN)?(d|q)(WB_fixed_|WB_register_)?Asm",
158   "WIN__CHKSTK", "WIN__DBZCHK")>;
160 // Miscellaneous
161 // -----------------------------------------------------------------------------
163 def : InstRW<[A57Write_1cyc_1I], (instrs COPY)>;
165 // --- 3.2 Branch Instructions ---
166 // B, BX, BL, BLX (imm, reg != LR, reg == LR), CBZ, CBNZ
168 def : InstRW<[A57Write_1cyc_1B], (instregex "(t2|t)?B$", "t?BX", "(t2|t)?Bcc$",
169   "t?TAILJMP(d|r)", "TCRETURN(d|r)i", "tBfar", "tCBN?Z")>;
170 def : InstRW<[A57Write_1cyc_1B_1I],
171   (instregex "t?BL$", "BL_pred$", "t?BLXi", "t?TPsoft")>;
172 def : InstRW<[A57Write_2cyc_1B_1I], (instregex "BLX", "tBLX(NS)?r")>;
173 // Pseudos
174 def : InstRW<[A57Write_2cyc_1B_1I], (instregex "BCCi64", "BCCZi64")>;
175 def : InstRW<[A57Write_3cyc_1B_1I], (instregex "BR_JTadd", "t?BR_JTr",
176   "t2BR_JT", "t2BXJ", "(t2)?TB(B|H)(_JT)?$", "tBRIND")>;
177 def : InstRW<[A57Write_6cyc_1B_1L], (instregex "BR_JTm")>;
179 // --- 3.3 Arithmetic and Logical Instructions ---
180 // ADD{S}, ADC{S}, ADR, AND{S}, BIC{S}, CMN, CMP, EOR{S}, ORN{S}, ORR{S},
181 // RSB{S}, RSC{S}, SUB{S}, SBC{S}, TEQ, TST
183 def : InstRW<[A57Write_1cyc_1I], (instregex "tADDframe")>;
185 // shift by register, conditional or unconditional
186 // TODO: according to the doc, conditional uses I0/I1, unconditional uses M
187 // Why more complex instruction uses more simple pipeline?
188 // May be an error in doc.
189 def A57WriteALUsi : SchedWriteVariant<[
190   // lsl #2, lsl #1, or lsr #1.
191   SchedVar<IsPredicatedPred, [A57Write_2cyc_1M]>,
192   SchedVar<NoSchedPred,      [A57Write_2cyc_1M]>
194 def A57WriteALUsr : SchedWriteVariant<[
195   SchedVar<IsPredicatedPred, [A57Write_2cyc_1I]>,
196   SchedVar<NoSchedPred,      [A57Write_2cyc_1M]>
198 def A57WriteALUSsr : SchedWriteVariant<[
199   SchedVar<IsPredicatedPred, [A57Write_2cyc_1I]>,
200   SchedVar<NoSchedPred,      [A57Write_2cyc_1M]>
202 def A57ReadALUsr : SchedReadVariant<[
203   SchedVar<IsPredicatedPred, [ReadDefault]>,
204   SchedVar<NoSchedPred,      [ReadDefault]>
206 def : SchedAlias<WriteALUsi,  A57WriteALUsi>;
207 def : SchedAlias<WriteALUsr,  A57WriteALUsr>;
208 def : SchedAlias<WriteALUSsr, A57WriteALUSsr>;
209 def : SchedAlias<ReadALUsr,   A57ReadALUsr>;
211 def A57WriteCMPsr : SchedWriteVariant<[
212   SchedVar<IsPredicatedPred, [A57Write_2cyc_1I]>,
213   SchedVar<NoSchedPred,      [A57Write_2cyc_1M]>
215 def : SchedAlias<WriteCMP,   A57Write_1cyc_1I>;
216 def : SchedAlias<WriteCMPsi, A57Write_2cyc_1M>;
217 def : SchedAlias<WriteCMPsr, A57WriteCMPsr>;
219 // --- 3.4 Move and Shift Instructions ---
220 // Move, basic
221 // MOV{S}, MOVW, MVN{S}
222 def : InstRW<[A57Write_1cyc_1I], (instregex "MOV(r|i|i16|r_TC)",
223   "(t2)?MVN(CC)?(r|i)", "BMOVPCB_CALL", "BMOVPCRX_CALL",
224   "MOVCC(r|i|i16|i32imm)", "tMOV", "tMVN")>;
226 // Move, shift by immed, setflags/no setflags
227 // (ASR, LSL, LSR, ROR, RRX)=MOVsi, MVN
228 // setflags = isCPSRDefined
229 def A57WriteMOVsi : SchedWriteVariant<[
230   SchedVar<IsCPSRDefinedPred,              [A57Write_2cyc_1M]>,
231   SchedVar<NoSchedPred,                    [A57Write_1cyc_1I]>
233 def : InstRW<[A57WriteMOVsi], (instregex "MOV(CC)?si", "MVNsi",
234   "ASRi", "(t2|t)ASRri", "LSRi", "(t2|t)LSRri", "LSLi", "(t2|t)LSLri", "RORi",
235   "(t2|t)RORri", "(t2)?RRX", "t2MOV", "tROR")>;
237 // shift by register, conditional or unconditional, setflags/no setflags
238 def A57WriteMOVsr : SchedWriteVariant<[
239   SchedVar<IsCPSRDefinedAndPredicatedPred, [A57Write_2cyc_1I]>,
240   SchedVar<IsCPSRDefinedPred,              [A57Write_2cyc_1M]>,
241   SchedVar<IsPredicatedPred,               [A57Write_2cyc_1I]>,
242   SchedVar<NoSchedPred,                    [A57Write_1cyc_1I]>
244 def : InstRW<[A57WriteMOVsr], (instregex "MOV(CC)?sr", "MVNsr", "t2MVNs",
245   "ASRr", "(t2|t)ASRrr", "LSRr", "(t2|t)LSRrr", "LSLr", "(t2|t)?LSLrr", "RORr",
246   "(t2|t)RORrr")>;
248 // Move, top
249 // MOVT - A57Write_2cyc_1M for r0px, A57Write_1cyc_1I for r1p0 and later
250 def A57WriteMOVT : SchedWriteVariant<[
251   SchedVar<IsR1P0AndLaterPred,             [A57Write_1cyc_1I]>,
252   SchedVar<NoSchedPred,                    [A57Write_2cyc_1M]>
254 def : InstRW<[A57WriteMOVT], (instregex "MOVTi16")>;
256 def A57WriteI2pc :
257   WriteSequence<[A57Write_1cyc_1I, A57Write_1cyc_1I, A57Write_1cyc_1I]>;
258 def A57WriteI2ld :
259   WriteSequence<[A57Write_1cyc_1I, A57Write_1cyc_1I, A57Write_4cyc_1L]>;
260 def : InstRW< [A57WriteI2pc], (instregex "MOV_ga_pcrel")>;
261 def : InstRW< [A57WriteI2ld], (instregex "MOV_ga_pcrel_ldr")>;
263 // +2cyc for branch forms
264 def : InstRW<[A57Write_3cyc_1I], (instregex "MOVPC(LR|RX)")>;
266 // --- 3.5 Divide and Multiply Instructions ---
267 // Divide: SDIV, UDIV
268 // latency from documentration: 4 ­‐ 20, maximum taken
269 def : SchedAlias<WriteDIV, A57Write_20cyc_1M>;
270 // Multiply: tMul not bound to common WriteRes types
271 def : InstRW<[A57Write_3cyc_1M], (instregex "tMUL")>;
272 def : SchedAlias<WriteMUL16, A57Write_3cyc_1M>;
273 def : SchedAlias<WriteMUL32, A57Write_3cyc_1M>;
274 def : ReadAdvance<ReadMUL, 0>;
276 // Multiply accumulate: MLA, MLS, SMLABB, SMLABT, SMLATB, SMLATT, SMLAWB,
277 // SMLAWT, SMLAD{X}, SMLSD{X}, SMMLA{R}, SMMLS{R}
278 // Multiply-accumulate pipelines support late-forwarding of accumulate operands
279 // from similar μops, allowing a typical sequence of multiply-accumulate μops
280 // to issue one every 1 cycle (sched advance = 2).
281 def A57WriteMLA : SchedWriteRes<[A57UnitM]> { let Latency = 3; }
282 def A57WriteMLAL : SchedWriteRes<[A57UnitM]> { let Latency = 4; }
283 def A57ReadMLA  : SchedReadAdvance<2, [A57WriteMLA, A57WriteMLAL]>;
285 def : InstRW<[A57WriteMLA],
286   (instregex "t2SMLAD", "t2SMLADX", "t2SMLSD", "t2SMLSDX")>;
288 def : SchedAlias<WriteMAC16, A57WriteMLA>;
289 def : SchedAlias<WriteMAC32, A57WriteMLA>;
290 def : SchedAlias<ReadMAC,    A57ReadMLA>;
292 def : SchedAlias<WriteMAC64Lo, A57WriteMLAL>;
293 def : SchedAlias<WriteMAC64Hi, A57WriteMLAL>;
295 // Multiply long: SMULL, UMULL
296 def : SchedAlias<WriteMUL64Lo, A57Write_4cyc_1M>;
297 def : SchedAlias<WriteMUL64Hi, A57Write_4cyc_1M>;
299 // --- 3.6 Saturating and Parallel Arithmetic Instructions ---
300 // Parallel     arith
301 // SADD16, SADD8, SSUB16, SSUB8, UADD16, UADD8, USUB16, USUB8
302 // Conditional GE-setting instructions require three extra μops
303 // and two additional cycles to conditionally update the GE field.
304 def A57WriteParArith : SchedWriteVariant<[
305   SchedVar<IsPredicatedPred, [A57Write_4cyc_1I_1M]>,
306   SchedVar<NoSchedPred,      [A57Write_2cyc_1I_1M]>
308 def : InstRW< [A57WriteParArith], (instregex
309   "(t2)?SADD(16|8)", "(t2)?SSUB(16|8)",
310   "(t2)?UADD(16|8)", "(t2)?USUB(16|8)")>;
312 // Parallel     arith with exchange: SASX, SSAX, UASX, USAX
313 def A57WriteParArithExch : SchedWriteVariant<[
314   SchedVar<IsPredicatedPred, [A57Write_5cyc_1I_1M]>,
315   SchedVar<NoSchedPred,      [A57Write_3cyc_1I_1M]>
317 def : InstRW<[A57WriteParArithExch],
318   (instregex "(t2)?SASX", "(t2)?SSAX", "(t2)?UASX", "(t2)?USAX")>;
320 // Parallel     halving arith
321 // SHADD16, SHADD8, SHSUB16, SHSUB8, UHADD16, UHADD8, UHSUB16,  UHSUB8
322 def : InstRW<[A57Write_2cyc_1M], (instregex
323   "(t2)?SHADD(16|8)", "(t2)?SHSUB(16|8)",
324   "(t2)?UHADD(16|8)", "(t2)?UHSUB(16|8)")>;
326 // Parallel halving arith with exchange
327 // SHASX, SHSAX, UHASX, UHSAX
328 def : InstRW<[A57Write_3cyc_1I_1M], (instregex "(t2)?SHASX", "(t2)?SHSAX",
329   "(t2)?UHASX", "(t2)?UHSAX")>;
331 // Parallel     saturating arith
332 // QADD16, QADD8, QSUB16, QSUB8, UQADD16, UQADD8, UQSUB16, UQSUB8
333 def : InstRW<[A57Write_2cyc_1M], (instregex "QADD(16|8)", "QSUB(16|8)",
334   "UQADD(16|8)", "UQSUB(16|8)", "t2(U?)QADD", "t2(U?)QSUB")>;
336 // Parallel     saturating arith with exchange
337 // QASX, QSAX, UQASX, UQSAX
338 def : InstRW<[A57Write_3cyc_1I_1M], (instregex "(t2)?QASX", "(t2)?QSAX",
339   "(t2)?UQASX", "(t2)?UQSAX")>;
341 // Saturate: SSAT, SSAT16, USAT, USAT16
342 def : InstRW<[A57Write_2cyc_1M],
343   (instregex "(t2)?SSAT(16)?", "(t2)?USAT(16)?")>;
345 // Saturating arith: QADD, QSUB
346 def : InstRW<[A57Write_2cyc_1M], (instregex "QADD$", "QSUB$")>;
348 // Saturating doubling arith: QDADD, QDSUB
349 def : InstRW<[A57Write_3cyc_1I_1M], (instregex "(t2)?QDADD", "(t2)?QDSUB")>;
351 // --- 3.7 Miscellaneous Data-Processing Instructions ---
352 // Bit field extract: SBFX, UBFX
353 def : InstRW<[A57Write_1cyc_1I], (instregex "(t2)?SBFX", "(t2)?UBFX")>;
355 // Bit field insert/clear: BFI, BFC
356 def : InstRW<[A57Write_2cyc_1M], (instregex "(t2)?BFI", "(t2)?BFC")>;
358 // Select bytes, conditional/unconditional
359 def A57WriteSEL : SchedWriteVariant<[
360   SchedVar<IsPredicatedPred, [A57Write_2cyc_1I]>,
361   SchedVar<NoSchedPred,      [A57Write_1cyc_1I]>
363 def : InstRW<[A57WriteSEL], (instregex "(t2)?SEL")>;
365 // Sign/zero extend, normal: SXTB, SXTH, UXTB, UXTH
366 def : InstRW<[A57Write_1cyc_1I],
367   (instregex "(t2|t)?SXT(B|H)$", "(t2|t)?UXT(B|H)$")>;
369 // Sign/zero extend and add, normal: SXTAB, SXTAH, UXTAB, UXTAH
370 def : InstRW<[A57Write_2cyc_1M],
371   (instregex "(t2)?SXTA(B|H)$", "(t2)?UXTA(B|H)$")>;
373 // Sign/zero extend and add, parallel: SXTAB16, UXTAB16
374 def : InstRW<[A57Write_4cyc_1M], (instregex "(t2)?SXTAB16", "(t2)?UXTAB16")>;
376 // Sum of absolute differences: USAD8, USADA8
377 def : InstRW<[A57Write_3cyc_1M], (instregex "(t2)?USAD8", "(t2)?USADA8")>;
379 // --- 3.8 Load Instructions ---
381 // Load, immed offset
382 // LDR and LDRB have LDRi12 and LDRBi12 forms for immediate
383 def : InstRW<[A57Write_4cyc_1L], (instregex "LDRi12", "LDRBi12",
384   "LDRcp", "(t2|t)?LDRConstPool", "LDRLIT_ga_(pcrel|abs)",
385   "PICLDR", "tLDR")>;
387 def : InstRW<[A57Write_4cyc_1L],
388   (instregex "t2LDRS?(B|H)?(pcrel|T|i8|i12|pci|pci_pic|s)?$")>;
390 // For "Load, register offset, minus" we need +1cyc, +1I
391 def A57WriteLdrAm3 : SchedWriteVariant<[
392   SchedVar<IsLdrAm3NegRegOffPred, [A57Write_5cyc_1I_1L]>,
393   SchedVar<NoSchedPred,           [A57Write_4cyc_1L]>
395 def : InstRW<[A57WriteLdrAm3], (instregex "LDR(H|SH|SB)$")>;
396 def A57WriteLdrAm3X2 : SchedWriteVariant<[
397   SchedVar<IsLdrAm3NegRegOffPredX2, [A57Write_5cyc_1I_1L]>,
398   SchedVar<NoSchedPred,             [A57Write_4cyc_1L]>
400 def : InstRW<[A57WriteLdrAm3X2, A57WriteLdrAm3X2], (instregex "LDRD$")>;
401 def : InstRW<[A57Write_4cyc_1L, A57Write_4cyc_1L], (instregex "t2LDRDi8")>;
403 def A57WriteLdrAmLDSTSO : SchedWriteVariant<[
404   SchedVar<IsLdstsoScaledNotOptimalPred, [A57Write_5cyc_1I_1L]>,
405   SchedVar<IsLdstsoMinusRegPred,         [A57Write_5cyc_1I_1L]>,
406   SchedVar<NoSchedPred,                  [A57Write_4cyc_1L]>
408 def : InstRW<[A57WriteLdrAmLDSTSO], (instregex "LDRrs", "LDRBrs")>;
410 def A57WrBackOne : SchedWriteRes<[]> {
411   let Latency = 1;
412   let NumMicroOps = 0;
414 def A57WrBackTwo : SchedWriteRes<[]> {
415   let Latency = 2;
416   let NumMicroOps = 0;
418 def A57WrBackThree : SchedWriteRes<[]> {
419   let Latency = 3;
420   let NumMicroOps = 0;
423 // --- LDR pre-indexed ---
424 // Load, immed pre-indexed (4 cyc for load result, 1 cyc for Base update)
425 def : InstRW<[A57Write_4cyc_1L_1I, A57WrBackOne], (instregex "LDR_PRE_IMM",
426   "LDRB_PRE_IMM", "t2LDRB_PRE")>;
428 // Load, register pre-indexed (4 cyc for load result, 2 cyc for Base update)
429 // (5 cyc load result for not-lsl2 scaled)
430 def A57WriteLdrAmLDSTSOPre : SchedWriteVariant<[
431   SchedVar<IsLdstsoScaledNotOptimalPredX2, [A57Write_5cyc_1I_1L]>,
432   SchedVar<NoSchedPred,                    [A57Write_4cyc_1L_1I]>
434 def : InstRW<[A57WriteLdrAmLDSTSOPre, A57WrBackTwo],
435   (instregex "LDR_PRE_REG", "LDRB_PRE_REG")>;
437 def A57WriteLdrAm3PreWrBack : SchedWriteVariant<[
438   SchedVar<IsLdrAm3RegOffPredX2, [A57WrBackTwo]>,
439   SchedVar<NoSchedPred,          [A57WrBackOne]>
441 def : InstRW<[A57Write_4cyc_1L, A57WriteLdrAm3PreWrBack],
442   (instregex "LDR(H|SH|SB)_PRE")>;
443 def : InstRW<[A57Write_4cyc_1L, A57WrBackOne],
444   (instregex "t2LDR(H|SH|SB)?_PRE")>;
446 // LDRD pre-indexed: 5(2) cyc for reg, 4(1) cyc for imm.
447 def A57WriteLdrDAm3Pre : SchedWriteVariant<[
448   SchedVar<IsLdrAm3RegOffPredX3, [A57Write_5cyc_1I_1L]>,
449   SchedVar<NoSchedPred,          [A57Write_4cyc_1L_1I]>
451 def A57WriteLdrDAm3PreWrBack : SchedWriteVariant<[
452   SchedVar<IsLdrAm3RegOffPredX3, [A57WrBackTwo]>,
453   SchedVar<NoSchedPred,          [A57WrBackOne]>
455 def : InstRW<[A57WriteLdrDAm3Pre, A57WriteLdrDAm3Pre, A57WriteLdrDAm3PreWrBack],
456   (instregex "LDRD_PRE")>;
457 def : InstRW<[A57Write_4cyc_1L_1I, A57Write_4cyc_1L_1I, A57WrBackOne],
458   (instregex "t2LDRD_PRE")>;
460 // --- LDR post-indexed ---
461 def : InstRW<[A57Write_4cyc_1L_1I, A57WrBackOne], (instregex "LDR(T?)_POST_IMM",
462   "LDRB(T?)_POST_IMM", "LDR(SB|H|SH)Ti", "t2LDRB_POST")>;
464 def A57WriteLdrAm3PostWrBack : SchedWriteVariant<[
465   SchedVar<IsLdrAm3RegOffPred, [A57WrBackTwo]>,
466   SchedVar<NoSchedPred,        [A57WrBackOne]>
468 def : InstRW<[A57Write_4cyc_1L_1I, A57WriteLdrAm3PostWrBack],
469   (instregex "LDR(H|SH|SB)_POST")>;
470 def : InstRW<[A57Write_4cyc_1L, A57WrBackOne],
471   (instregex "t2LDR(H|SH|SB)?_POST")>;
473 def : InstRW<[A57Write_4cyc_1L_1I, A57WrBackTwo], (instregex "LDR_POST_REG",
474   "LDRB_POST_REG", "LDR(B?)T_POST$")>;
476 def A57WriteLdrTRegPost : SchedWriteVariant<[
477   SchedVar<IsLdrAm2ScaledPred, [A57Write_4cyc_1I_1L_1M]>,
478   SchedVar<NoSchedPred,        [A57Write_4cyc_1L_1I]>
480 def A57WriteLdrTRegPostWrBack : SchedWriteVariant<[
481   SchedVar<IsLdrAm2ScaledPred, [A57WrBackThree]>,
482   SchedVar<NoSchedPred,        [A57WrBackTwo]>
484 // 4(3) "I0/I1,L,M" for scaled register, otherwise 4(2) "I0/I1,L"
485 def : InstRW<[A57WriteLdrTRegPost, A57WriteLdrTRegPostWrBack],
486   (instregex "LDRT_POST_REG", "LDRBT_POST_REG")>;
488 def : InstRW<[A57Write_4cyc_1L_1I, A57WrBackTwo], (instregex "LDR(SB|H|SH)Tr")>;
490 def A57WriteLdrAm3PostWrBackX3 : SchedWriteVariant<[
491   SchedVar<IsLdrAm3RegOffPredX3, [A57WrBackTwo]>,
492   SchedVar<NoSchedPred,          [A57WrBackOne]>
494 // LDRD post-indexed: 4(2) cyc for reg, 4(1) cyc for imm.
495 def : InstRW<[A57Write_4cyc_1L_1I, A57Write_4cyc_1L_1I,
496   A57WriteLdrAm3PostWrBackX3], (instregex "LDRD_POST")>;
497 def : InstRW<[A57Write_4cyc_1L_1I, A57Write_4cyc_1L_1I, A57WrBackOne],
498   (instregex "t2LDRD_POST")>;
500 // --- Preload instructions ---
501 // Preload, immed offset
502 def : InstRW<[A57Write_4cyc_1L], (instregex "(t2)?PLDi12", "(t2)?PLDWi12",
503   "t2PLDW?(i8|pci|s)", "(t2)?PLI")>;
505 // Preload, register offset,
506 // 5cyc "I0/I1,L" for minus reg or scaled not plus lsl2
507 // otherwise 4cyc "L"
508 def A57WritePLD : SchedWriteVariant<[
509   SchedVar<IsLdstsoScaledNotOptimalPredX0, [A57Write_5cyc_1I_1L]>,
510   SchedVar<IsLdstsoMinusRegPredX0,         [A57Write_5cyc_1I_1L]>,
511   SchedVar<NoSchedPred,                    [A57Write_4cyc_1L]>
513 def : InstRW<[A57WritePLD], (instregex "PLDrs", "PLDWrs")>;
515 // --- Load multiple instructions ---
516 foreach NumAddr = 1-8 in {
517   def A57LMAddrPred#NumAddr :
518     SchedPredicate<"(TII->getLDMVariableDefsSize(*MI)+1)/2 == "#NumAddr>;
521 def A57LDMOpsListNoregin : A57WriteLMOpsListType<
522                 [A57Write_3cyc_1L, A57Write_3cyc_1L,
523                  A57Write_4cyc_1L, A57Write_4cyc_1L,
524                  A57Write_5cyc_1L, A57Write_5cyc_1L,
525                  A57Write_6cyc_1L, A57Write_6cyc_1L,
526                  A57Write_7cyc_1L, A57Write_7cyc_1L,
527                  A57Write_8cyc_1L, A57Write_8cyc_1L,
528                  A57Write_9cyc_1L, A57Write_9cyc_1L,
529                  A57Write_10cyc_1L, A57Write_10cyc_1L]>;
530 def A57WriteLDMnoreginlist : SchedWriteVariant<[
531   SchedVar<A57LMAddrPred1,     A57LDMOpsListNoregin.Writes[0-1]>,
532   SchedVar<A57LMAddrPred2,     A57LDMOpsListNoregin.Writes[0-3]>,
533   SchedVar<A57LMAddrPred3,     A57LDMOpsListNoregin.Writes[0-5]>,
534   SchedVar<A57LMAddrPred4,     A57LDMOpsListNoregin.Writes[0-7]>,
535   SchedVar<A57LMAddrPred5,     A57LDMOpsListNoregin.Writes[0-9]>,
536   SchedVar<A57LMAddrPred6,     A57LDMOpsListNoregin.Writes[0-11]>,
537   SchedVar<A57LMAddrPred7,     A57LDMOpsListNoregin.Writes[0-13]>,
538   SchedVar<A57LMAddrPred8,     A57LDMOpsListNoregin.Writes[0-15]>,
539   SchedVar<NoSchedPred,        A57LDMOpsListNoregin.Writes[0-15]>
540 ]> { let Variadic=1; }
542 def A57LDMOpsListRegin : A57WriteLMOpsListType<
543                 [A57Write_4cyc_1L_1I, A57Write_4cyc_1L_1I,
544                  A57Write_5cyc_1L_1I, A57Write_5cyc_1L_1I,
545                  A57Write_6cyc_1L_1I, A57Write_6cyc_1L_1I,
546                  A57Write_7cyc_1L_1I, A57Write_7cyc_1L_1I,
547                  A57Write_8cyc_1L_1I, A57Write_8cyc_1L_1I,
548                  A57Write_9cyc_1L_1I, A57Write_9cyc_1L_1I,
549                  A57Write_10cyc_1L_1I, A57Write_10cyc_1L_1I,
550                  A57Write_11cyc_1L_1I, A57Write_11cyc_1L_1I]>;
551 def A57WriteLDMreginlist : SchedWriteVariant<[
552   SchedVar<A57LMAddrPred1,     A57LDMOpsListRegin.Writes[0-1]>,
553   SchedVar<A57LMAddrPred2,     A57LDMOpsListRegin.Writes[0-3]>,
554   SchedVar<A57LMAddrPred3,     A57LDMOpsListRegin.Writes[0-5]>,
555   SchedVar<A57LMAddrPred4,     A57LDMOpsListRegin.Writes[0-7]>,
556   SchedVar<A57LMAddrPred5,     A57LDMOpsListRegin.Writes[0-9]>,
557   SchedVar<A57LMAddrPred6,     A57LDMOpsListRegin.Writes[0-11]>,
558   SchedVar<A57LMAddrPred7,     A57LDMOpsListRegin.Writes[0-13]>,
559   SchedVar<A57LMAddrPred8,     A57LDMOpsListRegin.Writes[0-15]>,
560   SchedVar<NoSchedPred,        A57LDMOpsListRegin.Writes[0-15]>
561 ]> { let Variadic=1; }
563 def A57LDMOpsList_Upd : A57WriteLMOpsListType<
564               [A57WrBackOne,
565                A57Write_3cyc_1L_1I, A57Write_3cyc_1L_1I,
566                A57Write_4cyc_1L_1I, A57Write_4cyc_1L_1I,
567                A57Write_5cyc_1L_1I, A57Write_5cyc_1L_1I,
568                A57Write_6cyc_1L_1I, A57Write_6cyc_1L_1I,
569                A57Write_7cyc_1L_1I, A57Write_7cyc_1L_1I,
570                A57Write_8cyc_1L_1I, A57Write_8cyc_1L_1I,
571                A57Write_9cyc_1L_1I, A57Write_9cyc_1L_1I,
572                A57Write_10cyc_1L_1I, A57Write_10cyc_1L_1I]>;
573 def A57WriteLDM_Upd : SchedWriteVariant<[
574   SchedVar<A57LMAddrPred1,     A57LDMOpsList_Upd.Writes[0-2]>,
575   SchedVar<A57LMAddrPred2,     A57LDMOpsList_Upd.Writes[0-4]>,
576   SchedVar<A57LMAddrPred3,     A57LDMOpsList_Upd.Writes[0-6]>,
577   SchedVar<A57LMAddrPred4,     A57LDMOpsList_Upd.Writes[0-8]>,
578   SchedVar<A57LMAddrPred5,     A57LDMOpsList_Upd.Writes[0-10]>,
579   SchedVar<A57LMAddrPred6,     A57LDMOpsList_Upd.Writes[0-12]>,
580   SchedVar<A57LMAddrPred7,     A57LDMOpsList_Upd.Writes[0-14]>,
581   SchedVar<A57LMAddrPred8,     A57LDMOpsList_Upd.Writes[0-16]>,
582   SchedVar<NoSchedPred,        A57LDMOpsList_Upd.Writes[0-16]>
583 ]> { let Variadic=1; }
585 def A57WriteLDM : SchedWriteVariant<[
586   SchedVar<IsLdmBaseRegInList, [A57WriteLDMreginlist]>,
587   SchedVar<NoSchedPred,        [A57WriteLDMnoreginlist]>
588 ]> { let Variadic=1; }
590 def : InstRW<[A57WriteLDM], (instregex "(t|t2|sys)?LDM(IA|DA|DB|IB)$")>;
592 // TODO: no writeback latency defined in documentation (implemented as 1 cyc)
593 def : InstRW<[A57WriteLDM_Upd],
594   (instregex "(t|t2|sys)?LDM(IA_UPD|DA_UPD|DB_UPD|IB_UPD|IA_RET)", "tPOP")>;
596 def : InstRW<[A57Write_5cyc_1L], (instregex "VLLDM")>;
598 // --- 3.9 Store Instructions ---
600 // Store, immed offset
601 def : InstRW<[A57Write_1cyc_1S], (instregex "STRi12", "STRBi12", "PICSTR",
602   "t2STR(B?)(T|i12|i8|s)", "t2STRDi8", "t2STRH(i12|i8|s)", "tSTR")>;
604 // Store, register offset
605 // For minus or for not plus lsl2 scaled we need 3cyc "I0/I1, S",
606 // otherwise 1cyc S.
607 def A57WriteStrAmLDSTSO : SchedWriteVariant<[
608   SchedVar<IsLdstsoScaledNotOptimalPred, [A57Write_3cyc_1I_1S]>,
609   SchedVar<IsLdstsoMinusRegPred,         [A57Write_3cyc_1I_1S]>,
610   SchedVar<NoSchedPred,                  [A57Write_1cyc_1S]>
612 def : InstRW<[A57WriteStrAmLDSTSO], (instregex "STRrs", "STRBrs")>;
614 // STRH,STRD: 3cyc "I0/I1, S" for minus reg, 1cyc S for imm or for plus reg.
615 def A57WriteStrAm3 : SchedWriteVariant<[
616   SchedVar<IsLdrAm3NegRegOffPred, [A57Write_3cyc_1I_1S]>,
617   SchedVar<NoSchedPred,           [A57Write_1cyc_1S]>
619 def : InstRW<[A57WriteStrAm3], (instregex "STRH$")>;
620 def A57WriteStrAm3X2 : SchedWriteVariant<[
621   SchedVar<IsLdrAm3NegRegOffPredX2, [A57Write_3cyc_1I_1S]>,
622   SchedVar<NoSchedPred,             [A57Write_1cyc_1S]>
624 def : InstRW<[A57WriteStrAm3X2], (instregex "STRD$")>;
626 // Store, immed pre-indexed (1cyc "S, I0/I1", 1cyc writeback)
627 def : InstRW<[A57WrBackOne, A57Write_1cyc_1S_1I], (instregex "STR_PRE_IMM",
628   "STRB_PRE_IMM", "STR(B)?(r|i)_preidx", "(t2)?STRH_(preidx|PRE)",
629   "t2STR(B?)_(PRE|preidx)", "t2STRD_PRE")>;
631 // Store, register pre-indexed:
632 // 1(1) "S, I0/I1" for plus reg
633 // 3(2) "I0/I1, S" for minus reg
634 // 1(2) "S, M" for scaled plus lsl2
635 // 3(2) "I0/I1, S" for other scaled
636 def A57WriteStrAmLDSTSOPre : SchedWriteVariant<[
637   SchedVar<IsLdstsoScaledNotOptimalPredX2, [A57Write_3cyc_1I_1S]>,
638   SchedVar<IsLdstsoMinusRegPredX2,         [A57Write_3cyc_1I_1S]>,
639   SchedVar<IsLdstsoScaledPredX2,           [A57Write_1cyc_1S_1M]>,
640   SchedVar<NoSchedPred,                    [A57Write_1cyc_1S_1I]>
642 def A57WriteStrAmLDSTSOPreWrBack : SchedWriteVariant<[
643   SchedVar<IsLdstsoScaledPredX2,           [A57WrBackTwo]>,
644   SchedVar<IsLdstsoMinusRegPredX2,         [A57WrBackTwo]>,
645   SchedVar<NoSchedPred,                    [A57WrBackOne]>
647 def : InstRW<[A57WriteStrAmLDSTSOPreWrBack, A57WriteStrAmLDSTSOPre],
648   (instregex "STR_PRE_REG", "STRB_PRE_REG")>;
650 // pre-indexed STRH/STRD (STRH_PRE, STRD_PRE)
651 // 1(1) "S, I0/I1" for imm or reg plus
652 // 3(2) "I0/I1, S" for reg minus
653 def A57WriteStrAm3PreX2 : SchedWriteVariant<[
654   SchedVar<IsLdrAm3NegRegOffPredX2, [A57Write_3cyc_1I_1S]>,
655   SchedVar<NoSchedPred,             [A57Write_1cyc_1S_1I]>
657 def A57WriteStrAm3PreWrBackX2 : SchedWriteVariant<[
658   SchedVar<IsLdrAm3NegRegOffPredX2, [A57WrBackTwo]>,
659   SchedVar<NoSchedPred,             [A57WrBackOne]>
661 def : InstRW<[A57WriteStrAm3PreWrBackX2, A57WriteStrAm3PreX2],
662   (instregex "STRH_PRE")>;
664 def A57WriteStrAm3PreX3 : SchedWriteVariant<[
665   SchedVar<IsLdrAm3NegRegOffPredX3, [A57Write_3cyc_1I_1S]>,
666   SchedVar<NoSchedPred,             [A57Write_1cyc_1S_1I]>
668 def A57WriteStrAm3PreWrBackX3 : SchedWriteVariant<[
669   SchedVar<IsLdrAm3NegRegOffPredX3, [A57WrBackTwo]>,
670   SchedVar<NoSchedPred,             [A57WrBackOne]>
672 def : InstRW<[A57WriteStrAm3PreWrBackX3, A57WriteStrAm3PreX3],
673   (instregex "STRD_PRE")>;
675 def : InstRW<[A57WrBackOne, A57Write_1cyc_1S_1I], (instregex "STR(T?)_POST_IMM",
676   "STRB(T?)_POST_IMM", "t2STR(B?)_POST")>;
678 // 1(2) "S, M" for STR/STRB register post-indexed (both scaled or not)
679 def : InstRW<[A57WrBackTwo, A57Write_1cyc_1S_1M], (instregex "STR(T?)_POST_REG",
680   "STRB(T?)_POST_REG", "STR(B?)T_POST$")>;
682 // post-indexed STRH/STRD(STRH_POST, STRD_POST), STRHTi, STRHTr
683 // 1(1) "S, I0/I1" both for reg or imm
684 def : InstRW<[A57WrBackOne, A57Write_1cyc_1S_1I],
685   (instregex "(t2)?STR(H|D)_POST", "STRHT(i|r)", "t2STRHT")>;
687 // --- Store multiple instructions ---
688 // TODO: no writeback latency defined in documentation
689 def A57WriteSTM : SchedWriteVariant<[
690     SchedVar<A57LMAddrPred1, [A57Write_1cyc_1S]>,
691     SchedVar<A57LMAddrPred2, [A57Write_2cyc_1S]>,
692     SchedVar<A57LMAddrPred3, [A57Write_3cyc_1S]>,
693     SchedVar<A57LMAddrPred4, [A57Write_4cyc_1S]>,
694     SchedVar<A57LMAddrPred5, [A57Write_5cyc_1S]>,
695     SchedVar<A57LMAddrPred6, [A57Write_6cyc_1S]>,
696     SchedVar<A57LMAddrPred7, [A57Write_7cyc_1S]>,
697     SchedVar<A57LMAddrPred8, [A57Write_8cyc_1S]>,
698     SchedVar<NoSchedPred,    [A57Write_2cyc_1S]>
700 def A57WriteSTM_Upd : SchedWriteVariant<[
701     SchedVar<A57LMAddrPred1, [A57Write_1cyc_1S_1I]>,
702     SchedVar<A57LMAddrPred2, [A57Write_2cyc_1S_1I]>,
703     SchedVar<A57LMAddrPred3, [A57Write_3cyc_1S_1I]>,
704     SchedVar<A57LMAddrPred4, [A57Write_4cyc_1S_1I]>,
705     SchedVar<A57LMAddrPred5, [A57Write_5cyc_1S_1I]>,
706     SchedVar<A57LMAddrPred6, [A57Write_6cyc_1S_1I]>,
707     SchedVar<A57LMAddrPred7, [A57Write_7cyc_1S_1I]>,
708     SchedVar<A57LMAddrPred8, [A57Write_8cyc_1S_1I]>,
709     SchedVar<NoSchedPred,    [A57Write_2cyc_1S_1I]>
712 def : InstRW<[A57WriteSTM], (instregex "(t2|sys|t)?STM(IA|DA|DB|IB)$")>;
713 def : InstRW<[A57WrBackOne, A57WriteSTM_Upd],
714   (instregex "(t2|sys|t)?STM(IA_UPD|DA_UPD|DB_UPD|IB_UPD)", "tPUSH")>;
716 def : InstRW<[A57Write_5cyc_1S], (instregex "VLSTM")>;
718 // --- 3.10 FP Data Processing Instructions ---
719 def : SchedAlias<WriteFPALU32, A57Write_5cyc_1V>;
720 def : SchedAlias<WriteFPALU64, A57Write_5cyc_1V>;
722 def : InstRW<[A57Write_3cyc_1V], (instregex "VABS(S|D|H)")>;
724 // fp compare - 3cyc F1 for unconditional, 6cyc "F0/F1, F1" for conditional
725 def A57WriteVcmp : SchedWriteVariant<[
726   SchedVar<IsPredicatedPred, [A57Write_6cyc_1V_1X]>,
727   SchedVar<NoSchedPred,      [A57Write_3cyc_1X]>
729 def : InstRW<[A57WriteVcmp],
730   (instregex "VCMP(D|S|H|ZD|ZS|ZH)$", "VCMPE(D|S|H|ZD|ZS|ZH)")>;
732 // fp convert
733 def : InstRW<[A57Write_5cyc_1V], (instregex
734   "VCVT(A|N|P|M)(SH|UH|SS|US|SD|UD)", "VCVT(BDH|THD|TDH)")>;
735 def : InstRW<[A57Write_5cyc_1V], (instregex "VTOSLS", "VTOUHS", "VTOULS")>;
736 def : SchedAlias<WriteFPCVT, A57Write_5cyc_1V>;
738 def : InstRW<[A57Write_5cyc_1V], (instregex "VJCVT")>;
740 // FP round to integral
741 def : InstRW<[A57Write_5cyc_1V], (instregex "VRINT(A|N|P|M|Z|R|X)(H|S|D)$")>;
743 // FP divide, FP square root
744 def : SchedAlias<WriteFPDIV32, A57Write_17cyc_1W>;
745 def : SchedAlias<WriteFPDIV64, A57Write_32cyc_1W>;
746 def : SchedAlias<WriteFPSQRT32, A57Write_17cyc_1W>;
747 def : SchedAlias<WriteFPSQRT64, A57Write_32cyc_1W>;
749 def : InstRW<[A57Write_17cyc_1W], (instregex "VSQRTH")>;
751 // FP max/min
752 def : InstRW<[A57Write_5cyc_1V], (instregex "VMAX", "VMIN")>;
754 // FP multiply-accumulate pipelines support late forwarding of the result
755 // from FP multiply μops to the accumulate operands of an
756 // FP multiply-accumulate μop. The latter can potentially be issued 1 cycle
757 // after the FP multiply μop has been issued
758 // FP multiply, FZ
759 def A57WriteVMUL : SchedWriteRes<[A57UnitV]> { let Latency = 5; }
761 def : SchedAlias<WriteFPMUL32, A57WriteVMUL>;
762 def : SchedAlias<WriteFPMUL64, A57WriteVMUL>;
763 def : ReadAdvance<ReadFPMUL, 0>;
765 // FP multiply accumulate, FZ: 9cyc "F0/F1" or 4 cyc for sequenced accumulate
766 // VFMA, VFMS, VFNMA, VFNMS, VMLA, VMLS, VNMLA, VNMLS
767 def A57WriteVFMA : SchedWriteRes<[A57UnitV]> { let Latency = 9;  }
769 // VFMA takes 9 cyc for common case and 4 cyc for VFMA->VFMA chain (5 read adv.)
770 // VMUL takes 5 cyc for common case and 1 cyc for VMUL->VFMA chain (4 read adv.)
771 // Currently, there is no way to define different read advances for VFMA operand
772 // from VFMA or from VMUL, so there will be 5 read advance.
773 // Zero latency (instead of one) for VMUL->VFMA shouldn't break something.
774 // The same situation with ASIMD VMUL/VFMA instructions
775 // def A57ReadVFMA : SchedRead;
776 // def : ReadAdvance<A57ReadVFMA, 5, [A57WriteVFMA]>;
777 // def : ReadAdvance<A57ReadVFMA, 4, [A57WriteVMUL]>;
778 def A57ReadVFMA5 : SchedReadAdvance<5, [A57WriteVFMA, A57WriteVMUL]>;
780 def : SchedAlias<WriteFPMAC32, A57WriteVFMA>;
781 def : SchedAlias<WriteFPMAC64, A57WriteVFMA>;
782 def : SchedAlias<ReadFPMAC, A57ReadVFMA5>;
784 // VMLAH/VMLSH are not binded to scheduling classes by default, so here custom:
785 def : InstRW<[A57WriteVFMA, A57ReadVFMA5, ReadFPMUL, ReadFPMUL],
786   (instregex "VMLAH", "VMLSH", "VNMLAH", "VNMLSH")>;
788 def : InstRW<[A57WriteVMUL],
789   (instregex "VUDOTD", "VSDOTD", "VUDOTQ", "VSDOTQ")>;
791 def : InstRW<[A57Write_3cyc_1V], (instregex "VNEG")>;
792 def : InstRW<[A57Write_3cyc_1V], (instregex "VSEL")>;
794 // --- 3.11 FP Miscellaneous Instructions ---
795 // VMOV: 3cyc "F0/F1" for imm/reg
796 def : InstRW<[A57Write_3cyc_1V], (instregex "FCONST(D|S|H)")>;
797 def : InstRW<[A57Write_3cyc_1V], (instregex "VMOV(D|S|H)(cc)?$")>;
799 def : InstRW<[A57Write_3cyc_1V], (instregex "VINSH")>;
801 // 5cyc L for FP transfer, vfp to core reg,
802 // 5cyc L for FP transfer, core reg to vfp
803 def : SchedAlias<WriteFPMOV, A57Write_5cyc_1L>;
804 // VMOVRRS/VMOVRRD in common code declared with one WriteFPMOV (instead of 2).
805 def : InstRW<[A57Write_5cyc_1L, A57Write_5cyc_1L], (instregex "VMOV(RRS|RRD)")>;
807 // 8cyc "L,F0/F1" for FP transfer, core reg to upper or lower half of vfp D-reg
808 def : InstRW<[A57Write_8cyc_1L_1I], (instregex "VMOVDRR")>;
810 // --- 3.12 FP Load Instructions ---
811 def : InstRW<[A57Write_5cyc_1L], (instregex "VLDR(D|S|H)")>;
813 def : InstRW<[A57Write_5cyc_1L], (instregex "VLDMQIA$")>;
815 // FP load multiple (VLDM)
817 def A57VLDMOpsListUncond : A57WriteLMOpsListType<
818                [A57Write_5cyc_1L, A57Write_5cyc_1L,
819                 A57Write_6cyc_1L, A57Write_6cyc_1L,
820                 A57Write_7cyc_1L, A57Write_7cyc_1L,
821                 A57Write_8cyc_1L, A57Write_8cyc_1L,
822                 A57Write_9cyc_1L, A57Write_9cyc_1L,
823                 A57Write_10cyc_1L, A57Write_10cyc_1L,
824                 A57Write_11cyc_1L, A57Write_11cyc_1L,
825                 A57Write_12cyc_1L, A57Write_12cyc_1L]>;
826 def A57WriteVLDMuncond : SchedWriteVariant<[
827   SchedVar<A57LMAddrPred1,  A57VLDMOpsListUncond.Writes[0-1]>,
828   SchedVar<A57LMAddrPred2,  A57VLDMOpsListUncond.Writes[0-3]>,
829   SchedVar<A57LMAddrPred3,  A57VLDMOpsListUncond.Writes[0-5]>,
830   SchedVar<A57LMAddrPred4,  A57VLDMOpsListUncond.Writes[0-7]>,
831   SchedVar<A57LMAddrPred5,  A57VLDMOpsListUncond.Writes[0-9]>,
832   SchedVar<A57LMAddrPred6,  A57VLDMOpsListUncond.Writes[0-11]>,
833   SchedVar<A57LMAddrPred7,  A57VLDMOpsListUncond.Writes[0-13]>,
834   SchedVar<A57LMAddrPred8,  A57VLDMOpsListUncond.Writes[0-15]>,
835   SchedVar<NoSchedPred,     A57VLDMOpsListUncond.Writes[0-15]>
836 ]> { let Variadic=1; }
838 def A57VLDMOpsListCond : A57WriteLMOpsListType<
839                [A57Write_5cyc_1L, A57Write_6cyc_1L,
840                 A57Write_7cyc_1L, A57Write_8cyc_1L,
841                 A57Write_9cyc_1L, A57Write_10cyc_1L,
842                 A57Write_11cyc_1L, A57Write_12cyc_1L,
843                 A57Write_13cyc_1L, A57Write_14cyc_1L,
844                 A57Write_15cyc_1L, A57Write_16cyc_1L,
845                 A57Write_17cyc_1L, A57Write_18cyc_1L,
846                 A57Write_19cyc_1L, A57Write_20cyc_1L]>;
847 def A57WriteVLDMcond : SchedWriteVariant<[
848   SchedVar<A57LMAddrPred1,  A57VLDMOpsListCond.Writes[0-1]>,
849   SchedVar<A57LMAddrPred2,  A57VLDMOpsListCond.Writes[0-3]>,
850   SchedVar<A57LMAddrPred3,  A57VLDMOpsListCond.Writes[0-5]>,
851   SchedVar<A57LMAddrPred4,  A57VLDMOpsListCond.Writes[0-7]>,
852   SchedVar<A57LMAddrPred5,  A57VLDMOpsListCond.Writes[0-9]>,
853   SchedVar<A57LMAddrPred6,  A57VLDMOpsListCond.Writes[0-11]>,
854   SchedVar<A57LMAddrPred7,  A57VLDMOpsListCond.Writes[0-13]>,
855   SchedVar<A57LMAddrPred8,  A57VLDMOpsListCond.Writes[0-15]>,
856   SchedVar<NoSchedPred,     A57VLDMOpsListCond.Writes[0-15]>
857 ]> { let Variadic=1; }
859 def A57WriteVLDM : SchedWriteVariant<[
860   SchedVar<IsPredicatedPred, [A57WriteVLDMcond]>,
861   SchedVar<NoSchedPred,      [A57WriteVLDMuncond]>
862 ]> { let Variadic=1; }
864 def : InstRW<[A57WriteVLDM], (instregex "VLDM(DIA|SIA)$")>;
866 def A57VLDMOpsListUncond_Upd : A57WriteLMOpsListType<
867                [A57Write_5cyc_1L_1I, A57Write_5cyc_1L_1I,
868                 A57Write_6cyc_1L_1I, A57Write_6cyc_1L_1I,
869                 A57Write_7cyc_1L_1I, A57Write_7cyc_1L_1I,
870                 A57Write_8cyc_1L_1I, A57Write_8cyc_1L_1I,
871                 A57Write_9cyc_1L_1I, A57Write_9cyc_1L_1I,
872                 A57Write_10cyc_1L_1I, A57Write_10cyc_1L_1I,
873                 A57Write_11cyc_1L_1I, A57Write_11cyc_1L_1I,
874                 A57Write_12cyc_1L_1I, A57Write_12cyc_1L_1I]>;
875 def A57WriteVLDMuncond_UPD : SchedWriteVariant<[
876   SchedVar<A57LMAddrPred1,  A57VLDMOpsListUncond_Upd.Writes[0-1]>,
877   SchedVar<A57LMAddrPred2,  A57VLDMOpsListUncond_Upd.Writes[0-3]>,
878   SchedVar<A57LMAddrPred3,  A57VLDMOpsListUncond_Upd.Writes[0-5]>,
879   SchedVar<A57LMAddrPred4,  A57VLDMOpsListUncond_Upd.Writes[0-7]>,
880   SchedVar<A57LMAddrPred5,  A57VLDMOpsListUncond_Upd.Writes[0-9]>,
881   SchedVar<A57LMAddrPred6,  A57VLDMOpsListUncond_Upd.Writes[0-11]>,
882   SchedVar<A57LMAddrPred7,  A57VLDMOpsListUncond_Upd.Writes[0-13]>,
883   SchedVar<A57LMAddrPred8,  A57VLDMOpsListUncond_Upd.Writes[0-15]>,
884   SchedVar<NoSchedPred,     A57VLDMOpsListUncond_Upd.Writes[0-15]>
885 ]> { let Variadic=1; }
887 def A57VLDMOpsListCond_Upd : A57WriteLMOpsListType<
888                [A57Write_5cyc_1L_1I, A57Write_6cyc_1L_1I,
889                 A57Write_7cyc_1L_1I, A57Write_8cyc_1L_1I,
890                 A57Write_9cyc_1L_1I, A57Write_10cyc_1L_1I,
891                 A57Write_11cyc_1L_1I, A57Write_12cyc_1L_1I,
892                 A57Write_13cyc_1L_1I, A57Write_14cyc_1L_1I,
893                 A57Write_15cyc_1L_1I, A57Write_16cyc_1L_1I,
894                 A57Write_17cyc_1L_1I, A57Write_18cyc_1L_1I,
895                 A57Write_19cyc_1L_1I, A57Write_20cyc_1L_1I]>;
896 def A57WriteVLDMcond_UPD : SchedWriteVariant<[
897   SchedVar<A57LMAddrPred1,  A57VLDMOpsListCond_Upd.Writes[0-1]>,
898   SchedVar<A57LMAddrPred2,  A57VLDMOpsListCond_Upd.Writes[0-3]>,
899   SchedVar<A57LMAddrPred3,  A57VLDMOpsListCond_Upd.Writes[0-5]>,
900   SchedVar<A57LMAddrPred4,  A57VLDMOpsListCond_Upd.Writes[0-7]>,
901   SchedVar<A57LMAddrPred5,  A57VLDMOpsListCond_Upd.Writes[0-9]>,
902   SchedVar<A57LMAddrPred6,  A57VLDMOpsListCond_Upd.Writes[0-11]>,
903   SchedVar<A57LMAddrPred7,  A57VLDMOpsListCond_Upd.Writes[0-13]>,
904   SchedVar<A57LMAddrPred8,  A57VLDMOpsListCond_Upd.Writes[0-15]>,
905   SchedVar<NoSchedPred,     A57VLDMOpsListCond_Upd.Writes[0-15]>
906 ]> { let Variadic=1; }
908 def A57WriteVLDM_UPD : SchedWriteVariant<[
909   SchedVar<IsPredicatedPred, [A57WriteVLDMcond_UPD]>,
910   SchedVar<NoSchedPred,      [A57WriteVLDMuncond_UPD]>
911 ]> { let Variadic=1; }
913 def : InstRW<[A57WrBackOne, A57WriteVLDM_UPD],
914   (instregex "VLDM(DIA_UPD|DDB_UPD|SIA_UPD|SDB_UPD)")>;
916 // --- 3.13 FP Store Instructions ---
917 def : InstRW<[A57Write_1cyc_1S], (instregex "VSTR(D|S|H)")>;
919 def : InstRW<[A57Write_2cyc_1S], (instregex "VSTMQIA$")>;
921 def A57WriteVSTMs : SchedWriteVariant<[
922     SchedVar<A57LMAddrPred1, [A57Write_1cyc_1S]>,
923     SchedVar<A57LMAddrPred2, [A57Write_2cyc_1S]>,
924     SchedVar<A57LMAddrPred3, [A57Write_3cyc_1S]>,
925     SchedVar<A57LMAddrPred4, [A57Write_4cyc_1S]>,
926     SchedVar<A57LMAddrPred5, [A57Write_5cyc_1S]>,
927     SchedVar<A57LMAddrPred6, [A57Write_6cyc_1S]>,
928     SchedVar<A57LMAddrPred7, [A57Write_7cyc_1S]>,
929     SchedVar<A57LMAddrPred8, [A57Write_8cyc_1S]>,
930     SchedVar<NoSchedPred,    [A57Write_2cyc_1S]>
932 def A57WriteVSTMd : SchedWriteVariant<[
933     SchedVar<A57LMAddrPred1, [A57Write_2cyc_1S]>,
934     SchedVar<A57LMAddrPred2, [A57Write_4cyc_1S]>,
935     SchedVar<A57LMAddrPred3, [A57Write_6cyc_1S]>,
936     SchedVar<A57LMAddrPred4, [A57Write_8cyc_1S]>,
937     SchedVar<A57LMAddrPred5, [A57Write_10cyc_1S]>,
938     SchedVar<A57LMAddrPred6, [A57Write_12cyc_1S]>,
939     SchedVar<A57LMAddrPred7, [A57Write_14cyc_1S]>,
940     SchedVar<A57LMAddrPred8, [A57Write_16cyc_1S]>,
941     SchedVar<NoSchedPred,    [A57Write_4cyc_1S]>
943 def A57WriteVSTMs_Upd : SchedWriteVariant<[
944     SchedVar<A57LMAddrPred1, [A57Write_1cyc_1S_1I]>,
945     SchedVar<A57LMAddrPred2, [A57Write_2cyc_1S_1I]>,
946     SchedVar<A57LMAddrPred3, [A57Write_3cyc_1S_1I]>,
947     SchedVar<A57LMAddrPred4, [A57Write_4cyc_1S_1I]>,
948     SchedVar<A57LMAddrPred5, [A57Write_5cyc_1S_1I]>,
949     SchedVar<A57LMAddrPred6, [A57Write_6cyc_1S_1I]>,
950     SchedVar<A57LMAddrPred7, [A57Write_7cyc_1S_1I]>,
951     SchedVar<A57LMAddrPred8, [A57Write_8cyc_1S_1I]>,
952     SchedVar<NoSchedPred,    [A57Write_2cyc_1S_1I]>
954 def A57WriteVSTMd_Upd : SchedWriteVariant<[
955     SchedVar<A57LMAddrPred1, [A57Write_2cyc_1S_1I]>,
956     SchedVar<A57LMAddrPred2, [A57Write_4cyc_1S_1I]>,
957     SchedVar<A57LMAddrPred3, [A57Write_6cyc_1S_1I]>,
958     SchedVar<A57LMAddrPred4, [A57Write_8cyc_1S_1I]>,
959     SchedVar<A57LMAddrPred5, [A57Write_10cyc_1S_1I]>,
960     SchedVar<A57LMAddrPred6, [A57Write_12cyc_1S_1I]>,
961     SchedVar<A57LMAddrPred7, [A57Write_14cyc_1S_1I]>,
962     SchedVar<A57LMAddrPred8, [A57Write_16cyc_1S_1I]>,
963     SchedVar<NoSchedPred,    [A57Write_2cyc_1S_1I]>
966 def : InstRW<[A57WriteVSTMs], (instregex "VSTMSIA$")>;
967 def : InstRW<[A57WriteVSTMd], (instregex "VSTMDIA$")>;
968 def : InstRW<[A57WrBackOne, A57WriteVSTMs_Upd],
969   (instregex "VSTM(SIA_UPD|SDB_UPD)")>;
970 def : InstRW<[A57WrBackOne, A57WriteVSTMd_Upd],
971   (instregex "VSTM(DIA_UPD|DDB_UPD)")>;
973 // --- 3.14 ASIMD Integer Instructions ---
975 // ASIMD absolute diff, 3cyc F0/F1 for integer VABD
976 def : InstRW<[A57Write_3cyc_1V], (instregex "VABD(s|u)")>;
978 // ASIMD absolute diff accum: 4(1) F1 for D-form, 5(2) F1 for Q-form
979 def A57WriteVABAD : SchedWriteRes<[A57UnitX]> { let Latency = 4; }
980 def A57ReadVABAD  : SchedReadAdvance<3, [A57WriteVABAD]>;
981 def : InstRW<[A57WriteVABAD, A57ReadVABAD],
982   (instregex "VABA(s|u)(v8i8|v4i16|v2i32)")>;
983 def A57WriteVABAQ : SchedWriteRes<[A57UnitX]> { let Latency = 5; }
984 def A57ReadVABAQ  : SchedReadAdvance<3, [A57WriteVABAQ]>;
985 def : InstRW<[A57WriteVABAQ, A57ReadVABAQ],
986   (instregex "VABA(s|u)(v16i8|v8i16|v4i32)")>;
988 // ASIMD absolute diff accum long: 4(1) F1 for VABAL
989 def A57WriteVABAL : SchedWriteRes<[A57UnitX]> { let Latency = 4; }
990 def A57ReadVABAL  : SchedReadAdvance<3, [A57WriteVABAL]>;
991 def : InstRW<[A57WriteVABAL, A57ReadVABAL], (instregex "VABAL(s|u)")>;
993 // ASIMD absolute diff long: 3cyc F0/F1 for VABDL
994 def : InstRW<[A57Write_3cyc_1V], (instregex "VABDL(s|u)")>;
996 // ASIMD arith, basic
997 def : InstRW<[A57Write_3cyc_1V], (instregex "VADDv", "VADDL", "VADDW",
998   "VNEG(s8d|s16d|s32d|s8q|s16q|s32q|d|q)",
999   "VPADDi", "VPADDL", "VSUBv", "VSUBL", "VSUBW")>;
1001 // ASIMD arith, complex
1002 def : InstRW<[A57Write_3cyc_1V], (instregex "VABS", "VADDHN", "VHADD", "VHSUB",
1003   "VQABS", "VQADD", "VQNEG", "VQSUB",
1004   "VRADDHN", "VRHADD", "VRSUBHN", "VSUBHN")>;
1006 // ASIMD compare
1007 def : InstRW<[A57Write_3cyc_1V],
1008   (instregex "VCEQ", "VCGE", "VCGT", "VCLE", "VTST", "VCLT")>;
1010 // ASIMD logical
1011 def : InstRW<[A57Write_3cyc_1V],
1012   (instregex "VAND", "VBIC", "VMVN", "VORR", "VORN", "VEOR")>;
1014 // ASIMD max/min
1015 def : InstRW<[A57Write_3cyc_1V],
1016   (instregex "(VMAX|VMIN)(s|u)", "(VPMAX|VPMIN)(s8|s16|s32|u8|u16|u32)")>;
1018 // ASIMD multiply, D-form: 5cyc F0 for r0px, 4cyc F0 for r1p0 and later
1019 // Cortex-A57 r1p0 and later reduce the latency of ASIMD multiply
1020 // and multiply-with-accumulate instructions relative to r0pX.
1021 def A57WriteVMULD_VecInt : SchedWriteVariant<[
1022   SchedVar<IsR1P0AndLaterPred, [A57Write_4cyc_1W]>,
1023   SchedVar<NoSchedPred,        [A57Write_5cyc_1W]>]>;
1024 def : InstRW<[A57WriteVMULD_VecInt], (instregex
1025   "VMUL(v8i8|v4i16|v2i32|pd)", "VMULsl(v4i16|v2i32)",
1026   "VQDMULH(sl)?(v4i16|v2i32)", "VQRDMULH(sl)?(v4i16|v2i32)")>;
1028 // ASIMD multiply, Q-form: 6cyc F0 for r0px, 5cyc F0 for r1p0 and later
1029 def A57WriteVMULQ_VecInt : SchedWriteVariant<[
1030   SchedVar<IsR1P0AndLaterPred, [A57Write_5cyc_1W]>,
1031   SchedVar<NoSchedPred,        [A57Write_6cyc_1W]>]>;
1032 def : InstRW<[A57WriteVMULQ_VecInt], (instregex
1033   "VMUL(v16i8|v8i16|v4i32|pq)", "VMULsl(v8i16|v4i32)",
1034   "VQDMULH(sl)?(v8i16|v4i32)", "VQRDMULH(sl)?(v8i16|v4i32)")>;
1036 // ASIMD multiply accumulate, D-form
1037 // 5cyc F0 for r0px, 4cyc F0 for r1p0 and later, 1cyc for accumulate sequence
1038 // (4 or 3 ReadAdvance)
1039 def A57WriteVMLAD_VecInt : SchedWriteVariant<[
1040   SchedVar<IsR1P0AndLaterPred, [A57Write_4cyc_1W]>,
1041   SchedVar<NoSchedPred,        [A57Write_5cyc_1W]>]>;
1042 def A57ReadVMLAD_VecInt : SchedReadVariant<[
1043   SchedVar<IsR1P0AndLaterPred, [SchedReadAdvance<3, [A57WriteVMLAD_VecInt]>]>,
1044   SchedVar<NoSchedPred,        [SchedReadAdvance<4, [A57WriteVMLAD_VecInt]>]>
1046 def : InstRW<[A57WriteVMLAD_VecInt, A57ReadVMLAD_VecInt],
1047   (instregex "VMLA(sl)?(v8i8|v4i16|v2i32)", "VMLS(sl)?(v8i8|v4i16|v2i32)")>;
1049 // ASIMD multiply accumulate, Q-form
1050 // 6cyc F0 for r0px, 5cyc F0 for r1p0 and later, 2cyc for accumulate sequence
1051 // (4 or 3 ReadAdvance)
1052 def A57WriteVMLAQ_VecInt : SchedWriteVariant<[
1053   SchedVar<IsR1P0AndLaterPred, [A57Write_5cyc_1W]>,
1054   SchedVar<NoSchedPred,        [A57Write_6cyc_1W]>]>;
1055 def A57ReadVMLAQ_VecInt : SchedReadVariant<[
1056   SchedVar<IsR1P0AndLaterPred, [SchedReadAdvance<3, [A57WriteVMLAQ_VecInt]>]>,
1057   SchedVar<NoSchedPred,        [SchedReadAdvance<4, [A57WriteVMLAQ_VecInt]>]>
1059 def : InstRW<[A57WriteVMLAQ_VecInt, A57ReadVMLAQ_VecInt],
1060   (instregex "VMLA(sl)?(v16i8|v8i16|v4i32)", "VMLS(sl)?(v16i8|v8i16|v4i32)")>;
1062 // ASIMD multiply accumulate long
1063 // 5cyc F0 for r0px, 4cyc F0 for r1p0 and later, 1cyc for accumulate sequence
1064 // (4 or 3 ReadAdvance)
1065 def A57WriteVMLAL_VecInt : SchedWriteVariant<[
1066   SchedVar<IsR1P0AndLaterPred, [A57Write_4cyc_1W]>,
1067   SchedVar<NoSchedPred,        [A57Write_5cyc_1W]>]>;
1068 def A57ReadVMLAL_VecInt : SchedReadVariant<[
1069   SchedVar<IsR1P0AndLaterPred, [SchedReadAdvance<3, [A57WriteVMLAL_VecInt]>]>,
1070   SchedVar<NoSchedPred,        [SchedReadAdvance<4, [A57WriteVMLAL_VecInt]>]>
1072 def : InstRW<[A57WriteVMLAL_VecInt, A57ReadVMLAL_VecInt],
1073   (instregex "VMLAL(s|u)", "VMLSL(s|u)")>;
1075 // ASIMD multiply accumulate saturating long
1076 // 5cyc F0 for r0px, 4cyc F0 for r1p0 and later, 2cyc for accumulate sequence
1077 // (3 or 2 ReadAdvance)
1078 def A57WriteVQDMLAL_VecInt : SchedWriteVariant<[
1079   SchedVar<IsR1P0AndLaterPred, [A57Write_4cyc_1W]>,
1080   SchedVar<NoSchedPred,        [A57Write_5cyc_1W]>]>;
1081 def A57ReadVQDMLAL_VecInt : SchedReadVariant<[
1082   SchedVar<IsR1P0AndLaterPred, [SchedReadAdvance<2, [A57WriteVQDMLAL_VecInt]>]>,
1083   SchedVar<NoSchedPred,        [SchedReadAdvance<3, [A57WriteVQDMLAL_VecInt]>]>
1085 def : InstRW<[A57WriteVQDMLAL_VecInt, A57ReadVQDMLAL_VecInt],
1086   (instregex "VQDMLAL", "VQDMLSL")>;
1088 // Vector Saturating Rounding Doubling Multiply Accumulate/Subtract Long
1089 // Scheduling info from VQDMLAL/VQDMLSL
1090 def : InstRW<[A57WriteVQDMLAL_VecInt, A57ReadVQDMLAL_VecInt],
1091   (instregex "VQRDMLAH", "VQRDMLSH")>;
1093 // ASIMD multiply long
1094 // 5cyc F0 for r0px, 4cyc F0 for r1p0 and later
1095 def A57WriteVMULL_VecInt : SchedWriteVariant<[
1096   SchedVar<IsR1P0AndLaterPred, [A57Write_4cyc_1W]>,
1097   SchedVar<NoSchedPred,        [A57Write_5cyc_1W]>]>;
1098 def : InstRW<[A57WriteVMULL_VecInt],
1099   (instregex "VMULL(s|u|p8|sls|slu)", "VQDMULL")>;
1101 // ASIMD pairwise add and accumulate
1102 // 4cyc F1, 1cyc for accumulate sequence (3cyc ReadAdvance)
1103 def A57WriteVPADAL : SchedWriteRes<[A57UnitX]> { let Latency = 4; }
1104 def A57ReadVPADAL  : SchedReadAdvance<3, [A57WriteVPADAL]>;
1105 def : InstRW<[A57WriteVPADAL, A57ReadVPADAL], (instregex "VPADAL(s|u)")>;
1107 // ASIMD shift accumulate
1108 // 4cyc F1, 1cyc for accumulate sequence (3cyc ReadAdvance)
1109 def A57WriteVSRA : SchedWriteRes<[A57UnitX]> { let Latency = 4;  }
1110 def A57ReadVSRA  : SchedReadAdvance<3, [A57WriteVSRA]>;
1111 def : InstRW<[A57WriteVSRA, A57ReadVSRA], (instregex "VSRA", "VRSRA")>;
1113 // ASIMD shift by immed, basic
1114 def : InstRW<[A57Write_3cyc_1X],
1115   (instregex "VMOVL", "VSHLi", "VSHLL", "VSHR(s|u)", "VSHRN")>;
1117 // ASIMD shift by immed, complex
1118 def : InstRW<[A57Write_4cyc_1X], (instregex
1119   "VQRSHRN", "VQRSHRUN", "VQSHL(si|ui|su)", "VQSHRN", "VQSHRUN", "VRSHR(s|u)",
1120   "VRSHRN")>;
1122 // ASIMD shift by immed and insert, basic, D-form
1123 def : InstRW<[A57Write_4cyc_1X], (instregex
1124   "VSLI(v8i8|v4i16|v2i32|v1i64)", "VSRI(v8i8|v4i16|v2i32|v1i64)")>;
1126 // ASIMD shift by immed and insert, basic, Q-form
1127 def : InstRW<[A57Write_5cyc_1X], (instregex
1128   "VSLI(v16i8|v8i16|v4i32|v2i64)", "VSRI(v16i8|v8i16|v4i32|v2i64)")>;
1130 // ASIMD shift by register, basic, D-form
1131 def : InstRW<[A57Write_3cyc_1X], (instregex
1132   "VSHL(s|u)(v8i8|v4i16|v2i32|v1i64)")>;
1134 // ASIMD shift by register, basic, Q-form
1135 def : InstRW<[A57Write_4cyc_1X], (instregex
1136   "VSHL(s|u)(v16i8|v8i16|v4i32|v2i64)")>;
1138 // ASIMD shift by register, complex, D-form
1139 // VQRSHL, VQSHL, VRSHL
1140 def : InstRW<[A57Write_4cyc_1X], (instregex
1141   "VQRSHL(s|u)(v8i8|v4i16|v2i32|v1i64)", "VQSHL(s|u)(v8i8|v4i16|v2i32|v1i64)",
1142   "VRSHL(s|u)(v8i8|v4i16|v2i32|v1i64)")>;
1144 // ASIMD shift by register, complex, Q-form
1145 def : InstRW<[A57Write_5cyc_1X], (instregex
1146   "VQRSHL(s|u)(v16i8|v8i16|v4i32|v2i64)", "VQSHL(s|u)(v16i8|v8i16|v4i32|v2i64)",
1147   "VRSHL(s|u)(v16i8|v8i16|v4i32|v2i64)")>;
1149 // --- 3.15 ASIMD Floating-Point Instructions ---
1150 // ASIMD FP absolute value
1151 def : InstRW<[A57Write_3cyc_1V], (instregex "VABS(fd|fq|hd|hq)")>;
1153 // ASIMD FP arith
1154 def : InstRW<[A57Write_5cyc_1V], (instregex "VABD(fd|fq|hd|hq)",
1155   "VADD(fd|fq|hd|hq)", "VPADD(f|h)", "VSUB(fd|fq|hd|hq)")>;
1157 def : InstRW<[A57Write_5cyc_1V], (instregex "VCADD", "VCMLA")>;
1159 // ASIMD FP compare
1160 def : InstRW<[A57Write_5cyc_1V], (instregex "VAC(GE|GT|LE|LT)",
1161   "VC(EQ|GE|GT|LE)(fd|fq|hd|hq)")>;
1163 // ASIMD FP convert, integer
1164 def : InstRW<[A57Write_5cyc_1V], (instregex
1165   "VCVT(f2sd|f2ud|s2fd|u2fd|f2sq|f2uq|s2fq|u2fq|f2xsd|f2xud|xs2fd|xu2fd)",
1166   "VCVT(f2xsq|f2xuq|xs2fq|xu2fq)",
1167   "VCVT(AN|MN|NN|PN)(SDf|SQf|UDf|UQf|SDh|SQh|UDh|UQh)")>;
1169 // ASIMD FP convert, half-precision: 8cyc F0/F1
1170 def : InstRW<[A57Write_8cyc_1V], (instregex
1171   "VCVT(h2sd|h2ud|s2hd|u2hd|h2sq|h2uq|s2hq|u2hq|h2xsd|h2xud|xs2hd|xu2hd)",
1172   "VCVT(h2xsq|h2xuq|xs2hq|xu2hq)",
1173   "VCVT(f2h|h2f)")>;
1175 // ASIMD FP max/min
1176 def : InstRW<[A57Write_5cyc_1V], (instregex
1177   "(VMAX|VMIN)(fd|fq|hd|hq)", "(VPMAX|VPMIN)(f|h)", "VMAXNM", "VMINNM")>;
1179 // ASIMD FP multiply
1180 def A57WriteVMUL_VecFP  : SchedWriteRes<[A57UnitV]> { let Latency = 5;  }
1181 def : InstRW<[A57WriteVMUL_VecFP], (instregex "VMUL(sl)?(fd|fq|hd|hq)")>;
1183 // ASIMD FP multiply accumulate: 9cyc F0/F1, 4cyc for accumulate sequence
1184 def A57WriteVMLA_VecFP  : SchedWriteRes<[A57UnitV]> { let Latency = 9;  }
1185 def A57ReadVMLA_VecFP  :
1186   SchedReadAdvance<5, [A57WriteVMLA_VecFP, A57WriteVMUL_VecFP]>;
1187 def : InstRW<[A57WriteVMLA_VecFP, A57ReadVMLA_VecFP],
1188   (instregex "(VMLA|VMLS)(sl)?(fd|fq|hd|hq)", "(VFMA|VFMS)(fd|fq|hd|hq)")>;
1190 // ASIMD FP negate
1191 def : InstRW<[A57Write_3cyc_1V], (instregex "VNEG(fd|f32q|hd|hq)")>;
1193 // ASIMD FP round to integral
1194 def : InstRW<[A57Write_5cyc_1V], (instregex
1195   "VRINT(AN|MN|NN|PN|XN|ZN)(Df|Qf|Dh|Qh)")>;
1197 // --- 3.16 ASIMD Miscellaneous Instructions ---
1199 // ASIMD bitwise insert
1200 def : InstRW<[A57Write_3cyc_1V], (instregex "VBIF", "VBIT", "VBSL")>;
1202 // ASIMD count
1203 def : InstRW<[A57Write_3cyc_1V], (instregex "VCLS", "VCLZ", "VCNT")>;
1205 // ASIMD duplicate, core reg: 8cyc "L, F0/F1"
1206 def : InstRW<[A57Write_8cyc_1L_1V], (instregex "VDUP(8|16|32)(d|q)")>;
1208 // ASIMD duplicate, scalar: 3cyc "F0/F1"
1209 def : InstRW<[A57Write_3cyc_1V], (instregex "VDUPLN(8|16|32)(d|q)")>;
1211 // ASIMD extract
1212 def : InstRW<[A57Write_3cyc_1V], (instregex "VEXT(d|q)(8|16|32|64)")>;
1214 // ASIMD move, immed
1215 def : InstRW<[A57Write_3cyc_1V], (instregex
1216   "VMOV(v8i8|v16i8|v4i16|v8i16|v2i32|v4i32|v1i64|v2i64|v2f32|v4f32)",
1217   "VMOVD0", "VMOVQ0")>;
1219 // ASIMD move, narrowing
1220 def : InstRW<[A57Write_3cyc_1V], (instregex "VMOVN")>;
1222 // ASIMD move, saturating
1223 def : InstRW<[A57Write_4cyc_1X], (instregex "VQMOVN")>;
1225 // ASIMD reciprocal estimate
1226 def : InstRW<[A57Write_5cyc_1V], (instregex "VRECPE", "VRSQRTE")>;
1228 // ASIMD reciprocal step, FZ
1229 def : InstRW<[A57Write_9cyc_1V], (instregex "VRECPS", "VRSQRTS")>;
1231 // ASIMD reverse, swap, table lookup (1-2 reg)
1232 def : InstRW<[A57Write_3cyc_1V], (instregex "VREV", "VSWP", "VTB(L|X)(1|2)")>;
1234 // ASIMD table lookup (3-4 reg)
1235 def : InstRW<[A57Write_6cyc_1V], (instregex "VTBL(3|4)", "VTBX(3|4)")>;
1237 // ASIMD transfer, scalar to core reg: 6cyc "L, I0/I1"
1238 def : InstRW<[A57Write_6cyc_1L_1I], (instregex "VGETLN")>;
1240 // ASIMD transfer, core reg to scalar: 8cyc "L, F0/F1"
1241 def : InstRW<[A57Write_8cyc_1L_1V], (instregex "VSETLN")>;
1243 // ASIMD transpose
1244 def : InstRW<[A57Write_3cyc_1V, A57Write_3cyc_1V], (instregex "VTRN")>;
1246 // ASIMD unzip/zip, D-form
1247 def : InstRW<[A57Write_3cyc_1V, A57Write_3cyc_1V],
1248   (instregex "VUZPd", "VZIPd")>;
1250 // ASIMD unzip/zip, Q-form
1251 def : InstRW<[A57Write_6cyc_1V, A57Write_6cyc_1V],
1252   (instregex "VUZPq", "VZIPq")>;
1254 // --- 3.17 ASIMD Load Instructions ---
1256 // Overriden via InstRW for this processor.
1257 def : WriteRes<WriteVLD1, []>;
1258 def : WriteRes<WriteVLD2, []>;
1259 def : WriteRes<WriteVLD3, []>;
1260 def : WriteRes<WriteVLD4, []>;
1261 def : WriteRes<WriteVST1, []>;
1262 def : WriteRes<WriteVST2, []>;
1263 def : WriteRes<WriteVST3, []>;
1264 def : WriteRes<WriteVST4, []>;
1266 // 1-2 reg: 5cyc L, +I for writeback, 1 cyc wb latency
1267 def : InstRW<[A57Write_5cyc_1L], (instregex "VLD1(d|q)(8|16|32|64)$")>;
1268 def : InstRW<[A57Write_5cyc_1L_1I, A57WrBackOne],
1269   (instregex "VLD1(d|q)(8|16|32|64)wb")>;
1271 // 3-4 reg: 6cyc L, +I for writeback, 1 cyc wb latency
1272 def : InstRW<[A57Write_6cyc_1L],
1273   (instregex "VLD1(d|q)(8|16|32|64)(T|Q)$", "VLD1d64(T|Q)Pseudo")>;
1275 def : InstRW<[A57Write_6cyc_1L_1I, A57WrBackOne],
1276   (instregex "VLD1(d|q)(8|16|32|64)(T|Q)wb")>;
1278 // ASIMD load, 1 element, one lane and all lanes: 8cyc "L, F0/F1"
1279 def : InstRW<[A57Write_8cyc_1L_1V], (instregex
1280   "VLD1(LN|DUP)(d|q)(8|16|32)$", "VLD1(LN|DUP)(d|q)(8|16|32)Pseudo$")>;
1281 def : InstRW<[A57Write_8cyc_1L_1V_1I, A57WrBackOne], (instregex
1282   "VLD1(LN|DUP)(d|q)(8|16|32)(wb|_UPD)", "VLD1LNq(8|16|32)Pseudo_UPD")>;
1284 // ASIMD load, 2 element, multiple, 2 reg: 8cyc "L, F0/F1"
1285 def : InstRW<[A57Write_8cyc_1L_1V],
1286       (instregex "VLD2(d|q)(8|16|32)$", "VLD2q(8|16|32)Pseudo$")>;
1287 def : InstRW<[A57Write_8cyc_1L_1V_1I, A57WrBackOne],
1288       (instregex "VLD2(d|q)(8|16|32)wb", "VLD2q(8|16|32)PseudoWB")>;
1290 // ASIMD load, 2 element, multiple, 4 reg: 9cyc "L, F0/F1"
1291 def : InstRW<[A57Write_9cyc_1L_1V], (instregex "VLD2b(8|16|32)$")>;
1292 def : InstRW<[A57Write_9cyc_1L_1V_1I, A57WrBackOne],
1293       (instregex "VLD2b(8|16|32)wb")>;
1295 // ASIMD load, 2 element, one lane and all lanes: 8cyc "L, F0/F1"
1296 def : InstRW<[A57Write_8cyc_1L_1V, A57Write_8cyc_1L_1V],
1297       (instregex "VLD2(DUP|LN)(d|q)(8|16|32|8x2|16x2|32x2)$",
1298                  "VLD2LN(d|q)(8|16|32)Pseudo$")>;
1299 // 2 results + wb result
1300 def : InstRW<[A57Write_8cyc_1L_1V_1I, A57Write_8cyc_1L_1V, A57WrBackOne],
1301       (instregex "VLD2LN(d|q)(8|16|32)_UPD$")>;
1302 // 1 result + wb result
1303 def : InstRW<[A57Write_8cyc_1L_1V_1I, A57WrBackOne],
1304       (instregex "VLD2DUPd(8|16|32|8x2|16x2|32x2)wb",
1305                  "VLD2LN(d|q)(8|16|32)Pseudo_UPD")>;
1307 // ASIMD load, 3 element, multiple, 3 reg: 9cyc "L, F0/F1"
1308 // 3 results
1309 def : InstRW<[A57Write_9cyc_1L_1V, A57Write_9cyc_1L_1V, A57Write_9cyc_1L_1V],
1310       (instregex "VLD3(d|q)(8|16|32)$")>;
1311 // 1 result
1312 def : InstRW<[A57Write_9cyc_1L_1V],
1313       (instregex "VLD3(d|q)(8|16|32)(oddP|P)seudo$")>;
1314 // 3 results + wb
1315 def : InstRW<[A57Write_9cyc_1L_1V_1I, A57Write_9cyc_1L_1V_1I,
1316               A57Write_9cyc_1L_1V_1I, A57WrBackOne],
1317       (instregex "VLD3(d|q)(8|16|32)_UPD$")>;
1318 // 1 result + wb
1319 def : InstRW<[A57Write_9cyc_1L_1V_1I, A57WrBackOne],
1320       (instregex "VLD3(d|q)(8|16|32)(oddP|P)seudo_UPD")>;
1322 // ASIMD load, 3 element, one lane, size 32: 8cyc "L, F0/F1"
1323 def : InstRW<[A57Write_8cyc_1L_1V, A57Write_8cyc_1L_1V, A57Write_8cyc_1L_1V],
1324       (instregex "VLD3LN(d|q)32$",
1325                  "VLD3LN(d|q)32Pseudo$")>;
1326 def : InstRW<[A57Write_8cyc_1L_1V_1I, A57Write_8cyc_1L_1V_1I,
1327               A57Write_8cyc_1L_1V_1I, A57WrBackOne],
1328       (instregex "VLD3LN(d|q)32_UPD")>;
1329 def : InstRW<[A57Write_8cyc_1L_1V_1I, A57WrBackOne],
1330       (instregex "VLD3LN(d|q)32Pseudo_UPD")>;
1332 // ASIMD load, 3 element, one lane, size 8/16: 9cyc "L, F0/F1"
1333 def : InstRW<[A57Write_9cyc_1L_1V, A57Write_9cyc_1L_1V, A57Write_9cyc_1L_1V],
1334       (instregex "VLD3LN(d|q)(8|16)$",
1335                  "VLD3LN(d|q)(8|16)Pseudo$")>;
1336 def : InstRW<[A57Write_9cyc_1L_1V_1I, A57Write_9cyc_1L_1V_1I,
1337               A57Write_9cyc_1L_1V_1I, A57WrBackOne],
1338       (instregex "VLD3LN(d|q)(8|16)_UPD")>;
1339 def : InstRW<[A57Write_9cyc_1L_1V_1I, A57WrBackOne],
1340       (instregex "VLD3LN(d|q)(8|16)Pseudo_UPD")>;
1342 // ASIMD load, 3 element, all lanes: 8cyc "L, F0/F1"
1343 def : InstRW<[A57Write_8cyc_1L_1V, A57Write_8cyc_1L_1V, A57Write_8cyc_1L_1V],
1344       (instregex "VLD3DUP(d|q)(8|16|32)$",
1345                  "VLD3DUP(d|q)(8|16|32)Pseudo$")>;
1346 def : InstRW<[A57Write_8cyc_1L_1V_1I, A57Write_8cyc_1L_1V_1I,
1347               A57Write_8cyc_1L_1V_1I, A57WrBackOne],
1348       (instregex "VLD3DUP(d|q)(8|16|32)_UPD")>;
1349 def : InstRW<[A57Write_8cyc_1L_1V_1I, A57WrBackOne],
1350       (instregex "VLD3DUP(d|q)(8|16|32)Pseudo_UPD")>;
1352 // ASIMD load, 4 element, multiple, 4 reg: 9cyc "L, F0/F1"
1353 def : InstRW<[A57Write_9cyc_1L_1V, A57Write_9cyc_1L_1V, A57Write_9cyc_1L_1V,
1354               A57Write_9cyc_1L_1V],
1355       (instregex "VLD4(d|q)(8|16|32)$")>;
1356 def : InstRW<[A57Write_9cyc_1L_1V],
1357       (instregex "VLD4(d|q)(8|16|32)(oddP|P)seudo$")>;
1358 def : InstRW<[A57Write_9cyc_1L_1V_1I, A57Write_9cyc_1L_1V_1I,
1359               A57Write_9cyc_1L_1V_1I, A57Write_9cyc_1L_1V_1I, A57WrBackOne],
1360       (instregex "VLD4(d|q)(8|16|32)_UPD")>;
1361 def : InstRW<[A57Write_9cyc_1L_1V_1I, A57WrBackOne],
1362       (instregex  "VLD4(d|q)(8|16|32)(oddP|P)seudo_UPD")>;
1364 // ASIMD load, 4 element, one lane, size 32: 8cyc "L, F0/F1"
1365 def : InstRW<[A57Write_8cyc_1L_1V, A57Write_8cyc_1L_1V, A57Write_8cyc_1L_1V,
1366               A57Write_8cyc_1L_1V],
1367       (instregex "VLD4LN(d|q)32$",
1368                  "VLD4LN(d|q)32Pseudo$")>;
1369 def : InstRW<[A57Write_8cyc_1L_1V_1I, A57Write_8cyc_1L_1V_1I,
1370               A57Write_8cyc_1L_1V_1I, A57Write_8cyc_1L_1V_1I,
1371               A57WrBackOne],
1372       (instregex "VLD4LN(d|q)32_UPD")>;
1373 def : InstRW<[A57Write_8cyc_1L_1V_1I, A57WrBackOne],
1374       (instregex "VLD4LN(d|q)32Pseudo_UPD")>;
1376 // ASIMD load, 4 element, one lane, size 8/16: 9cyc "L, F0/F1"
1377 def : InstRW<[A57Write_9cyc_1L_1V, A57Write_9cyc_1L_1V, A57Write_9cyc_1L_1V,
1378               A57Write_9cyc_1L_1V],
1379       (instregex "VLD4LN(d|q)(8|16)$",
1380                  "VLD4LN(d|q)(8|16)Pseudo$")>;
1381 def : InstRW<[A57Write_9cyc_1L_1V_1I, A57Write_9cyc_1L_1V_1I,
1382               A57Write_9cyc_1L_1V_1I, A57Write_9cyc_1L_1V_1I,
1383               A57WrBackOne],
1384       (instregex "VLD4LN(d|q)(8|16)_UPD")>;
1385 def : InstRW<[A57Write_9cyc_1L_1V_1I, A57WrBackOne],
1386       (instregex "VLD4LN(d|q)(8|16)Pseudo_UPD")>;
1388 // ASIMD load, 4 element, all lanes: 8cyc "L, F0/F1"
1389 def : InstRW<[A57Write_8cyc_1L_1V, A57Write_8cyc_1L_1V, A57Write_8cyc_1L_1V,
1390               A57Write_8cyc_1L_1V],
1391       (instregex "VLD4DUP(d|q)(8|16|32)$",
1392                  "VLD4DUP(d|q)(8|16|32)Pseudo$")>;
1393 def : InstRW<[A57Write_8cyc_1L_1V_1I, A57Write_8cyc_1L_1V_1I,
1394               A57Write_8cyc_1L_1V_1I, A57Write_8cyc_1L_1V_1I,
1395               A57WrBackOne],
1396       (instregex "VLD4DUP(d|q)(8|16|32)_UPD")>;
1397 def : InstRW<[A57Write_8cyc_1L_1V_1I, A57WrBackOne],
1398       (instregex "VLD4DUP(d|q)(8|16|32)Pseudo_UPD")>;
1400 // --- 3.18 ASIMD Store Instructions ---
1402 // ASIMD store, 1 element, multiple, 1 reg: 1cyc S
1403 def : InstRW<[A57Write_1cyc_1S], (instregex "VST1d(8|16|32|64)$")>;
1404 def : InstRW<[A57WrBackOne, A57Write_1cyc_1S_1I],
1405       (instregex "VST1d(8|16|32|64)wb")>;
1406 // ASIMD store, 1 element, multiple, 2 reg: 2cyc S
1407 def : InstRW<[A57Write_2cyc_1S], (instregex "VST1q(8|16|32|64)$")>;
1408 def : InstRW<[A57WrBackOne, A57Write_2cyc_1S_1I],
1409       (instregex "VST1q(8|16|32|64)wb")>;
1410 // ASIMD store, 1 element, multiple, 3 reg: 3cyc S
1411 def : InstRW<[A57Write_3cyc_1S],
1412       (instregex "VST1d(8|16|32|64)T$", "VST1d64TPseudo$")>;
1413 def : InstRW<[A57WrBackOne, A57Write_3cyc_1S_1I],
1414       (instregex "VST1d(8|16|32|64)Twb", "VST1d64TPseudoWB")>;
1415 // ASIMD store, 1 element, multiple, 4 reg: 4cyc S
1416 def : InstRW<[A57Write_4cyc_1S],
1417       (instregex "VST1d(8|16|32|64)(Q|QPseudo)$")>;
1418 def : InstRW<[A57WrBackOne, A57Write_4cyc_1S_1I],
1419       (instregex "VST1d(8|16|32|64)(Qwb|QPseudoWB)")>;
1420 // ASIMD store, 1 element, one lane: 3cyc "F0/F1, S"
1421 def : InstRW<[A57Write_3cyc_1S_1V],
1422       (instregex "VST1LNd(8|16|32)$", "VST1LNq(8|16|32)Pseudo$")>;
1423 def : InstRW<[A57WrBackOne, A57Write_3cyc_1S_1V_1I],
1424       (instregex "VST1LNd(8|16|32)_UPD", "VST1LNq(8|16|32)Pseudo_UPD")>;
1425 // ASIMD store, 2 element, multiple, 2 reg: 3cyc "F0/F1, S"
1426 def : InstRW<[A57Write_3cyc_1S_1V],
1427       (instregex "VST2(d|b)(8|16|32)$")>;
1428 def : InstRW<[A57WrBackOne, A57Write_3cyc_1S_1V_1I],
1429       (instregex "VST2(b|d)(8|16|32)wb")>;
1430 // ASIMD store, 2 element, multiple, 4 reg: 4cyc "F0/F1, S"
1431 def : InstRW<[A57Write_4cyc_1S_1V],
1432       (instregex "VST2q(8|16|32)$", "VST2q(8|16|32)Pseudo$")>;
1433 def : InstRW<[A57WrBackOne, A57Write_4cyc_1S_1V_1I],
1434       (instregex "VST2q(8|16|32)wb", "VST2q(8|16|32)PseudoWB")>;
1435 // ASIMD store, 2 element, one lane: 3cyc "F0/F1, S"
1436 def : InstRW<[A57Write_3cyc_1S_1V],
1437       (instregex "VST2LN(d|q)(8|16|32)$", "VST2LN(d|q)(8|16|32)Pseudo$")>;
1438 def : InstRW<[A57WrBackOne, A57Write_3cyc_1S_1V_1I],
1439       (instregex "VST2LN(d|q)(8|16|32)_UPD",
1440                  "VST2LN(d|q)(8|16|32)Pseudo_UPD")>;
1441 // ASIMD store, 3 element, multiple, 3 reg
1442 def : InstRW<[A57Write_3cyc_1S_1V],
1443       (instregex "VST3(d|q)(8|16|32)$", "VST3(d|q)(8|16|32)(oddP|P)seudo$")>;
1444 def : InstRW<[A57WrBackOne, A57Write_3cyc_1S_1V_1I],
1445       (instregex "VST3(d|q)(8|16|32)_UPD",
1446                  "VST3(d|q)(8|16|32)(oddP|P)seudo_UPD$")>;
1447 // ASIMD store, 3 element, one lane
1448 def : InstRW<[A57Write_3cyc_1S_1V],
1449       (instregex "VST3LN(d|q)(8|16|32)$", "VST3LN(d|q)(8|16|32)Pseudo$")>;
1450 def : InstRW<[A57WrBackOne, A57Write_3cyc_1S_1V_1I],
1451       (instregex "VST3LN(d|q)(8|16|32)_UPD",
1452                  "VST3LN(d|q)(8|16|32)Pseudo_UPD")>;
1453 // ASIMD store, 4 element, multiple, 4 reg
1454 def : InstRW<[A57Write_4cyc_1S_1V],
1455       (instregex "VST4(d|q)(8|16|32)$", "VST4(d|q)(8|16|32)(oddP|P)seudo$")>;
1456 def : InstRW<[A57WrBackOne, A57Write_4cyc_1S_1V_1I],
1457       (instregex "VST4(d|q)(8|16|32)_UPD",
1458                  "VST4(d|q)(8|16|32)(oddP|P)seudo_UPD$")>;
1459 // ASIMD store, 4 element, one lane
1460 def : InstRW<[A57Write_3cyc_1S_1V],
1461       (instregex "VST4LN(d|q)(8|16|32)$", "VST4LN(d|q)(8|16|32)Pseudo$")>;
1462 def : InstRW<[A57WrBackOne, A57Write_3cyc_1S_1V_1I],
1463       (instregex "VST4LN(d|q)(8|16|32)_UPD",
1464                  "VST4LN(d|q)(8|16|32)Pseudo_UPD")>;
1466 // --- 3.19 Cryptography Extensions ---
1467 // Crypto AES ops
1468 // AESD, AESE, AESIMC, AESMC: 3cyc F0
1469 def : InstRW<[A57Write_3cyc_1W], (instregex "^AES")>;
1470 // Crypto polynomial (64x64) multiply long (VMULL.P64): 3cyc F0
1471 def : InstRW<[A57Write_3cyc_1W], (instregex "^VMULLp64")>;
1472 // Crypto SHA1 xor ops: 6cyc F0/F1
1473 def : InstRW<[A57Write_6cyc_2V], (instregex "^SHA1SU0")>;
1474 // Crypto SHA1 fast ops: 3cyc F0
1475 def : InstRW<[A57Write_3cyc_1W], (instregex "^SHA1(H|SU1)")>;
1476 // Crypto SHA1 slow ops: 6cyc F0
1477 def : InstRW<[A57Write_6cyc_2W], (instregex "^SHA1[CMP]")>;
1478 // Crypto SHA256 fast ops: 3cyc F0
1479 def : InstRW<[A57Write_3cyc_1W], (instregex "^SHA256SU0")>;
1480 // Crypto SHA256 slow ops: 6cyc F0
1481 def : InstRW<[A57Write_6cyc_2W], (instregex "^SHA256(H|H2|SU1)")>;
1483 // --- 3.20 CRC ---
1484 def : InstRW<[A57Write_3cyc_1W], (instregex "^(t2)?CRC32")>;
1486 // -----------------------------------------------------------------------------
1487 // Common definitions
1488 def : WriteRes<WriteNoop, []> { let Latency = 0; let NumMicroOps = 0; }
1489 def : SchedAlias<WriteALU, A57Write_1cyc_1I>;
1491 def : SchedAlias<WriteBr, A57Write_1cyc_1B>;
1492 def : SchedAlias<WriteBrL, A57Write_1cyc_1B_1I>;
1493 def : SchedAlias<WriteBrTbl, A57Write_1cyc_1B_1I>;
1494 def : SchedAlias<WritePreLd, A57Write_4cyc_1L>;
1496 def : SchedAlias<WriteLd, A57Write_4cyc_1L>;
1497 def : SchedAlias<WriteST, A57Write_1cyc_1S>;
1498 def : ReadAdvance<ReadALU, 0>;
1500 } // SchedModel = CortexA57Model