[llvm-shlib] Fix the version naming style of libLLVM for Windows (#85710)
[llvm-project.git] / llvm / lib / Target / AArch64 / AArch64SchedA510.td
blob1b66d6bb8fbd443cc5dc0fabd1efcf4010e8c21c
1 //==- AArch64SchedCortexA510.td - ARM Cortex-A510 Scheduling Definitions -*- tablegen -*-=//
2 //
3 // Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
4 // See https://llvm.org/LICENSE.txt for license information.
5 // SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
6 //
7 //===----------------------------------------------------------------------===//
8 //
9 // This file defines the machine model for the ARM Cortex-A510 processor.
11 //===----------------------------------------------------------------------===//
13 // ===---------------------------------------------------------------------===//
14 // The following definitions describe the per-operand machine model.
15 // This works with MachineScheduler. See MCSchedModel.h for details.
17 // Cortex-A510 machine model for scheduling and other instruction cost heuristics.
18 def CortexA510Model : SchedMachineModel {
19   let MicroOpBufferSize = 0;  // The Cortex-A510 is an in-order processor
20   let IssueWidth = 3;         // It dual-issues under most circumstances
21   let LoadLatency = 3;        // Cycles for loads to access the cache.
22                               // Most loads have a latency of 2, but some have higher latencies.
23                               // 3 seems to be a good tradeoff
24   let PostRAScheduler = 1;    // Enable PostRA scheduler pass.
25   let CompleteModel = 0;      // Covers instructions applicable to Cortex-A510.
27   // FIXME: Remove when all errors have been fixed.
28   let FullInstRWOverlapCheck = 0;
32 //===----------------------------------------------------------------------===//
33 // Subtarget-specific SchedWrite types
35 let SchedModel = CortexA510Model in {
37 //===----------------------------------------------------------------------===//
38 // Define each kind of processor resource and number available.
40 // Modeling each pipeline as a ProcResource using the BufferSize = 0 since the
41 // Cortex-A510 is in-order.
42 let BufferSize = 0 in {
43   def CortexA510UnitALU0   : ProcResource<1>;    // Int ALU0
44   def CortexA510UnitALU12  : ProcResource<2>;    // Int ALU1 & ALU2
45   def CortexA510UnitMAC    : ProcResource<1>;    // Int MAC, 64-bi wide
46   def CortexA510UnitDiv    : ProcResource<1>;    // Int Division, not pipelined
47   // There are 2 LS pipes, 1 for Load/Store; 1 for Store only
48   def CortexA510UnitLdSt   : ProcResource<1>;    // Load/Store shared pipe
49   def CortexA510UnitLd1    : ProcResource<1>;    // Load pipe
50   def CortexA510UnitB      : ProcResource<1>;    // Branch
51   def CortexA510UnitPAC    : ProcResource<1>;    // Pointer Authentication (PAC) pipe
53   // The FP DIV/SQRT instructions execute totally differently from the FP ALU
54   // instructions, which can mostly be dual-issued; that's why for now we model
55   // them with 2 resources.
56   def CortexA510UnitVALU0  : ProcResource<1>;    // SIMD/FP/SVE ALU0
57   def CortexA510UnitVALU1  : ProcResource<1>;    // SIMD/FP/SVE ALU0
58   def CortexA510UnitVMAC   : ProcResource<2>;    // SIMD/FP/SVE MAC
59   def CortexA510UnitVMC    : ProcResource<1>;    // SIMD/FP/SVE multicycle instrs  (e.g Div, SQRT, cryptography)
62 def CortexA510UnitLd     : ProcResGroup<[CortexA510UnitLdSt, CortexA510UnitLd1]>;
63 def CortexA510UnitVALU   : ProcResGroup<[CortexA510UnitVALU0, CortexA510UnitVALU1]>;
64 def CortexA510UnitALU    : ProcResGroup<[CortexA510UnitALU0, CortexA510UnitALU12]>;
65 // These latencies are modeled without taking into account forwarding paths
66 // (the software optimisation guide lists latencies taking into account
67 // typical forwarding paths).
68 def : WriteRes<WriteImm, [CortexA510UnitALU]> { let Latency = 1; }    // MOVN, MOVZ
69 def : WriteRes<WriteI, [CortexA510UnitALU]> { let Latency = 1; }      // ALU
70 def : WriteRes<WriteISReg, [CortexA510UnitALU]> { let Latency = 2; }  // ALU of Shifted-Reg
71 def : WriteRes<WriteIEReg, [CortexA510UnitALU]> { let Latency = 2; }  // ALU of Extended-Reg
72 def : WriteRes<WriteExtr, [CortexA510UnitALU]> { let Latency = 2; }   // EXTR from a reg pair
73 def : WriteRes<WriteIS, [CortexA510UnitALU]> { let Latency = 2; }     // Shift/Scale
75 // MAC
76 def : WriteRes<WriteIM32, [CortexA510UnitMAC]> { let Latency = 3; }   // 32-bit Multiply
77 def : WriteRes<WriteIM64, [CortexA510UnitMAC]> { let Latency = 5; let ReleaseAtCycles = [2];}   // 64-bit Multiply
79 // Div
80 def : WriteRes<WriteID32, [CortexA510UnitDiv]> {
81   let Latency = 8; let ReleaseAtCycles = [8];
83 def : WriteRes<WriteID64, [CortexA510UnitDiv]> {
84   let Latency = 16; let ReleaseAtCycles = [16];
87 //===----------------------------------------------------------------------===//
88 // Define customized scheduler read/write types specific to the Cortex A510
90 //===----------------------------------------------------------------------===//
91 class CortexA510Write<int n, ProcResourceKind res> : SchedWriteRes<[res]> {
92   let Latency = n;
95 class CortexA510MCWrite<int n, int m, ProcResourceKind res> : SchedWriteRes<[res]> {
96   let Latency = n;
97   let ReleaseAtCycles = [m];
98   let BeginGroup = 1;
101 class CortexA510MC_RC0Write<int n, ProcResourceKind res> : SchedWriteRes<[res]> {
102   let Latency = n;
103   let BeginGroup = 1;
106 //===----------------------------------------------------------------------===//
107 // Define generic 2 micro-op types
108 def A510Write_10cyc_1VMAC_1VALU : SchedWriteRes<[CortexA510UnitVALU, CortexA510UnitVMAC]> {
109   let Latency     = 10;
110   let NumMicroOps = 2;
113 def A510Write_15cyc_1VMAC_1VALU : SchedWriteRes<[CortexA510UnitVALU, CortexA510UnitVMAC]> {
114   let Latency     = 15;
115   let NumMicroOps = 2;
118 class A510Write_PAC_B <int lat> : SchedWriteRes<[CortexA510UnitPAC, CortexA510UnitB]> {
119   let Latency = lat;
120   let NumMicroOps = 2;
122 // Load
123 def : WriteRes<WriteLD, [CortexA510UnitLd]> { let Latency = 2; }
124 def : WriteRes<WriteLDIdx, [CortexA510UnitLd]> { let Latency = 2; }
125 def : WriteRes<WriteLDHi, [CortexA510UnitLd]> { let Latency = 2; }
127 def CortexA510WriteVLD1 : SchedWriteRes<[CortexA510UnitLd]> { let Latency = 3; }
128 def CortexA510WriteVLD1SI : SchedWriteRes<[CortexA510UnitLd]> { let Latency = 3; let SingleIssue = 1; }
129 def CortexA510WriteVLD2 : SchedWriteRes<[CortexA510UnitLd]> { let Latency = 4;
130                                                   let ReleaseAtCycles = [2]; }
131 def CortexA510WriteVLD3 : SchedWriteRes<[CortexA510UnitLd]> { let Latency = 5;
132                                                   let ReleaseAtCycles = [3]; }
133 def CortexA510WriteVLD4 : SchedWriteRes<[CortexA510UnitLd]> { let Latency = 6;
134                                                   let ReleaseAtCycles = [4]; }
135 def CortexA510WriteVLD6 : SchedWriteRes<[CortexA510UnitLd]> { let Latency = 5;
136                                                   let ReleaseAtCycles = [3]; }
137 def CortexA510WriteVLD8 : SchedWriteRes<[CortexA510UnitLd]> { let Latency = 6;
138                                                   let ReleaseAtCycles = [4]; }
140 def CortexA510WriteLDP1 : SchedWriteRes<[CortexA510UnitLd]> { let Latency = 3; }
141 def CortexA510WriteLDP2 : SchedWriteRes<[CortexA510UnitLd]> { let Latency = 3; }
142 def CortexA510WriteLDP4 : SchedWriteRes<[CortexA510UnitLd]> { let Latency = 3; }
144 // Pre/Post Indexing - Performed as part of address generation
145 def : WriteRes<WriteAdr, []> { let Latency = 0; }
147 // Store
148 let RetireOOO = 1 in {
149 def : WriteRes<WriteST, [CortexA510UnitLdSt]> { let Latency = 1; }
150 def : WriteRes<WriteSTP, [CortexA510UnitLdSt]> { let Latency = 1; }
151 def : WriteRes<WriteSTIdx, [CortexA510UnitLdSt]> { let Latency = 1; }
153 def : WriteRes<WriteSTX, [CortexA510UnitLdSt]> { let Latency = 3; }
155 // Vector Store - Similar to vector loads, can take 1-3 cycles to issue.
156 def : WriteRes<WriteVST, [CortexA510UnitLdSt]> { let Latency = 5;
157                                           let ReleaseAtCycles = [2];}
158 def CortexA510WriteVST1 : SchedWriteRes<[CortexA510UnitLdSt]> { let Latency = 4; }
159 def CortexA510WriteVST2 : SchedWriteRes<[CortexA510UnitLdSt]> { let Latency = 5;
160                                                   let ReleaseAtCycles = [2]; }
161 def CortexA510WriteVST3 : SchedWriteRes<[CortexA510UnitLdSt]> { let Latency = 5;
162                                                   let ReleaseAtCycles = [3]; }
163 def CortexA510WriteVST4 : SchedWriteRes<[CortexA510UnitLdSt]> { let Latency = 5;
164                                                   let ReleaseAtCycles = [4]; }
166 def : WriteRes<WriteAtomic, []> { let Unsupported = 1; }
168 // Branch
169 def : WriteRes<WriteBr, [CortexA510UnitB]>;
170 def : WriteRes<WriteBrReg, [CortexA510UnitB]>;
171 def : WriteRes<WriteSys, [CortexA510UnitB]>;
172 def : WriteRes<WriteBarrier, [CortexA510UnitB]>;
173 def : WriteRes<WriteHint, [CortexA510UnitB]>;
175 // FP ALU
176 //   As WriteF result is produced in F5 and it can be mostly forwarded
177 //   to consumer at F1, the effectively Latency is set as 4.
178 def : WriteRes<WriteF, [CortexA510UnitVALU]> { let Latency = 4; }
179 def : WriteRes<WriteFCmp, [CortexA510UnitVALU]> { let Latency = 3; }
180 def : WriteRes<WriteFCvt, [CortexA510UnitVALU]> { let Latency = 4; }
181 def : WriteRes<WriteFCopy, [CortexA510UnitVALU]> { let Latency = 3; }
182 def : WriteRes<WriteFImm, [CortexA510UnitVALU]> { let Latency = 3; }
184 class CortexA510VSt<int n> : SchedWriteRes<[CortexA510UnitLdSt]> {
185   let RetireOOO = 1;
186   let ReleaseAtCycles = [n];
189 def CortexA510VSt0      : SchedWriteRes<[CortexA510UnitLdSt]> {
190   let RetireOOO = 1;
193 def : SchedAlias<WriteVd, CortexA510Write<4, CortexA510UnitVALU>>;
194 def : SchedAlias<WriteVq, CortexA510Write<4, CortexA510UnitVALU>>;
196 // FP ALU specific new schedwrite definitions
197 def CortexA510WriteFPALU_F3 : SchedWriteRes<[CortexA510UnitVALU]> { let Latency = 3;}
198 def CortexA510WriteFPALU_F4 : SchedWriteRes<[CortexA510UnitVALU]> { let Latency = 4;}
200 // FP Mul, Div, Sqrt. Div/Sqrt are not pipelined
201 def : WriteRes<WriteFMul, [CortexA510UnitVMAC]> { let Latency = 4; }
203 let RetireOOO = 1 in {
204 def : WriteRes<WriteFDiv, [CortexA510UnitVMC]> { let Latency = 22;
205                                             let ReleaseAtCycles = [29]; }
206 def CortexA510WriteVMAC : SchedWriteRes<[CortexA510UnitVMAC]> { let Latency = 4; }
207 def CortexA510WriteFDivHP : SchedWriteRes<[CortexA510UnitVMC]> { let Latency = 8;
208                                                      let ReleaseAtCycles = [5]; }
209 def CortexA510WriteFDivSP : SchedWriteRes<[CortexA510UnitVMC]> { let Latency = 13;
210                                                      let ReleaseAtCycles = [10]; }
211 def CortexA510WriteFDivDP : SchedWriteRes<[CortexA510UnitVMC]> { let Latency = 22;
212                                                      let ReleaseAtCycles = [19]; }
213 def CortexA510WriteFSqrtHP : SchedWriteRes<[CortexA510UnitVMC]> { let Latency = 8;
214                                                       let ReleaseAtCycles = [5]; }
215 def CortexA510WriteFSqrtSP : SchedWriteRes<[CortexA510UnitVMC]> { let Latency = 12;
216                                                       let ReleaseAtCycles = [9]; }
217 def CortexA510WriteFSqrtDP : SchedWriteRes<[CortexA510UnitVMC]> { let Latency = 22;
218                                                       let ReleaseAtCycles = [19]; }
220 //===----------------------------------------------------------------------===//
221 // Subtarget-specific SchedRead types.
223 def : ReadAdvance<ReadVLD, 0>;
224 def : ReadAdvance<ReadExtrHi, 0>;
225 def : ReadAdvance<ReadAdrBase, 0>;
226 def : ReadAdvance<ReadST, 1>;
228 def : ReadAdvance<ReadI, 0>;
229 def : ReadAdvance<ReadISReg, 0>;
230 def : ReadAdvance<ReadIEReg, 0>;
233 // MUL
234 def : ReadAdvance<ReadIM, 0>;
235 def : ReadAdvance<ReadIMA, 2>;
237 // Div
238 def : ReadAdvance<ReadID, 0>;
240 //===----------------------------------------------------------------------===//
241 // Subtarget-specific InstRWs.
243 def A510WriteISReg : SchedWriteVariant<[
244        SchedVar<RegShiftedPred, [WriteISReg]>,
245        SchedVar<NoSchedPred, [WriteI]>]>;
246 def : InstRW<[A510WriteISReg], (instregex ".*rs$")>;
247 def : InstRW<[WriteIS], (instrs RBITWr, RBITXr)>;
249 // Pointer Authentication Instructions (v8.3 PAC)
250 // -----------------------------------------------------------------------------
252 // Authenticate data address
253 // Authenticate instruction address
254 // Compute pointer authentication code for data address
255 // Compute pointer authentication code, using generic key
256 // Compute pointer authentication code for instruction address
257 def : InstRW<[CortexA510Write<3, CortexA510UnitPAC>], (instregex "^AUT", "^PAC")>;
259 // Branch and link, register, with pointer authentication
260 // Branch, register, with pointer authentication
261 // Branch, return, with pointer authentication
262 def : InstRW<[A510Write_PAC_B<1>], (instrs BLRAA, BLRAAZ, BLRAB, BLRABZ, BRAA,
263                                             BRAAZ, BRAB, BRABZ, RETAA, RETAB,
264                                             ERETAA, ERETAB)>;
266 // Load register, with pointer authentication
267 def : InstRW<[CortexA510Write<2, CortexA510UnitPAC>], (instregex "^LDRA[AB](indexed|writeback)")>;
269 // Strip pointer authentication code
270 def : InstRW<[CortexA510Write<5, CortexA510UnitPAC>], (instrs XPACD, XPACI, XPACLRI)>;
271 //---
272 // Miscellaneous
273 //---
274 def : InstRW<[CortexA510WriteVLD1SI,CortexA510WriteLDP1], (instregex "LDPS?Wi")>;
275 def : InstRW<[CortexA510WriteVLD1,CortexA510WriteLDP1], (instregex "LDPSi")>;
276 def : InstRW<[CortexA510WriteVLD1,CortexA510WriteLDP2], (instregex "LDP(X|D)i")>;
277 def : InstRW<[CortexA510WriteVLD1,CortexA510WriteLDP4], (instregex "LDPQi")>;
278 def : InstRW<[WriteAdr, CortexA510WriteVLD1SI,CortexA510WriteLDP1], (instregex "LDPS?W(pre|post)")>;
279 def : InstRW<[WriteAdr, CortexA510WriteVLD1,CortexA510WriteLDP1], (instregex "LDPS(pre|post)")>;
280 def : InstRW<[WriteAdr, CortexA510WriteVLD1,CortexA510WriteLDP2], (instregex "LDP(X|D)(pre|post)")>;
281 def : InstRW<[WriteAdr, CortexA510WriteVLD1,CortexA510WriteLDP4], (instregex "LDPQ(pre|post)")>;
282 def : InstRW<[WriteI], (instrs COPY)>;
283 //---
284 // Vector Loads - 128-bit per cycle
285 //---
286 //   1-element structures
287 def : InstRW<[CortexA510WriteVLD1], (instregex "LD1i(8|16|32|64)$")>;                // single element
288 def : InstRW<[CortexA510WriteVLD1], (instregex "LD1Rv(8b|4h|2s|1d|16b|8h|4s|2d)$")>; // replicate
289 def : InstRW<[CortexA510WriteVLD1], (instregex "LD1Onev(8b|4h|2s|1d)$")>;
290 def : InstRW<[CortexA510WriteVLD1], (instregex "LD1Onev(16b|8h|4s|2d)$")>;
291 def : InstRW<[CortexA510WriteVLD1], (instregex "LD1Twov(8b|4h|2s|1d)$")>; // multiple structures
292 def : InstRW<[CortexA510WriteVLD1], (instregex "LD1Twov(16b|8h|4s|2d)$")>;
293 def : InstRW<[CortexA510WriteVLD2], (instregex "LD1Threev(8b|4h|2s|1d)$")>;
294 def : InstRW<[CortexA510WriteVLD2], (instregex "LD1Threev(16b|8h|4s|2d)$")>;
295 def : InstRW<[CortexA510WriteVLD2], (instregex "LD1Fourv(8b|4h|2s|1d)$")>;
296 def : InstRW<[CortexA510WriteVLD2], (instregex "LD1Fourv(16b|8h|4s|2d)$")>;
298 def : InstRW<[WriteAdr, CortexA510WriteVLD1], (instregex "LD1i(8|16|32|64)_POST$")>;
299 def : InstRW<[WriteAdr, CortexA510WriteVLD1], (instregex "LD1Rv(8b|4h|2s|1d|16b|8h|4s|2d)_POST$")>;
300 def : InstRW<[WriteAdr, CortexA510WriteVLD1], (instregex "LD1Onev(8b|4h|2s|1d)_POST$")>;
301 def : InstRW<[WriteAdr, CortexA510WriteVLD2], (instregex "LD1Onev(16b|8h|4s|2d)_POST$")>;
302 def : InstRW<[WriteAdr, CortexA510WriteVLD2], (instregex "LD1Twov(8b|4h|2s|1d)_POST$")>;
303 def : InstRW<[WriteAdr, CortexA510WriteVLD2], (instregex "LD1Twov(16b|8h|4s|2d)_POST$")>;
304 def : InstRW<[WriteAdr, CortexA510WriteVLD2], (instregex "LD1Threev(8b|4h|2s|1d)_POST$")>;
305 def : InstRW<[WriteAdr, CortexA510WriteVLD2], (instregex "LD1Threev(16b|8h|4s|2d)_POST$")>;
306 def : InstRW<[WriteAdr, CortexA510WriteVLD2], (instregex "LD1Fourv(8b|4h|2s|1d)_POST$")>;
307 def : InstRW<[WriteAdr, CortexA510WriteVLD2], (instregex "LD1Fourv(16b|8h|4s|2d)_POST$")>;
309 //    2-element structures
310 def : InstRW<[CortexA510WriteVLD2], (instregex "LD2i(8|16|32|64)$")>;
311 def : InstRW<[CortexA510WriteVLD2], (instregex "LD2Rv(8b|4h|2s|1d|16b|8h|4s|2d)$")>;
312 def : InstRW<[CortexA510WriteVLD2], (instregex "LD2Twov(8b|4h|2s)$")>;
313 def : InstRW<[CortexA510WriteVLD4], (instregex "LD2Twov(16b|8h|4s|2d)$")>;
315 def : InstRW<[WriteAdr, CortexA510WriteVLD2], (instregex "LD2i(8|16|32|64)(_POST)?$")>;
316 def : InstRW<[WriteAdr, CortexA510WriteVLD2], (instregex "LD2Rv(8b|4h|2s|1d|16b|8h|4s|2d)(_POST)?$")>;
317 def : InstRW<[WriteAdr, CortexA510WriteVLD2], (instregex "LD2Twov(8b|4h|2s)(_POST)?$")>;
318 def : InstRW<[WriteAdr, CortexA510WriteVLD4], (instregex "LD2Twov(16b|8h|4s|2d)(_POST)?$")>;
320 //    3-element structures
321 def : InstRW<[CortexA510WriteVLD2], (instregex "LD3i(8|16|32|64)$")>;
322 def : InstRW<[CortexA510WriteVLD2], (instregex "LD3Rv(8b|4h|2s|1d|16b|8h|4s|2d)$")>;
323 def : InstRW<[CortexA510WriteVLD3], (instregex "LD3Threev(8b|4h|2s|1d)$")>;
324 def : InstRW<[CortexA510WriteVLD6], (instregex "LD3Threev(16b|8h|4s|2d)$")>;
326 def : InstRW<[WriteAdr, CortexA510WriteVLD2], (instregex "LD3i(8|16|32|64)_POST$")>;
327 def : InstRW<[WriteAdr, CortexA510WriteVLD2], (instregex "LD3Rv(8b|4h|2s|1d|16b|8h|4s|2d)_POST$")>;
328 def : InstRW<[WriteAdr, CortexA510WriteVLD3], (instregex "LD3Threev(8b|4h|2s|1d)_POST$")>;
329 def : InstRW<[WriteAdr, CortexA510WriteVLD6], (instregex "LD3Threev(16b|8h|4s|2d)_POST$")>;
331 //    4-element structures
332 def : InstRW<[CortexA510WriteVLD2], (instregex "LD4i(8|16|32|64)$")>;                // load single 4-el structure to one lane of 4 regs.
333 def : InstRW<[CortexA510WriteVLD2], (instregex "LD4Rv(8b|4h|2s|1d|16b|8h|4s|2d)$")>; // load single 4-el structure, replicate to all lanes of 4 regs.
334 def : InstRW<[CortexA510WriteVLD4], (instregex "LD4Fourv(8b|4h|2s|1d)$")>;           // load multiple 4-el structures to 4 regs.
335 def : InstRW<[CortexA510WriteVLD8], (instregex "LD4Fourv(16b|8h|4s|2d)$")>;
337 def : InstRW<[WriteAdr, CortexA510WriteVLD2], (instregex "LD4i(8|16|32|64)_POST$")>;
338 def : InstRW<[WriteAdr, CortexA510WriteVLD2], (instregex "LD4Rv(8b|4h|2s|1d|16b|8h|4s|2d)_POST$")>;
339 def : InstRW<[WriteAdr, CortexA510WriteVLD4], (instregex "LD4Fourv(8b|4h|2s|1d)_POST$")>;
340 def : InstRW<[WriteAdr, CortexA510WriteVLD8], (instregex "LD4Fourv(16b|8h|4s|2d)_POST$")>;
342 //---
343 // Vector Stores
344 //---
345 def : InstRW<[CortexA510WriteVST1], (instregex "ST1i(8|16|32|64)$")>;
346 def : InstRW<[CortexA510WriteVST1], (instregex "ST1Onev(8b|4h|2s|1d|16b|8h|4s|2d)$")>;
347 def : InstRW<[CortexA510WriteVST1], (instregex "ST1Twov(8b|4h|2s|1d|16b|8h|4s|2d)$")>;
348 def : InstRW<[CortexA510WriteVST2], (instregex "ST1Threev(8b|4h|2s|1d|16b|8h|4s|2d)$")>;
349 def : InstRW<[CortexA510WriteVST4], (instregex "ST1Fourv(8b|4h|2s|1d|16b|8h|4s|2d)$")>;
350 def : InstRW<[WriteAdr, CortexA510WriteVST1], (instregex "ST1i(8|16|32|64)_POST$")>;
351 def : InstRW<[WriteAdr, CortexA510WriteVST1], (instregex "ST1Onev(8b|4h|2s|1d|16b|8h|4s|2d)_POST$")>;
352 def : InstRW<[WriteAdr, CortexA510WriteVST1], (instregex "ST1Twov(8b|4h|2s|1d|16b|8h|4s|2d)_POST$")>;
353 def : InstRW<[WriteAdr, CortexA510WriteVST2], (instregex "ST1Threev(8b|4h|2s|1d|16b|8h|4s|2d)_POST$")>;
354 def : InstRW<[WriteAdr, CortexA510WriteVST4], (instregex "ST1Fourv(8b|4h|2s|1d|16b|8h|4s|2d)_POST$")>;
356 def : InstRW<[CortexA510WriteVST2], (instregex "ST2i(8|16|32|64)$")>;
357 def : InstRW<[CortexA510WriteVST2], (instregex "ST2Twov(8b|4h|2s)$")>;
358 def : InstRW<[CortexA510WriteVST4], (instregex "ST2Twov(16b|8h|4s|2d)$")>;
359 def : InstRW<[WriteAdr, CortexA510WriteVST2], (instregex "ST2i(8|16|32|64)_POST$")>;
360 def : InstRW<[WriteAdr, CortexA510WriteVST2], (instregex "ST2Twov(8b|4h|2s)_POST$")>;
361 def : InstRW<[WriteAdr, CortexA510WriteVST4], (instregex "ST2Twov(16b|8h|4s|2d)_POST$")>;
363 def : InstRW<[CortexA510WriteVST2], (instregex "ST3i(8|16|32|64)$")>;
364 def : InstRW<[CortexA510WriteVST4], (instregex "ST3Threev(8b|4h|2s|1d|16b|8h|4s|2d)$")>;
365 def : InstRW<[WriteAdr, CortexA510WriteVST2], (instregex "ST3i(8|16|32|64)_POST$")>;
366 def : InstRW<[WriteAdr, CortexA510WriteVST4], (instregex "ST3Threev(8b|4h|2s|1d|2d|16b|8h|4s|4d)_POST$")>;
368 def : InstRW<[CortexA510WriteVST2], (instregex "ST4i(8|16|32|64)$")>;
369 def : InstRW<[CortexA510WriteVST4], (instregex "ST4Fourv(8b|4h|2s|1d|16b|8h|4s|2d)$")>;
370 def : InstRW<[WriteAdr, CortexA510WriteVST2], (instregex "ST4i(8|16|32|64)_POST$")>;
371 def : InstRW<[WriteAdr, CortexA510WriteVST4], (instregex "ST4Fourv(8b|4h|2s|1d|16b|8h|4s|2d)_POST$")>;
373 //---
374 // Floating Point Conversions, MAC, DIV, SQRT
375 //---
376 def : InstRW<[CortexA510WriteFPALU_F3], (instregex "^DUP(v2i64|v4i32|v8i16|v16i8)")>;
377 def : InstRW<[CortexA510WriteFPALU_F4], (instregex "^XTN")>;
378 def : InstRW<[CortexA510WriteFPALU_F4], (instregex "^FCVT[ALMNPZ][SU](S|U)?(W|X)")>;
379 def : InstRW<[CortexA510WriteFPALU_F4], (instregex "^FCVT(X)?[ALMNPXZ](S|U|N)?v")>;
381 def : InstRW<[CortexA510WriteFPALU_F4], (instregex "^(S|U)CVTF(S|U)(W|X)(H|S|D)")>;
382 def : InstRW<[CortexA510WriteFPALU_F4], (instregex "^(S|U)CVTF(h|s|d)")>;
383 def : InstRW<[CortexA510WriteFPALU_F4], (instregex "^(S|U)CVTFv")>;
385 def : InstRW<[CortexA510WriteVMAC], (instregex "^FN?M(ADD|SUB).*")>;
386 def : InstRW<[CortexA510WriteVMAC], (instregex "^FML(A|S)v.*")>;
387 def : InstRW<[CortexA510WriteFDivHP], (instrs FDIVHrr)>;
388 def : InstRW<[CortexA510WriteFDivSP], (instrs FDIVSrr)>;
389 def : InstRW<[CortexA510WriteFDivDP], (instrs FDIVDrr)>;
390 def : InstRW<[CortexA510WriteFDivHP], (instregex "^FDIVv.*16$")>;
391 def : InstRW<[CortexA510WriteFDivSP], (instregex "^FDIVv.*32$")>;
392 def : InstRW<[CortexA510WriteFDivDP], (instregex "^FDIVv.*64$")>;
393 def : InstRW<[CortexA510WriteFSqrtHP], (instregex "^.*SQRT.*16$")>;
394 def : InstRW<[CortexA510WriteFSqrtSP], (instregex "^.*SQRT.*32$")>;
395 def : InstRW<[CortexA510WriteFSqrtDP], (instregex "^.*SQRT.*64$")>;
397 // 4.15. Advanced SIMD integer instructions
398 // ASIMD absolute diff
399 def : InstRW<[CortexA510Write<3, CortexA510UnitVALU>], (instregex "[SU]ABDv(2i32|4i16|8i8)")>;
400 def : InstRW<[CortexA510Write<3, CortexA510UnitVALU>], (instregex "[SU]ABDv(16i8|4i32|8i16)")>;
401 // ASIMD absolute diff accum
402 def : InstRW<[CortexA510Write<8, CortexA510UnitVALU>], (instregex "[SU]ABAL?v")>;
403 // ASIMD absolute diff long
404 def : InstRW<[CortexA510Write<3, CortexA510UnitVALU>], (instregex "[SU]ABDLv")>;
405 // ASIMD arith #1
406 def : InstRW<[CortexA510Write<3, CortexA510UnitVALU>], (instregex "(ADD|SUB|NEG)v(1i64|2i32|4i16|8i8)",
407   "[SU]R?HADDv(2i32|4i16|8i8)", "[SU]HSUBv(2i32|4i16|8i8)")>;
408 def : InstRW<[CortexA510Write<4, CortexA510UnitVALU>], (instregex "(ADD|SUB|NEG)v(2i64|4i32|8i16|16i8)",
409   "[SU]R?HADDv(8i16|4i32|16i8)", "[SU]HSUBv(8i16|4i32|16i8)")>;
410 // ASIMD arith #2
411 def : InstRW<[CortexA510Write<4, CortexA510UnitVALU>], (instregex "ABSv(1i64|2i32|4i16|8i8)$",
412   "[SU]ADDLPv(2i32_v1i64|4i16_v2i32|8i8_v4i16)$",
413   "([SU]QADD|[SU]QSUB|SQNEG|SUQADD|USQADD)v(1i16|1i32|1i64|1i8|2i32|4i16|8i8)$",
414   "ADDPv(2i32|4i16|8i8)$")>;
415 def : InstRW<[CortexA510Write<4, CortexA510UnitVALU>], (instregex "ABSv(2i64|4i32|8i16|16i8)$",
416   "[SU]ADDLPv(16i8_v8i16|4i32_v2i64|8i16_v4i32)$",
417   "([SU]QADD|[SU]QSUB|SQNEG|SUQADD|USQADD)v(16i8|2i64|4i32|8i16)$",
418   "ADDPv(16i8|2i64|4i32|8i16)$")>;
419 // ASIMD arith #3
420 def : InstRW<[CortexA510Write<4, CortexA510UnitVALU>], (instregex  "SADDLv", "UADDLv", "SADDWv",
421   "UADDWv", "SSUBLv", "USUBLv", "SSUBWv", "USUBWv", "ADDHNv", "SUBHNv")>;
422 // ASIMD arith #5
423 def : InstRW<[CortexA510Write<4, CortexA510UnitVALU>], (instregex "RADDHNv", "RSUBHNv")>;
424 // ASIMD arith, reduce
425 def : InstRW<[CortexA510Write<4, CortexA510UnitVALU>], (instregex  "ADDVv", "SADDLVv", "UADDLVv")>;
426 // ASIMD compare #1
427 def : InstRW<[CortexA510Write<3, CortexA510UnitVALU>], (instregex "CM(EQ|GE|GT|HI|HS|LE|LT)v(1i64|2i32|4i16|8i8)")>;
428 def : InstRW<[CortexA510Write<3, CortexA510UnitVALU>], (instregex "CM(EQ|GE|GT|HI|HS|LE|LT)v(2i64|4i32|8i16|16i8)")>;
429 // ASIMD compare #2
430 def : InstRW<[CortexA510Write<4, CortexA510UnitVALU>], (instregex "CMTSTv(1i64|2i32|4i16|8i8)")>;
431 def : InstRW<[CortexA510Write<4, CortexA510UnitVALU>], (instregex "CMTSTv(2i64|4i32|8i16|16i8)")>;
432 // ASIMD logical $1
433 def : InstRW<[CortexA510Write<3, CortexA510UnitVALU>], (instregex "(AND|EOR|NOT|ORN)v8i8",
434   "(ORR|BIC)v(2i32|4i16|8i8)$", "MVNIv(2i|2s|4i16)")>;
435 def : InstRW<[CortexA510Write<3, CortexA510UnitVALU>], (instregex "(AND|EOR|NOT|ORN)v16i8",
436   "(ORR|BIC)v(16i8|4i32|8i16)$", "MVNIv(4i32|4s|8i16)")>;
437 // ASIMD max/min, basic
438 def : InstRW<[CortexA510Write<4, CortexA510UnitVALU>], (instregex "[SU](MIN|MAX)P?v(2i32|4i16|8i8)")>;
439 def : InstRW<[CortexA510Write<4, CortexA510UnitVALU>], (instregex "[SU](MIN|MAX)P?v(16i8|4i132|8i16)")>;
440 // SIMD max/min, reduce
441 def : InstRW<[CortexA510Write<4, CortexA510UnitVALU>], (instregex "[SU](MAX|MIN)Vv")>;
442 // ASIMD multiply, by element
443 def : InstRW<[CortexA510Write<4, CortexA510UnitVALU>], (instregex "MULv(2i32|4i16|4i32|8i16)_indexed$",
444   "SQR?DMULHv(1i16|1i32|2i32|4i16|4i32|8i16)_indexed$")>;
445 // ASIMD multiply
446 def : InstRW<[CortexA510Write<4, CortexA510UnitVALU>], (instrs PMULv8i8)>;
447 def : InstRW<[CortexA510Write<4, CortexA510UnitVALU>], (instrs PMULv16i8)>;
448 // ASIMD multiply accumulate
449 def : InstRW<[CortexA510Write<4, CortexA510UnitVALU>], (instregex "ML[AS]v(2i32|4i16|8i8)$")>;
450 def : InstRW<[CortexA510Write<4, CortexA510UnitVALU>], (instregex "ML[AS]v(16i8|4i32|8i16)$")>;
451 def : InstRW<[CortexA510Write<4, CortexA510UnitVALU>], (instregex "ML[AS]v(2i32|4i16|4i32|8i16)_indexed$")>;
452 // ASIMD multiply accumulate half
453 def : InstRW<[CortexA510Write<4, CortexA510UnitVALU>], (instregex "SQRDML[AS]H[vi]")>;
454 // ASIMD multiply accumulate long
455 def : InstRW<[CortexA510Write<4, CortexA510UnitVALU>], (instregex "[SU]ML[AS]Lv")>;
456 // ASIMD multiply accumulate long #2
457 def : InstRW<[CortexA510Write<4, CortexA510UnitVALU>], (instregex "SQDML[AS]L[iv]")>;
458 // ASIMD dot product
459 def : InstRW<[CortexA510Write<4, CortexA510UnitVALU>], (instregex "[SU]DOTv8i8")>;
460 def : InstRW<[CortexA510Write<4, CortexA510UnitVALU>], (instregex "[SU]DOTv16i8")>;
461 // ASIMD dot product, by scalar
462 def : InstRW<[CortexA510Write<4, CortexA510UnitVALU>], (instregex "[SU]DOTlanev")>;
463 // ASIMD multiply long
464 def : InstRW<[CortexA510Write<4, CortexA510UnitVALU>], (instregex "[SU]MULLv", "SQDMULL[iv]")>;
465 // ASIMD polynomial (8x8) multiply long
466 def : InstRW<[CortexA510Write<4, CortexA510UnitVALU>], (instrs PMULLv8i8, PMULLv16i8)>;
467 // ASIMD pairwise add and accumulate
468 def : InstRW<[CortexA510MCWrite<8, 2, CortexA510UnitVALU>], (instregex "[SU]ADALPv")>;
469 // ASIMD shift accumulate
470 def : InstRW<[CortexA510MCWrite<8, 2, CortexA510UnitVALU>], (instregex "[SU]SRA(d|v2i32|v4i16|v8i8)")>;
471 def : InstRW<[CortexA510MCWrite<8, 2, CortexA510UnitVALU>], (instregex "[SU]SRAv(16i8|2i64|4i32|8i16)")>;
472 // ASIMD shift accumulate #2
473 def : InstRW<[CortexA510MCWrite<8, 2, CortexA510UnitVALU>], (instregex "[SU]RSRA[vd]")>;
474 // ASIMD shift by immed
475 def : InstRW<[CortexA510Write<3, CortexA510UnitVALU>], (instregex "SHLd$", "SHLv",
476   "SLId$", "SRId$", "[SU]SHR[vd]", "SHRNv")>;
477 // ASIMD shift by immed
478 // SXTL and UXTL are aliases for SHLL
479 def : InstRW<[CortexA510Write<3, CortexA510UnitVALU>], (instregex "[US]?SHLLv")>;
480 // ASIMD shift by immed #2
481 def : InstRW<[CortexA510Write<3, CortexA510UnitVALU>], (instregex "[SU]RSHR(d|v2i32|v4i16|v8i8)",
482   "[SU]RSHRv(16i8|2i64|4i32|8i16)")>;
483 def : InstRW<[CortexA510Write<4, CortexA510UnitVALU>], (instregex "RSHRNv(2i32|4i16|8i8)",
484   "RSHRNv(16i8|4i32|8i16)")>;
485 // ASIMD shift by register
486 def : InstRW<[CortexA510Write<3, CortexA510UnitVALU>], (instregex "[SU]SHLv(1i64|2i32|4i16|8i8)")>;
487 def : InstRW<[CortexA510Write<3, CortexA510UnitVALU>], (instregex "[SU]SHLv(2i64|4i32|8i16|16i8)")>;
488 // ASIMD shift by register #2
489 def : InstRW<[CortexA510Write<3, CortexA510UnitVALU>], (instregex "[SU]RSHLv(1i64|2i32|4i16|8i8)")>;
490 def : InstRW<[CortexA510Write<3, CortexA510UnitVALU>], (instregex "[SU]RSHLv(2i64|4i32|8i16|16i8)")>;
492 def : InstRW<[CortexA510Write<4, CortexA510UnitVALU>], (instregex "[SU]QSHLv(1i64|2i32|4i16|8i8)")>;
493 def : InstRW<[CortexA510Write<4, CortexA510UnitVALU>], (instregex "[SU]QSHLv(2i64|4i32|8i16|16i8)")>;
495 def : InstRW<[CortexA510Write<4, CortexA510UnitVALU>], (instregex "[SU]QRSHLv(1i64|2i32|4i16|8i8)")>;
496 def : InstRW<[CortexA510Write<4, CortexA510UnitVALU>], (instregex "[SU]QRSHLv(2i64|4i32|8i16|16i8)")>;
498 // Cryptography extensions
499 // -----------------------------------------------------------------------------
501 // Crypto AES ops
502 def : InstRW<[CortexA510Write<3, CortexA510UnitVALU>], (instregex "^AES[DE]rr$", "^AESI?MCrr")>;
504 // Crypto polynomial (64x64) multiply long
505 def : InstRW<[CortexA510MCWrite<8, 0, CortexA510UnitVMC>], (instrs PMULLv1i64, PMULLv2i64)>;
507 // Crypto SHA1 hash acceleration op
508 // Crypto SHA1 schedule acceleration ops
509 def : InstRW<[CortexA510Write<3, CortexA510UnitVALU>], (instregex "^SHA1(H|SU0|SU1)")>;
511 // Crypto SHA1 hash acceleration ops
512 // Crypto SHA256 hash acceleration ops
513 def : InstRW<[CortexA510MCWrite<8, 0, CortexA510UnitVMC>], (instregex "^SHA1[CMP]", "^SHA256H2?")>;
515 // Crypto SHA256 schedule acceleration ops
516 def : InstRW<[CortexA510MCWrite<8, 0, CortexA510UnitVMC>], (instregex "^SHA256SU[01]")>;
518 // Crypto SHA512 hash acceleration ops
519 def : InstRW<[CortexA510MCWrite<8, 0, CortexA510UnitVMC>], (instregex "^SHA512(H|H2|SU0|SU1)")>;
521 // Crypto SHA3 ops
522 def : InstRW<[CortexA510Write<3, CortexA510UnitVALU>], (instrs BCAX, EOR3, XAR)>;
523 def : InstRW<[CortexA510MCWrite<8, 0, CortexA510UnitVMC>], (instrs RAX1)>;
526 // Crypto SM3 ops
527 def : InstRW<[CortexA510MCWrite<8, 0, CortexA510UnitVMC>], (instregex "^SM3PARTW[12]$", "^SM3SS1$",
528                                                             "^SM3TT[12][AB]$")>;
530 // Crypto SM4 ops
531 def : InstRW<[CortexA510MCWrite<8, 0, CortexA510UnitVMC>], (instrs SM4E, SM4ENCKEY)>;
533 // CRC
534 // -----------------------------------------------------------------------------
536 def : InstRW<[CortexA510MCWrite<2, 0, CortexA510UnitMAC>], (instregex "^CRC32")>;
538 // SVE Predicate instructions
540 // Loop control, based on predicate
541 def : InstRW<[CortexA510Write<6, CortexA510UnitVALU0>], (instrs BRKA_PPmP, BRKA_PPzP,
542                                                   BRKB_PPmP, BRKB_PPzP)>;
544 // Loop control, based on predicate and flag setting
545 def : InstRW<[CortexA510Write<6, CortexA510UnitVALU0>], (instrs BRKAS_PPzP, BRKBS_PPzP)>;
547 // Loop control, propagating
548 def : InstRW<[CortexA510Write<6, CortexA510UnitVALU0>], (instrs BRKN_PPzP, BRKPA_PPzPP, BRKPB_PPzPP)>;
550 // Loop control, propagating and flag setting
551 def : InstRW<[CortexA510Write<6, CortexA510UnitVALU0>], (instrs BRKNS_PPzP)>;
552 def : InstRW<[CortexA510Write<6, CortexA510UnitVALU0>], (instrs BRKPAS_PPzPP, BRKPBS_PPzPP)>;
555 // Loop control, based on GPR
556 def : InstRW<[CortexA510Write<6, CortexA510UnitVALU0>],
557              (instregex "^WHILE(GE|GT|HI|HS|LE|LO|LS|LT)_P(WW|XX)_[BHSD]")>;
559 def : InstRW<[CortexA510Write<6, CortexA510UnitVALU0>], (instregex "^WHILE(RW|WR)_PXX_[BHSD]")>;
561 // Loop terminate
562 def : InstRW<[CortexA510Write<1, CortexA510UnitALU>], (instregex "^CTERM(EQ|NE)_(WW|XX)")>;
564 // Predicate counting scalar
565 def : InstRW<[CortexA510Write<1, CortexA510UnitALU>], (instrs ADDPL_XXI, ADDVL_XXI, RDVLI_XI)>;
567 def : InstRW<[CortexA510Write<1, CortexA510UnitALU>],
568              (instregex "^CNT[BHWD]_XPiI")>;
570 def : InstRW<[CortexA510Write<1, CortexA510UnitALU>],
571              (instregex "^(INC|DEC)[BHWD]_XPiI")>;
573 def : InstRW<[CortexA510Write<1, CortexA510UnitALU>],
574              (instregex "^(SQINC|SQDEC|UQINC|UQDEC)[BHWD]_[XW]Pi(Wd)?I")>;
576 // Predicate counting scalar, active predicate
577 def : InstRW<[CortexA510Write<6, CortexA510UnitVALU0>],
578              (instregex "^CNTP_XPP_[BHSD]")>;
580 def : InstRW<[CortexA510Write<6, CortexA510UnitVALU0>],
581              (instregex "^(DEC|INC)P_XP_[BHSD]")>;
583 def : InstRW<[CortexA510Write<8, CortexA510UnitVALU0>],
584              (instregex "^(SQDEC|SQINC|UQDEC|UQINC)P_XP_[BHSD]",
585                         "^(UQDEC|UQINC)P_WP_[BHSD]",
586                         "^(SQDEC|SQINC|UQDEC|UQINC)P_XPWd_[BHSD]")>;
589 // Predicate counting vector, active predicate
590 def : InstRW<[CortexA510Write<4, CortexA510UnitVALU>],
591              (instregex "^(DEC|INC|SQDEC|SQINC|UQDEC|UQINC)P_ZP_[HSD]")>;
593 // Predicate logical
594 def : InstRW<[CortexA510Write<6, CortexA510UnitVALU0>],
595              (instregex "^(AND|BIC|EOR|NAND|NOR|ORN|ORR)_PPzPP")>;
597 // Predicate logical, flag setting
598 def : InstRW<[CortexA510Write<6, CortexA510UnitVALU0>],
599              (instregex "^(ANDS|BICS|EORS|NANDS|NORS|ORNS|ORRS)_PPzPP")>;
601 // Predicate reverse
602 def : InstRW<[CortexA510Write<6, CortexA510UnitVALU0>], (instregex "^REV_PP_[BHSD]")>;
604 // Predicate select
605 def : InstRW<[CortexA510Write<6, CortexA510UnitVALU0>], (instrs SEL_PPPP)>;
607 // Predicate set
608 def : InstRW<[CortexA510Write<6, CortexA510UnitVALU0>], (instregex "^PFALSE", "^PTRUE_[BHSD]")>;
610 // Predicate set/initialize, set flags
611 def : InstRW<[CortexA510Write<6, CortexA510UnitVALU0>], (instregex "^PTRUES_[BHSD]")>;
613 // Predicate find first/next
614 def : InstRW<[CortexA510Write<6, CortexA510UnitVALU0>], (instregex "^PFIRST_B", "^PNEXT_[BHSD]")>;
616 // Predicate test
617 def : InstRW<[CortexA510Write<6, CortexA510UnitVALU0>], (instrs PTEST_PP)>;
619 // Predicate transpose
620 def : InstRW<[CortexA510Write<6, CortexA510UnitVALU0>], (instregex "^TRN[12]_PPP_[BHSDQ]")>;
622 // Predicate unpack and widen
623 def : InstRW<[CortexA510Write<6, CortexA510UnitVALU0>], (instrs PUNPKHI_PP, PUNPKLO_PP)>;
625 // Predicate zip/unzip
626 def : InstRW<[CortexA510Write<6, CortexA510UnitVALU0>], (instregex "^(ZIP|UZP)[12]_PPP_[BHSDQ]")>;
629 // SVE integer instructions
630 // -----------------------------------------------------------------------------
631 // Arithmetic, absolute diff
632 def : InstRW<[CortexA510Write<3, CortexA510UnitVALU>], (instregex "^[SU]ABD_(ZPmZ|ZPZZ)_[BHSD]")>;
634 // Arithmetic, absolute diff accum
635 def : InstRW<[CortexA510MCWrite<8, 2, CortexA510UnitVALU>], (instregex "^[SU]ABA_ZZZ_[BHSD]")>;
637 // Arithmetic, absolute diff accum long
638 def : InstRW<[CortexA510MCWrite<8, 2, CortexA510UnitVALU>], (instregex "^[SU]ABAL[TB]_ZZZ_[HSD]")>;
640 // Arithmetic, absolute diff long
641 def : InstRW<[CortexA510Write<3, CortexA510UnitVALU>], (instregex "^[SU]ABDL[TB]_ZZZ_[HSD]")>;
643 // Arithmetic, basic
644 def : InstRW<[CortexA510Write<3, CortexA510UnitVALU>],
645              (instregex "^(ABS|CNOT|NEG)_ZPmZ_[BHSD]",
646                         "^(ADD|SUB|SUBR)_ZPmZ_[BHSD]",
647                         "^(ADD|SUB|SUBR)_ZPZZ_[BHSD]",
648                         "^(ADD|SUB)_ZZZ_[BHSD]",
649                         "^(ADD|SUB|SUBR)_ZI_[BHSD]",
650                         "^ADR_[SU]XTW_ZZZ_D_[0123]",
651                         "^ADR_LSL_ZZZ_[SD]_[0123]",
652                         "^[SU](ADD|SUB)[LW][BT]_ZZZ_[HSD]",
653                         "^SADDLBT_ZZZ_[HSD]",
654                         "^[SU]H(ADD|SUB|SUBR)_ZPmZ_[BHSD]",
655                         "^SSUBL(BT|TB)_ZZZ_[HSD]")>;
657 // Arithmetic, complex
658 def : InstRW<[CortexA510Write<4, CortexA510UnitVALU>],
659              (instregex "^R?(ADD|SUB)HN[BT]_ZZZ_[BHS]",
660                         "^SQ(ABS|NEG)_ZPmZ_[BHSD]",
661                         "^SQ(ADD|SUB|SUBR)_ZPmZ_?[BHSD]",
662                         "^[SU]Q(ADD|SUB)_ZZZ_[BHSD]",
663                         "^[SU]Q(ADD|SUB)_ZI_[BHSD]",
664                         "^(SRH|SUQ|UQ|USQ|URH)ADD_ZPmZ_[BHSD]",
665                         "^(UQSUB|UQSUBR)_ZPmZ_[BHSD]")>;
667 // Arithmetic, large integer
668 def : InstRW<[CortexA510Write<4, CortexA510UnitVALU>], (instregex "^(AD|SB)CL[BT]_ZZZ_[SD]")>;
670 // Arithmetic, pairwise add
671 def : InstRW<[CortexA510Write<3, CortexA510UnitVALU>], (instregex "^ADDP_ZPmZ_[BHSD]")>;
673 // Arithmetic, pairwise add and accum long
674 def : InstRW<[CortexA510MCWrite<7, 2, CortexA510UnitVALU>], (instregex "^[SU]ADALP_ZPmZ_[HSD]")>;
676 // Arithmetic, shift
677 def : InstRW<[CortexA510Write<3, CortexA510UnitVALU>],
678              (instregex "^(ASR|LSL|LSR)_WIDE_ZPmZ_[BHS]",
679                         "^(ASR|LSL|LSR)_WIDE_ZZZ_[BHS]",
680                         "^(ASR|LSL|LSR)_ZPmI_[BHSD]",
681                         "^(ASR|LSL|LSR)_ZPZI_[BHSD]",
682                         "^(ASR|LSL|LSR)_ZPmZ_[BHSD]",
683                         "^(ASR|LSL|LSR)_ZPZZ_[BHSD]",
684                         "^(ASR|LSL|LSR)_ZZI_[BHSD]",
685                         "^(ASRR|LSLR|LSRR)_ZPmZ_[BHSD]")>;
686 // Arithmetic, shift right for divide
687 def : InstRW<[CortexA510Write<4, CortexA510UnitVALU>],
688              (instregex "^ASRD_ZPmI_[BHSD]",
689                         "^ASRD_ZPZI_[BHSD]")>;
691 // Arithmetic, shift and accumulate
692 def : InstRW<[CortexA510Write<4, CortexA510UnitVALU>],
693              (instregex "^(SSRA|USRA)_ZZI_[BHSD]")>;
695 def : InstRW<[CortexA510MCWrite<7, 2, CortexA510UnitVALU>],
696              (instregex "^(SRSRA|URSRA)_ZZI_[BHSD]")>;
699 // Arithmetic, shift by immediate
700 // Arithmetic, shift by immediate and insert
701 def : InstRW<[CortexA510Write<3, CortexA510UnitVALU>],
702              (instregex "^(SHRNB|SHRNT|SSHLLB|SSHLLT|USHLLB|USHLLT|SLI|SRI)_ZZI_[BHSD]")>;
704 // Arithmetic, shift complex
705 def : InstRW<[CortexA510Write<4, CortexA510UnitVALU>],
706              (instregex "^(SQ)?RSHRU?N[BT]_ZZI_[BHS]",
707                         "^(SQRSHL|SQRSHLR|SQSHL|SQSHLR|UQRSHL|UQRSHLR|UQSHL|UQSHLR)_(ZPmZ|ZPZZ)_[BHSD]",
708                         "^(SQSHL|SQSHLU|UQSHL)_(ZPmI|ZPZI)_[BHSD]",
709                         "^SQSHRU?N[BT]_ZZI_[BHS]",
710                         "^UQR?SHRN[BT]_ZZI_[BHS]")>;
712 // Arithmetic, shift rounding
713 def : InstRW<[CortexA510Write<4, CortexA510UnitVALU>],
714              (instregex "^(SRSHL|SRSHR|SRSHLR|URSHL|URSHLR|URSHR)_(ZPmZ|ZPZZ|ZPZI)_[BHSD]",
715                         "^[SU]RSHR_ZPmI_[BHSD]")>;
717 // Bit manipulation
718 def : InstRW<[CortexA510MCWrite<14, 13, CortexA510UnitVMC>],
719              (instregex "^(BDEP|BEXT|BGRP)_ZZZ_B")>;
721 def : InstRW<[CortexA510MCWrite<22, 21, CortexA510UnitVMC>],
722              (instregex "^(BDEP|BEXT|BGRP)_ZZZ_H")>;
724 def : InstRW<[CortexA510MCWrite<38, 37, CortexA510UnitVMC>],
725              (instregex "^(BDEP|BEXT|BGRP)_ZZZ_S")>;
727 def : InstRW<[CortexA510MCWrite<70, 69, CortexA510UnitVMC>],
728              (instregex "^(BDEP|BEXT|BGRP)_ZZZ_D")>;
731 // Bitwise select
732 def : InstRW<[CortexA510Write<3, CortexA510UnitVALU>], (instregex "^(BSL|BSL1N|BSL2N|NBSL)_ZZZZ")>;
734 // Count/reverse bits
735 def : InstRW<[CortexA510Write<3, CortexA510UnitVALU>], (instregex "^(CLS|CLZ|RBIT)_ZPmZ_[BHSD]")>;
736 def : InstRW<[CortexA510Write<4, CortexA510UnitVALU>], (instregex "^CNT_ZPmZ_[BH]")>;
737 def : InstRW<[CortexA510Write<8, CortexA510UnitVALU>], (instregex "^CNT_ZPmZ_S")>;
738 def : InstRW<[CortexA510Write<12, CortexA510UnitVALU>], (instregex "^CNT_ZPmZ_D")>;
739 // Broadcast logical bitmask immediate to vector
740 def : InstRW<[CortexA510Write<4, CortexA510UnitVALU>], (instrs DUPM_ZI)>;
742 // Compare and set flags
743 def : InstRW<[CortexA510Write<3, CortexA510UnitVALU>],
744              (instregex "^CMP(EQ|GE|GT|HI|HS|LE|LO|LS|LT|NE)_PPzZ[IZ]_[BHSD]",
745                         "^CMP(EQ|GE|GT|HI|HS|LE|LO|LS|LT|NE)_WIDE_PPzZZ_[BHS]")>;
747 // Complex add
748 def : InstRW<[CortexA510Write<3, CortexA510UnitVALU>], (instregex "^CADD_ZZI_[BHSD]")>;
750 def : InstRW<[CortexA510Write<4, CortexA510UnitVALU>], (instregex "^SQCADD_ZZI_[BHSD]")>;
752 // Complex dot product 8-bit element
753 def : InstRW<[CortexA510Write<4, CortexA510UnitVMAC>], (instrs CDOT_ZZZ_S, CDOT_ZZZI_S)>;
755 // Complex dot product 16-bit element
756 def : InstRW<[CortexA510Write<4, CortexA510UnitVMAC>], (instrs CDOT_ZZZ_D, CDOT_ZZZI_D)>;
758 // Complex multiply-add B, H, S element size
759 def : InstRW<[CortexA510Write<4, CortexA510UnitVMAC>], (instregex "^CMLA_ZZZ_[BHS]",
760                                             "^CMLA_ZZZI_[HS]")>;
762 // Complex multiply-add D element size
763 def : InstRW<[CortexA510Write<4, CortexA510UnitVMAC>], (instrs CMLA_ZZZ_D)>;
765 // Conditional extract operations, scalar form
766 def : InstRW<[CortexA510MCWrite<8, 2, CortexA510UnitVALU>], (instregex "^CLAST[AB]_RPZ_[BHSD]")>;
768 // Conditional extract operations, SIMD&FP scalar and vector forms
769 def : InstRW<[CortexA510Write<4, CortexA510UnitVALU>], (instregex "^CLAST[AB]_[VZ]PZ_[BHSD]",
770                                             "^COMPACT_ZPZ_[SD]",
771                                             "^SPLICE_ZPZZ?_[BHSD]")>;
773 // Convert to floating point, 64b to float or convert to double
774 def : InstRW<[CortexA510Write<4, CortexA510UnitVALU>], (instregex "^[SU]CVTF_ZPmZ_Dto[SD]")>;
776 // Convert to floating point, 64b to half
777 def : InstRW<[CortexA510Write<4, CortexA510UnitVALU>], (instregex "^[SU]CVTF_ZPmZ_DtoH")>;
779 // Convert to floating point, 32b to single or half
780 def : InstRW<[CortexA510Write<4, CortexA510UnitVALU>], (instregex "^[SU]CVTF_ZPmZ_Sto[HS]")>;
782 // Convert to floating point, 32b to double
783 def : InstRW<[CortexA510Write<4, CortexA510UnitVALU>], (instregex "^[SU]CVTF_ZPmZ_StoD")>;
785 // Convert to floating point, 16b to half
786 def : InstRW<[CortexA510Write<4, CortexA510UnitVALU>], (instregex "^[SU]CVTF_ZPmZ_HtoH")>;
788 // Copy, scalar
789 def : InstRW<[CortexA510Write<3, CortexA510UnitVALU0>],(instregex "^CPY_ZPmR_[BHSD]")>;
791 // Copy, scalar SIMD&FP or imm
792 def : InstRW<[CortexA510Write<3, CortexA510UnitVALU>], (instregex "^CPY_ZPm[IV]_[BHSD]",
793                                            "^CPY_ZPzI_[BHSD]")>;
795 // Divides, 32 bit
796 def : InstRW<[CortexA510MCWrite<15, 12, CortexA510UnitVMC>], (instregex "^[SU]DIVR?_(ZPmZ|ZPZZ)_S")>;
798 // Divides, 64 bit
799 def : InstRW<[CortexA510MCWrite<26, 23, CortexA510UnitVMC>], (instregex "^[SU]DIVR?_(ZPmZ|ZPZZ)_D")>;
801 // Dot product, 8 bit
802 def : InstRW<[CortexA510Write<4, CortexA510UnitVMAC>], (instregex "^[SU]DOT_ZZZI?_S")>;
804 // Dot product, 8 bit, using signed and unsigned integers
805 def : InstRW<[CortexA510Write<4, CortexA510UnitVMAC>], (instrs SUDOT_ZZZI, USDOT_ZZZI, USDOT_ZZZ)>;
807 // Dot product, 16 bit
808 def : InstRW<[CortexA510Write<4, CortexA510UnitVMAC>], (instregex "^[SU]DOT_ZZZI?_D")>;
810 // Duplicate, immediate and indexed form
811 def : InstRW<[CortexA510Write<3, CortexA510UnitVALU>], (instregex "^DUP_ZI_[BHSD]",
812                                            "^DUP_ZZI_[BHSDQ]")>;
814 // Duplicate, scalar form
815 def : InstRW<[CortexA510Write<3, CortexA510UnitVALU>], (instregex "^DUP_ZR_[BHSD]")>;
817 // Extend, sign or zero
818 def : InstRW<[CortexA510Write<3, CortexA510UnitVALU>], (instregex "^[SU]XTB_ZPmZ_[HSD]",
819                                             "^[SU]XTH_ZPmZ_[SD]",
820                                             "^[SU]XTW_ZPmZ_[D]")>;
822 // Extract
823 def : InstRW<[CortexA510Write<3, CortexA510UnitVALU>], (instrs EXT_ZZI, EXT_ZZI_B)>;
825 // Extract narrow saturating
826 def : InstRW<[CortexA510Write<4, CortexA510UnitVALU>], (instregex "^[SU]QXTN[BT]_ZZ_[BHS]",
827                                             "^SQXTUN[BT]_ZZ_[BHS]")>;
829 // Extract/insert operation, SIMD and FP scalar form
830 def : InstRW<[CortexA510Write<4, CortexA510UnitVALU>], (instregex "^LAST[AB]_VPZ_[BHSD]",
831                                             "^INSR_ZV_[BHSD]")>;
833 // Extract/insert operation, scalar
834 def : InstRW<[CortexA510MCWrite<8, 2, CortexA510UnitVALU0>], (instregex "^LAST[AB]_RPZ_[BHSD]",
835                                                 "^INSR_ZR_[BHSD]")>;
837 // Histogram operations
838 def : InstRW<[CortexA510MCWrite<8, 2, CortexA510UnitVALU0>], (instregex "^HISTCNT_ZPzZZ_[SD]",
839                                                   "^HISTSEG_ZZZ")>;
841 // Horizontal operations, B, H, S form, immediate operands only
842 def : InstRW<[CortexA510Write<4, CortexA510UnitVMAC>], (instregex "^INDEX_II_[BHS]")>;
844 // Horizontal operations, B, H, S form, scalar, immediate operands/ scalar
845 // operands only / immediate, scalar operands
846 def : InstRW<[CortexA510Write<4, CortexA510UnitVMAC>], (instregex "^INDEX_(IR|RI|RR)_[BHS]")>;
848 // Horizontal operations, D form, immediate operands only
849 def : InstRW<[CortexA510Write<4, CortexA510UnitVMAC>], (instrs INDEX_II_D)>;
851 // Horizontal operations, D form, scalar, immediate operands)/ scalar operands
852 // only / immediate, scalar operands
853 def : InstRW<[CortexA510Write<4, CortexA510UnitVMAC>], (instregex "^INDEX_(IR|RI|RR)_D")>;
855 // Logical
856 def : InstRW<[CortexA510Write<3, CortexA510UnitVALU>],
857              (instregex "^(AND|EOR|ORR)_ZI",
858                         "^(AND|BIC|EOR|EOR|ORR)_ZZZ",
859                         "^(AND|BIC|EOR|NOT|ORR)_ZPmZ_[BHSD]",
860                         "^(AND|BIC|EOR|NOT|ORR)_ZPZZ_[BHSD]")>;
862 def : InstRW<[CortexA510Write<4, CortexA510UnitVALU>],
863              (instregex "^EOR(BT|TB)_ZZZ_[BHSD]")>;
865 // Max/min, basic and pairwise
866 def : InstRW<[CortexA510Write<3, CortexA510UnitVALU>], (instregex "^[SU](MAX|MIN)_ZI_[BHSD]",
867                                            "^[SU](MAX|MIN)P?_(ZPmZ|ZPZZ)_[BHSD]")>;
869 // Matching operations
870 def : InstRW<[CortexA510MCWrite<7, 2, CortexA510UnitVALU>], (instregex "^N?MATCH_PPzZZ_[BH]")>;
872 // Matrix multiply-accumulate
873 def : InstRW<[CortexA510Write<4, CortexA510UnitVMAC>], (instrs SMMLA_ZZZ, UMMLA_ZZZ, USMMLA_ZZZ)>;
875 // Move prefix
876 def : InstRW<[CortexA510Write<3, CortexA510UnitVALU>], (instregex "^MOVPRFX_ZP[mz]Z_[BHSD]",
877                                            "^MOVPRFX_ZZ")>;
879 // Multiply, B, H, S element size
880 def : InstRW<[CortexA510Write<4, CortexA510UnitVMAC>], (instregex "^MUL_(ZI|ZPmZ|ZZZI|ZZZ|ZPZZ)_[BHS]",
881                                             "^[SU]MULH_(ZPmZ|ZZZ|ZPZZ)_[BHS]")>;
883 // Multiply, D element size
884 def : InstRW<[CortexA510Write<4, CortexA510UnitVMAC>], (instregex "^MUL_(ZI|ZPmZ|ZZZI|ZZZ|ZPZZ)_D",
885                                             "^[SU]MULH_(ZPmZ|ZZZ|ZPZZ)_D")>;
887 // Multiply long
888 def : InstRW<[CortexA510Write<4, CortexA510UnitVMAC>], (instregex "^[SU]MULL[BT]_ZZZI_[SD]",
889                                             "^[SU]MULL[BT]_ZZZ_[HSD]")>;
891 // Multiply accumulate, B, H, S element size
892 def : InstRW<[CortexA510Write<4, CortexA510UnitVMAC>], (instregex "^ML[AS]_(ZZZI|ZPZZZ)_[BHS]",
893                                             "^(ML[AS]|MAD|MSB)_ZPmZZ_[BHS]")>;
895 // Multiply accumulate, D element size
896 def : InstRW<[CortexA510Write<4, CortexA510UnitVMAC>], (instregex "^ML[AS]_(ZZZI|ZPZZZ)_D",
897                                             "^(ML[AS]|MAD|MSB)_ZPmZZ_D")>;
899 // Multiply accumulate long
900 def : InstRW<[CortexA510Write<4, CortexA510UnitVMAC>], (instregex "^[SU]ML[AS]L[BT]_ZZZ_[HSD]",
901                                             "^[SU]ML[AS]L[BT]_ZZZI_[SD]")>;
903 // Multiply accumulate saturating doubling long regular
904 def : InstRW<[CortexA510Write<4, CortexA510UnitVMAC>], (instregex "^SQDML[AS](LB|LT|LBT)_ZZZ_[HSD]",
905                                             "^SQDML[AS](LB|LT)_ZZZI_[SD]")>;
907 // Multiply saturating doubling high, B, H, S element size
908 def : InstRW<[CortexA510Write<4, CortexA510UnitVMAC>], (instregex "^SQDMULH_ZZZ_[BHS]",
909                                             "^SQDMULH_ZZZI_[HS]")>;
911 // Multiply saturating doubling high, D element size
912 def : InstRW<[CortexA510Write<4, CortexA510UnitVMAC>], (instrs SQDMULH_ZZZ_D, SQDMULH_ZZZI_D)>;
914 // Multiply saturating doubling long
915 def : InstRW<[CortexA510Write<4, CortexA510UnitVMAC>], (instregex "^SQDMULL[BT]_ZZZ_[HSD]",
916                                             "^SQDMULL[BT]_ZZZI_[SD]")>;
918 // Multiply saturating rounding doubling regular/complex accumulate, B, H, S
919 // element size
920 def : InstRW<[CortexA510Write<4, CortexA510UnitVMAC>], (instregex "^SQRDML[AS]H_ZZZ_[BHS]",
921                                             "^SQRDCMLAH_ZZZ_[BHS]",
922                                             "^SQRDML[AS]H_ZZZI_[HS]",
923                                             "^SQRDCMLAH_ZZZI_[HS]")>;
925 // Multiply saturating rounding doubling regular/complex accumulate, D element
926 // size
927 def : InstRW<[CortexA510Write<4, CortexA510UnitVMAC>], (instregex "^SQRDML[AS]H_ZZZI?_D",
928                                             "^SQRDCMLAH_ZZZ_D")>;
930 // Multiply saturating rounding doubling regular/complex, B, H, S element size
931 def : InstRW<[CortexA510Write<4, CortexA510UnitVMAC>], (instregex "^SQRDMULH_ZZZ_[BHS]",
932                                             "^SQRDMULH_ZZZI_[HS]")>;
934 // Multiply saturating rounding doubling regular/complex, D element size
935 def : InstRW<[CortexA510Write<4, CortexA510UnitVMAC>], (instregex "^SQRDMULH_ZZZI?_D")>;
937 // Multiply/multiply long, (8x8) polynomial
938 def : InstRW<[CortexA510Write<4, CortexA510UnitVALU>], (instregex "^PMUL_ZZZ_B")>;
940 def : InstRW<[CortexA510Write<6, CortexA510UnitVMC>], (instregex "^PMULL[BT]_ZZZ_[HDQ]")>;
943 // Predicate counting vector
944 def : InstRW<[CortexA510Write<4, CortexA510UnitVALU>],
945              (instregex "^(DEC|INC|SQDEC|SQINC|UQDEC|UQINC)[HWD]_ZPiI")>;
947 // Reciprocal estimate
948 def : InstRW<[CortexA510Write<4, CortexA510UnitVMAC>], (instregex "^URECPE_ZPmZ_S", "^URSQRTE_ZPmZ_S")>;
950 // Reduction, arithmetic, B form
951 def : InstRW<[CortexA510Write<4, CortexA510UnitVALU0>], (instregex "^[SU](ADD|MAX|MIN)V_VPZ_B")>;
953 // Reduction, arithmetic, H form
954 def : InstRW<[CortexA510Write<4, CortexA510UnitVALU0>], (instregex "^[SU](ADD|MAX|MIN)V_VPZ_H")>;
956 // Reduction, arithmetic, S form
957 def : InstRW<[CortexA510Write<4, CortexA510UnitVALU0>], (instregex "^[SU](ADD|MAX|MIN)V_VPZ_S")>;
959 // Reduction, arithmetic, D form
960 def : InstRW<[CortexA510Write<4, CortexA510UnitVALU0>], (instregex "^[SU](ADD|MAX|MIN)V_VPZ_D")>;
962 // Reduction, logical
963 def : InstRW<[CortexA510Write<4, CortexA510UnitVALU0>], (instregex "^(ANDV|EORV|ORV)_VPZ_[BHSD]")>;
965 // Reverse, vector
966 def : InstRW<[CortexA510Write<4, CortexA510UnitVALU>], (instregex "^REV_ZZ_[BHSD]",
967                                            "^REVB_ZPmZ_[HSD]",
968                                            "^REVH_ZPmZ_[SD]",
969                                            "^REVW_ZPmZ_D")>;
971 // Select, vector form
972 def : InstRW<[CortexA510Write<3, CortexA510UnitVALU>], (instregex "^SEL_ZPZZ_[BHSD]")>;
974 // Table lookup
975 def : InstRW<[CortexA510Write<4, CortexA510UnitVALU>], (instregex "^TBL_ZZZZ?_[BHSD]")>;
977 // Table lookup extension
978 def : InstRW<[CortexA510Write<4, CortexA510UnitVALU>], (instregex "^TBX_ZZZ_[BHSD]")>;
980 // Transpose, vector form
981 def : InstRW<[CortexA510Write<4, CortexA510UnitVALU>], (instregex "^TRN[12]_ZZZ_[BHSDQ]")>;
983 // Unpack and extend
984 def : InstRW<[CortexA510Write<4, CortexA510UnitVALU>], (instregex "^[SU]UNPK(HI|LO)_ZZ_[HSD]")>;
986 // Zip/unzip
987 def : InstRW<[CortexA510Write<4, CortexA510UnitVALU>], (instregex "^(UZP|ZIP)[12]_ZZZ_[BHSDQ]")>;
989 // SVE floating-point instructions
990 // -----------------------------------------------------------------------------
992 // Floating point absolute value/difference
993 def : InstRW<[CortexA510Write<4, CortexA510UnitVALU>], (instregex "^FAB[SD]_ZPmZ_[HSD]",
994                                                                   "^FAB[SD]_ZPZZ_[HSD]")>;
996 // Floating point arithmetic
997 def : InstRW<[CortexA510Write<4, CortexA510UnitVALU>], (instregex "^F(ADD|SUB)_(ZPm[IZ]|ZZZ|ZPZI|ZPZZ)_[HSD]",
998                                            "^FADDP_ZPmZZ_[HSD]",
999                                            "^FNEG_ZPmZ_[HSD]",
1000                                            "^FSUBR_(ZPm[IZ]|ZPZ[IZ])_[HSD]")>;
1002 // Floating point associative add, F16
1003 def : InstRW<[CortexA510MCWrite<32, 29, CortexA510UnitVALU>], (instrs FADDA_VPZ_H)>;
1005 // Floating point associative add, F32
1006 def : InstRW<[CortexA510MCWrite<16, 13, CortexA510UnitVALU>], (instrs FADDA_VPZ_S)>;
1008 // Floating point associative add, F64
1009 def : InstRW<[CortexA510MCWrite<8, 5, CortexA510UnitVALU>], (instrs FADDA_VPZ_D)>;
1011 // Floating point compare
1012 def : InstRW<[CortexA510Write<4, CortexA510UnitVALU>], (instregex "^FACG[ET]_PPzZZ_[HSD]",
1013                                             "^FCM(EQ|GE|GT|NE)_PPzZ[0Z]_[HSD]",
1014                                             "^FCM(LE|LT)_PPzZ0_[HSD]",
1015                                             "^FCMUO_PPzZZ_[HSD]")>;
1017 // Floating point complex add
1018 def : InstRW<[CortexA510Write<4, CortexA510UnitVALU>], (instregex "^FCADD_ZPmZ_[HSD]")>;
1020 // Floating point complex multiply add
1021 def : InstRW<[CortexA510Write<4, CortexA510UnitVMAC>], (instregex "^FCMLA_ZPmZZ_[HSD]",
1022                                            "^FCMLA_ZZZI_[HS]")>;
1024 // Floating point convert, long or narrow (F16 to F32 or F32 to F16)
1025 def : InstRW<[CortexA510Write<4, CortexA510UnitVALU>], (instregex "^FCVT_ZPmZ_(HtoS|StoH)",
1026                                             "^FCVTLT_ZPmZ_HtoS",
1027                                             "^FCVTNT_ZPmZ_StoH")>;
1029 // Floating point convert, long or narrow (F16 to F64, F32 to F64, F64 to F32
1030 // or F64 to F16)
1031 def : InstRW<[CortexA510Write<4, CortexA510UnitVALU>], (instregex "^FCVT_ZPmZ_(HtoD|StoD|DtoS|DtoH)",
1032                                             "^FCVTLT_ZPmZ_StoD",
1033                                             "^FCVTNT_ZPmZ_DtoS")>;
1035 // Floating point convert, round to odd
1036 def : InstRW<[CortexA510Write<4, CortexA510UnitVALU>], (instregex "^FCVTX_ZPmZ_DtoS", "FCVTXNT_ZPmZ_DtoS")>;
1038 // Floating point base2 log, F16
1039 def : InstRW<[CortexA510Write<4, CortexA510UnitVMAC>], (instregex "^FLOGB_(ZPmZ|ZPZZ)_H")>;
1041 // Floating point base2 log, F32
1042 def : InstRW<[CortexA510Write<4, CortexA510UnitVMAC>], (instregex "^FLOGB_(ZPmZ|ZPZZ)_S")>;
1044 // Floating point base2 log, F64
1045 def : InstRW<[CortexA510Write<4, CortexA510UnitVMAC>], (instregex "^FLOGB_(ZPmZ|ZPZZ)_D")>;
1047 // Floating point convert to integer, F16
1048 def : InstRW<[CortexA510Write<4, CortexA510UnitVALU>], (instregex "^FCVTZ[SU]_ZPmZ_HtoH")>;
1050 // Floating point convert to integer, F32
1051 def : InstRW<[CortexA510Write<4, CortexA510UnitVALU>], (instregex "^FCVTZ[SU]_ZPmZ_(HtoS|StoS)")>;
1053 // Floating point convert to integer, F64
1054 def : InstRW<[CortexA510Write<4, CortexA510UnitVALU>],
1055              (instregex "^FCVTZ[SU]_ZPmZ_(HtoD|StoD|DtoS|DtoD)")>;
1057 // Floating point copy
1058 def : InstRW<[CortexA510Write<3, CortexA510UnitVALU0>], (instregex "^FCPY_ZPmI_[HSD]",
1059                                            "^FDUP_ZI_[HSD]")>;
1061 // Floating point divide, F16
1062 def : InstRW<[CortexA510MCWrite<8, 5, CortexA510UnitVMC>], (instregex "^FDIVR?_(ZPmZ|ZPZZ)_H")>;
1064 // Floating point divide, F32
1065 def : InstRW<[CortexA510MCWrite<13, 10, CortexA510UnitVMC>], (instregex "^FDIVR?_(ZPmZ|ZPZZ)_S")>;
1067 // Floating point divide, F64
1068 def : InstRW<[CortexA510MCWrite<22, 19, CortexA510UnitVMC>], (instregex "^FDIVR?_(ZPmZ|ZPZZ)_D")>;
1070 // Floating point min/max pairwise
1071 def : InstRW<[CortexA510Write<4, CortexA510UnitVALU>], (instregex "^F(MAX|MIN)(NM)?P_ZPmZZ_[HSD]")>;
1073 // Floating point min/max
1074 def : InstRW<[CortexA510Write<4, CortexA510UnitVALU>], (instregex "^F(MAX|MIN)(NM)?_(ZPm[IZ]|ZPZZ|ZPZI)_[HSD]")>;
1076 // Floating point multiply
1077 def : InstRW<[CortexA510Write<4, CortexA510UnitVMAC>], (instregex "^(FSCALE|FMULX)_(ZPmZ|ZPZZ)_[HSD]",
1078                                            "^FMUL_(ZPm[IZ]|ZZZI?|ZPZI|ZPZZ)_[HSD]")>;
1080 // Floating point multiply accumulate
1081 def : InstRW<[CortexA510Write<4, CortexA510UnitVMAC>],
1082              (instregex "^FML[AS]_(ZPmZZ|ZZZI|ZPZZZ)_[HSD]",
1083                         "^(FMAD|FNMAD|FNML[AS]|FN?MSB)_(ZPmZZ|ZPZZZ)_[HSD]")>;
1085 // Floating point multiply add/sub accumulate long
1086 def : InstRW<[CortexA510Write<4, CortexA510UnitVMAC>], (instregex "^FML[AS]L[BT]_ZZZI?_SHH")>;
1088 // Floating point reciprocal estimate, F16
1089 def : InstRW<[CortexA510Write<4, CortexA510UnitVMAC>], (instregex "^FRECPE_ZZ_H", "^FRECPX_ZPmZ_H",
1090                                          "^FRSQRTE_ZZ_H")>;
1092 // Floating point reciprocal estimate, F32
1093 def : InstRW<[CortexA510Write<4, CortexA510UnitVMAC>], (instregex "^FRECPE_ZZ_S", "^FRECPX_ZPmZ_S",
1094                                          "^FRSQRTE_ZZ_S")>;
1095 // Floating point reciprocal estimate, F64
1096 def : InstRW<[CortexA510Write<4, CortexA510UnitVMAC>],(instregex "^FRECPE_ZZ_D", "^FRECPX_ZPmZ_D",
1097                                          "^FRSQRTE_ZZ_D")>;
1099 // Floating point reciprocal step
1100 def : InstRW<[CortexA510Write<4, CortexA510UnitVMAC>], (instregex "^F(RECPS|RSQRTS)_ZZZ_[HSD]")>;
1102 // Floating point reduction, F16
1103 def : InstRW<[CortexA510Write<4, CortexA510UnitVALU0>],
1104              (instregex "^(FMAXNMV|FMAXV|FMINNMV|FMINV)_VPZ_[HSD]")>;
1106 // Floating point reduction, F32
1107 def : InstRW<[CortexA510MCWrite<12, 11, CortexA510UnitVALU0>],
1108              (instregex "^FADDV_VPZ_H")>;
1110 def : InstRW<[CortexA510MCWrite<8, 5, CortexA510UnitVALU0>],
1111              (instregex "^FADDV_VPZ_S")>;
1113 def : InstRW<[CortexA510Write<4, CortexA510UnitVALU0>],
1114              (instregex "^FADDV_VPZ_D")>;
1117 // Floating point round to integral, F16
1118 def : InstRW<[CortexA510Write<4, CortexA510UnitVALU>], (instregex "^FRINT[AIMNPXZ]_ZPmZ_H")>;
1120 // Floating point round to integral, F32
1121 def : InstRW<[CortexA510Write<4, CortexA510UnitVALU>], (instregex "^FRINT[AIMNPXZ]_ZPmZ_S")>;
1123 // Floating point round to integral, F64
1124 def : InstRW<[CortexA510Write<4, CortexA510UnitVALU>], (instregex "^FRINT[AIMNPXZ]_ZPmZ_D")>;
1126 // Floating point square root, F16
1127 def : InstRW<[CortexA510MCWrite<8, 5, CortexA510UnitVMC>], (instregex "^FSQRT_ZPmZ_H")>;
1129 // Floating point square root, F32
1130 def : InstRW<[CortexA510MCWrite<12, 9, CortexA510UnitVMC>], (instregex "^FSQRT_ZPmZ_S")>;
1132 // Floating point square root, F64
1133 def : InstRW<[CortexA510MCWrite<22, 19, CortexA510UnitVMC>], (instregex "^FSQRT_ZPmZ_D")>;
1135 // Floating point trigonometric exponentiation
1136 def : InstRW<[CortexA510Write<4, CortexA510UnitVMAC>], (instregex "^FEXPA_ZZ_[HSD]")>;
1138 // Floating point trigonometric multiply add
1139 def : InstRW<[CortexA510Write<4, CortexA510UnitVMAC>], (instregex "^FTMAD_ZZI_[HSD]")>;
1141 // Floating point trigonometric, miscellaneous
1142 def : InstRW<[CortexA510Write<4, CortexA510UnitVMAC>], (instregex "^FTSMUL_ZZZ_[HSD]")>;
1143 def : InstRW<[CortexA510Write<4, CortexA510UnitVALU>], (instregex "^FTSSEL_ZZZ_[HSD]")>;
1146 // SVE BFloat16 (BF16) instructions
1147 // -----------------------------------------------------------------------------
1149 // Convert, F32 to BF16
1150 def : InstRW<[CortexA510Write<4, CortexA510UnitVALU>], (instrs BFCVT_ZPmZ, BFCVTNT_ZPmZ)>;
1152 // Dot product
1153 def : InstRW<[A510Write_10cyc_1VMAC_1VALU], (instrs BFDOT_ZZI, BFDOT_ZZZ)>;
1155 // Matrix multiply accumulate
1156 def : InstRW<[A510Write_15cyc_1VMAC_1VALU], (instrs BFMMLA_ZZZ)>;
1158 // Multiply accumulate long
1159 def : InstRW<[CortexA510Write<4, CortexA510UnitVMAC>], (instregex "^BFMLAL[BT]_ZZZ(I)?")>;
1161 // SVE Load instructions
1162 // -----------------------------------------------------------------------------
1164 // Load vector
1165 def : InstRW<[CortexA510Write<3, CortexA510UnitLd>], (instrs LDR_ZXI)>;
1167 // Load predicate
1168 def : InstRW<[CortexA510Write<3, CortexA510UnitLdSt>], (instrs LDR_PXI)>;
1170 // Contiguous load, scalar + imm
1171 def : InstRW<[CortexA510Write<3, CortexA510UnitLd>], (instregex "^LD1[BHWD]_IMM$",
1172                                            "^LD1S?B_[HSD]_IMM$",
1173                                            "^LD1S?H_[SD]_IMM$",
1174                                            "^LD1S?W_D_IMM$" )>;
1175 // Contiguous load, scalar + scalar
1176 def : InstRW<[CortexA510Write<3, CortexA510UnitLd>], (instregex "^LD1[BHWD]$",
1177                                              "^LD1S?B_[HSD]$",
1178                                              "^LD1S?H_[SD]$",
1179                                              "^LD1S?W_D$" )>;
1181 // Contiguous load broadcast, scalar + imm
1182 def : InstRW<[CortexA510Write<3, CortexA510UnitLd>], (instregex "^LD1R[BHWD]_IMM$",
1183                                            "^LD1RSW_IMM$",
1184                                            "^LD1RS?B_[HSD]_IMM$",
1185                                            "^LD1RS?H_[SD]_IMM$",
1186                                            "^LD1RS?W_D_IMM$",
1187                                            "^LD1RQ_[BHWD]_IMM$")>;
1189 // Contiguous load broadcast, scalar + scalar
1190 def : InstRW<[CortexA510Write<3, CortexA510UnitLdSt>], (instregex "^LD1RQ_[BHWD]$")>;
1192 // Non temporal load, scalar + imm
1193 def : InstRW<[CortexA510Write<3, CortexA510UnitLdSt>], (instregex "^LDNT1[BHWD]_ZRI$")>;
1195 // Non temporal load, scalar + scalar
1196 def : InstRW<[CortexA510Write<3, CortexA510UnitLdSt>], (instregex "^LDNT1[BHWD]_ZRR$")>;
1198 // Non temporal gather load, vector + scalar 32-bit element size
1199 def : InstRW<[CortexA510MCWrite<9, 9, CortexA510UnitLdSt>], (instregex "^LDNT1[BHW]_ZZR_S_REAL$",
1200                                               "^LDNT1S[BH]_ZZR_S_REAL$")>;
1202 // Non temporal gather load, vector + scalar 64-bit element size
1203 def : InstRW<[CortexA510MCWrite<7, 7, CortexA510UnitLdSt>], (instregex "^LDNT1S?[BHW]_ZZR_D_REAL$")>;
1204 def : InstRW<[CortexA510MCWrite<7, 7, CortexA510UnitLdSt>], (instrs LDNT1D_ZZR_D_REAL)>;
1206 // Contiguous first faulting load, scalar + scalar
1207 def : InstRW<[CortexA510Write<3, CortexA510UnitLd>], (instregex "^LDFF1[BHWD]_REAL$",
1208                                               "^LDFF1S?B_[HSD]_REAL$",
1209                                               "^LDFF1S?H_[SD]_REAL$",
1210                                               "^LDFF1S?W_D_REAL$")>;
1212 // Contiguous non faulting load, scalar + imm
1213 def : InstRW<[CortexA510Write<3, CortexA510UnitLd>], (instregex "^LDNF1[BHWD]_IMM_REAL$",
1214                                            "^LDNF1S?B_[HSD]_IMM_REAL$",
1215                                            "^LDNF1S?H_[SD]_IMM_REAL$",
1216                                            "^LDNF1S?W_D_IMM_REAL$")>;
1218 // Contiguous Load two structures to two vectors, scalar + imm
1219 def : InstRW<[CortexA510MCWrite<3, 1, CortexA510UnitLdSt>], (instregex "^LD2[BHWD]_IMM$")>;
1221 // Contiguous Load two structures to two vectors, scalar + scalar
1222 def : InstRW<[CortexA510MCWrite<3, 2, CortexA510UnitLdSt>], (instregex "^LD2[BHWD]$")>;
1224 // Contiguous Load three structures to three vectors, scalar + imm
1225 def : InstRW<[CortexA510MCWrite<5, 3, CortexA510UnitLdSt>], (instregex "^LD3[BHWD]_IMM$")>;
1227 // Contiguous Load three structures to three vectors, scalar + scalar
1228 def : InstRW<[CortexA510MCWrite<5, 3, CortexA510UnitLdSt>], (instregex "^LD3[BHWD]$")>;
1230 // Contiguous Load four structures to four vectors, scalar + imm
1231 def : InstRW<[CortexA510MCWrite<5, 3, CortexA510UnitLdSt>], (instregex "^LD4[BHWD]_IMM$")>;
1233 // Contiguous Load four structures to four vectors, scalar + scalar
1234 def : InstRW<[CortexA510MCWrite<5, 3, CortexA510UnitLdSt>], (instregex "^LD4[BHWD]$")>;
1236 // Gather load, vector + imm, 32-bit element size
1237 def : InstRW<[CortexA510MCWrite<9, 9, CortexA510UnitLdSt>], (instregex "^GLD(FF)?1S?[BH]_S_IMM_REAL$",
1238                                               "^GLD(FF)?1W_IMM_REAL$")>;
1240 // Gather load, vector + imm, 64-bit element size
1241 def : InstRW<[CortexA510MCWrite<7, 7, CortexA510UnitLdSt>], (instregex "^GLD(FF)?1S?[BHW]_D_IMM_REAL$",
1242                                               "^GLD(FF)?1D_IMM_REAL$")>;
1244 // Gather load, 64-bit element size
1245 def : InstRW<[CortexA510MCWrite<7, 7, CortexA510UnitLdSt>],
1246              (instregex "^GLD(FF)?1S?[BHW]_D_[SU]XTW_(SCALED_)?REAL$",
1247                         "^GLD(FF)?1S?[BHW]_D_(SCALED_)?REAL$",
1248                         "^GLD(FF)?1D_[SU]XTW_(SCALED_)?REAL$",
1249                         "^GLD(FF)?1D_(SCALED_)?REAL$")>;
1251 // Gather load, 32-bit scaled offset
1252 def : InstRW<[CortexA510MCWrite<9, 9, CortexA510UnitLd>],
1253              (instregex "^GLD(FF)?1S?[HW]_S_[SU]XTW_SCALED_REAL$",
1254                         "^GLD(FF)?1W_[SU]XTW_SCALED_REAL")>;
1256 // Gather load, 32-bit unpacked unscaled offset
1257 def : InstRW<[CortexA510MCWrite<9, 9, CortexA510UnitLd>], (instregex "^GLD(FF)?1S?[BH]_S_[SU]XTW_REAL$",
1258                                               "^GLD(FF)?1W_[SU]XTW_REAL$")>;
1260 def : InstRW<[CortexA510Write<0, CortexA510UnitVALU>], (instregex "^PRF(B|H|W|D).*")>;
1261 // SVE Store instructions
1262 // -----------------------------------------------------------------------------
1264 // Store from predicate reg
1265 def : InstRW<[CortexA510VSt0], (instrs STR_PXI)>;
1267 // Store from vector reg
1268 def : InstRW<[CortexA510VSt0], (instrs STR_ZXI)>;
1270 // Contiguous store, scalar + imm
1271 def : InstRW<[CortexA510VSt0], (instregex "^ST1[BHWD]_IMM$",
1272                                                 "^ST1B_[HSD]_IMM$",
1273                                                 "^ST1H_[SD]_IMM$",
1274                                                 "^ST1W_D_IMM$")>;
1276 // Contiguous store, scalar + scalar
1277 def : InstRW<[CortexA510VSt0], (instregex "^ST1H(_[SD])?$")>;
1278 def : InstRW<[CortexA510VSt0], (instregex "^ST1[BWD]$",
1279                                                 "^ST1B_[HSD]$",
1280                                                 "^ST1W_D$")>;
1282 // Contiguous store two structures from two vectors, scalar + imm
1283 def : InstRW<[CortexA510VSt<11>], (instregex "^ST2[BHWD]_IMM$")>;
1285 // Contiguous store two structures from two vectors, scalar + scalar
1286 def : InstRW<[CortexA510VSt<11>], (instrs ST2H)>;
1288 // Contiguous store two structures from two vectors, scalar + scalar
1289 def : InstRW<[CortexA510VSt<11>], (instregex "^ST2[BWD]$")>;
1291 // Contiguous store three structures from three vectors, scalar + imm
1292 def : InstRW<[CortexA510VSt<25>], (instregex "^ST3[BHW]_IMM$")>;
1293 def : InstRW<[CortexA510VSt<14>], (instregex "^ST3D_IMM$")>;
1295 // Contiguous store three structures from three vectors, scalar + scalar
1296 def : InstRW<[CortexA510VSt<25>], (instregex "^ST3[BHW]$")>;
1297 def : InstRW<[CortexA510VSt<14>], (instregex "^ST3D$")>;
1299 // Contiguous store four structures from four vectors, scalar + imm
1300 def : InstRW<[CortexA510VSt<50>], (instregex "^ST4[BHW]_IMM$")>;
1301 def : InstRW<[CortexA510VSt<25>], (instregex "^ST4D_IMM$")>;
1303 // Contiguous store four structures from four vectors, scalar + scalar
1304 def : InstRW<[CortexA510VSt<50>], (instregex "^ST4[BHW]$")>;
1306 // Contiguous store four structures from four vectors, scalar + scalar
1307 def : InstRW<[CortexA510VSt<25>], (instregex "^ST4D$")>;
1309 // Non temporal store, scalar + imm
1310 def : InstRW<[CortexA510VSt0], (instregex "^STNT1[BHWD]_ZRI$")>;
1312 // Non temporal store, scalar + scalar
1313 def : InstRW<[CortexA510VSt0], (instrs STNT1H_ZRR)>;
1314 def : InstRW<[CortexA510VSt0], (instregex "^STNT1[BWD]_ZRR$")>;
1316 // Scatter non temporal store, vector + scalar 32-bit element size
1317 def : InstRW<[CortexA510VSt<9>], (instregex "^STNT1[BHW]_ZZR_S")>;
1319 // Scatter non temporal store, vector + scalar 64-bit element size
1320 def : InstRW<[CortexA510VSt<7>], (instregex "^STNT1[BHWD]_ZZR_D")>;
1322 // Scatter store vector + imm 32-bit element size
1323 def : InstRW<[CortexA510VSt<9>], (instregex "^SST1[BH]_S_IMM$",
1324                                                 "^SST1W_IMM$")>;
1326 // Scatter store vector + imm 64-bit element size
1327 def : InstRW<[CortexA510VSt<7>], (instregex "^SST1[BHW]_D_IMM$",
1328                                                 "^SST1D_IMM$")>;
1330 // Scatter store, 32-bit scaled offset
1331 def : InstRW<[CortexA510VSt<8>],
1332              (instregex "^SST1(H_S|W)_[SU]XTW_SCALED$")>;
1334 // Scatter store, 32-bit unpacked unscaled offset
1335 def : InstRW<[CortexA510VSt<8>], (instregex "^SST1[BHW]_D_[SU]XTW$",
1336                                                 "^SST1D_[SU]XTW$")>;
1338 // Scatter store, 32-bit unpacked scaled offset
1339 def : InstRW<[CortexA510VSt<8>], (instregex "^SST1[HW]_D_[SU]XTW_SCALED$",
1340                                                 "^SST1D_[SU]XTW_SCALED$")>;
1342 // Scatter store, 32-bit unscaled offset
1343 def : InstRW<[CortexA510VSt<8>], (instregex "^SST1[BH]_S_[SU]XTW$",
1344                                                 "^SST1W_[SU]XTW$")>;
1346 // Scatter store, 64-bit scaled offset
1347 def : InstRW<[CortexA510VSt<8>], (instregex "^SST1[HW]_D_SCALED$",
1348                                                 "^SST1D_SCALED$")>;
1350 // Scatter store, 64-bit unscaled offset
1351 def : InstRW<[CortexA510VSt<8>], (instregex "^SST1[BHW]_D$",
1352                                                 "^SST1D$")>;
1354 // SVE Miscellaneous instructions
1355 // -----------------------------------------------------------------------------
1357 // Read first fault register, unpredicated
1358 def : InstRW<[CortexA510Write<1, CortexA510UnitALU>], (instrs RDFFR_P_REAL)>;
1360 // Read first fault register, predicated
1361 def : InstRW<[CortexA510Write<3, CortexA510UnitALU0>], (instrs RDFFR_PPz_REAL)>;
1363 // Read first fault register and set flags
1364 def : InstRW<[CortexA510Write<3, CortexA510UnitALU0>], (instrs RDFFRS_PPz)>;
1366 // Set first fault register
1367 // Write to first fault register
1368 def : InstRW<[CortexA510Write<1, CortexA510UnitALU>], (instrs SETFFR, WRFFR)>;
1370 // SVE Cryptographic instructions
1371 // -----------------------------------------------------------------------------
1373 // Crypto AES ops
1374 def : InstRW<[CortexA510Write<3, CortexA510UnitVALU>], (instregex "^AES[DE]_ZZZ_B$",
1375                                            "^AESI?MC_ZZ_B$")>;
1377 // Crypto SHA3 ops
1378 def : InstRW<[CortexA510Write<3, CortexA510UnitVALU>], (instregex "^(BCAX|EOR3)_ZZZZ$",
1379                                             "^XAR_ZZZI_[BHSD]$")>;
1381 def : InstRW<[CortexA510MC_RC0Write<8, CortexA510UnitVMC>], (instregex "^RAX1_ZZZ_D$")>;
1383 // Crypto SM4 ops
1384 def : InstRW<[CortexA510MC_RC0Write<8, CortexA510UnitVMC>], (instregex "^SM4E(KEY)?_ZZZ_S$")>;