[llvm-shlib] Fix the version naming style of libLLVM for Windows (#85710)
[llvm-project.git] / llvm / lib / Target / AArch64 / AArch64SchedNeoverseV2.td
blobe7de40fdf1deb08b558a9499b893b70f01a4e1ec
1 //=- AArch64SchedNeoverseV2.td - NeoverseV2 Scheduling Defs --*- tablegen -*-=//
2 //
3 // Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
4 // See https://llvm.org/LICENSE.txt for license information.
5 // SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
6 //
7 //===----------------------------------------------------------------------===//
8 //
9 // This file defines the scheduling model for the Arm Neoverse V2 processors.
10 // All information is taken from the V2 Software Optimisation guide:
12 // https://developer.arm.com/documentation/PJDOC-466751330-593177/r0p2
14 //===----------------------------------------------------------------------===//
16 def NeoverseV2Model : SchedMachineModel {
17   let IssueWidth            =  16; // Micro-ops dispatched at a time.
18   let MicroOpBufferSize     = 160; // Entries in micro-op re-order buffer. NOTE: Copied from N2.
19   let LoadLatency           =   4; // Optimistic load latency.
20   let MispredictPenalty     =  10; // Extra cycles for mispredicted branch.  NOTE: Copied from N2.
21   let LoopMicroOpBufferSize =  16; // NOTE: Copied from Cortex-A57.
22   let CompleteModel         =   1;
24   list<Predicate> UnsupportedFeatures = !listconcat(SMEUnsupported.F,
25                                                     [HasSVE2p1, HasCPA,
26                                                     HasCSSC]);
29 //===----------------------------------------------------------------------===//
30 // Define each kind of processor resource and number available on Neoverse V2.
31 // Instructions are first fetched and then decoded into internal macro-ops
32 // (MOPs). From there, the MOPs proceed through register renaming and dispatch
33 // stages. A MOP can be split into two micro-ops further down the pipeline
34 // after the decode stage. Once dispatched, micro-ops wait for their operands
35 // and issue out-of-order to one of seventeen issue pipelines. Each issue
36 // pipeline can accept one micro-op per cycle.
38 let SchedModel = NeoverseV2Model in {
40 // Define the (17) issue ports.
41 def V2UnitB   : ProcResource<2>;  // Branch 0/1
42 def V2UnitS0  : ProcResource<1>;  // Integer single-cycle 0
43 def V2UnitS1  : ProcResource<1>;  // Integer single-cycle 1
44 def V2UnitS2  : ProcResource<1>;  // Integer single-cycle 2
45 def V2UnitS3  : ProcResource<1>;  // Integer single-cycle 3
46 def V2UnitM0  : ProcResource<1>;  // Integer single/multicycle 0
47 def V2UnitM1  : ProcResource<1>;  // Integer single/multicycle 1
48 def V2UnitV0  : ProcResource<1>;  // FP/ASIMD 0
49 def V2UnitV1  : ProcResource<1>;  // FP/ASIMD 1
50 def V2UnitV2  : ProcResource<1>;  // FP/ASIMD 2
51 def V2UnitV3  : ProcResource<1>;  // FP/ASIMD 3
52 def V2UnitL01 : ProcResource<2>;  // Load/Store 0/1
53 def V2UnitL2  : ProcResource<1>;  // Load 2
54 def V2UnitD   : ProcResource<2>;  // Store data 0/1
56 def V2UnitR   : ProcResGroup<[V2UnitS0, V2UnitS1]>;  // Integer single-cycle 0/1
57 def V2UnitS   : ProcResGroup<[V2UnitS0, V2UnitS1, V2UnitS2, V2UnitS3]>;  // Integer single-cycle 0/1/2/3
58 def V2UnitF   : ProcResGroup<[V2UnitS0, V2UnitS1, V2UnitM0, V2UnitM1]>;  // Integer single-cycle 0/1 and single/multicycle 0/1
59 def V2UnitI   : ProcResGroup<[V2UnitS0, V2UnitS1, V2UnitS2, V2UnitS3, V2UnitM0, V2UnitM1]>;  // Integer single-cycle 0/1/2/3 and single/multicycle 0/1
60 def V2UnitM   : ProcResGroup<[V2UnitM0, V2UnitM1]>;  // Integer single/multicycle 0/1
61 def V2UnitL   : ProcResGroup<[V2UnitL01, V2UnitL2]>; // Load/Store 0/1 and Load 2
62 def V2UnitV   : ProcResGroup<[V2UnitV0, V2UnitV1, V2UnitV2, V2UnitV3]>;  // FP/ASIMD 0/1/2/3
63 def V2UnitV01 : ProcResGroup<[V2UnitV0, V2UnitV1]>;  // FP/ASIMD 0/1
64 def V2UnitV02 : ProcResGroup<[V2UnitV0, V2UnitV2]>;  // FP/ASIMD 0/2
65 def V2UnitV13 : ProcResGroup<[V2UnitV1, V2UnitV3]>;  // FP/ASIMD 1/3
66 def V2UnitV23 : ProcResGroup<[V2UnitV2, V2UnitV3]>;  // FP/ASIMD 2/3
68 // Define commonly used read types.
70 // No forwarding is provided for these types.
71 def : ReadAdvance<ReadI,       0>;
72 def : ReadAdvance<ReadISReg,   0>;
73 def : ReadAdvance<ReadIEReg,   0>;
74 def : ReadAdvance<ReadIM,      0>;
75 def : ReadAdvance<ReadIMA,     0>;
76 def : ReadAdvance<ReadID,      0>;
77 def : ReadAdvance<ReadExtrHi,  0>;
78 def : ReadAdvance<ReadAdrBase, 0>;
79 def : ReadAdvance<ReadST,      0>;
80 def : ReadAdvance<ReadVLD,     0>;
82 // NOTE: Copied from N2.
83 def : WriteRes<WriteAtomic,  []> { let Unsupported = 1; }
84 def : WriteRes<WriteBarrier, []> { let Latency = 1; }
85 def : WriteRes<WriteHint,    []> { let Latency = 1; }
86 def : WriteRes<WriteLDHi,    []> { let Latency = 4; }
88 //===----------------------------------------------------------------------===//
89 // Define customized scheduler read/write types specific to the Neoverse V2.
91 //===----------------------------------------------------------------------===//
93 // Define generic 0 micro-op types
94 def V2Write_0cyc : SchedWriteRes<[]> { let Latency = 0; }
96 // Define generic 1 micro-op types
98 def V2Write_1cyc_1B    : SchedWriteRes<[V2UnitB]>   { let Latency = 1; }
99 def V2Write_1cyc_1F    : SchedWriteRes<[V2UnitF]>   { let Latency = 1; }
100 def V2Write_1cyc_1I    : SchedWriteRes<[V2UnitI]>   { let Latency = 1; }
101 def V2Write_1cyc_1M    : SchedWriteRes<[V2UnitM]>   { let Latency = 1; }
102 def V2Write_1cyc_1M0   : SchedWriteRes<[V2UnitM0]>  { let Latency = 1; }
103 def V2Write_1cyc_1L01  : SchedWriteRes<[V2UnitL01]> { let Latency = 1; }
104 def V2Write_2cyc_1M    : SchedWriteRes<[V2UnitM]>   { let Latency = 2; }
105 def V2Write_3cyc_1M    : SchedWriteRes<[V2UnitM]>   { let Latency = 3; }
106 def V2Write_2cyc_1M0   : SchedWriteRes<[V2UnitM0]>  { let Latency = 2; }
107 def V2Write_3cyc_1M0   : SchedWriteRes<[V2UnitM0]>  { let Latency = 3; }
108 def V2Write_5cyc_1M0   : SchedWriteRes<[V2UnitM0]>  { let Latency = 5; }
109 def V2Write_12cyc_1M0  : SchedWriteRes<[V2UnitM0]>  { let Latency = 12;
110                                                       let ReleaseAtCycles = [12]; }
111 def V2Write_20cyc_1M0  : SchedWriteRes<[V2UnitM0]>  { let Latency = 20;
112                                                       let ReleaseAtCycles = [20]; }
113 def V2Write_4cyc_1L    : SchedWriteRes<[V2UnitL]>   { let Latency = 4; }
114 def V2Write_6cyc_1L    : SchedWriteRes<[V2UnitL]>   { let Latency = 6; }
115 def V2Write_2cyc_1V    : SchedWriteRes<[V2UnitV]>   { let Latency = 2; }
116 def V2Write_2cyc_1V0   : SchedWriteRes<[V2UnitV0]>  { let Latency = 2; }
117 def V2Write_2cyc_1V01  : SchedWriteRes<[V2UnitV01]> { let Latency = 2; }
118 def V2Write_2cyc_1V23  : SchedWriteRes<[V2UnitV23]> { let Latency = 2; }
119 def V2Write_3cyc_1V    : SchedWriteRes<[V2UnitV]>   { let Latency = 3; }
120 def V2Write_3cyc_1V01  : SchedWriteRes<[V2UnitV01]> { let Latency = 3;
121                                                       let ReleaseAtCycles = [2]; }
122 def V2Write_3cyc_1V23  : SchedWriteRes<[V2UnitV23]> { let Latency = 3; }
123 def V2Write_4cyc_1V    : SchedWriteRes<[V2UnitV]>   { let Latency = 4; }
124 def V2Write_5cyc_1V    : SchedWriteRes<[V2UnitV]>   { let Latency = 5; }
125 def V2Write_6cyc_1V    : SchedWriteRes<[V2UnitV]>   { let Latency = 6; }
126 def V2Write_12cyc_1V   : SchedWriteRes<[V2UnitV]>   { let Latency = 12; }
127 def V2Write_3cyc_1V0   : SchedWriteRes<[V2UnitV0]>  { let Latency = 3; }
128 def V2Write_3cyc_1V02  : SchedWriteRes<[V2UnitV02]> { let Latency = 3; }
129 def V2Write_4cyc_1V0   : SchedWriteRes<[V2UnitV0]>  { let Latency = 4; }
130 def V2Write_4cyc_1V02  : SchedWriteRes<[V2UnitV02]> { let Latency = 4; }
131 def V2Write_7cyc_1V0   : SchedWriteRes<[V2UnitV0]>  { let Latency = 7;
132                                                       let ReleaseAtCycles = [7]; }
133 def V2Write_7cyc_1V02  : SchedWriteRes<[V2UnitV02]> { let Latency = 7;
134                                                       let ReleaseAtCycles = [2]; }
135 def V2Write_9cyc_1V0   : SchedWriteRes<[V2UnitV0]>  { let Latency = 9; }
136 def V2Write_9cyc_1V02  : SchedWriteRes<[V2UnitV02]> { let Latency = 9;
137                                                       let ReleaseAtCycles = [2]; }
138 def V2Write_10cyc_1V0  : SchedWriteRes<[V2UnitV0]>  { let Latency = 10; }
139 def V2Write_10cyc_1V02 : SchedWriteRes<[V2UnitV02]> { let Latency = 10;
140                                                       let ReleaseAtCycles = [2]; }
141 def V2Write_12cyc_1V0  : SchedWriteRes<[V2UnitV0]>  { let Latency = 12;
142                                                       let ReleaseAtCycles = [11]; }
143 def V2Write_13cyc_1V0  : SchedWriteRes<[V2UnitV0]>  { let Latency = 13; }
144 def V2Write_15cyc_1V0  : SchedWriteRes<[V2UnitV0]>  { let Latency = 15; }
145 def V2Write_15cyc_1V02 : SchedWriteRes<[V2UnitV02]> { let Latency = 15;
146                                                       let ReleaseAtCycles = [8]; }
147 def V2Write_16cyc_1V0  : SchedWriteRes<[V2UnitV0]>  { let Latency = 16; }
148 def V2Write_16cyc_1V02 : SchedWriteRes<[V2UnitV02]> { let Latency = 16;
149                                                       let ReleaseAtCycles = [8]; }
150 def V2Write_20cyc_1V0  : SchedWriteRes<[V2UnitV0]>  { let Latency = 20;
151                                                       let ReleaseAtCycles = [20]; }
152 def V2Write_2cyc_1V1   : SchedWriteRes<[V2UnitV1]>  { let Latency = 2; }
153 def V2Write_2cyc_1V13  : SchedWriteRes<[V2UnitV13]> { let Latency = 2; }
154 def V2Write_3cyc_1V1   : SchedWriteRes<[V2UnitV1]>  { let Latency = 3; }
155 def V2Write_4cyc_1V1   : SchedWriteRes<[V2UnitV1]>  { let Latency = 4; }
156 def V2Write_4cyc_1V13  : SchedWriteRes<[V2UnitV13]> { let Latency = 4; }
157 def V2Write_6cyc_1V1   : SchedWriteRes<[V2UnitV1]>  { let Latency = 6; }
158 def V2Write_10cyc_1V1  : SchedWriteRes<[V2UnitV1]>  { let Latency = 10; }
159 def V2Write_6cyc_1L01  : SchedWriteRes<[V2UnitL01]> { let Latency = 6; }
161 //===----------------------------------------------------------------------===//
162 // Define generic 2 micro-op types
164 def V2Write_1cyc_1B_1R : SchedWriteRes<[V2UnitB, V2UnitR]> {
165   let Latency     = 1;
166   let NumMicroOps = 2;
169 def V2Write_6cyc_1M0_1B : SchedWriteRes<[V2UnitM0, V2UnitB]> {
170   let Latency     = 6;
171   let NumMicroOps = 2;
174 def V2Write_9cyc_1M0_1L : SchedWriteRes<[V2UnitM0, V2UnitL]> {
175   let Latency     = 9;
176   let NumMicroOps = 2;
179 def V2Write_3cyc_1I_1M : SchedWriteRes<[V2UnitI, V2UnitM]> {
180   let Latency     = 3;
181   let NumMicroOps = 2;
184 def V2Write_1cyc_2M : SchedWriteRes<[V2UnitM, V2UnitM]> {
185   let Latency     = 1;
186   let NumMicroOps = 2;
189 def V2Write_3cyc_2M : SchedWriteRes<[V2UnitM, V2UnitM]> {
190   let Latency     = 3;
191   let NumMicroOps = 2;
194 def V2Write_4cyc_2M : SchedWriteRes<[V2UnitM, V2UnitM]> {
195   let Latency     = 4;
196   let NumMicroOps = 2;
199 def V2Write_5cyc_1L_1F : SchedWriteRes<[V2UnitL, V2UnitF]> {
200   let Latency     = 5;
201   let NumMicroOps = 2;
204 def V2Write_6cyc_1I_1L : SchedWriteRes<[V2UnitI, V2UnitL]> {
205   let Latency     = 6;
206   let NumMicroOps = 2;
209 def V2Write_7cyc_1F_1L : SchedWriteRes<[V2UnitF, V2UnitL]> {
210   let Latency     = 7;
211   let NumMicroOps = 2;
214 def V2Write_7cyc_1I_1L : SchedWriteRes<[V2UnitI, V2UnitL]> {
215   let Latency     = 7;
216   let NumMicroOps = 2;
219 def V2Write_1cyc_1L01_1D : SchedWriteRes<[V2UnitL01, V2UnitD]> {
220   let Latency     = 1;
221   let NumMicroOps = 2;
224 def V2Write_5cyc_1M0_1V : SchedWriteRes<[V2UnitM0, V2UnitV]> {
225   let Latency     = 5;
226   let NumMicroOps = 2;
229 def V2Write_2cyc_1L01_1V01 : SchedWriteRes<[V2UnitL01, V2UnitV01]> {
230   let Latency     = 2;
231   let NumMicroOps = 2;
234 def V2Write_2cyc_1L01_1V : SchedWriteRes<[V2UnitL01, V2UnitV]> {
235   let Latency     = 2;
236   let NumMicroOps = 2;
239 def V2Write_2cyc_2V01  : SchedWriteRes<[V2UnitV01, V2UnitV01]> {
240   let Latency = 2;
241   let NumMicroOps = 2;
244 def V2Write_4cyc_2V01  : SchedWriteRes<[V2UnitV01, V2UnitV01]> {
245   let Latency = 4;
246   let NumMicroOps = 2;
249 def V2Write_4cyc_1L01_1V01  : SchedWriteRes<[V2UnitL01, V2UnitV01]> {
250   let Latency = 4;
251   let NumMicroOps = 2;
254 def V2Write_4cyc_1V13_1V : SchedWriteRes<[V2UnitV13, V2UnitV]> {
255   let Latency     = 4;
256   let NumMicroOps = 2;
259 def V2Write_4cyc_2V0 : SchedWriteRes<[V2UnitV0, V2UnitV0]> {
260   let Latency     = 4;
261   let NumMicroOps = 2;
264 def V2Write_4cyc_2V02 : SchedWriteRes<[V2UnitV02, V2UnitV02]> {
265   let Latency     = 4;
266   let NumMicroOps = 2;
269 def V2Write_4cyc_2V : SchedWriteRes<[V2UnitV, V2UnitV]> {
270   let Latency     = 4;
271   let NumMicroOps = 2;
274 def V2Write_6cyc_2V : SchedWriteRes<[V2UnitV, V2UnitV]> {
275   let Latency     = 6;
276   let NumMicroOps = 2;
279 def V2Write_6cyc_2L : SchedWriteRes<[V2UnitL, V2UnitL]> {
280   let Latency     = 6;
281   let NumMicroOps = 2;
284 def V2Write_8cyc_1L_1V : SchedWriteRes<[V2UnitL, V2UnitV]> {
285   let Latency     = 8;
286   let NumMicroOps = 2;
289 def V2Write_4cyc_1L01_1V : SchedWriteRes<[V2UnitL01, V2UnitV]> {
290   let Latency     = 4;
291   let NumMicroOps = 2;
294 def V2Write_3cyc_1M0_1M  : SchedWriteRes<[V2UnitM0, V2UnitM]> {
295   let Latency     = 3;
296   let NumMicroOps = 2;
299 def V2Write_4cyc_1M0_1M  : SchedWriteRes<[V2UnitM0, V2UnitM]> {
300   let Latency     = 4;
301   let NumMicroOps = 2;
304 def V2Write_1cyc_1M0_1M  : SchedWriteRes<[V2UnitM0, V2UnitM]> {
305   let Latency     = 1;
306   let NumMicroOps = 2;
309 def V2Write_2cyc_1M0_1M  : SchedWriteRes<[V2UnitM0, V2UnitM]> {
310   let Latency     = 2;
311   let NumMicroOps = 2;
314 def V2Write_6cyc_2V1 : SchedWriteRes<[V2UnitV1, V2UnitV1]> {
315   let Latency     = 6;
316   let NumMicroOps = 2;
319 def V2Write_4cyc_1V0_1M0 : SchedWriteRes<[V2UnitV0, V2UnitM0]> {
320   let Latency     = 4;
321   let NumMicroOps = 2;
324 def V2Write_5cyc_1V0_1M0 : SchedWriteRes<[V2UnitV0, V2UnitM0]> {
325   let Latency     = 5;
326   let NumMicroOps = 2;
329 def V2Write_5cyc_2V0 : SchedWriteRes<[V2UnitV0, V2UnitV0]> {
330   let Latency     = 5;
331   let NumMicroOps = 2;
334 def V2Write_5cyc_2V02 : SchedWriteRes<[V2UnitV02, V2UnitV02]> {
335   let Latency     = 5;
336   let NumMicroOps = 2;
339 def V2Write_6cyc_1V1_1M0 : SchedWriteRes<[V2UnitV1, V2UnitM0]> {
340   let Latency     = 6;
341   let NumMicroOps = 2;
344 def V2Write_7cyc_1M0_1V02 : SchedWriteRes<[V2UnitM0, V2UnitV02]> {
345   let Latency     = 7;
346   let NumMicroOps = 2;
349 def V2Write_2cyc_1V0_1M : SchedWriteRes<[V2UnitV0, V2UnitM]> {
350   let Latency     = 2;
351   let NumMicroOps = 2;
354 def V2Write_3cyc_1V0_1M : SchedWriteRes<[V2UnitV0, V2UnitM]> {
355   let Latency     = 3;
356   let NumMicroOps = 2;
359 def V2Write_6cyc_1V_1V13 : SchedWriteRes<[V2UnitV, V2UnitV13]> {
360   let Latency     = 6;
361   let NumMicroOps = 2;
364 def V2Write_6cyc_1L_1M : SchedWriteRes<[V2UnitL, V2UnitM]> {
365   let Latency     = 6;
366   let NumMicroOps = 2;
369 def V2Write_6cyc_1L_1S : SchedWriteRes<[V2UnitL, V2UnitS]> {
370   let Latency     = 6;
371   let NumMicroOps = 2;
374 def V2Write_4cyc_2V13 : SchedWriteRes<[V2UnitV13, V2UnitV13]> {
375   let Latency     = 4;
376   let NumMicroOps = 2;
379 def V2Write_8cyc_1M0_1V01 : SchedWriteRes<[V2UnitM0, V2UnitV01]> {
380   let Latency     = 8;
381   let NumMicroOps = 2;
384 //===----------------------------------------------------------------------===//
385 // Define generic 3 micro-op types
387 def V2Write_1cyc_1L01_1D_1I : SchedWriteRes<[V2UnitL01, V2UnitD, V2UnitI]> {
388   let Latency     = 1;
389   let NumMicroOps = 3;
392 def V2Write_2cyc_1L01_1V01_1I : SchedWriteRes<[V2UnitL01, V2UnitV01, V2UnitI]> {
393   let Latency     = 2;
394   let NumMicroOps = 3;
397 def V2Write_2cyc_1L01_2V01 : SchedWriteRes<[V2UnitL01, V2UnitV01, V2UnitV01]> {
398   let Latency     = 2;
399   let NumMicroOps = 3;
402 def V2Write_4cyc_1L01_2V01 : SchedWriteRes<[V2UnitL01, V2UnitV01, V2UnitV01]> {
403   let Latency     = 4;
404   let NumMicroOps = 3;
407 def V2Write_9cyc_1L_2V : SchedWriteRes<[V2UnitL, V2UnitV, V2UnitV]> {
408   let Latency     = 9;
409   let NumMicroOps = 3;
412 def V2Write_4cyc_3V01  : SchedWriteRes<[V2UnitV01, V2UnitV01, V2UnitV01]> {
413   let Latency = 4;
414   let NumMicroOps = 3;
417 def V2Write_7cyc_1M_1M0_1V : SchedWriteRes<[V2UnitM, V2UnitM0, V2UnitV]> {
418   let Latency     = 7;
419   let NumMicroOps = 3;
422 def V2Write_2cyc_1L01_1S_1V : SchedWriteRes<[V2UnitL01, V2UnitS, V2UnitV]> {
423   let Latency     = 2;
424   let NumMicroOps = 3;
427 def V2Write_2cyc_1L01_1S_1V01 : SchedWriteRes<[V2UnitL01, V2UnitS, V2UnitV01]> {
428   let Latency     = 2;
429   let NumMicroOps = 3;
432 def V2Write_6cyc_3L : SchedWriteRes<[V2UnitL, V2UnitL, V2UnitL]> {
433   let Latency     = 6;
434   let NumMicroOps = 3;
437 def V2Write_6cyc_3V : SchedWriteRes<[V2UnitV, V2UnitV, V2UnitV]> {
438   let Latency     = 6;
439   let NumMicroOps = 3;
442 def V2Write_8cyc_1L_2V : SchedWriteRes<[V2UnitL, V2UnitV, V2UnitV]> {
443   let Latency     = 8;
444   let NumMicroOps = 3;
447 //===----------------------------------------------------------------------===//
448 // Define generic 4 micro-op types
450 def V2Write_2cyc_1L01_2V01_1I : SchedWriteRes<[V2UnitL01, V2UnitV01, V2UnitV01,
451                                                V2UnitI]> {
452   let Latency     = 2;
453   let NumMicroOps = 4;
456 def V2Write_2cyc_2L01_2V01 : SchedWriteRes<[V2UnitL01, V2UnitL01,
457                                             V2UnitV01, V2UnitV01]> {
458   let Latency     = 2;
459   let NumMicroOps = 4;
462 def V2Write_4cyc_2L01_2V01 : SchedWriteRes<[V2UnitL01, V2UnitL01,
463                                             V2UnitV01, V2UnitV01]> {
464   let Latency     = 4;
465   let NumMicroOps = 4;
468 def V2Write_5cyc_1I_3L : SchedWriteRes<[V2UnitI, V2UnitL, V2UnitL, V2UnitL]> {
469   let Latency     = 5;
470   let NumMicroOps = 4;
473 def V2Write_9cyc_2L_2V1 : SchedWriteRes<[V2UnitL, V2UnitL, V2UnitV1,
474                                          V2UnitV1]> {
475   let Latency     = 9;
476   let NumMicroOps = 4;
479 def V2Write_6cyc_4V0 : SchedWriteRes<[V2UnitV0, V2UnitV0, V2UnitV0, V2UnitV0]> {
480   let Latency     = 6;
481   let NumMicroOps = 4;
484 def V2Write_8cyc_4V : SchedWriteRes<[V2UnitV, V2UnitV, V2UnitV, V2UnitV]> {
485   let Latency     = 8;
486   let NumMicroOps = 4;
489 def V2Write_6cyc_2V_2V13 : SchedWriteRes<[V2UnitV, V2UnitV, V2UnitV13,
490                                           V2UnitV13]> {
491   let Latency     = 6;
492   let NumMicroOps = 4;
495 def V2Write_8cyc_2V_2V13 : SchedWriteRes<[V2UnitV, V2UnitV, V2UnitV13,
496                                           V2UnitV13]> {
497   let Latency     = 8;
498   let NumMicroOps = 4;
501 def V2Write_6cyc_4V02 : SchedWriteRes<[V2UnitV02, V2UnitV02, V2UnitV02,
502                                        V2UnitV02]> {
503   let Latency     = 6;
504   let NumMicroOps = 4;
507 def V2Write_6cyc_4V : SchedWriteRes<[V2UnitV, V2UnitV, V2UnitV, V2UnitV]> {
508   let Latency     = 6;
509   let NumMicroOps = 4;
512 def V2Write_8cyc_2L_2V : SchedWriteRes<[V2UnitL, V2UnitL, V2UnitV, V2UnitV]> {
513   let Latency     = 8;
514   let NumMicroOps = 4;
517 def V2Write_9cyc_2L_2V : SchedWriteRes<[V2UnitL, V2UnitL, V2UnitV, V2UnitV]> {
518   let Latency     = 9;
519   let NumMicroOps = 4;
522 def V2Write_2cyc_2L01_2V : SchedWriteRes<[V2UnitL01, V2UnitL01, V2UnitV,
523                                           V2UnitV]> {
524   let Latency     = 2;
525   let NumMicroOps = 4;
528 def V2Write_4cyc_2L01_2V : SchedWriteRes<[V2UnitL01, V2UnitL01, V2UnitV,
529                                           V2UnitV]> {
530   let Latency     = 4;
531   let NumMicroOps = 4;
534 def V2Write_8cyc_2M0_2V02 : SchedWriteRes<[V2UnitM0, V2UnitM0, V2UnitV02,
535                                           V2UnitV02]> {
536   let Latency     = 8;
537   let NumMicroOps = 4;
540 def V2Write_8cyc_2V_2V1 : SchedWriteRes<[V2UnitV, V2UnitV, V2UnitV1,
541                                          V2UnitV1]> {
542   let Latency     = 8;
543   let NumMicroOps = 4;
546 def V2Write_4cyc_2M0_2M : SchedWriteRes<[V2UnitM0, V2UnitM0, V2UnitM,
547                                          V2UnitM]> {
548   let Latency     = 4;
549   let NumMicroOps = 4;
552 def V2Write_5cyc_2M0_2M : SchedWriteRes<[V2UnitM0, V2UnitM0, V2UnitM,
553                                          V2UnitM]> {
554   let Latency     = 5;
555   let NumMicroOps = 4;
558 def V2Write_6cyc_2I_2L : SchedWriteRes<[V2UnitI, V2UnitI, V2UnitL, V2UnitL]> {
559   let Latency     = 6;
560   let NumMicroOps = 4;
563 def V2Write_7cyc_4L : SchedWriteRes<[V2UnitL, V2UnitL, V2UnitL, V2UnitL]> {
564   let Latency     = 7;
565   let NumMicroOps = 4;
568 def V2Write_6cyc_1L01_3V01 : SchedWriteRes<[V2UnitL01, V2UnitV01, V2UnitV01,
569                                             V2UnitV01]> {
570   let Latency     = 6;
571   let NumMicroOps = 4;
574 //===----------------------------------------------------------------------===//
575 // Define generic 5 micro-op types
577 def V2Write_2cyc_1L01_2V01_2I : SchedWriteRes<[V2UnitL01, V2UnitV01, V2UnitV01,
578                                                V2UnitI, V2UnitI]> {
579   let Latency     = 2;
580   let NumMicroOps = 5;
583 def V2Write_8cyc_2L_3V : SchedWriteRes<[V2UnitL, V2UnitL, V2UnitV, V2UnitV,
584                                         V2UnitV]> {
585   let Latency     = 8;
586   let NumMicroOps = 5;
589 def V2Write_9cyc_1L_4V : SchedWriteRes<[V2UnitL, V2UnitV, V2UnitV, V2UnitV,
590                                         V2UnitV]> {
591   let Latency     = 9;
592   let NumMicroOps = 5;
595 def V2Write_10cyc_1L_4V : SchedWriteRes<[V2UnitL, V2UnitV, V2UnitV, V2UnitV,
596                                          V2UnitV]> {
597   let Latency     = 10;
598   let NumMicroOps = 5;
601 def V2Write_6cyc_5V : SchedWriteRes<[V2UnitV, V2UnitV, V2UnitV, V2UnitV,
602                                      V2UnitV]> {
603   let Latency     = 6;
604   let NumMicroOps = 5;
607 //===----------------------------------------------------------------------===//
608 // Define generic 6 micro-op types
610 def V2Write_8cyc_3L_3V : SchedWriteRes<[V2UnitL, V2UnitL, V2UnitL,
611                                         V2UnitV, V2UnitV, V2UnitV]> {
612   let Latency     = 8;
613   let NumMicroOps = 6;
616 def V2Write_9cyc_3L_3V : SchedWriteRes<[V2UnitL, V2UnitL, V2UnitL,
617                                         V2UnitV, V2UnitV, V2UnitV]> {
618   let Latency     = 9;
619   let NumMicroOps = 6;
622 def V2Write_9cyc_2L_4V : SchedWriteRes<[V2UnitL, V2UnitL, V2UnitV,
623                                         V2UnitV, V2UnitV, V2UnitV]> {
624   let Latency     = 9;
625   let NumMicroOps = 6;
628 def V2Write_9cyc_2L_2V_2S : SchedWriteRes<[V2UnitL, V2UnitL, V2UnitV,
629                                            V2UnitV, V2UnitS, V2UnitS]> {
630   let Latency     = 9;
631   let NumMicroOps = 6;
634 def V2Write_9cyc_2V_4V13 : SchedWriteRes<[V2UnitV, V2UnitV, V2UnitV13,
635                                           V2UnitV13, V2UnitV13, V2UnitV13]> {
636   let Latency     = 9;
637   let NumMicroOps = 6;
640 def V2Write_2cyc_3L01_3V : SchedWriteRes<[V2UnitL01, V2UnitL01, V2UnitL01,
641                                           V2UnitV, V2UnitV, V2UnitV]> {
642   let Latency     = 2;
643   let NumMicroOps = 6;
646 def V2Write_4cyc_2L01_4V01 : SchedWriteRes<[V2UnitL01, V2UnitL01, V2UnitV01,
647                                             V2UnitV01, V2UnitV01, V2UnitV01]> {
648   let Latency     = 4;
649   let NumMicroOps = 6;
652 def V2Write_5cyc_2L01_4V01 : SchedWriteRes<[V2UnitL01, V2UnitL01, V2UnitV01,
653                                             V2UnitV01, V2UnitV01, V2UnitV01]> {
654   let Latency     = 5;
655   let NumMicroOps = 6;
658 def V2Write_2cyc_3L01_3V01 : SchedWriteRes<[V2UnitL01, V2UnitL01, V2UnitL01,
659                                             V2UnitV01, V2UnitV01, V2UnitV01]> {
660   let Latency     = 2;
661   let NumMicroOps = 6;
664 def V2Write_4cyc_2L01_2S_2V01 : SchedWriteRes<[V2UnitL01, V2UnitL01, V2UnitS,
665                                                V2UnitS, V2UnitV01, V2UnitV01]> {
666   let Latency     = 4;
667   let NumMicroOps = 6;
670 //===----------------------------------------------------------------------===//
671 // Define generic 7 micro-op types
673 def V2Write_8cyc_3L_4V : SchedWriteRes<[V2UnitL, V2UnitL, V2UnitL,
674                                         V2UnitV, V2UnitV, V2UnitV, V2UnitV]> {
675   let Latency     = 8;
676   let NumMicroOps = 7;
679 //===----------------------------------------------------------------------===//
680 // Define generic 8 micro-op types
682 def V2Write_2cyc_4L01_4V : SchedWriteRes<[V2UnitL01, V2UnitL01, V2UnitL01,
683                                           V2UnitL01, V2UnitV, V2UnitV, V2UnitV,
684                                           V2UnitV]> {
685   let Latency     = 2;
686   let NumMicroOps = 8;
689 def V2Write_2cyc_4L01_4V01 : SchedWriteRes<[V2UnitL01, V2UnitL01, V2UnitL01,
690                                             V2UnitL01, V2UnitV01, V2UnitV01,
691                                             V2UnitV01, V2UnitV01]> {
692   let Latency     = 2;
693   let NumMicroOps = 8;
696 def V2Write_4cyc_4L01_4V01 : SchedWriteRes<[V2UnitL01, V2UnitL01, V2UnitL01,
697                                             V2UnitL01, V2UnitV01, V2UnitV01,
698                                             V2UnitV01, V2UnitV01]> {
699   let Latency     = 4;
700   let NumMicroOps = 8;
703 def V2Write_6cyc_2L01_6V01 : SchedWriteRes<[V2UnitL01, V2UnitL01, V2UnitV01,
704                                             V2UnitV01, V2UnitV01, V2UnitV01,
705                                             V2UnitV01, V2UnitV01]> {
706   let Latency     = 6;
707   let NumMicroOps = 8;
710 def V2Write_8cyc_4L_4V : SchedWriteRes<[V2UnitL, V2UnitL, V2UnitL, V2UnitL,
711                                         V2UnitV, V2UnitV, V2UnitV, V2UnitV]> {
712   let Latency     = 8;
713   let NumMicroOps = 8;
716 //===----------------------------------------------------------------------===//
717 // Define generic 9 micro-op types
719 def V2Write_6cyc_3L01_6V01 : SchedWriteRes<[V2UnitL01, V2UnitL01, V2UnitL01,
720                                             V2UnitV01, V2UnitV01, V2UnitV01,
721                                             V2UnitV01, V2UnitV01, V2UnitV01]> {
722   let Latency     = 6;
723   let NumMicroOps = 9;
726 def V2Write_10cyc_1L_8V : SchedWriteRes<[V2UnitL, V2UnitV, V2UnitV, V2UnitV,
727                                          V2UnitV, V2UnitV, V2UnitV, V2UnitV,
728                                          V2UnitV]> {
729   let Latency     = 10;
730   let NumMicroOps = 9;
733 def V2Write_10cyc_3V_3L_3S : SchedWriteRes<[V2UnitV, V2UnitV, V2UnitV,
734                                             V2UnitL, V2UnitL, V2UnitL,
735                                             V2UnitS, V2UnitS, V2UnitS]> {
736   let Latency     = 10;
737   let NumMicroOps = 9;
740 //===----------------------------------------------------------------------===//
741 // Define generic 10 micro-op types
743 def V2Write_9cyc_6L_4V : SchedWriteRes<[V2UnitL, V2UnitL, V2UnitL, V2UnitL,
744                                         V2UnitL, V2UnitL, V2UnitV, V2UnitV,
745                                         V2UnitV, V2UnitV]> {
746   let Latency     = 9;
747   let NumMicroOps = 10;
750 //===----------------------------------------------------------------------===//
751 // Define generic 12 micro-op types
753 def V2Write_5cyc_4L01_8V01 : SchedWriteRes<[V2UnitL01, V2UnitL01, V2UnitL01,
754                                             V2UnitL01, V2UnitV01, V2UnitV01,
755                                             V2UnitV01, V2UnitV01, V2UnitV01,
756                                             V2UnitV01, V2UnitV01, V2UnitV01]> {
757   let Latency     = 5;
758   let NumMicroOps = 12;
761 def V2Write_9cyc_4L_8V : SchedWriteRes<[V2UnitL, V2UnitL, V2UnitL,
762                                         V2UnitL, V2UnitV, V2UnitV,
763                                         V2UnitV, V2UnitV, V2UnitV,
764                                         V2UnitV, V2UnitV, V2UnitV]> {
765   let Latency     = 9;
766   let NumMicroOps = 12;
769 def V2Write_10cyc_4L_8V : SchedWriteRes<[V2UnitL, V2UnitL, V2UnitL,
770                                          V2UnitL, V2UnitV, V2UnitV,
771                                          V2UnitV, V2UnitV, V2UnitV,
772                                          V2UnitV, V2UnitV, V2UnitV]> {
773   let Latency     = 10;
774   let NumMicroOps = 12;
777 //===----------------------------------------------------------------------===//
778 // Define generic 16 micro-op types
780 def V2Write_7cyc_4L01_12V01 : SchedWriteRes<[V2UnitL01, V2UnitL01, V2UnitL01,
781                                              V2UnitL01, V2UnitV01, V2UnitV01,
782                                              V2UnitV01, V2UnitV01, V2UnitV01,
783                                              V2UnitV01, V2UnitV01, V2UnitV01,
784                                              V2UnitV01, V2UnitV01, V2UnitV01,
785                                              V2UnitV01]> {
786   let Latency     = 7;
787   let NumMicroOps = 16;
790 def V2Write_10cyc_4L_8V_4S : SchedWriteRes<[V2UnitL, V2UnitL, V2UnitL,
791                                             V2UnitL, V2UnitV, V2UnitV,
792                                             V2UnitV, V2UnitV, V2UnitV,
793                                             V2UnitV, V2UnitV, V2UnitV,
794                                             V2UnitS, V2UnitS, V2UnitS,
795                                             V2UnitS]> {
796   let Latency     = 10;
797   let NumMicroOps = 16;
800 //===----------------------------------------------------------------------===//
801 // Define generic 18 micro-op types
803 def V2Write_7cyc_9L01_9V01 : SchedWriteRes<[V2UnitL01, V2UnitL01, V2UnitL01,
804                                             V2UnitL01, V2UnitL01, V2UnitL01,
805                                             V2UnitL01, V2UnitL01, V2UnitL01,
806                                             V2UnitV01, V2UnitV01, V2UnitV01,
807                                             V2UnitV01, V2UnitV01, V2UnitV01,
808                                             V2UnitV01, V2UnitV01, V2UnitV01]> {
809   let Latency     = 7;
810   let NumMicroOps = 18;
813 //===----------------------------------------------------------------------===//
814 // Define generic 27 micro-op types
816 def V2Write_7cyc_9L01_9S_9V01 : SchedWriteRes<[V2UnitL01, V2UnitL01, V2UnitL01,
817                                                V2UnitL01, V2UnitL01, V2UnitL01,
818                                                V2UnitL01, V2UnitL01, V2UnitL01,
819                                                V2UnitS, V2UnitS, V2UnitS,
820                                                V2UnitS, V2UnitS, V2UnitS,
821                                                V2UnitS, V2UnitS, V2UnitS,
822                                                V2UnitV01, V2UnitV01, V2UnitV01,
823                                                V2UnitV01, V2UnitV01, V2UnitV01,
824                                                V2UnitV01, V2UnitV01,
825                                                V2UnitV01]> {
826   let Latency     = 7;
827   let NumMicroOps = 27;
830 //===----------------------------------------------------------------------===//
831 // Define generic 36 micro-op types
833 def V2Write_11cyc_18L01_18V01 : SchedWriteRes<[V2UnitL01, V2UnitL01, V2UnitL01,
834                                                V2UnitL01, V2UnitL01, V2UnitL01,
835                                                V2UnitL01, V2UnitL01, V2UnitL01,
836                                                V2UnitL01, V2UnitL01, V2UnitL01,
837                                                V2UnitL01, V2UnitL01, V2UnitL01,
838                                                V2UnitL01, V2UnitL01, V2UnitL01,
839                                                V2UnitV01, V2UnitV01, V2UnitV01,
840                                                V2UnitV01, V2UnitV01, V2UnitV01,
841                                                V2UnitV01, V2UnitV01, V2UnitV01,
842                                                V2UnitV01, V2UnitV01, V2UnitV01,
843                                                V2UnitV01, V2UnitV01, V2UnitV01,
844                                                V2UnitV01, V2UnitV01,
845                                                V2UnitV01]> {
846   let Latency     = 11;
847   let NumMicroOps = 36;
850 //===----------------------------------------------------------------------===//
851 // Define generic 54 micro-op types
853 def V2Write_11cyc_18L01_18S_18V01 : SchedWriteRes<[V2UnitL01, V2UnitL01,
854                                                    V2UnitL01, V2UnitL01,
855                                                    V2UnitL01, V2UnitL01,
856                                                    V2UnitL01, V2UnitL01,
857                                                    V2UnitL01, V2UnitL01,
858                                                    V2UnitL01, V2UnitL01,
859                                                    V2UnitL01, V2UnitL01,
860                                                    V2UnitL01, V2UnitL01,
861                                                    V2UnitL01, V2UnitL01,
862                                                    V2UnitS, V2UnitS, V2UnitS,
863                                                    V2UnitS, V2UnitS, V2UnitS,
864                                                    V2UnitS, V2UnitS, V2UnitS,
865                                                    V2UnitS, V2UnitS, V2UnitS,
866                                                    V2UnitS, V2UnitS, V2UnitS,
867                                                    V2UnitS, V2UnitS, V2UnitS,
868                                                    V2UnitV01, V2UnitV01,
869                                                    V2UnitV01, V2UnitV01,
870                                                    V2UnitV01, V2UnitV01,
871                                                    V2UnitV01, V2UnitV01,
872                                                    V2UnitV01, V2UnitV01,
873                                                    V2UnitV01, V2UnitV01,
874                                                    V2UnitV01, V2UnitV01,
875                                                    V2UnitV01, V2UnitV01,
876                                                    V2UnitV01, V2UnitV01]> {
877   let Latency     = 11;
878   let NumMicroOps = 54;
881 //===----------------------------------------------------------------------===//
882 // Define predicate-controlled types
884 def V2Write_ArithI : SchedWriteVariant<[
885                        SchedVar<IsCheapLSL,  [V2Write_1cyc_1I]>,
886                        SchedVar<NoSchedPred, [V2Write_2cyc_1M]>]>;
888 def V2Write_ArithF : SchedWriteVariant<[
889                        SchedVar<IsCheapLSL,  [V2Write_1cyc_1F]>,
890                        SchedVar<NoSchedPred, [V2Write_2cyc_1M]>]>;
892 def V2Write_Logical : SchedWriteVariant<[
893                         SchedVar<NeoverseNoLSL, [V2Write_1cyc_1F]>,
894                         SchedVar<NoSchedPred,   [V2Write_2cyc_1M]>]>;
896 def V2Write_Extr : SchedWriteVariant<[
897                      SchedVar<IsRORImmIdiomPred, [V2Write_1cyc_1I]>,
898                      SchedVar<NoSchedPred,       [V2Write_3cyc_1I_1M]>]>;
900 def V2Write_LdrHQ : SchedWriteVariant<[
901                       SchedVar<NeoverseHQForm,  [V2Write_7cyc_1I_1L]>,
902                       SchedVar<NoSchedPred,     [V2Write_6cyc_1L]>]>;
904 def V2Write_StrHQ : SchedWriteVariant<[
905                       SchedVar<NeoverseHQForm,  [V2Write_2cyc_1L01_1V01_1I]>,
906                       SchedVar<NoSchedPred,     [V2Write_2cyc_1L01_1V01]>]>;
908 def V2Write_0or1cyc_1I : SchedWriteVariant<[
909                       SchedVar<NeoverseZeroMove, [V2Write_0cyc]>,
910                       SchedVar<NoSchedPred,      [V2Write_1cyc_1I]>]>;
912 def V2Write_0or2cyc_1V : SchedWriteVariant<[
913                       SchedVar<NeoverseZeroMove, [V2Write_0cyc]>,
914                       SchedVar<NoSchedPred,      [V2Write_2cyc_1V]>]>;
916 def V2Write_0or3cyc_1M0 : SchedWriteVariant<[
917                       SchedVar<NeoverseZeroMove, [V2Write_0cyc]>,
918                       SchedVar<NoSchedPred,      [V2Write_3cyc_1M0]>]>;
920 def V2Write_2or3cyc_1M : SchedWriteVariant<[
921                       SchedVar<NeoversePdIsPg,  [V2Write_3cyc_1M]>,
922                       SchedVar<NoSchedPred,     [V2Write_2cyc_1M]>]>;
924 def V2Write_3or4cyc_2M : SchedWriteVariant<[
925                       SchedVar<NeoversePdIsPg,  [V2Write_4cyc_2M]>,
926                       SchedVar<NoSchedPred,     [V2Write_3cyc_2M]>]>;
928 def V2Write_1or2cyc_1M0 : SchedWriteVariant<[
929                       SchedVar<NeoversePdIsPg,  [V2Write_2cyc_1M0]>,
930                       SchedVar<NoSchedPred,     [V2Write_1cyc_1M0]>]>;
932 def V2Write_2or3cyc_1M0 : SchedWriteVariant<[
933                       SchedVar<NeoversePdIsPg,  [V2Write_3cyc_1M0]>,
934                       SchedVar<NoSchedPred,     [V2Write_2cyc_1M0]>]>;
936 def V2Write_1or2cyc_1M0_1M : SchedWriteVariant<[
937                       SchedVar<NeoversePdIsPg,  [V2Write_2cyc_1M0_1M]>,
938                       SchedVar<NoSchedPred,     [V2Write_1cyc_1M0_1M]>]>;
940 def V2Write_3or4cyc_1M0_1M : SchedWriteVariant<[
941                       SchedVar<NeoversePdIsPg,  [V2Write_4cyc_1M0_1M]>,
942                       SchedVar<NoSchedPred,     [V2Write_3cyc_1M0_1M]>]>;
944 def V2Write_4or5cyc_2M0_2M : SchedWriteVariant<[
945                       SchedVar<NeoversePdIsPg,  [V2Write_5cyc_2M0_2M]>,
946                       SchedVar<NoSchedPred,     [V2Write_4cyc_2M0_2M]>]>;
948 def V2Write_4or5cyc_1V0_1M0 : SchedWriteVariant<[
949                       SchedVar<NeoversePdIsPg,  [V2Write_5cyc_1V0_1M0]>,
950                       SchedVar<NoSchedPred,     [V2Write_4cyc_1V0_1M0]>]>;
952 def V2Write_2or3cyc_1V0_1M : SchedWriteVariant<[
953                       SchedVar<NeoversePdIsPg,  [V2Write_3cyc_1V0_1M]>,
954                       SchedVar<NoSchedPred,     [V2Write_2cyc_1V0_1M]>]>;
956 def V2Write_IncDec : SchedWriteVariant<[
957                       SchedVar<NeoverseCheapIncDec, [V2Write_1cyc_1F]>,
958                       SchedVar<NoSchedPred,         [V2Write_2cyc_1M]>]>;
960 //===----------------------------------------------------------------------===//
961 // Define forwarded types
963 // NOTE: SOG, p. 16, n. 2: Accumulator forwarding is not supported for
964 // consumers of 64 bit multiply high operations?
965 def V2Wr_IM   : SchedWriteRes<[V2UnitM]>  { let Latency = 2; }
966 def V2Wr_IMA  : SchedWriteRes<[V2UnitM0]> { let Latency = 2; }
967 def V2Wr_IMUL : SchedWriteVariant<[
968                   SchedVar<IsReg3ZeroPred, [V2Wr_IM]>,
969                   SchedVar<NoSchedPred,    [V2Wr_IMA]>]>;
970 def V2Rd_IMA  : SchedReadAdvance<1, [V2Wr_IMA]>;
972 def V2Wr_FMA : SchedWriteRes<[V2UnitV]> { let Latency = 4; }
973 def V2Rd_FMA : SchedReadAdvance<2, [WriteFMul, V2Wr_FMA]>;
975 def V2Wr_VA : SchedWriteRes<[V2UnitV13]> { let Latency = 4; }
976 def V2Rd_VA : SchedReadAdvance<3, [V2Wr_VA]>;
978 def V2Wr_VDOT : SchedWriteRes<[V2UnitV]> { let Latency = 3; }
979 def V2Rd_VDOT : SchedReadAdvance<2, [V2Wr_VDOT]>;
981 def V2Wr_VMMA : SchedWriteRes<[V2UnitV]> { let Latency = 3; }
982 def V2Rd_VMMA : SchedReadAdvance<2, [V2Wr_VMMA]>;
984 def V2Wr_VMA : SchedWriteRes<[V2UnitV02]> { let Latency = 4; }
985 def V2Rd_VMA : SchedReadAdvance<3, [V2Wr_VMA]>;
987 def V2Wr_VMAH : SchedWriteRes<[V2UnitV02, V2UnitV02]> { let Latency = 4; }
988 def V2Rd_VMAH : SchedReadAdvance<2, [V2Wr_VMAH]>;
990 def V2Wr_VMAL : SchedWriteRes<[V2UnitV02]> { let Latency = 4; }
991 def V2Rd_VMAL : SchedReadAdvance<3, [V2Wr_VMAL]>;
993 def V2Wr_VPA : SchedWriteRes<[V2UnitV13]> { let Latency = 4; }
994 def V2Rd_VPA : SchedReadAdvance<3, [V2Wr_VPA]>;
996 def V2Wr_VSA : SchedWriteRes<[V2UnitV13]> { let Latency = 4; }
997 def V2Rd_VSA : SchedReadAdvance<3, [V2Wr_VSA]>;
999 def V2Wr_VFCMA : SchedWriteRes<[V2UnitV]> { let Latency = 4; }
1000 def V2Rd_VFCMA : SchedReadAdvance<2, [V2Wr_VFCMA]>;
1002 def V2Wr_VFM  : SchedWriteRes<[V2UnitV]> { let Latency = 3; }
1003 def V2Wr_VFMA : SchedWriteRes<[V2UnitV]> { let Latency = 4; }
1004 def V2Rd_VFMA : SchedReadAdvance<2, [V2Wr_VFM, V2Wr_VFMA]>;
1006 def V2Wr_VFMAL : SchedWriteRes<[V2UnitV]> { let Latency = 4; }
1007 def V2Rd_VFMAL : SchedReadAdvance<2, [V2Wr_VFMAL]>;
1009 def V2Wr_VBFDOT : SchedWriteRes<[V2UnitV]> { let Latency = 5; }
1010 def V2Rd_VBFDOT : SchedReadAdvance<2, [V2Wr_VBFDOT]>;
1011 def V2Wr_VBFMMA : SchedWriteRes<[V2UnitV]> { let Latency = 6; }
1012 def V2Rd_VBFMMA : SchedReadAdvance<2, [V2Wr_VBFMMA]>;
1013 def V2Wr_VBFMAL : SchedWriteRes<[V2UnitV]> { let Latency = 5; }
1014 def V2Rd_VBFMAL : SchedReadAdvance<3, [V2Wr_VBFMAL]>;
1016 def V2Wr_CRC : SchedWriteRes<[V2UnitM0]> { let Latency = 2; }
1017 def V2Rd_CRC : SchedReadAdvance<1, [V2Wr_CRC]>;
1019 def V2Wr_ZA  : SchedWriteRes<[V2UnitV13]> { let Latency = 4; }
1020 def V2Rd_ZA  : SchedReadAdvance<3, [V2Wr_ZA]>;
1021 def V2Wr_ZPA : SchedWriteRes<[V2UnitV13]> { let Latency = 4; }
1022 def V2Rd_ZPA : SchedReadAdvance<3, [V2Wr_ZPA]>;
1023 def V2Wr_ZSA : SchedWriteRes<[V2UnitV13]> { let Latency = 4; }
1024 def V2Rd_ZSA : SchedReadAdvance<3, [V2Wr_ZSA]>;
1026 def V2Wr_ZDOTB : SchedWriteRes<[V2UnitV]>   { let Latency = 3; }
1027 def V2Rd_ZDOTB : SchedReadAdvance<2, [V2Wr_ZDOTB]>;
1028 def V2Wr_ZDOTH : SchedWriteRes<[V2UnitV02]> { let Latency = 4; }
1029 def V2Rd_ZDOTH : SchedReadAdvance<3, [V2Wr_ZDOTH]>;
1031 // NOTE: SOG p. 43: Complex multiply-add B, H, S element size: How to reduce
1032 // throughput to 1 in case of forwarding?
1033 def V2Wr_ZCMABHS : SchedWriteRes<[V2UnitV02]> { let Latency = 4; }
1034 def V2Rd_ZCMABHS : SchedReadAdvance<3, [V2Wr_ZCMABHS]>;
1035 def V2Wr_ZCMAD   : SchedWriteRes<[V2UnitV02, V2UnitV02]> { let Latency = 5; }
1036 def V2Rd_ZCMAD   : SchedReadAdvance<2, [V2Wr_ZCMAD]>;
1038 def V2Wr_ZMMA : SchedWriteRes<[V2UnitV]> { let Latency = 3; }
1039 def V2Rd_ZMMA : SchedReadAdvance<2, [V2Wr_ZMMA]>;
1041 def V2Wr_ZMABHS : SchedWriteRes<[V2UnitV02, V2UnitV02]> { let Latency = 4; }
1042 def V2Rd_ZMABHS : SchedReadAdvance<3, [V2Wr_ZMABHS]>;
1043 def V2Wr_ZMAD  : SchedWriteRes<[V2UnitV02, V2UnitV02]> { let Latency = 5; }
1044 def V2Rd_ZMAD  : SchedReadAdvance<2, [V2Wr_ZMAD]>;
1046 def V2Wr_ZMAL : SchedWriteRes<[V2UnitV02]> { let Latency = 4; }
1047 def V2Rd_ZMAL : SchedReadAdvance<3, [V2Wr_ZMAL]>;
1049 def V2Wr_ZMASQL   : SchedWriteRes<[V2UnitV02]>            { let Latency = 4; }
1050 def V2Wr_ZMASQBHS : SchedWriteRes<[V2UnitV02]>            { let Latency = 4; }
1051 def V2Wr_ZMASQD   : SchedWriteRes<[V2UnitV02, V2UnitV02]> { let Latency = 5; }
1052 def V2Rd_ZMASQ    : SchedReadAdvance<2, [V2Wr_ZMASQL, V2Wr_ZMASQBHS,
1053                                          V2Wr_ZMASQD]>;
1055 def V2Wr_ZFCMA : SchedWriteRes<[V2UnitV]> { let Latency = 5; }
1056 def V2Rd_ZFCMA : SchedReadAdvance<3, [V2Wr_ZFCMA]>;
1058 def V2Wr_ZFMA : SchedWriteRes<[V2UnitV]> { let Latency = 4; }
1059 def V2Rd_ZFMA : SchedReadAdvance<2, [V2Wr_ZFMA]>;
1061 def V2Wr_ZFMAL : SchedWriteRes<[V2UnitV]> { let Latency = 4; }
1062 def V2Rd_ZFMAL : SchedReadAdvance<2, [V2Wr_ZFMAL]>;
1064 def V2Wr_ZBFDOT : SchedWriteRes<[V2UnitV]> { let Latency = 5; }
1065 def V2Rd_ZBFDOT : SchedReadAdvance<2, [V2Wr_ZBFDOT]>;
1066 def V2Wr_ZBFMMA : SchedWriteRes<[V2UnitV]> { let Latency = 6; }
1067 def V2Rd_ZBFMMA : SchedReadAdvance<2, [V2Wr_ZBFMMA]>;
1068 def V2Wr_ZBFMAL : SchedWriteRes<[V2UnitV]> { let Latency = 5; }
1069 def V2Rd_ZBFMAL : SchedReadAdvance<3, [V2Wr_ZBFMAL]>;
1071 //===----------------------------------------------------------------------===//
1072 // Define types with long resource cycles (rc)
1074 def V2Write_6cyc_1V1_5rc    : SchedWriteRes<[V2UnitV1]>  { let Latency =  6; let ReleaseAtCycles = [ 5]; }
1075 def V2Write_7cyc_1V02_7rc   : SchedWriteRes<[V2UnitV02]> { let Latency =  7; let ReleaseAtCycles = [ 7]; }
1076 def V2Write_10cyc_1V02_5rc  : SchedWriteRes<[V2UnitV02]> { let Latency = 10; let ReleaseAtCycles = [ 5]; }
1077 def V2Write_10cyc_1V02_9rc  : SchedWriteRes<[V2UnitV02]> { let Latency = 10; let ReleaseAtCycles = [ 9]; }
1078 def V2Write_10cyc_1V02_10rc : SchedWriteRes<[V2UnitV02]> { let Latency = 10; let ReleaseAtCycles = [10]; }
1079 def V2Write_10cyc_1V0_9rc   : SchedWriteRes<[V2UnitV0]>  { let Latency = 10; let ReleaseAtCycles = [ 9]; }
1080 def V2Write_10cyc_1V1_9rc   : SchedWriteRes<[V2UnitV1]>  { let Latency = 10; let ReleaseAtCycles = [ 9]; }
1081 def V2Write_13cyc_1V0_12rc  : SchedWriteRes<[V2UnitV0]>  { let Latency = 13; let ReleaseAtCycles = [12]; }
1082 def V2Write_13cyc_1V02_12rc : SchedWriteRes<[V2UnitV02]> { let Latency = 13; let ReleaseAtCycles = [12]; }
1083 def V2Write_13cyc_1V02_13rc : SchedWriteRes<[V2UnitV02]> { let Latency = 13; let ReleaseAtCycles = [13]; }
1084 def V2Write_15cyc_1V02_14rc : SchedWriteRes<[V2UnitV02]> { let Latency = 15; let ReleaseAtCycles = [14]; }
1085 def V2Write_16cyc_1V02_15rc : SchedWriteRes<[V2UnitV02]> { let Latency = 16; let ReleaseAtCycles = [15]; }
1086 def V2Write_16cyc_1V0_14rc  : SchedWriteRes<[V2UnitV0]>  { let Latency = 16; let ReleaseAtCycles = [14]; }
1088 // Miscellaneous
1089 // -----------------------------------------------------------------------------
1091 def : InstRW<[WriteI], (instrs COPY)>;
1093 // Â§3.3 Branch instructions
1094 // -----------------------------------------------------------------------------
1096 // Branch, immed
1097 // Compare and branch
1098 def : SchedAlias<WriteBr,    V2Write_1cyc_1B>;
1100 // Branch, register
1101 def : SchedAlias<WriteBrReg, V2Write_1cyc_1B>;
1103 // Branch and link, immed
1104 // Branch and link, register
1105 def : InstRW<[V2Write_1cyc_1B_1R], (instrs BL, BLR)>;
1107 // Â§3.4 Arithmetic and Logical Instructions
1108 // -----------------------------------------------------------------------------
1110 // ALU, basic
1111 // ALU, basic, flagset
1112 def : SchedAlias<WriteI, V2Write_1cyc_1I>;
1113 def : InstRW<[V2Write_1cyc_1F], (instregex "^(ADC|SBC)S[WX]r$")>;
1114 def : InstRW<[V2Write_0or1cyc_1I], (instregex "^MOVZ[WX]i$")>;
1116 // ALU, extend and shift
1117 def : SchedAlias<WriteIEReg, V2Write_2cyc_1M>;
1119 // Arithmetic, LSL shift, shift <= 4
1120 // Arithmetic, flagset, LSL shift, shift <= 4
1121 // Arithmetic, LSR/ASR/ROR shift or LSL shift > 4
1122 def : SchedAlias<WriteISReg, V2Write_ArithI>;
1123 def : InstRW<[V2Write_ArithF],
1124              (instregex "^(ADD|SUB)S[WX]rs$")>;
1126 // Arithmetic, immediate to logical address tag
1127 def : InstRW<[V2Write_2cyc_1M], (instrs ADDG, SUBG)>;
1129 // Convert floating-point condition flags
1130 // Flag manipulation instructions
1131 def : WriteRes<WriteSys, []> { let Latency = 1; }
1133 // Insert Random Tags
1134 def : InstRW<[V2Write_2cyc_1M], (instrs IRG, IRGstack)>;
1136 // Insert Tag Mask
1137 // Subtract Pointer
1138 // Subtract Pointer, flagset
1139 def : InstRW<[V2Write_1cyc_1I], (instrs GMI, SUBP, SUBPS)>;
1141 // Logical, shift, no flagset
1142 def : InstRW<[V2Write_1cyc_1I],    (instregex "^(AND|BIC|EON|EOR|ORN)[WX]rs$")>;
1143 def : InstRW<[V2Write_0or1cyc_1I], (instregex "^ORR[WX]rs$")>;
1145 // Logical, shift, flagset
1146 def : InstRW<[V2Write_Logical], (instregex "^(AND|BIC)S[WX]rs$")>;
1148 // Move and shift instructions
1149 // -----------------------------------------------------------------------------
1151 def : SchedAlias<WriteImm, V2Write_1cyc_1I>;
1153 // Â§3.5 Divide and multiply instructions
1154 // -----------------------------------------------------------------------------
1156 // SDIV, UDIV
1157 def : SchedAlias<WriteID32,  V2Write_12cyc_1M0>;
1158 def : SchedAlias<WriteID64,  V2Write_20cyc_1M0>;
1160 def : SchedAlias<WriteIM32, V2Write_2cyc_1M>;
1161 def : SchedAlias<WriteIM64, V2Write_2cyc_1M>;
1163 // Multiply
1164 // Multiply accumulate, W-form
1165 // Multiply accumulate, X-form
1166 def : InstRW<[V2Wr_IMUL, ReadIM, ReadIM, V2Rd_IMA],
1167              (instregex "^M(ADD|SUB)[WX]rrr$")>;
1169 // Multiply accumulate long
1170 // Multiply long
1171 def : InstRW<[V2Wr_IMUL, ReadIM, ReadIM, V2Rd_IMA],
1172              (instregex "^(S|U)M(ADD|SUB)Lrrr$")>;
1174 // Multiply high
1175 def : InstRW<[V2Write_3cyc_1M], (instrs SMULHrr, UMULHrr)>;
1177 // Pointer Authentication Instructions (v8.3 PAC)
1178 // -----------------------------------------------------------------------------
1180 // Authenticate data address
1181 // Authenticate instruction address
1182 // Compute pointer authentication code for data address
1183 // Compute pointer authentication code, using generic key
1184 // Compute pointer authentication code for instruction address
1185 def : InstRW<[V2Write_5cyc_1M0], (instregex "^AUT", "^PAC")>;
1187 // Branch and link, register, with pointer authentication
1188 // Branch, register, with pointer authentication
1189 // Branch, return, with pointer authentication
1190 def : InstRW<[V2Write_6cyc_1M0_1B], (instrs BLRAA, BLRAAZ, BLRAB, BLRABZ, BRAA,
1191                                             BRAAZ, BRAB, BRABZ, RETAA, RETAB,
1192                                             ERETAA, ERETAB)>;
1195 // Load register, with pointer authentication
1196 def : InstRW<[V2Write_9cyc_1M0_1L], (instregex "^LDRA[AB](indexed|writeback)")>;
1198 // Strip pointer authentication code
1199 def : InstRW<[V2Write_2cyc_1M0], (instrs XPACD, XPACI, XPACLRI)>;
1201 // Miscellaneous data-processing instructions
1202 // -----------------------------------------------------------------------------
1204 // Address generation
1205 def : InstRW<[V2Write_1cyc_1F], (instrs ADR, ADRP)>;
1207 // Bitfield extract, one reg
1208 // Bitfield extract, two regs
1209 def : SchedAlias<WriteExtr, V2Write_Extr>;
1210 def : InstRW<[V2Write_Extr], (instrs EXTRWrri, EXTRXrri)>;
1212 // Bitfield move, basic
1213 def : SchedAlias<WriteIS, V2Write_1cyc_1I>;
1215 // Bitfield move, insert
1216 def : InstRW<[V2Write_2cyc_1M], (instregex "^BFM[WX]ri$")>;
1218 // Load instructions
1219 // -----------------------------------------------------------------------------
1221 // NOTE: SOG p. 19: Throughput of LDN?P X-form should be 2, but reported as 3.
1223 def : SchedAlias<WriteLD,    V2Write_4cyc_1L>;
1224 def : SchedAlias<WriteLDIdx, V2Write_4cyc_1L>;
1226 // Load register, literal
1227 def : InstRW<[V2Write_5cyc_1L_1F], (instrs LDRWl, LDRXl, LDRSWl, PRFMl)>;
1229 // Load pair, signed immed offset, signed words
1230 def : InstRW<[V2Write_5cyc_1I_3L, WriteLDHi], (instrs LDPSWi)>;
1232 // Load pair, immed post-index or immed pre-index, signed words
1233 def : InstRW<[WriteAdr, V2Write_5cyc_1I_3L, WriteLDHi],
1234              (instregex "^LDPSW(post|pre)$")>;
1236 // Store instructions
1237 // -----------------------------------------------------------------------------
1239 // NOTE: SOG, p. 20: Unsure if STRH uses pipeline I.
1241 def : SchedAlias<WriteST,    V2Write_1cyc_1L01_1D>;
1242 def : SchedAlias<WriteSTIdx, V2Write_1cyc_1L01_1D>;
1243 def : SchedAlias<WriteSTP,   V2Write_1cyc_1L01_1D>;
1244 def : SchedAlias<WriteAdr,   V2Write_1cyc_1I>;
1246 // Tag load instructions
1247 // -----------------------------------------------------------------------------
1249 // Load allocation tag
1250 // Load multiple allocation tags
1251 def : InstRW<[V2Write_4cyc_1L], (instrs LDG, LDGM)>;
1253 // Tag store instructions
1254 // -----------------------------------------------------------------------------
1256 // Store allocation tags to one or two granules, post-index
1257 // Store allocation tags to one or two granules, pre-index
1258 // Store allocation tag to one or two granules, zeroing, post-index
1259 // Store Allocation Tag to one or two granules, zeroing, pre-index
1260 // Store allocation tag and reg pair to memory, post-Index
1261 // Store allocation tag and reg pair to memory, pre-Index
1262 def : InstRW<[V2Write_1cyc_1L01_1D_1I], (instrs STGPreIndex, STGPostIndex,
1263                                                 ST2GPreIndex, ST2GPostIndex,
1264                                                 STZGPreIndex, STZGPostIndex,
1265                                                 STZ2GPreIndex, STZ2GPostIndex,
1266                                                 STGPpre, STGPpost)>;
1268 // Store allocation tags to one or two granules, signed offset
1269 // Store allocation tag to two granules, zeroing, signed offset
1270 // Store allocation tag and reg pair to memory, signed offset
1271 // Store multiple allocation tags
1272 def : InstRW<[V2Write_1cyc_1L01_1D], (instrs STGi, ST2Gi, STZGi,
1273                                              STZ2Gi, STGPi, STGM, STZGM)>;
1275 // FP data processing instructions
1276 // -----------------------------------------------------------------------------
1278 // FP absolute value
1279 // FP arithmetic
1280 // FP min/max
1281 // FP negate
1282 // FP select
1283 def : SchedAlias<WriteF,     V2Write_2cyc_1V>;
1285 // FP compare
1286 def : SchedAlias<WriteFCmp,  V2Write_2cyc_1V0>;
1288 // FP divide, square root
1289 def : SchedAlias<WriteFDiv,  V2Write_7cyc_1V02>;
1291 // FP divide, H-form
1292 def : InstRW<[V2Write_7cyc_1V02],  (instrs FDIVHrr)>;
1293 // FP divide, S-form
1294 def : InstRW<[V2Write_10cyc_1V02], (instrs FDIVSrr)>;
1295 // FP divide, D-form
1296 def : InstRW<[V2Write_15cyc_1V02], (instrs FDIVDrr)>;
1298 // FP square root, H-form
1299 def : InstRW<[V2Write_7cyc_1V02],  (instrs FSQRTHr)>;
1300 // FP square root, S-form
1301 def : InstRW<[V2Write_9cyc_1V02],  (instrs FSQRTSr)>;
1302 // FP square root, D-form
1303 def : InstRW<[V2Write_16cyc_1V02], (instrs FSQRTDr)>;
1305 // FP multiply
1306 def : WriteRes<WriteFMul, [V2UnitV]> { let Latency = 3; }
1308 // FP multiply accumulate
1309 def : InstRW<[V2Wr_FMA, ReadDefault, ReadDefault, V2Rd_FMA],
1310              (instregex "^FN?M(ADD|SUB)[HSD]rrr$")>;
1312 // FP round to integral
1313 def : InstRW<[V2Write_3cyc_1V02], (instregex "^FRINT[AIMNPXZ][HSD]r$",
1314                                              "^FRINT(32|64)[XZ][SD]r$")>;
1316 // FP miscellaneous instructions
1317 // -----------------------------------------------------------------------------
1319 // FP convert, from gen to vec reg
1320 def : InstRW<[V2Write_3cyc_1M0], (instregex "^[SU]CVTF[SU][WX][HSD]ri$")>;
1322 // FP convert, from vec to gen reg
1323 def : InstRW<[V2Write_3cyc_1V01],
1324              (instregex "^FCVT[AMNPZ][SU][SU][WX][HSD]ri?$")>;
1326 // FP convert, Javascript from vec to gen reg
1327 def : SchedAlias<WriteFCvt, V2Write_3cyc_1V0>;
1329 // FP convert, from vec to vec reg
1330 def : InstRW<[V2Write_3cyc_1V02], (instrs FCVTSHr, FCVTDHr, FCVTHSr, FCVTDSr,
1331                                           FCVTHDr, FCVTSDr, FCVTXNv1i64)>;
1333 // FP move, immed
1334 // FP move, register
1335 def : SchedAlias<WriteFImm, V2Write_2cyc_1V>;
1337 // FP transfer, from gen to low half of vec reg
1338 def : InstRW<[V2Write_0or3cyc_1M0],
1339              (instrs FMOVWHr, FMOVXHr, FMOVWSr, FMOVXDr)>;
1341 // FP transfer, from gen to high half of vec reg
1342 def : InstRW<[V2Write_5cyc_1M0_1V], (instrs FMOVXDHighr)>;
1344 // FP transfer, from vec to gen reg
1345 def : SchedAlias<WriteFCopy, V2Write_2cyc_2V01>;
1347 // FP load instructions
1348 // -----------------------------------------------------------------------------
1350 // Load vector reg, literal, S/D/Q forms
1351 def : InstRW<[V2Write_7cyc_1F_1L], (instregex "^LDR[SDQ]l$")>;
1353 // Load vector reg, unscaled immed
1354 def : InstRW<[V2Write_6cyc_1L], (instregex "^LDUR[BHSDQ]i$")>;
1356 // Load vector reg, immed post-index
1357 // Load vector reg, immed pre-index
1358 def : InstRW<[WriteAdr, V2Write_6cyc_1I_1L],
1359              (instregex "^LDR[BHSDQ](pre|post)$")>;
1361 // Load vector reg, unsigned immed
1362 def : InstRW<[V2Write_6cyc_1L], (instregex "^LDR[BHSDQ]ui$")>;
1364 // Load vector reg, register offset, basic
1365 // Load vector reg, register offset, scale, S/D-form
1366 // Load vector reg, register offset, scale, H/Q-form
1367 // Load vector reg, register offset, extend
1368 // Load vector reg, register offset, extend, scale, S/D-form
1369 // Load vector reg, register offset, extend, scale, H/Q-form
1370 def : InstRW<[V2Write_LdrHQ, ReadAdrBase], (instregex "^LDR[BHSDQ]ro[WX]$")>;
1372 // Load vector pair, immed offset, S/D-form
1373 def : InstRW<[V2Write_6cyc_1L, WriteLDHi], (instregex "^LDN?P[SD]i$")>;
1375 // Load vector pair, immed offset, Q-form
1376 def : InstRW<[V2Write_6cyc_2L, WriteLDHi], (instrs LDPQi, LDNPQi)>;
1378 // Load vector pair, immed post-index, S/D-form
1379 // Load vector pair, immed pre-index, S/D-form
1380 def : InstRW<[WriteAdr, V2Write_6cyc_1I_1L, WriteLDHi],
1381              (instregex "^LDP[SD](pre|post)$")>;
1383 // Load vector pair, immed post-index, Q-form
1384 // Load vector pair, immed pre-index, Q-form
1385 def : InstRW<[WriteAdr, V2Write_6cyc_2I_2L, WriteLDHi], (instrs LDPQpost,
1386                                                                 LDPQpre)>;
1388 // FP store instructions
1389 // -----------------------------------------------------------------------------
1391 // Store vector reg, unscaled immed, B/H/S/D-form
1392 // Store vector reg, unscaled immed, Q-form
1393 def : InstRW<[V2Write_2cyc_1L01_1V01], (instregex "^STUR[BHSDQ]i$")>;
1395 // Store vector reg, immed post-index, B/H/S/D-form
1396 // Store vector reg, immed post-index, Q-form
1397 // Store vector reg, immed pre-index, B/H/S/D-form
1398 // Store vector reg, immed pre-index, Q-form
1399 def : InstRW<[WriteAdr, V2Write_2cyc_1L01_1V01_1I],
1400              (instregex "^STR[BHSDQ](pre|post)$")>;
1402 // Store vector reg, unsigned immed, B/H/S/D-form
1403 // Store vector reg, unsigned immed, Q-form
1404 def : InstRW<[V2Write_2cyc_1L01_1V01], (instregex "^STR[BHSDQ]ui$")>;
1406 // Store vector reg, register offset, basic, B/H/S/D-form
1407 // Store vector reg, register offset, basic, Q-form
1408 // Store vector reg, register offset, scale, H-form
1409 // Store vector reg, register offset, scale, S/D-form
1410 // Store vector reg, register offset, scale, Q-form
1411 // Store vector reg, register offset, extend, B/H/S/D-form
1412 // Store vector reg, register offset, extend, Q-form
1413 // Store vector reg, register offset, extend, scale, H-form
1414 // Store vector reg, register offset, extend, scale, S/D-form
1415 // Store vector reg, register offset, extend, scale, Q-form
1416 def : InstRW<[V2Write_StrHQ, ReadAdrBase],
1417              (instregex "^STR[BHSDQ]ro[WX]$")>;
1419 // Store vector pair, immed offset, S-form
1420 // Store vector pair, immed offset, D-form
1421 def : InstRW<[V2Write_2cyc_1L01_1V01], (instregex "^STN?P[SD]i$")>;
1423 // Store vector pair, immed offset, Q-form
1424 def : InstRW<[V2Write_2cyc_1L01_2V01], (instrs STPQi, STNPQi)>;
1426 // Store vector pair, immed post-index, S-form
1427 // Store vector pair, immed post-index, D-form
1428 // Store vector pair, immed pre-index, S-form
1429 // Store vector pair, immed pre-index, D-form
1430 def : InstRW<[WriteAdr, V2Write_2cyc_1L01_1V01_1I],
1431              (instregex "^STP[SD](pre|post)$")>;
1433 // Store vector pair, immed post-index, Q-form
1434 def : InstRW<[V2Write_2cyc_1L01_2V01_1I], (instrs STPQpost)>;
1436 // Store vector pair, immed pre-index, Q-form
1437 def : InstRW<[V2Write_2cyc_1L01_2V01_2I], (instrs STPQpre)>;
1439 // ASIMD integer instructions
1440 // -----------------------------------------------------------------------------
1442 // ASIMD absolute diff
1443 // ASIMD absolute diff long
1444 // ASIMD arith, basic
1445 // ASIMD arith, complex
1446 // ASIMD arith, pair-wise
1447 // ASIMD compare
1448 // ASIMD logical
1449 // ASIMD max/min, basic and pair-wise
1450 def : SchedAlias<WriteVd, V2Write_2cyc_1V>;
1451 def : SchedAlias<WriteVq, V2Write_2cyc_1V>;
1453 // ASIMD absolute diff accum
1454 // ASIMD absolute diff accum long
1455 def : InstRW<[V2Wr_VA, V2Rd_VA], (instregex "^[SU]ABAL?v")>;
1457 // ASIMD arith, reduce, 4H/4S
1458 def : InstRW<[V2Write_2cyc_1V13], (instregex "^(ADDV|[SU]ADDLV)v4(i16|i32)v$")>;
1460 // ASIMD arith, reduce, 8B/8H
1461 def : InstRW<[V2Write_4cyc_1V13_1V],
1462              (instregex "^(ADDV|[SU]ADDLV)v8(i8|i16)v$")>;
1464 // ASIMD arith, reduce, 16B
1465 def : InstRW<[V2Write_4cyc_2V13], (instregex "^(ADDV|[SU]ADDLV)v16i8v$")>;
1467 // ASIMD dot product
1468 // ASIMD dot product using signed and unsigned integers
1469 def : InstRW<[V2Wr_VDOT, V2Rd_VDOT],
1470              (instregex "^([SU]|SU|US)DOT(lane)?(v8|v16)i8$")>;
1472 // ASIMD matrix multiply-accumulate
1473 def : InstRW<[V2Wr_VMMA, V2Rd_VMMA], (instrs SMMLA, UMMLA, USMMLA)>;
1475 // ASIMD max/min, reduce, 4H/4S
1476 def : InstRW<[V2Write_2cyc_1V13], (instregex "^[SU](MAX|MIN)Vv4i16v$",
1477                                              "^[SU](MAX|MIN)Vv4i32v$")>;
1479 // ASIMD max/min, reduce, 8B/8H
1480 def : InstRW<[V2Write_4cyc_1V13_1V], (instregex "^[SU](MAX|MIN)Vv8i8v$",
1481                                                 "^[SU](MAX|MIN)Vv8i16v$")>;
1483 // ASIMD max/min, reduce, 16B
1484 def : InstRW<[V2Write_4cyc_2V13], (instregex "[SU](MAX|MIN)Vv16i8v$")>;
1486 // ASIMD multiply
1487 def : InstRW<[V2Write_4cyc_1V02], (instregex "^MULv", "^SQ(R)?DMULHv")>;
1489 // ASIMD multiply accumulate
1490 def : InstRW<[V2Wr_VMA, V2Rd_VMA], (instregex "^MLAv", "^MLSv")>;
1492 // ASIMD multiply accumulate high
1493 def : InstRW<[V2Wr_VMAH, V2Rd_VMAH], (instregex "^SQRDMLAHv", "^SQRDMLSHv")>;
1495 // ASIMD multiply accumulate long
1496 def : InstRW<[V2Wr_VMAL, V2Rd_VMAL], (instregex "^[SU]MLALv", "^[SU]MLSLv")>;
1498 // ASIMD multiply accumulate saturating long
1499 def : InstRW<[V2Write_4cyc_1V02], (instregex "^SQDML[AS]L[iv]")>;
1501 // ASIMD multiply/multiply long (8x8) polynomial, D-form
1502 // ASIMD multiply/multiply long (8x8) polynomial, Q-form
1503 def : InstRW<[V2Write_3cyc_1V23], (instregex "^PMULL?(v8i8|v16i8)$")>;
1505 // ASIMD multiply long
1506 def : InstRW<[V2Write_3cyc_1V02], (instregex "^[SU]MULLv", "^SQDMULL[iv]")>;
1508 // ASIMD pairwise add and accumulate long
1509 def : InstRW<[V2Wr_VPA, V2Rd_VPA], (instregex "^[SU]ADALPv")>;
1511 // ASIMD shift accumulate
1512 def : InstRW<[V2Wr_VSA, V2Rd_VSA], (instregex "^[SU]SRA[dv]", "^[SU]RSRA[dv]")>;
1514 // ASIMD shift by immed, basic
1515 def : InstRW<[V2Write_2cyc_1V13], (instregex "^SHL[dv]", "^SHLLv", "^SHRNv",
1516                                              "^SSHLLv", "^SSHR[dv]", "^USHLLv",
1517                                              "^USHR[dv]")>;
1519 // ASIMD shift by immed and insert, basic
1520 def : InstRW<[V2Write_2cyc_1V13], (instregex "^SLI[dv]", "^SRI[dv]")>;
1522 // ASIMD shift by immed, complex
1523 def : InstRW<[V2Write_4cyc_1V13],
1524              (instregex "^RSHRNv", "^SQRSHRU?N[bhsv]", "^(SQSHLU?|UQSHL)[bhsd]$",
1525                         "^(SQSHLU?|UQSHL)(v8i8|v16i8|v4i16|v8i16|v2i32|v4i32|v2i64)_shift$",
1526                         "^SQSHRU?N[bhsv]", "^SRSHR[dv]", "^UQRSHRN[bhsv]",
1527                         "^UQSHRN[bhsv]", "^URSHR[dv]")>;
1529 // ASIMD shift by register, basic
1530 def : InstRW<[V2Write_2cyc_1V13], (instregex "^[SU]SHLv")>;
1532 // ASIMD shift by register, complex
1533 def : InstRW<[V2Write_4cyc_1V13],
1534              (instregex "^[SU]RSHLv", "^[SU]QRSHLv",
1535                         "^[SU]QSHL(v1i8|v1i16|v1i32|v1i64|v8i8|v16i8|v4i16|v8i16|v2i32|v4i32|v2i64)$")>;
1537 // ASIMD floating-point instructions
1538 // -----------------------------------------------------------------------------
1540 // ASIMD FP absolute value/difference
1541 // ASIMD FP arith, normal
1542 // ASIMD FP compare
1543 // ASIMD FP complex add
1544 // ASIMD FP max/min, normal
1545 // ASIMD FP max/min, pairwise
1546 // ASIMD FP negate
1547 // Handled by SchedAlias<WriteV[dq], ...>
1549 // ASIMD FP complex multiply add
1550 def : InstRW<[V2Wr_VFCMA, V2Rd_VFCMA], (instregex "^FCMLAv")>;
1552 // ASIMD FP convert, long (F16 to F32)
1553 def : InstRW<[V2Write_4cyc_2V02], (instregex "^FCVTL(v4|v8)i16")>;
1555 // ASIMD FP convert, long (F32 to F64)
1556 def : InstRW<[V2Write_3cyc_1V02], (instregex "^FCVTL(v2|v4)i32")>;
1558 // ASIMD FP convert, narrow (F32 to F16)
1559 def : InstRW<[V2Write_4cyc_2V02], (instregex "^FCVTN(v4|v8)i16")>;
1561 // ASIMD FP convert, narrow (F64 to F32)
1562 def : InstRW<[V2Write_3cyc_1V02], (instregex "^FCVTN(v2|v4)i32",
1563                                              "^FCVTXN(v2|v4)f32")>;
1565 // ASIMD FP convert, other, D-form F32 and Q-form F64
1566 def : InstRW<[V2Write_3cyc_1V02], (instregex "^FCVT[AMNPZ][SU]v2f(32|64)$",
1567                                              "^FCVT[AMNPZ][SU]v1i64$",
1568                                              "^FCVTZ[SU]d$",
1569                                              "^[SU]CVTFv2f(32|64)$",
1570                                              "^[SU]CVTFv1i64$",
1571                                              "^[SU]CVTFd$")>;
1573 // ASIMD FP convert, other, D-form F16 and Q-form F32
1574 def : InstRW<[V2Write_4cyc_2V02], (instregex "^FCVT[AMNPZ][SU]v4f(16|32)$",
1575                                              "^FCVT[AMNPZ][SU]v1i32$",
1576                                              "^FCVTZ[SU]s$",
1577                                              "^[SU]CVTFv4f(16|32)$",
1578                                              "^[SU]CVTFv1i32$",
1579                                              "^[SU]CVTFs$")>;
1581 // ASIMD FP convert, other, Q-form F16
1582 def : InstRW<[V2Write_6cyc_4V02], (instregex "^FCVT[AMNPZ][SU]v8f16$",
1583                                              "^FCVT[AMNPZ][SU]v1f16$",
1584                                              "^FCVTZ[SU]h$",
1585                                              "^[SU]CVTFv8f16$",
1586                                              "^[SU]CVTFv1i16$",
1587                                              "^[SU]CVTFh$")>;
1589 // ASIMD FP divide, D-form, F16
1590 def : InstRW<[V2Write_7cyc_1V02_7rc], (instrs FDIVv4f16)>;
1592 // ASIMD FP divide, D-form, F32
1593 def : InstRW<[V2Write_10cyc_1V02_5rc], (instrs FDIVv2f32)>;
1595 // ASIMD FP divide, Q-form, F16
1596 def : InstRW<[V2Write_13cyc_1V02_13rc], (instrs FDIVv8f16)>;
1598 // ASIMD FP divide, Q-form, F32
1599 def : InstRW<[V2Write_10cyc_1V02_10rc], (instrs FDIVv4f32)>;
1601 // ASIMD FP divide, Q-form, F64
1602 def : InstRW<[V2Write_15cyc_1V02_14rc], (instrs FDIVv2f64)>;
1604 // ASIMD FP max/min, reduce, F32 and D-form F16
1605 def : InstRW<[V2Write_4cyc_2V], (instregex "^(FMAX|FMIN)(NM)?Vv4(i16|i32)v$")>;
1607 // ASIMD FP max/min, reduce, Q-form F16
1608 def : InstRW<[V2Write_6cyc_3V], (instregex "^(FMAX|FMIN)(NM)?Vv8i16v$")>;
1610 // ASIMD FP multiply
1611 def : InstRW<[V2Wr_VFM], (instregex "^FMULv", "^FMULXv")>;
1613 // ASIMD FP multiply accumulate
1614 def : InstRW<[V2Wr_VFMA, V2Rd_VFMA], (instregex "^FMLAv", "^FMLSv")>;
1616 // ASIMD FP multiply accumulate long
1617 def : InstRW<[V2Wr_VFMAL, V2Rd_VFMAL], (instregex "^FML[AS]L2?(lane)?v")>;
1619 // ASIMD FP round, D-form F32 and Q-form F64
1620 def : InstRW<[V2Write_3cyc_1V02],
1621              (instregex "^FRINT[AIMNPXZ]v2f(32|64)$",
1622                         "^FRINT(32|64)[XZ]v2f(32|64)$")>;
1624 // ASIMD FP round, D-form F16 and Q-form F32
1625 def : InstRW<[V2Write_4cyc_2V02],
1626              (instregex "^FRINT[AIMNPXZ]v4f(16|32)$",
1627                         "^FRINT(32|64)[XZ]v4f32$")>;
1629 // ASIMD FP round, Q-form F16
1630 def : InstRW<[V2Write_6cyc_4V02], (instregex "^FRINT[AIMNPXZ]v8f16$")>;
1632 // ASIMD FP square root, D-form, F16
1633 def : InstRW<[V2Write_7cyc_1V02_7rc], (instrs FSQRTv4f16)>;
1635 // ASIMD FP square root, D-form, F32
1636 def : InstRW<[V2Write_10cyc_1V02_5rc], (instrs FSQRTv2f32)>;
1638 // ASIMD FP square root, Q-form, F16
1639 def : InstRW<[V2Write_13cyc_1V02_13rc], (instrs FSQRTv8f16)>;
1641 // ASIMD FP square root, Q-form, F32
1642 def : InstRW<[V2Write_10cyc_1V02_9rc], (instrs FSQRTv4f32)>;
1644 // ASIMD FP square root, Q-form, F64
1645 def : InstRW<[V2Write_16cyc_1V02_15rc], (instrs FSQRTv2f64)>;
1647 // ASIMD BFloat16 (BF16) instructions
1648 // -----------------------------------------------------------------------------
1650 // ASIMD convert, F32 to BF16
1651 def : InstRW<[V2Write_4cyc_2V02], (instrs BFCVTN, BFCVTN2)>;
1653 // ASIMD dot product
1654 def : InstRW<[V2Wr_VBFDOT, V2Rd_VBFDOT], (instrs BFDOTv4bf16, BFDOTv8bf16)>;
1656 // ASIMD matrix multiply accumulate
1657 def : InstRW<[V2Wr_VBFMMA, V2Rd_VBFMMA], (instrs BFMMLA)>;
1659 // ASIMD multiply accumulate long
1660 def : InstRW<[V2Wr_VBFMAL, V2Rd_VBFMAL], (instrs BFMLALB, BFMLALBIdx, BFMLALT,
1661                                                  BFMLALTIdx)>;
1663 // Scalar convert, F32 to BF16
1664 def : InstRW<[V2Write_3cyc_1V02], (instrs BFCVT)>;
1666 // ASIMD miscellaneous instructions
1667 // -----------------------------------------------------------------------------
1669 // ASIMD bit reverse
1670 // ASIMD bitwise insert
1671 // ASIMD count
1672 // ASIMD duplicate, element
1673 // ASIMD extract
1674 // ASIMD extract narrow
1675 // ASIMD insert, element to element
1676 // ASIMD move, FP immed
1677 // ASIMD move, integer immed
1678 // ASIMD reverse
1679 // ASIMD table lookup extension, 1 table reg
1680 // ASIMD transpose
1681 // ASIMD unzip/zip
1682 // Handled by SchedAlias<WriteV[dq], ...>
1683 def : InstRW<[V2Write_0or2cyc_1V], (instrs MOVID, MOVIv2d_ns)>;
1685 // ASIMD duplicate, gen reg
1686 def : InstRW<[V2Write_3cyc_1M0], (instregex "^DUPv.+gpr")>;
1688 // ASIMD extract narrow, saturating
1689 def : InstRW<[V2Write_4cyc_1V13], (instregex "^[SU]QXTNv", "^SQXTUNv")>;
1691 // ASIMD reciprocal and square root estimate, D-form U32
1692 def : InstRW<[V2Write_3cyc_1V02], (instrs URECPEv2i32, URSQRTEv2i32)>;
1694 // ASIMD reciprocal and square root estimate, Q-form U32
1695 def : InstRW<[V2Write_4cyc_2V02], (instrs URECPEv4i32, URSQRTEv4i32)>;
1697 // ASIMD reciprocal and square root estimate, D-form F32 and scalar forms
1698 def : InstRW<[V2Write_3cyc_1V02], (instrs FRECPEv1f16, FRECPEv1i32,
1699                                           FRECPEv1i64, FRECPEv2f32,
1700                                           FRSQRTEv1f16, FRSQRTEv1i32,
1701                                           FRSQRTEv1i64, FRSQRTEv2f32)>;
1703 // ASIMD reciprocal and square root estimate, D-form F16 and Q-form F32
1704 def : InstRW<[V2Write_4cyc_2V02], (instrs FRECPEv4f16, FRECPEv4f32,
1705                                           FRSQRTEv4f16, FRSQRTEv4f32)>;
1707 // ASIMD reciprocal and square root estimate, Q-form F16
1708 def : InstRW<[V2Write_6cyc_4V02], (instrs FRECPEv8f16, FRSQRTEv8f16)>;
1710 // ASIMD reciprocal exponent
1711 def : InstRW<[V2Write_3cyc_1V02], (instregex "^FRECPXv")>;
1713 // ASIMD reciprocal step
1714 def : InstRW<[V2Write_4cyc_1V], (instregex "^FRECPS(32|64|v)",
1715                                            "^FRSQRTS(32|64|v)")>;
1717 // ASIMD table lookup, 1 or 2 table regs
1718 def : InstRW<[V2Write_2cyc_1V01], (instrs TBLv8i8One, TBLv16i8One,
1719                                           TBLv8i8Two, TBLv16i8Two)>;
1721 // ASIMD table lookup, 3 table regs
1722 def : InstRW<[V2Write_4cyc_2V01], (instrs TBLv8i8Three, TBLv16i8Three)>;
1724 // ASIMD table lookup, 4 table regs
1725 def : InstRW<[V2Write_4cyc_3V01], (instrs TBLv8i8Four, TBLv16i8Four)>;
1727 // ASIMD table lookup extension, 2 table reg
1728 def : InstRW<[V2Write_4cyc_2V], (instrs TBXv8i8Two, TBXv16i8Two)>;
1730 // ASIMD table lookup extension, 3 table reg
1731 def : InstRW<[V2Write_6cyc_3V], (instrs TBXv8i8Three, TBXv16i8Three)>;
1733 // ASIMD table lookup extension, 4 table reg
1734 def : InstRW<[V2Write_6cyc_5V], (instrs TBXv8i8Four, TBXv16i8Four)>;
1736 // ASIMD transfer, element to gen reg
1737 def : InstRW<[V2Write_2cyc_2V01], (instregex "^[SU]MOVv")>;
1739 // ASIMD transfer, gen reg to element
1740 def : InstRW<[V2Write_5cyc_1M0_1V], (instregex "^INSvi(8|16|32|64)gpr$")>;
1742 // ASIMD load instructions
1743 // -----------------------------------------------------------------------------
1745 // ASIMD load, 1 element, multiple, 1 reg, D-form
1746 def : InstRW<[V2Write_6cyc_1L], (instregex "^LD1Onev(8b|4h|2s|1d)$")>;
1747 def : InstRW<[WriteAdr, V2Write_6cyc_1L],
1748              (instregex "^LD1Onev(8b|4h|2s|1d)_POST$")>;
1750 // ASIMD load, 1 element, multiple, 1 reg, Q-form
1751 def : InstRW<[V2Write_6cyc_1L], (instregex "^LD1Onev(16b|8h|4s|2d)$")>;
1752 def : InstRW<[WriteAdr, V2Write_6cyc_1L],
1753              (instregex "^LD1Onev(16b|8h|4s|2d)_POST$")>;
1755 // ASIMD load, 1 element, multiple, 2 reg, D-form
1756 def : InstRW<[V2Write_6cyc_2L], (instregex "^LD1Twov(8b|4h|2s|1d)$")>;
1757 def : InstRW<[WriteAdr, V2Write_6cyc_2L],
1758              (instregex "^LD1Twov(8b|4h|2s|1d)_POST$")>;
1760 // ASIMD load, 1 element, multiple, 2 reg, Q-form
1761 def : InstRW<[V2Write_6cyc_2L], (instregex "^LD1Twov(16b|8h|4s|2d)$")>;
1762 def : InstRW<[WriteAdr, V2Write_6cyc_2L],
1763              (instregex "^LD1Twov(16b|8h|4s|2d)_POST$")>;
1765 // ASIMD load, 1 element, multiple, 3 reg, D-form
1766 def : InstRW<[V2Write_6cyc_3L], (instregex "^LD1Threev(8b|4h|2s|1d)$")>;
1767 def : InstRW<[WriteAdr, V2Write_6cyc_3L],
1768              (instregex "^LD1Threev(8b|4h|2s|1d)_POST$")>;
1770 // ASIMD load, 1 element, multiple, 3 reg, Q-form
1771 def : InstRW<[V2Write_6cyc_3L], (instregex "^LD1Threev(16b|8h|4s|2d)$")>;
1772 def : InstRW<[WriteAdr, V2Write_6cyc_3L],
1773              (instregex "^LD1Threev(16b|8h|4s|2d)_POST$")>;
1775 // ASIMD load, 1 element, multiple, 4 reg, D-form
1776 def : InstRW<[V2Write_7cyc_4L], (instregex "^LD1Fourv(8b|4h|2s|1d)$")>;
1777 def : InstRW<[WriteAdr, V2Write_7cyc_4L],
1778              (instregex "^LD1Fourv(8b|4h|2s|1d)_POST$")>;
1780 // ASIMD load, 1 element, multiple, 4 reg, Q-form
1781 def : InstRW<[V2Write_7cyc_4L], (instregex "^LD1Fourv(16b|8h|4s|2d)$")>;
1782 def : InstRW<[WriteAdr, V2Write_7cyc_4L],
1783              (instregex "^LD1Fourv(16b|8h|4s|2d)_POST$")>;
1785 // ASIMD load, 1 element, one lane, B/H/S
1786 // ASIMD load, 1 element, one lane, D
1787 def : InstRW<[V2Write_8cyc_1L_1V],           (instregex "LD1i(8|16|32|64)$")>;
1788 def : InstRW<[WriteAdr, V2Write_8cyc_1L_1V], (instregex "LD1i(8|16|32|64)_POST$")>;
1790 // ASIMD load, 1 element, all lanes, D-form, B/H/S
1791 // ASIMD load, 1 element, all lanes, D-form, D
1792 def : InstRW<[V2Write_8cyc_1L_1V],           (instregex "LD1Rv(8b|4h|2s|1d)$")>;
1793 def : InstRW<[WriteAdr, V2Write_8cyc_1L_1V], (instregex "LD1Rv(8b|4h|2s|1d)_POST$")>;
1795 // ASIMD load, 1 element, all lanes, Q-form
1796 def : InstRW<[V2Write_8cyc_1L_1V],           (instregex "LD1Rv(16b|8h|4s|2d)$")>;
1797 def : InstRW<[WriteAdr, V2Write_8cyc_1L_1V], (instregex "LD1Rv(16b|8h|4s|2d)_POST$")>;
1799 // ASIMD load, 2 element, multiple, D-form, B/H/S
1800 def : InstRW<[V2Write_8cyc_1L_2V],           (instregex "LD2Twov(8b|4h|2s)$")>;
1801 def : InstRW<[WriteAdr, V2Write_8cyc_1L_2V], (instregex "LD2Twov(8b|4h|2s)_POST$")>;
1803 // ASIMD load, 2 element, multiple, Q-form, B/H/S
1804 // ASIMD load, 2 element, multiple, Q-form, D
1805 def : InstRW<[V2Write_8cyc_2L_2V],           (instregex "LD2Twov(16b|8h|4s|2d)$")>;
1806 def : InstRW<[WriteAdr, V2Write_8cyc_2L_2V], (instregex "LD2Twov(16b|8h|4s|2d)_POST$")>;
1808 // ASIMD load, 2 element, one lane, B/H
1809 // ASIMD load, 2 element, one lane, S
1810 // ASIMD load, 2 element, one lane, D
1811 def : InstRW<[V2Write_8cyc_1L_2V],           (instregex "LD2i(8|16|32|64)$")>;
1812 def : InstRW<[WriteAdr, V2Write_8cyc_1L_2V], (instregex "LD2i(8|16|32|64)_POST$")>;
1814 // ASIMD load, 2 element, all lanes, D-form, B/H/S
1815 // ASIMD load, 2 element, all lanes, D-form, D
1816 def : InstRW<[V2Write_8cyc_1L_2V],            (instregex "LD2Rv(8b|4h|2s|1d)$")>;
1817 def : InstRW<[WriteAdr, V2Write_8cyc_1L_2V],  (instregex "LD2Rv(8b|4h|2s|1d)_POST$")>;
1819 // ASIMD load, 2 element, all lanes, Q-form
1820 def : InstRW<[V2Write_8cyc_1L_2V],           (instregex "LD2Rv(16b|8h|4s|2d)$")>;
1821 def : InstRW<[WriteAdr, V2Write_8cyc_1L_2V], (instregex "LD2Rv(16b|8h|4s|2d)_POST$")>;
1823 // ASIMD load, 3 element, multiple, D-form, B/H/S
1824 def : InstRW<[V2Write_8cyc_2L_3V],           (instregex "LD3Threev(8b|4h|2s)$")>;
1825 def : InstRW<[WriteAdr, V2Write_8cyc_2L_3V], (instregex "LD3Threev(8b|4h|2s)_POST$")>;
1827 // ASIMD load, 3 element, multiple, Q-form, B/H/S
1828 // ASIMD load, 3 element, multiple, Q-form, D
1829 def : InstRW<[V2Write_8cyc_3L_3V],           (instregex "LD3Threev(16b|8h|4s|2d)$")>;
1830 def : InstRW<[WriteAdr, V2Write_8cyc_3L_3V], (instregex "LD3Threev(16b|8h|4s|2d)_POST$")>;
1832 // ASIMD load, 3 element, one lane, B/H
1833 // ASIMD load, 3 element, one lane, S
1834 // ASIMD load, 3 element, one lane, D
1835 def : InstRW<[V2Write_8cyc_2L_3V],           (instregex "LD3i(8|16|32|64)$")>;
1836 def : InstRW<[WriteAdr, V2Write_8cyc_2L_3V], (instregex "LD3i(8|16|32|64)_POST$")>;
1838 // ASIMD load, 3 element, all lanes, D-form, B/H/S
1839 // ASIMD load, 3 element, all lanes, D-form, D
1840 def : InstRW<[V2Write_8cyc_2L_3V],           (instregex "LD3Rv(8b|4h|2s|1d)$")>;
1841 def : InstRW<[WriteAdr, V2Write_8cyc_2L_3V], (instregex "LD3Rv(8b|4h|2s|1d)_POST$")>;
1843 // ASIMD load, 3 element, all lanes, Q-form, B/H/S
1844 // ASIMD load, 3 element, all lanes, Q-form, D
1845 def : InstRW<[V2Write_8cyc_3L_3V],           (instregex "LD3Rv(16b|8h|4s|2d)$")>;
1846 def : InstRW<[WriteAdr, V2Write_8cyc_3L_3V], (instregex "LD3Rv(16b|8h|4s|2d)_POST$")>;
1848 // ASIMD load, 4 element, multiple, D-form, B/H/S
1849 def : InstRW<[V2Write_8cyc_3L_4V],           (instregex "LD4Fourv(8b|4h|2s)$")>;
1850 def : InstRW<[WriteAdr, V2Write_8cyc_3L_4V], (instregex "LD4Fourv(8b|4h|2s)_POST$")>;
1852 // ASIMD load, 4 element, multiple, Q-form, B/H/S
1853 // ASIMD load, 4 element, multiple, Q-form, D
1854 def : InstRW<[V2Write_9cyc_6L_4V],           (instregex "LD4Fourv(16b|8h|4s|2d)$")>;
1855 def : InstRW<[WriteAdr, V2Write_9cyc_6L_4V], (instregex "LD4Fourv(16b|8h|4s|2d)_POST$")>;
1857 // ASIMD load, 4 element, one lane, B/H
1858 // ASIMD load, 4 element, one lane, S
1859 // ASIMD load, 4 element, one lane, D
1860 def : InstRW<[V2Write_8cyc_3L_4V],           (instregex "LD4i(8|16|32|64)$")>;
1861 def : InstRW<[WriteAdr, V2Write_8cyc_3L_4V], (instregex "LD4i(8|16|32|64)_POST$")>;
1863 // ASIMD load, 4 element, all lanes, D-form, B/H/S
1864 // ASIMD load, 4 element, all lanes, D-form, D
1865 def : InstRW<[V2Write_8cyc_3L_4V],           (instregex "LD4Rv(8b|4h|2s|1d)$")>;
1866 def : InstRW<[WriteAdr, V2Write_8cyc_3L_4V], (instregex "LD4Rv(8b|4h|2s|1d)_POST$")>;
1868 // ASIMD load, 4 element, all lanes, Q-form, B/H/S
1869 // ASIMD load, 4 element, all lanes, Q-form, D
1870 def : InstRW<[V2Write_8cyc_4L_4V],           (instregex "LD4Rv(16b|8h|4s|2d)$")>;
1871 def : InstRW<[WriteAdr, V2Write_8cyc_4L_4V], (instregex "LD4Rv(16b|8h|4s|2d)_POST$")>;
1873 // ASIMD store instructions
1874 // -----------------------------------------------------------------------------
1876 // ASIMD store, 1 element, multiple, 1 reg, D-form
1877 def : InstRW<[V2Write_2cyc_1L01_1V01],           (instregex "ST1Onev(8b|4h|2s|1d)$")>;
1878 def : InstRW<[WriteAdr, V2Write_2cyc_1L01_1V01], (instregex "ST1Onev(8b|4h|2s|1d)_POST$")>;
1880 // ASIMD store, 1 element, multiple, 1 reg, Q-form
1881 def : InstRW<[V2Write_2cyc_1L01_1V01],           (instregex "ST1Onev(16b|8h|4s|2d)$")>;
1882 def : InstRW<[WriteAdr, V2Write_2cyc_1L01_1V01], (instregex "ST1Onev(16b|8h|4s|2d)_POST$")>;
1884 // ASIMD store, 1 element, multiple, 2 reg, D-form
1885 def : InstRW<[V2Write_2cyc_1L01_1V01],           (instregex "ST1Twov(8b|4h|2s|1d)$")>;
1886 def : InstRW<[WriteAdr, V2Write_2cyc_1L01_1V01], (instregex "ST1Twov(8b|4h|2s|1d)_POST$")>;
1888 // ASIMD store, 1 element, multiple, 2 reg, Q-form
1889 def : InstRW<[V2Write_2cyc_2L01_2V01],           (instregex "ST1Twov(16b|8h|4s|2d)$")>;
1890 def : InstRW<[WriteAdr, V2Write_2cyc_2L01_2V01], (instregex "ST1Twov(16b|8h|4s|2d)_POST$")>;
1892 // ASIMD store, 1 element, multiple, 3 reg, D-form
1893 def : InstRW<[V2Write_2cyc_2L01_2V01],           (instregex "ST1Threev(8b|4h|2s|1d)$")>;
1894 def : InstRW<[WriteAdr, V2Write_2cyc_2L01_2V01], (instregex "ST1Threev(8b|4h|2s|1d)_POST$")>;
1896 // ASIMD store, 1 element, multiple, 3 reg, Q-form
1897 def : InstRW<[V2Write_2cyc_3L01_3V01],           (instregex "ST1Threev(16b|8h|4s|2d)$")>;
1898 def : InstRW<[WriteAdr, V2Write_2cyc_3L01_3V01], (instregex "ST1Threev(16b|8h|4s|2d)_POST$")>;
1900 // ASIMD store, 1 element, multiple, 4 reg, D-form
1901 def : InstRW<[V2Write_2cyc_2L01_2V01],           (instregex "ST1Fourv(8b|4h|2s|1d)$")>;
1902 def : InstRW<[WriteAdr, V2Write_2cyc_2L01_2V01], (instregex "ST1Fourv(8b|4h|2s|1d)_POST$")>;
1904 // ASIMD store, 1 element, multiple, 4 reg, Q-form
1905 def : InstRW<[V2Write_2cyc_4L01_4V01],           (instregex "ST1Fourv(16b|8h|4s|2d)$")>;
1906 def : InstRW<[WriteAdr, V2Write_2cyc_4L01_4V01], (instregex "ST1Fourv(16b|8h|4s|2d)_POST$")>;
1908 // ASIMD store, 1 element, one lane, B/H/S
1909 // ASIMD store, 1 element, one lane, D
1910 def : InstRW<[V2Write_4cyc_1L01_2V01],           (instregex "ST1i(8|16|32|64)$")>;
1911 def : InstRW<[WriteAdr, V2Write_4cyc_1L01_2V01], (instregex "ST1i(8|16|32|64)_POST$")>;
1913 // ASIMD store, 2 element, multiple, D-form, B/H/S
1914 def : InstRW<[V2Write_4cyc_1L01_2V01],           (instregex "ST2Twov(8b|4h|2s)$")>;
1915 def : InstRW<[WriteAdr, V2Write_4cyc_1L01_2V01], (instregex "ST2Twov(8b|4h|2s)_POST$")>;
1917 // ASIMD store, 2 element, multiple, Q-form, B/H/S
1918 // ASIMD store, 2 element, multiple, Q-form, D
1919 def : InstRW<[V2Write_4cyc_2L01_4V01],           (instregex "ST2Twov(16b|8h|4s|2d)$")>;
1920 def : InstRW<[WriteAdr, V2Write_4cyc_2L01_4V01], (instregex "ST2Twov(16b|8h|4s|2d)_POST$")>;
1922 // ASIMD store, 2 element, one lane, B/H/S
1923 // ASIMD store, 2 element, one lane, D
1924 def : InstRW<[V2Write_4cyc_1L01_2V01],           (instregex "ST2i(8|16|32|64)$")>;
1925 def : InstRW<[WriteAdr, V2Write_4cyc_1L01_2V01], (instregex "ST2i(8|16|32|64)_POST$")>;
1927 // ASIMD store, 3 element, multiple, D-form, B/H/S
1928 def : InstRW<[V2Write_5cyc_2L01_4V01],           (instregex "ST3Threev(8b|4h|2s)$")>;
1929 def : InstRW<[WriteAdr, V2Write_5cyc_2L01_4V01], (instregex "ST3Threev(8b|4h|2s)_POST$")>;
1931 // ASIMD store, 3 element, multiple, Q-form, B/H/S
1932 // ASIMD store, 3 element, multiple, Q-form, D
1933 def : InstRW<[V2Write_6cyc_3L01_6V01],           (instregex "ST3Threev(16b|8h|4s|2d)$")>;
1934 def : InstRW<[WriteAdr, V2Write_6cyc_3L01_6V01], (instregex "ST3Threev(16b|8h|4s|2d)_POST$")>;
1936 // ASIMD store, 3 element, one lane, B/H
1937 // ASIMD store, 3 element, one lane, S
1938 // ASIMD store, 3 element, one lane, D
1939 def : InstRW<[V2Write_5cyc_2L01_4V01],           (instregex "ST3i(8|16|32|64)$")>;
1940 def : InstRW<[WriteAdr, V2Write_5cyc_2L01_4V01], (instregex "ST3i(8|16|32|64)_POST$")>;
1942 // ASIMD store, 4 element, multiple, D-form, B/H/S
1943 def : InstRW<[V2Write_6cyc_2L01_6V01],           (instregex "ST4Fourv(8b|4h|2s)$")>;
1944 def : InstRW<[WriteAdr, V2Write_6cyc_2L01_6V01], (instregex "ST4Fourv(8b|4h|2s)_POST$")>;
1946 // ASIMD store, 4 element, multiple, Q-form, B/H/S
1947 def : InstRW<[V2Write_7cyc_4L01_12V01],           (instregex "ST4Fourv(16b|8h|4s)$")>;
1948 def : InstRW<[WriteAdr, V2Write_7cyc_4L01_12V01], (instregex "ST4Fourv(16b|8h|4s)_POST$")>;
1950 // ASIMD store, 4 element, multiple, Q-form, D
1951 def : InstRW<[V2Write_5cyc_4L01_8V01],           (instregex "ST4Fourv(2d)$")>;
1952 def : InstRW<[WriteAdr, V2Write_5cyc_4L01_8V01], (instregex "ST4Fourv(2d)_POST$")>;
1954 // ASIMD store, 4 element, one lane, B/H/S
1955 def : InstRW<[V2Write_6cyc_1L01_3V01],           (instregex "ST4i(8|16|32)$")>;
1956 def : InstRW<[WriteAdr, V2Write_6cyc_1L01_3V01], (instregex "ST4i(8|16|32)_POST$")>;
1958 // ASIMD store, 4 element, one lane, D
1959 def : InstRW<[V2Write_4cyc_2L01_4V01],            (instregex "ST4i(64)$")>;
1960 def : InstRW<[WriteAdr, V2Write_4cyc_2L01_4V01],  (instregex "ST4i(64)_POST$")>;
1962 // Cryptography extensions
1963 // -----------------------------------------------------------------------------
1965 // Crypto AES ops
1966 def : InstRW<[V2Write_2cyc_1V], (instregex "^AES[DE]rr$", "^AESI?MCrr")>;
1968 // Crypto polynomial (64x64) multiply long
1969 def : InstRW<[V2Write_2cyc_1V], (instrs PMULLv1i64, PMULLv2i64)>;
1971 // Crypto SHA1 hash acceleration op
1972 // Crypto SHA1 schedule acceleration ops
1973 def : InstRW<[V2Write_2cyc_1V0], (instregex "^SHA1(H|SU0|SU1)")>;
1975 // Crypto SHA1 hash acceleration ops
1976 // Crypto SHA256 hash acceleration ops
1977 def : InstRW<[V2Write_4cyc_1V0], (instregex "^SHA1[CMP]", "^SHA256H2?")>;
1979 // Crypto SHA256 schedule acceleration ops
1980 def : InstRW<[V2Write_2cyc_1V0], (instregex "^SHA256SU[01]")>;
1982 // Crypto SHA512 hash acceleration ops
1983 def : InstRW<[V2Write_2cyc_1V0], (instregex "^SHA512(H|H2|SU0|SU1)")>;
1985 // Crypto SHA3 ops
1986 def : InstRW<[V2Write_2cyc_1V0], (instrs BCAX, EOR3, RAX1, XAR)>;
1988 // Crypto SM3 ops
1989 def : InstRW<[V2Write_2cyc_1V0], (instregex "^SM3PARTW[12]$", "^SM3SS1$",
1990                                             "^SM3TT[12][AB]$")>;
1992 // Crypto SM4 ops
1993 def : InstRW<[V2Write_4cyc_1V0], (instrs SM4E, SM4ENCKEY)>;
1995 // CRC
1996 // -----------------------------------------------------------------------------
1998 def : InstRW<[V2Wr_CRC, V2Rd_CRC], (instregex "^CRC32")>;
2000 // SVE Predicate instructions
2001 // -----------------------------------------------------------------------------
2003 // Loop control, based on predicate
2004 def : InstRW<[V2Write_2or3cyc_1M], (instrs BRKA_PPmP, BRKA_PPzP,
2005                                            BRKB_PPmP, BRKB_PPzP)>;
2007 // Loop control, based on predicate and flag setting
2008 def : InstRW<[V2Write_3or4cyc_2M], (instrs BRKAS_PPzP, BRKBS_PPzP)>;
2010 // Loop control, propagating
2011 def : InstRW<[V2Write_2or3cyc_1M0], (instrs BRKN_PPzP, BRKPA_PPzPP,
2012                                             BRKPB_PPzPP)>;
2014 // Loop control, propagating and flag setting
2015 def : InstRW<[V2Write_3or4cyc_1M0_1M], (instrs BRKNS_PPzP, BRKPAS_PPzPP,
2016                                                BRKPBS_PPzPP)>;
2018 // Loop control, based on GPR
2019 def : InstRW<[V2Write_3cyc_2M],
2020              (instregex "^WHILE(GE|GT|HI|HS|LE|LO|LS|LT)_P(WW|XX)_[BHSD]")>;
2021 def : InstRW<[V2Write_3cyc_2M], (instregex "^WHILE(RW|WR)_PXX_[BHSD]")>;
2023 // Loop terminate
2024 def : InstRW<[V2Write_1cyc_2M], (instregex "^CTERM(EQ|NE)_(WW|XX)")>;
2026 // Predicate counting scalar
2027 def : InstRW<[V2Write_2cyc_1M], (instrs ADDPL_XXI, ADDVL_XXI, RDVLI_XI)>;
2028 def : InstRW<[V2Write_2cyc_1M],
2029              (instregex "^(CNT|SQDEC|SQINC|UQDEC|UQINC)[BHWD]_XPiI",
2030                         "^SQ(DEC|INC)[BHWD]_XPiWdI",
2031                         "^UQ(DEC|INC)[BHWD]_WPiI")>;
2033 // Predicate counting scalar, ALL, {1,2,4}
2034 def : InstRW<[V2Write_IncDec], (instregex "^(DEC|INC)[BHWD]_XPiI")>;
2036 // Predicate counting scalar, active predicate
2037 def : InstRW<[V2Write_2cyc_1M],
2038              (instregex "^CNTP_XPP_[BHSD]",
2039                         "^(DEC|INC|SQDEC|SQINC|UQDEC|UQINC)P_XP_[BHSD]",
2040                         "^(UQDEC|UQINC)P_WP_[BHSD]",
2041                         "^(SQDEC|SQINC)P_XPWd_[BHSD]")>;
2043 // Predicate counting vector, active predicate
2044 def : InstRW<[V2Write_7cyc_1M_1M0_1V],
2045              (instregex "^(DEC|INC|SQDEC|SQINC|UQDEC|UQINC)P_ZP_[HSD]")>;
2047 // Predicate logical
2048 def : InstRW<[V2Write_1or2cyc_1M0],
2049              (instregex "^(AND|BIC|EOR|NAND|NOR|ORN|ORR)_PPzPP")>;
2051 // Predicate logical, flag setting
2052 def : InstRW<[V2Write_1or2cyc_1M0_1M],
2053              (instregex "^(ANDS|BICS|EORS|NANDS|NORS|ORNS|ORRS)_PPzPP")>;
2055 // Predicate reverse
2056 def : InstRW<[V2Write_2cyc_1M], (instregex "^REV_PP_[BHSD]")>;
2058 // Predicate select
2059 def : InstRW<[V2Write_1cyc_1M0], (instrs SEL_PPPP)>;
2061 // Predicate set
2062 def : InstRW<[V2Write_2cyc_1M], (instregex "^PFALSE", "^PTRUE_[BHSD]")>;
2064 // Predicate set/initialize, set flags
2065 def : InstRW<[V2Write_3cyc_2M], (instregex "^PTRUES_[BHSD]")>;
2067 // Predicate find first/next
2068 def : InstRW<[V2Write_2cyc_1M], (instregex "^PFIRST_B", "^PNEXT_[BHSD]")>;
2070 // Predicate test
2071 def : InstRW<[V2Write_1cyc_1M], (instrs PTEST_PP)>;
2073 // Predicate transpose
2074 def : InstRW<[V2Write_2cyc_1M], (instregex "^TRN[12]_PPP_[BHSD]")>;
2076 // Predicate unpack and widen
2077 def : InstRW<[V2Write_2cyc_1M], (instrs PUNPKHI_PP, PUNPKLO_PP)>;
2079 // Predicate zip/unzip
2080 def : InstRW<[V2Write_2cyc_1M], (instregex "^(ZIP|UZP)[12]_PPP_[BHSD]")>;
2082 // SVE integer instructions
2083 // -----------------------------------------------------------------------------
2085 // Arithmetic, absolute diff
2086 def : InstRW<[V2Write_2cyc_1V], (instregex "^[SU]ABD_ZPmZ_[BHSD]",
2087                                            "^[SU]ABD_ZPZZ_[BHSD]")>;
2089 // Arithmetic, absolute diff accum
2090 def : InstRW<[V2Wr_ZA, V2Rd_ZA], (instregex "^[SU]ABA_ZZZ_[BHSD]")>;
2092 // Arithmetic, absolute diff accum long
2093 def : InstRW<[V2Wr_ZA, V2Rd_ZA], (instregex "^[SU]ABAL[TB]_ZZZ_[HSD]")>;
2095 // Arithmetic, absolute diff long
2096 def : InstRW<[V2Write_2cyc_1V], (instregex "^[SU]ABDL[TB]_ZZZ_[HSD]")>;
2098 // Arithmetic, basic
2099 def : InstRW<[V2Write_2cyc_1V],
2100              (instregex "^(ABS|ADD|CNOT|NEG|SUB|SUBR)_ZPmZ_[BHSD]",
2101                         "^(ADD|SUB)_ZZZ_[BHSD]",
2102                         "^(ADD|SUB|SUBR)_ZPZZ_[BHSD]",
2103                         "^(ADD|SUB|SUBR)_ZI_[BHSD]",
2104                         "^ADR_[SU]XTW_ZZZ_D_[0123]",
2105                         "^ADR_LSL_ZZZ_[SD]_[0123]",
2106                         "^[SU](ADD|SUB)[LW][BT]_ZZZ_[HSD]",
2107                         "^SADDLBT_ZZZ_[HSD]",
2108                         "^[SU]H(ADD|SUB|SUBR)_ZPmZ_[BHSD]",
2109                         "^SSUBL(BT|TB)_ZZZ_[HSD]")>;
2111 // Arithmetic, complex
2112 def : InstRW<[V2Write_2cyc_1V],
2113              (instregex "^R?(ADD|SUB)HN[BT]_ZZZ_[BHS]",
2114                         "^SQ(ABS|ADD|NEG|SUB|SUBR)_ZPmZ_[BHSD]",
2115                         "^[SU]Q(ADD|SUB)_ZZZ_[BHSD]",
2116                         "^[SU]Q(ADD|SUB)_ZI_[BHSD]",
2117                         "^(SRH|SUQ|UQ|USQ|URH)ADD_ZPmZ_[BHSD]",
2118                         "^(UQSUB|UQSUBR)_ZPmZ_[BHSD]")>;
2120 // Arithmetic, large integer
2121 def : InstRW<[V2Write_2cyc_1V], (instregex "^(AD|SB)CL[BT]_ZZZ_[SD]")>;
2123 // Arithmetic, pairwise add
2124 def : InstRW<[V2Write_2cyc_1V], (instregex "^ADDP_ZPmZ_[BHSD]")>;
2126 // Arithmetic, pairwise add and accum long
2127 def : InstRW<[V2Wr_ZPA, ReadDefault, V2Rd_ZPA],
2128              (instregex "^[SU]ADALP_ZPmZ_[HSD]")>;
2130 // Arithmetic, shift
2131 def : InstRW<[V2Write_2cyc_1V13],
2132              (instregex "^(ASR|LSL|LSR)_WIDE_ZPmZ_[BHS]",
2133                         "^(ASR|LSL|LSR)_WIDE_ZZZ_[BHS]",
2134                         "^(ASR|LSL|LSR)_ZPmI_[BHSD]",
2135                         "^(ASR|LSL|LSR)_ZPmZ_[BHSD]",
2136                         "^(ASR|LSL|LSR)_ZZI_[BHSD]",
2137                         "^(ASR|LSL|LSR)_ZPZ[IZ]_[BHSD]",
2138                         "^(ASRR|LSLR|LSRR)_ZPmZ_[BHSD]")>;
2140 // Arithmetic, shift and accumulate
2141 def : InstRW<[V2Wr_ZSA, V2Rd_ZSA], (instregex "^[SU]R?SRA_ZZI_[BHSD]")>;
2143 // Arithmetic, shift by immediate
2144 def : InstRW<[V2Write_2cyc_1V13], (instregex "^SHRN[BT]_ZZI_[BHS]",
2145                                              "^[SU]SHLL[BT]_ZZI_[HSD]")>;
2147 // Arithmetic, shift by immediate and insert
2148 def : InstRW<[V2Write_2cyc_1V13], (instregex "^(SLI|SRI)_ZZI_[BHSD]")>;
2150 // Arithmetic, shift complex
2151 def : InstRW<[V2Write_4cyc_1V13],
2152              (instregex "^(SQ)?RSHRU?N[BT]_ZZI_[BHS]",
2153                         "^(SQRSHL|SQRSHLR|SQSHL|SQSHLR|UQRSHL|UQRSHLR|UQSHL|UQSHLR)_ZPmZ_[BHSD]",
2154                         "^[SU]QR?SHL_ZPZZ_[BHSD]",
2155                         "^(SQSHL|SQSHLU|UQSHL)_(ZPmI|ZPZI)_[BHSD]",
2156                         "^SQSHRU?N[BT]_ZZI_[BHS]",
2157                         "^UQR?SHRN[BT]_ZZI_[BHS]")>;
2159 // Arithmetic, shift right for divide
2160 def : InstRW<[V2Write_4cyc_1V13], (instregex "^ASRD_(ZPmI|ZPZI)_[BHSD]")>;
2162 // Arithmetic, shift rounding
2163 def : InstRW<[V2Write_4cyc_1V13], (instregex "^[SU]RSHLR?_ZPmZ_[BHSD]",
2164                                              "^[SU]RSHL_ZPZZ_[BHSD]",
2165                                              "^[SU]RSHR_(ZPmI|ZPZI)_[BHSD]")>;
2167 // Bit manipulation
2168 def : InstRW<[V2Write_6cyc_2V1], (instregex "^(BDEP|BEXT|BGRP)_ZZZ_[BHSD]")>;
2170 // Bitwise select
2171 def : InstRW<[V2Write_2cyc_1V], (instregex "^(BSL|BSL1N|BSL2N|NBSL)_ZZZZ")>;
2173 // Count/reverse bits
2174 def : InstRW<[V2Write_2cyc_1V], (instregex "^(CLS|CLZ|CNT|RBIT)_ZPmZ_[BHSD]")>;
2176 // Broadcast logical bitmask immediate to vector
2177 def : InstRW<[V2Write_2cyc_1V], (instrs DUPM_ZI)>;
2179 // Compare and set flags
2180 def : InstRW<[V2Write_4or5cyc_1V0_1M0],
2181              (instregex "^CMP(EQ|GE|GT|HI|HS|LE|LO|LS|LT|NE)_PPzZ[IZ]_[BHSD]",
2182                         "^CMP(EQ|GE|GT|HI|HS|LE|LO|LS|LT|NE)_WIDE_PPzZZ_[BHS]")>;
2184 // Complex add
2185 def : InstRW<[V2Write_2cyc_1V], (instregex "^(SQ)?CADD_ZZI_[BHSD]")>;
2187 // Complex dot product 8-bit element
2188 def : InstRW<[V2Wr_ZDOTB, V2Rd_ZDOTB], (instrs CDOT_ZZZ_S, CDOT_ZZZI_S)>;
2190 // Complex dot product 16-bit element
2191 def : InstRW<[V2Wr_ZDOTH, V2Rd_ZDOTH], (instrs CDOT_ZZZ_D, CDOT_ZZZI_D)>;
2193 // Complex multiply-add B, H, S element size
2194 def : InstRW<[V2Wr_ZCMABHS, V2Rd_ZCMABHS], (instregex "^CMLA_ZZZ_[BHS]",
2195                                                       "^CMLA_ZZZI_[HS]")>;
2197 // Complex multiply-add D element size
2198 def : InstRW<[V2Wr_ZCMAD, V2Rd_ZCMAD], (instrs CMLA_ZZZ_D)>;
2200 // Conditional extract operations, scalar form
2201 def : InstRW<[V2Write_8cyc_1M0_1V01], (instregex "^CLAST[AB]_RPZ_[BHSD]")>;
2203 // Conditional extract operations, SIMD&FP scalar and vector forms
2204 def : InstRW<[V2Write_3cyc_1V1], (instregex "^CLAST[AB]_[VZ]PZ_[BHSD]",
2205                                             "^COMPACT_ZPZ_[SD]",
2206                                             "^SPLICE_ZPZZ?_[BHSD]")>;
2208 // Convert to floating point, 64b to float or convert to double
2209 def : InstRW<[V2Write_3cyc_1V02], (instregex "^[SU]CVTF_ZPmZ_Dto[HSD]",
2210                                              "^[SU]CVTF_ZPmZ_StoD")>;
2212 // Convert to floating point, 32b to single or half
2213 def : InstRW<[V2Write_4cyc_2V02], (instregex "^[SU]CVTF_ZPmZ_Sto[HS]")>;
2215 // Convert to floating point, 16b to half
2216 def : InstRW<[V2Write_6cyc_4V02], (instregex "^[SU]CVTF_ZPmZ_HtoH")>;
2218 // Copy, scalar
2219 def : InstRW<[V2Write_5cyc_1M0_1V], (instregex "^CPY_ZPmR_[BHSD]")>;
2221 // Copy, scalar SIMD&FP or imm
2222 def : InstRW<[V2Write_2cyc_1V], (instregex "^CPY_ZPm[IV]_[BHSD]",
2223                                            "^CPY_ZPzI_[BHSD]")>;
2225 // Divides, 32 bit
2226 def : InstRW<[V2Write_12cyc_1V0], (instregex "^[SU]DIVR?_ZPmZ_S",
2227                                              "^[SU]DIV_ZPZZ_S")>;
2229 // Divides, 64 bit
2230 def : InstRW<[V2Write_20cyc_1V0], (instregex "^[SU]DIVR?_ZPmZ_D",
2231                                              "^[SU]DIV_ZPZZ_D")>;
2233 // Dot product, 8 bit
2234 def : InstRW<[V2Wr_ZDOTB, V2Rd_ZDOTB], (instregex "^[SU]DOT_ZZZI?_S")>;
2236 // Dot product, 8 bit, using signed and unsigned integers
2237 def : InstRW<[V2Wr_ZDOTB, V2Rd_ZDOTB], (instrs SUDOT_ZZZI, USDOT_ZZZI, USDOT_ZZZ)>;
2239 // Dot product, 16 bit
2240 def : InstRW<[V2Wr_ZDOTH, V2Rd_ZDOTH], (instregex "^[SU]DOT_ZZZI?_D")>;
2242 // Duplicate, immediate and indexed form
2243 def : InstRW<[V2Write_2cyc_1V], (instregex "^DUP_ZI_[BHSD]",
2244                                            "^DUP_ZZI_[BHSDQ]")>;
2246 // Duplicate, scalar form
2247 def : InstRW<[V2Write_3cyc_1M0], (instregex "^DUP_ZR_[BHSD]")>;
2249 // Extend, sign or zero
2250 def : InstRW<[V2Write_2cyc_1V13], (instregex "^[SU]XTB_ZPmZ_[HSD]",
2251                                              "^[SU]XTH_ZPmZ_[SD]",
2252                                              "^[SU]XTW_ZPmZ_[D]")>;
2254 // Extract
2255 def : InstRW<[V2Write_2cyc_1V], (instrs EXT_ZZI, EXT_ZZI_B)>;
2257 // Extract narrow saturating
2258 def : InstRW<[V2Write_4cyc_1V13], (instregex "^[SU]QXTN[BT]_ZZ_[BHS]",
2259                                              "^SQXTUN[BT]_ZZ_[BHS]")>;
2261 // Extract/insert operation, SIMD and FP scalar form
2262 def : InstRW<[V2Write_3cyc_1V1], (instregex "^LAST[AB]_VPZ_[BHSD]",
2263                                             "^INSR_ZV_[BHSD]")>;
2265 // Extract/insert operation, scalar
2266 def : InstRW<[V2Write_6cyc_1V1_1M0], (instregex "^LAST[AB]_RPZ_[BHSD]",
2267                                                 "^INSR_ZR_[BHSD]")>;
2269 // Histogram operations
2270 def : InstRW<[V2Write_2cyc_1V], (instregex "^HISTCNT_ZPzZZ_[SD]",
2271                                            "^HISTSEG_ZZZ")>;
2273 // Horizontal operations, B, H, S form, immediate operands only
2274 def : InstRW<[V2Write_4cyc_1V02], (instregex "^INDEX_II_[BHS]")>;
2276 // Horizontal operations, B, H, S form, scalar, immediate operands/ scalar
2277 // operands only / immediate, scalar operands
2278 def : InstRW<[V2Write_7cyc_1M0_1V02], (instregex "^INDEX_(IR|RI|RR)_[BHS]")>;
2280 // Horizontal operations, D form, immediate operands only
2281 def : InstRW<[V2Write_5cyc_2V02], (instrs INDEX_II_D)>;
2283 // Horizontal operations, D form, scalar, immediate operands)/ scalar operands
2284 // only / immediate, scalar operands
2285 def : InstRW<[V2Write_8cyc_2M0_2V02], (instregex "^INDEX_(IR|RI|RR)_D")>;
2287 // Logical
2288 def : InstRW<[V2Write_2cyc_1V],
2289              (instregex "^(AND|EOR|ORR)_ZI",
2290                         "^(AND|BIC|EOR|ORR)_ZZZ",
2291                         "^EOR(BT|TB)_ZZZ_[BHSD]",
2292                         "^(AND|BIC|EOR|NOT|ORR)_(ZPmZ|ZPZZ)_[BHSD]",
2293                         "^NOT_ZPmZ_[BHSD]")>;
2295 // Max/min, basic and pairwise
2296 def : InstRW<[V2Write_2cyc_1V], (instregex "^[SU](MAX|MIN)_ZI_[BHSD]",
2297                                            "^[SU](MAX|MIN)P?_ZPmZ_[BHSD]",
2298                                            "^[SU](MAX|MIN)_ZPZZ_[BHSD]")>;
2300 // Matching operations
2301 // FIXME: SOG p. 44, n. 5: If the consuming instruction has a flag source, the
2302 // latency for this instruction is 4 cycles.
2303 def : InstRW<[V2Write_2or3cyc_1V0_1M], (instregex "^N?MATCH_PPzZZ_[BH]")>;
2305 // Matrix multiply-accumulate
2306 def : InstRW<[V2Wr_ZMMA, V2Rd_ZMMA], (instrs SMMLA_ZZZ, UMMLA_ZZZ, USMMLA_ZZZ)>;
2308 // Move prefix
2309 def : InstRW<[V2Write_2cyc_1V], (instregex "^MOVPRFX_ZP[mz]Z_[BHSD]",
2310                                            "^MOVPRFX_ZZ")>;
2312 // Multiply, B, H, S element size
2313 def : InstRW<[V2Write_4cyc_1V02], (instregex "^MUL_(ZI|ZPmZ|ZZZI|ZZZ)_[BHS]",
2314                                              "^MUL_ZPZZ_[BHS]",
2315                                              "^[SU]MULH_(ZPmZ|ZZZ)_[BHS]",
2316                                              "^[SU]MULH_ZPZZ_[BHS]")>;
2318 // Multiply, D element size
2319 def : InstRW<[V2Write_5cyc_2V02], (instregex "^MUL_(ZI|ZPmZ|ZZZI|ZZZ)_D",
2320                                              "^MUL_ZPZZ_D",
2321                                              "^[SU]MULH_(ZPmZ|ZZZ)_D",
2322                                              "^[SU]MULH_ZPZZ_D")>;
2324 // Multiply long
2325 def : InstRW<[V2Write_4cyc_1V02], (instregex "^[SU]MULL[BT]_ZZZI_[SD]",
2326                                              "^[SU]MULL[BT]_ZZZ_[HSD]")>;
2328 // Multiply accumulate, B, H, S element size
2329 def : InstRW<[V2Wr_ZMABHS, V2Rd_ZMABHS],
2330              (instregex "^ML[AS]_ZZZI_[HS]", "^ML[AS]_ZPZZZ_[BHS]")>;
2331 def : InstRW<[V2Wr_ZMABHS, ReadDefault, V2Rd_ZMABHS],
2332              (instregex "^(ML[AS]|MAD|MSB)_ZPmZZ_[BHS]")>;
2334 // Multiply accumulate, D element size
2335 def : InstRW<[V2Wr_ZMAD, V2Rd_ZMAD],
2336              (instregex "^ML[AS]_ZZZI_D", "^ML[AS]_ZPZZZ_D")>;
2337 def : InstRW<[V2Wr_ZMAD, ReadDefault, V2Rd_ZMAD],
2338              (instregex "^(ML[AS]|MAD|MSB)_ZPmZZ_D")>;
2340 // Multiply accumulate long
2341 def : InstRW<[V2Wr_ZMAL, V2Rd_ZMAL], (instregex "^[SU]ML[AS]L[BT]_ZZZ_[HSD]",
2342                                                 "^[SU]ML[AS]L[BT]_ZZZI_[SD]")>;
2344 // Multiply accumulate saturating doubling long regular
2345 def : InstRW<[V2Wr_ZMASQL, V2Rd_ZMASQ],
2346              (instregex "^SQDML[AS]L(B|T|BT)_ZZZ_[HSD]",
2347                         "^SQDML[AS]L[BT]_ZZZI_[SD]")>;
2349 // Multiply saturating doubling high, B, H, S element size
2350 def : InstRW<[V2Write_4cyc_1V02], (instregex "^SQDMULH_ZZZ_[BHS]",
2351                                              "^SQDMULH_ZZZI_[HS]")>;
2353 // Multiply saturating doubling high, D element size
2354 def : InstRW<[V2Write_5cyc_2V02], (instrs SQDMULH_ZZZ_D, SQDMULH_ZZZI_D)>;
2356 // Multiply saturating doubling long
2357 def : InstRW<[V2Write_4cyc_1V02], (instregex "^SQDMULL[BT]_ZZZ_[HSD]",
2358                                              "^SQDMULL[BT]_ZZZI_[SD]")>;
2360 // Multiply saturating rounding doubling regular/complex accumulate, B, H, S
2361 // element size
2362 def : InstRW<[V2Wr_ZMASQBHS, V2Rd_ZMASQ], (instregex "^SQRDML[AS]H_ZZZ_[BHS]",
2363                                                      "^SQRDCMLAH_ZZZ_[BHS]",
2364                                                      "^SQRDML[AS]H_ZZZI_[HS]",
2365                                                      "^SQRDCMLAH_ZZZI_[HS]")>;
2367 // Multiply saturating rounding doubling regular/complex accumulate, D element
2368 // size
2369 def : InstRW<[V2Wr_ZMASQD, V2Rd_ZMASQ], (instregex "^SQRDML[AS]H_ZZZI?_D",
2370                                                    "^SQRDCMLAH_ZZZ_D")>;
2372 // Multiply saturating rounding doubling regular/complex, B, H, S element size
2373 def : InstRW<[V2Write_4cyc_1V02], (instregex "^SQRDMULH_ZZZ_[BHS]",
2374                                              "^SQRDMULH_ZZZI_[HS]")>;
2376 // Multiply saturating rounding doubling regular/complex, D element size
2377 def : InstRW<[V2Write_5cyc_2V02], (instregex "^SQRDMULH_ZZZI?_D")>;
2379 // Multiply/multiply long, (8x8) polynomial
2380 def : InstRW<[V2Write_2cyc_1V23], (instregex "^PMUL_ZZZ_B",
2381                                              "^PMULL[BT]_ZZZ_[HDQ]")>;
2383 // Predicate counting vector
2384 def : InstRW<[V2Write_2cyc_1V], (instregex "^([SU]Q)?(DEC|INC)[HWD]_ZPiI")>;
2386 // Reciprocal estimate
2387 def : InstRW<[V2Write_4cyc_2V02], (instregex "^URECPE_ZPmZ_S", "^URSQRTE_ZPmZ_S")>;
2389 // Reduction, arithmetic, B form
2390 def : InstRW<[V2Write_9cyc_2V_4V13], (instregex "^[SU](ADD|MAX|MIN)V_VPZ_B")>;
2392 // Reduction, arithmetic, H form
2393 def : InstRW<[V2Write_8cyc_2V_2V13], (instregex "^[SU](ADD|MAX|MIN)V_VPZ_H")>;
2395 // Reduction, arithmetic, S form
2396 def : InstRW<[V2Write_6cyc_2V_2V13], (instregex "^[SU](ADD|MAX|MIN)V_VPZ_S")>;
2398 // Reduction, arithmetic, D form
2399 def : InstRW<[V2Write_4cyc_2V], (instregex "^[SU](ADD|MAX|MIN)V_VPZ_D")>;
2401 // Reduction, logical
2402 def : InstRW<[V2Write_6cyc_1V_1V13], (instregex "^(AND|EOR|OR)V_VPZ_[BHSD]")>;
2404 // Reverse, vector
2405 def : InstRW<[V2Write_2cyc_1V], (instregex "^REV_ZZ_[BHSD]",
2406                                            "^REVB_ZPmZ_[HSD]",
2407                                            "^REVH_ZPmZ_[SD]",
2408                                            "^REVW_ZPmZ_D")>;
2410 // Select, vector form
2411 def : InstRW<[V2Write_2cyc_1V], (instregex "^SEL_ZPZZ_[BHSD]")>;
2413 // Table lookup
2414 def : InstRW<[V2Write_2cyc_1V], (instregex "^TBL_ZZZZ?_[BHSD]")>;
2416 // Table lookup extension
2417 def : InstRW<[V2Write_2cyc_1V], (instregex "^TBX_ZZZ_[BHSD]")>;
2419 // Transpose, vector form
2420 def : InstRW<[V2Write_2cyc_1V], (instregex "^TRN[12]_ZZZ_[BHSDQ]")>;
2422 // Unpack and extend
2423 def : InstRW<[V2Write_2cyc_1V], (instregex "^[SU]UNPK(HI|LO)_ZZ_[HSD]")>;
2425 // Zip/unzip
2426 def : InstRW<[V2Write_2cyc_1V], (instregex "^(UZP|ZIP)[12]_ZZZ_[BHSDQ]")>;
2428 // SVE floating-point instructions
2429 // -----------------------------------------------------------------------------
2431 // Floating point absolute value/difference
2432 def : InstRW<[V2Write_2cyc_1V], (instregex "^FAB[SD]_ZPmZ_[HSD]",
2433                                            "^FABD_ZPZZ_[HSD]",
2434                                            "^FABS_ZPmZ_[HSD]")>;
2436 // Floating point arithmetic
2437 def : InstRW<[V2Write_2cyc_1V], (instregex "^F(ADD|SUB)_(ZPm[IZ]|ZZZ)_[HSD]",
2438                                            "^F(ADD|SUB)_ZPZ[IZ]_[HSD]",
2439                                            "^FADDP_ZPmZZ_[HSD]",
2440                                            "^FNEG_ZPmZ_[HSD]",
2441                                            "^FSUBR_ZPm[IZ]_[HSD]",
2442                                            "^FSUBR_(ZPZI|ZPZZ)_[HSD]")>;
2444 // Floating point associative add, F16
2445 def : InstRW<[V2Write_10cyc_1V1_9rc], (instrs FADDA_VPZ_H)>;
2447 // Floating point associative add, F32
2448 def : InstRW<[V2Write_6cyc_1V1_5rc], (instrs FADDA_VPZ_S)>;
2450 // Floating point associative add, F64
2451 def : InstRW<[V2Write_4cyc_1V], (instrs FADDA_VPZ_D)>;
2453 // Floating point compare
2454 def : InstRW<[V2Write_2cyc_1V0], (instregex "^FACG[ET]_PPzZZ_[HSD]",
2455                                             "^FCM(EQ|GE|GT|NE)_PPzZ[0Z]_[HSD]",
2456                                             "^FCM(LE|LT)_PPzZ0_[HSD]",
2457                                             "^FCMUO_PPzZZ_[HSD]")>;
2459 // Floating point complex add
2460 def : InstRW<[V2Write_3cyc_1V], (instregex "^FCADD_ZPmZ_[HSD]")>;
2462 // Floating point complex multiply add
2463 def : InstRW<[V2Wr_ZFCMA, ReadDefault, V2Rd_ZFCMA], (instregex "^FCMLA_ZPmZZ_[HSD]")>;
2464 def : InstRW<[V2Wr_ZFCMA, V2Rd_ZFCMA],              (instregex "^FCMLA_ZZZI_[HS]")>;
2466 // Floating point convert, long or narrow (F16 to F32 or F32 to F16)
2467 def : InstRW<[V2Write_4cyc_2V02], (instregex "^FCVT_ZPmZ_(HtoS|StoH)",
2468                                              "^FCVTLT_ZPmZ_HtoS",
2469                                              "^FCVTNT_ZPmZ_StoH")>;
2471 // Floating point convert, long or narrow (F16 to F64, F32 to F64, F64 to F32
2472 // or F64 to F16)
2473 def : InstRW<[V2Write_3cyc_1V02], (instregex "^FCVT_ZPmZ_(HtoD|StoD|DtoS|DtoH)",
2474                                              "^FCVTLT_ZPmZ_StoD",
2475                                              "^FCVTNT_ZPmZ_DtoS")>;
2477 // Floating point convert, round to odd
2478 def : InstRW<[V2Write_3cyc_1V02], (instrs FCVTX_ZPmZ_DtoS, FCVTXNT_ZPmZ_DtoS)>;
2480 // Floating point base2 log, F16
2481 def : InstRW<[V2Write_6cyc_4V02], (instregex "^FLOGB_(ZPmZ|ZPZZ)_H")>;
2483 // Floating point base2 log, F32
2484 def : InstRW<[V2Write_4cyc_2V02], (instregex "^FLOGB_(ZPmZ|ZPZZ)_S")>;
2486 // Floating point base2 log, F64
2487 def : InstRW<[V2Write_3cyc_1V02], (instregex "^FLOGB_(ZPmZ|ZPZZ)_D")>;
2489 // Floating point convert to integer, F16
2490 def : InstRW<[V2Write_6cyc_4V02], (instregex "^FCVTZ[SU]_ZPmZ_HtoH")>;
2492 // Floating point convert to integer, F32
2493 def : InstRW<[V2Write_4cyc_2V02], (instregex "^FCVTZ[SU]_ZPmZ_(HtoS|StoS)")>;
2495 // Floating point convert to integer, F64
2496 def : InstRW<[V2Write_3cyc_1V02],
2497              (instregex "^FCVTZ[SU]_ZPmZ_(HtoD|StoD|DtoS|DtoD)")>;
2499 // Floating point copy
2500 def : InstRW<[V2Write_2cyc_1V], (instregex "^FCPY_ZPmI_[HSD]",
2501                                            "^FDUP_ZI_[HSD]")>;
2503 // Floating point divide, F16
2504 def : InstRW<[V2Write_13cyc_1V02_12rc], (instregex "^FDIVR?_(ZPmZ|ZPZZ)_H")>;
2506 // Floating point divide, F32
2507 def : InstRW<[V2Write_10cyc_1V02_9rc], (instregex "^FDIVR?_(ZPmZ|ZPZZ)_S")>;
2509 // Floating point divide, F64
2510 def : InstRW<[V2Write_15cyc_1V02_14rc], (instregex "^FDIVR?_(ZPmZ|ZPZZ)_D")>;
2512 // Floating point min/max pairwise
2513 def : InstRW<[V2Write_2cyc_1V], (instregex "^F(MAX|MIN)(NM)?P_ZPmZZ_[HSD]")>;
2515 // Floating point min/max
2516 def : InstRW<[V2Write_2cyc_1V], (instregex "^F(MAX|MIN)(NM)?_ZPm[IZ]_[HSD]",
2517                                            "^F(MAX|MIN)(NM)?_ZPZ[IZ]_[HSD]")>;
2519 // Floating point multiply
2520 def : InstRW<[V2Write_3cyc_1V], (instregex "^(FSCALE|FMULX)_ZPmZ_[HSD]",
2521                                            "^FMULX_ZPZZ_[HSD]",
2522                                            "^FMUL_(ZPm[IZ]|ZZZI?)_[HSD]",
2523                                            "^FMUL_ZPZ[IZ]_[HSD]")>;
2525 // Floating point multiply accumulate
2526 def : InstRW<[V2Wr_ZFMA, ReadDefault, V2Rd_ZFMA],
2527              (instregex "^FN?ML[AS]_ZPmZZ_[HSD]",
2528                         "^FN?(MAD|MSB)_ZPmZZ_[HSD]")>;
2529 def : InstRW<[V2Wr_ZFMA, V2Rd_ZFMA],
2530              (instregex "^FML[AS]_ZZZI_[HSD]",
2531                         "^FN?ML[AS]_ZPZZZ_[HSD]")>;
2533 // Floating point multiply add/sub accumulate long
2534 def : InstRW<[V2Wr_ZFMAL, V2Rd_ZFMAL], (instregex "^FML[AS]L[BT]_ZZZI?_SHH")>;
2536 // Floating point reciprocal estimate, F16
2537 def : InstRW<[V2Write_6cyc_4V02], (instregex "^FR(ECP|SQRT)E_ZZ_H", "^FRECPX_ZPmZ_H")>;
2539 // Floating point reciprocal estimate, F32
2540 def : InstRW<[V2Write_4cyc_2V02], (instregex "^FR(ECP|SQRT)E_ZZ_S", "^FRECPX_ZPmZ_S")>;
2542 // Floating point reciprocal estimate, F64
2543 def : InstRW<[V2Write_3cyc_1V02], (instregex "^FR(ECP|SQRT)E_ZZ_D", "^FRECPX_ZPmZ_D")>;
2545 // Floating point reciprocal step
2546 def : InstRW<[V2Write_4cyc_1V], (instregex "^F(RECPS|RSQRTS)_ZZZ_[HSD]")>;
2548 // Floating point reduction, F16
2549 def : InstRW<[V2Write_8cyc_4V],
2550              (instregex "^(FADDV|FMAXNMV|FMAXV|FMINNMV|FMINV)_VPZ_H")>;
2552 // Floating point reduction, F32
2553 def : InstRW<[V2Write_6cyc_3V],
2554              (instregex "^(FADDV|FMAXNMV|FMAXV|FMINNMV|FMINV)_VPZ_S")>;
2556 // Floating point reduction, F64
2557 def : InstRW<[V2Write_4cyc_2V],
2558              (instregex "^(FADDV|FMAXNMV|FMAXV|FMINNMV|FMINV)_VPZ_D")>;
2560 // Floating point round to integral, F16
2561 def : InstRW<[V2Write_6cyc_4V02], (instregex "^FRINT[AIMNPXZ]_ZPmZ_H")>;
2563 // Floating point round to integral, F32
2564 def : InstRW<[V2Write_4cyc_2V02], (instregex "^FRINT[AIMNPXZ]_ZPmZ_S")>;
2566 // Floating point round to integral, F64
2567 def : InstRW<[V2Write_3cyc_1V02], (instregex "^FRINT[AIMNPXZ]_ZPmZ_D")>;
2569 // Floating point square root, F16
2570 def : InstRW<[V2Write_13cyc_1V0_12rc], (instregex "^FSQRT_ZPmZ_H", "^FSQRT_ZPmZ_H")>;
2572 // Floating point square root, F32
2573 def : InstRW<[V2Write_10cyc_1V0_9rc], (instregex "^FSQRT_ZPmZ_S", "^FSQRT_ZPmZ_S")>;
2575 // Floating point square root, F64
2576 def : InstRW<[V2Write_16cyc_1V0_14rc], (instregex "^FSQRT_ZPmZ_D", "^FSQRT_ZPmZ_D")>;
2578 // Floating point trigonometric exponentiation
2579 def : InstRW<[V2Write_3cyc_1V1], (instregex "^FEXPA_ZZ_[HSD]")>;
2581 // Floating point trigonometric multiply add
2582 def : InstRW<[V2Write_4cyc_1V], (instregex "^FTMAD_ZZI_[HSD]")>;
2584 // Floating point trigonometric, miscellaneous
2585 def : InstRW<[V2Write_3cyc_1V], (instregex "^FTS(MUL|SEL)_ZZZ_[HSD]")>;
2587 // SVE BFloat16 (BF16) instructions
2588 // -----------------------------------------------------------------------------
2590 // Convert, F32 to BF16
2591 def : InstRW<[V2Write_4cyc_1V02], (instrs BFCVT_ZPmZ, BFCVTNT_ZPmZ)>;
2593 // Dot product
2594 def : InstRW<[V2Wr_ZBFDOT, V2Rd_ZBFDOT], (instrs BFDOT_ZZI, BFDOT_ZZZ)>;
2596 // Matrix multiply accumulate
2597 def : InstRW<[V2Wr_ZBFMMA, V2Rd_ZBFMMA], (instrs BFMMLA_ZZZ)>;
2599 // Multiply accumulate long
2600 def : InstRW<[V2Wr_ZBFMAL, V2Rd_ZBFMAL], (instregex "^BFMLAL[BT]_ZZZI?")>;
2602 // SVE Load instructions
2603 // -----------------------------------------------------------------------------
2605 // Load vector
2606 def : InstRW<[V2Write_6cyc_1L], (instrs LDR_ZXI)>;
2608 // Load predicate
2609 def : InstRW<[V2Write_6cyc_1L_1M], (instrs LDR_PXI)>;
2611 // Contiguous load, scalar + imm
2612 def : InstRW<[V2Write_6cyc_1L], (instregex "^LD1[BHWD]_IMM$",
2613                                            "^LD1S?B_[HSD]_IMM$",
2614                                            "^LD1S?H_[SD]_IMM$",
2615                                            "^LD1S?W_D_IMM$" )>;
2616 // Contiguous load, scalar + scalar
2617 def : InstRW<[V2Write_6cyc_1L], (instregex "^LD1[BHWD]$",
2618                                            "^LD1S?B_[HSD]$",
2619                                            "^LD1S?H_[SD]$",
2620                                            "^LD1S?W_D$" )>;
2622 // Contiguous load broadcast, scalar + imm
2623 def : InstRW<[V2Write_6cyc_1L], (instregex "^LD1R[BHWD]_IMM$",
2624                                            "^LD1RS?B_[HSD]_IMM$",
2625                                            "^LD1RS?H_[SD]_IMM$",
2626                                            "^LD1RW_D_IMM$",
2627                                            "^LD1RSW_IMM$",
2628                                            "^LD1RQ_[BHWD]_IMM$")>;
2630 // Contiguous load broadcast, scalar + scalar
2631 def : InstRW<[V2Write_6cyc_1L], (instregex "^LD1RQ_[BHWD]$")>;
2633 // Non temporal load, scalar + imm
2634 // Non temporal load, scalar + scalar
2635 def : InstRW<[V2Write_6cyc_1L], (instregex "^LDNT1[BHWD]_ZR[IR]$")>;
2637 // Non temporal gather load, vector + scalar 32-bit element size
2638 def : InstRW<[V2Write_9cyc_2L_4V], (instregex "^LDNT1[BHW]_ZZR_S_REAL$",
2639                                               "^LDNT1S[BH]_ZZR_S_REAL$")>;
2641 // Non temporal gather load, vector + scalar 64-bit element size
2642 def : InstRW<[V2Write_9cyc_2L_2V1], (instregex "^LDNT1S?[BHW]_ZZR_D_REAL$")>;
2643 def : InstRW<[V2Write_9cyc_2L_2V1], (instrs LDNT1D_ZZR_D_REAL)>;
2645 // Contiguous first faulting load, scalar + scalar
2646 def : InstRW<[V2Write_6cyc_1L_1S], (instregex "^LDFF1[BHWD]_REAL$",
2647                                               "^LDFF1S?B_[HSD]_REAL$",
2648                                               "^LDFF1S?H_[SD]_REAL$",
2649                                               "^LDFF1S?W_D_REAL$")>;
2651 // Contiguous non faulting load, scalar + imm
2652 def : InstRW<[V2Write_6cyc_1L], (instregex "^LDNF1[BHWD]_IMM_REAL$",
2653                                            "^LDNF1S?B_[HSD]_IMM_REAL$",
2654                                            "^LDNF1S?H_[SD]_IMM_REAL$",
2655                                            "^LDNF1S?W_D_IMM_REAL$")>;
2657 // Contiguous Load two structures to two vectors, scalar + imm
2658 def : InstRW<[V2Write_8cyc_2L_2V], (instregex "^LD2[BHWD]_IMM$")>;
2660 // Contiguous Load two structures to two vectors, scalar + scalar
2661 def : InstRW<[V2Write_9cyc_2L_2V_2S], (instregex "^LD2[BHWD]$")>;
2663 // Contiguous Load three structures to three vectors, scalar + imm
2664 def : InstRW<[V2Write_9cyc_3L_3V], (instregex "^LD3[BHWD]_IMM$")>;
2666 // Contiguous Load three structures to three vectors, scalar + scalar
2667 def : InstRW<[V2Write_10cyc_3V_3L_3S], (instregex "^LD3[BHWD]$")>;
2669 // Contiguous Load four structures to four vectors, scalar + imm
2670 def : InstRW<[V2Write_9cyc_4L_8V], (instregex "^LD4[BHWD]_IMM$")>;
2672 // Contiguous Load four structures to four vectors, scalar + scalar
2673 def : InstRW<[V2Write_10cyc_4L_8V_4S], (instregex "^LD4[BHWD]$")>;
2675 // Gather load, vector + imm, 32-bit element size
2676 def : InstRW<[V2Write_9cyc_1L_4V], (instregex "^GLD(FF)?1S?[BH]_S_IMM_REAL$",
2677                                               "^GLD(FF)?1W_IMM_REAL$")>;
2679 // Gather load, vector + imm, 64-bit element size
2680 def : InstRW<[V2Write_9cyc_1L_4V], (instregex "^GLD(FF)?1S?[BHW]_D_IMM_REAL$",
2681                                               "^GLD(FF)?1D_IMM_REAL$")>;
2683 // Gather load, 32-bit scaled offset
2684 def : InstRW<[V2Write_10cyc_1L_8V],
2685              (instregex "^GLD(FF)?1S?H_S_[SU]XTW_SCALED_REAL$",
2686                         "^GLD(FF)?1W_[SU]XTW_SCALED_REAL")>;
2688 // Gather load, 64-bit scaled offset
2689 // NOTE: These instructions are not specified in the SOG.
2690 def : InstRW<[V2Write_10cyc_1L_4V],
2691              (instregex "^GLD(FF)?1S?[HW]_D_([SU]XTW_)?SCALED_REAL$",
2692                         "^GLD(FF)?1D_([SU]XTW_)?SCALED_REAL$")>;
2694 // Gather load, 32-bit unpacked unscaled offset
2695 def : InstRW<[V2Write_9cyc_1L_4V], (instregex "^GLD(FF)?1S?[BH]_S_[SU]XTW_REAL$",
2696                                               "^GLD(FF)?1W_[SU]XTW_REAL$")>;
2698 // Gather load, 64-bit unpacked unscaled offset
2699 // NOTE: These instructions are not specified in the SOG.
2700 def : InstRW<[V2Write_9cyc_1L_2V],
2701              (instregex "^GLD(FF)?1S?[BHW]_D_([SU]XTW_)?REAL$",
2702                         "^GLD(FF)?1D_([SU]XTW_)?REAL$")>;
2704 // SVE Store instructions
2705 // -----------------------------------------------------------------------------
2707 // Store from predicate reg
2708 def : InstRW<[V2Write_1cyc_1L01], (instrs STR_PXI)>;
2710 // Store from vector reg
2711 def : InstRW<[V2Write_2cyc_1L01_1V01], (instrs STR_ZXI)>;
2713 // Contiguous store, scalar + imm
2714 def : InstRW<[V2Write_2cyc_1L01_1V01], (instregex "^ST1[BHWD]_IMM$",
2715                                                   "^ST1B_[HSD]_IMM$",
2716                                                   "^ST1H_[SD]_IMM$",
2717                                                   "^ST1W_D_IMM$")>;
2719 // Contiguous store, scalar + scalar
2720 def : InstRW<[V2Write_2cyc_1L01_1S_1V01], (instregex "^ST1H(_[SD])?$")>;
2721 def : InstRW<[V2Write_2cyc_1L01_1V01], (instregex "^ST1[BWD]$",
2722                                                   "^ST1B_[HSD]$",
2723                                                   "^ST1W_D$")>;
2725 // Contiguous store two structures from two vectors, scalar + imm
2726 def : InstRW<[V2Write_4cyc_1L01_1V01], (instregex "^ST2[BHWD]_IMM$")>;
2728 // Contiguous store two structures from two vectors, scalar + scalar
2729 def : InstRW<[V2Write_4cyc_2L01_2S_2V01], (instrs ST2H)>;
2730 def : InstRW<[V2Write_4cyc_2L01_2V01], (instregex "^ST2[BWD]$")>;
2732 // Contiguous store three structures from three vectors, scalar + imm
2733 def : InstRW<[V2Write_7cyc_9L01_9V01], (instregex "^ST3[BHWD]_IMM$")>;
2735 // Contiguous store three structures from three vectors, scalar + scalar
2736 def : InstRW<[V2Write_7cyc_9L01_9S_9V01], (instregex "^ST3[BHWD]$")>;
2738 // Contiguous store four structures from four vectors, scalar + imm
2739 def : InstRW<[V2Write_11cyc_18L01_18V01], (instregex "^ST4[BHWD]_IMM$")>;
2741 // Contiguous store four structures from four vectors, scalar + scalar
2742 def : InstRW<[V2Write_11cyc_18L01_18S_18V01], (instregex "^ST4[BHWD]$")>;
2744 // Non temporal store, scalar + imm
2745 def : InstRW<[V2Write_2cyc_1L01_1V], (instregex "^STNT1[BHWD]_ZRI$")>;
2747 // Non temporal store, scalar + scalar
2748 def : InstRW<[V2Write_2cyc_1L01_1S_1V], (instrs STNT1H_ZRR)>;
2749 def : InstRW<[V2Write_2cyc_1L01_1V], (instregex "^STNT1[BWD]_ZRR$")>;
2751 // Scatter non temporal store, vector + scalar 32-bit element size
2752 def : InstRW<[V2Write_4cyc_4L01_4V01], (instregex "^STNT1[BHW]_ZZR_S")>;
2754 // Scatter non temporal store, vector + scalar 64-bit element size
2755 def : InstRW<[V2Write_2cyc_2L01_2V01], (instregex "^STNT1[BHWD]_ZZR_D")>;
2757 // Scatter store vector + imm 32-bit element size
2758 def : InstRW<[V2Write_4cyc_4L01_4V01], (instregex "^SST1[BH]_S_IMM$",
2759                                                   "^SST1W_IMM$")>;
2761 // Scatter store vector + imm 64-bit element size
2762 def : InstRW<[V2Write_2cyc_2L01_2V01], (instregex "^SST1[BHW]_D_IMM$",
2763                                                   "^SST1D_IMM$")>;
2765 // Scatter store, 32-bit scaled offset
2766 def : InstRW<[V2Write_4cyc_4L01_4V01],
2767              (instregex "^SST1(H_S|W)_[SU]XTW_SCALED$")>;
2769 // Scatter store, 32-bit unpacked unscaled offset
2770 def : InstRW<[V2Write_2cyc_2L01_2V01], (instregex "^SST1[BHW]_D_[SU]XTW$",
2771                                                   "^SST1D_[SU]XTW$")>;
2773 // Scatter store, 32-bit unpacked scaled offset
2774 def : InstRW<[V2Write_2cyc_2L01_2V01], (instregex "^SST1[HW]_D_[SU]XTW_SCALED$",
2775                                                   "^SST1D_[SU]XTW_SCALED$")>;
2777 // Scatter store, 32-bit unscaled offset
2778 def : InstRW<[V2Write_4cyc_4L01_4V01], (instregex "^SST1[BH]_S_[SU]XTW$",
2779                                                   "^SST1W_[SU]XTW$")>;
2781 // Scatter store, 64-bit scaled offset
2782 def : InstRW<[V2Write_2cyc_2L01_2V01], (instregex "^SST1[HW]_D_SCALED$",
2783                                                   "^SST1D_SCALED$")>;
2785 // Scatter store, 64-bit unscaled offset
2786 def : InstRW<[V2Write_2cyc_2L01_2V01], (instregex "^SST1[BHW]_D$",
2787                                                   "^SST1D$")>;
2789 // SVE Miscellaneous instructions
2790 // -----------------------------------------------------------------------------
2792 // Read first fault register, unpredicated
2793 def : InstRW<[V2Write_2cyc_1M0], (instrs RDFFR_P_REAL)>;
2795 // Read first fault register, predicated
2796 def : InstRW<[V2Write_3or4cyc_1M0_1M], (instrs RDFFR_PPz_REAL)>;
2798 // Read first fault register and set flags
2799 def : InstRW<[V2Write_4or5cyc_2M0_2M], (instrs RDFFRS_PPz)>;
2801 // Set first fault register
2802 // Write to first fault register
2803 def : InstRW<[V2Write_2cyc_1M0], (instrs SETFFR, WRFFR)>;
2805 // Prefetch
2806 // NOTE: This is not specified in the SOG.
2807 def : InstRW<[V2Write_4cyc_1L], (instregex "^PRF[BHWD]")>;
2809 // SVE Cryptographic instructions
2810 // -----------------------------------------------------------------------------
2812 // Crypto AES ops
2813 def : InstRW<[V2Write_2cyc_1V], (instregex "^AES[DE]_ZZZ_B$",
2814                                            "^AESI?MC_ZZ_B$")>;
2816 // Crypto SHA3 ops
2817 def : InstRW<[V2Write_2cyc_1V0], (instregex "^(BCAX|EOR3)_ZZZZ$",
2818                                             "^RAX1_ZZZ_D$",
2819                                             "^XAR_ZZZI_[BHSD]$")>;
2821 // Crypto SM4 ops
2822 def : InstRW<[V2Write_4cyc_1V0], (instregex "^SM4E(KEY)?_ZZZ_S$")>;