[SampleProfileLoader] Fix integer overflow in generateMDProfMetadata (#90217)
[llvm-project.git] / llvm / lib / Target / AArch64 / AArch64SchedAmpere1B.td
blob67f8593f1577a3037ef3186150d62781933d70b3
1 //=- AArch64SchedAmpere1B.td - Ampere-1B scheduling def -----*- tablegen -*-=//
2 //
3 // Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
4 // See https://llvm.org/LICENSE.txt for license information.
5 // SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
6 //
7 //===----------------------------------------------------------------------===//
8 //
9 // This file defines the machine model for the Ampere Computing Ampere-1B to
10 // support instruction scheduling and other instruction cost heuristics.
12 //===----------------------------------------------------------------------===//
14 // The Ampere-1B core is an out-of-order micro-architecture.  The front
15 // end has branch prediction, with a 10-cycle recovery time from a
16 // mispredicted branch.  Instructions coming out of the front end are
17 // decoded into internal micro-ops (uops).
19 def Ampere1BModel : SchedMachineModel {
20   let IssueWidth            =  12;  // Maximum micro-ops dispatch rate.
21   let MicroOpBufferSize     = 208;  // micro-op re-order buffer size
22   let LoadLatency           =   3;  // Optimistic load latency
23   let MispredictPenalty     =  10;  // Branch mispredict penalty
24   let LoopMicroOpBufferSize =  32;  // Instruction queue size
25   let CompleteModel         =   1;
27   list<Predicate> UnsupportedFeatures = !listconcat(SVEUnsupported.F,
28                                                     SMEUnsupported.F,
29                                                     PAUnsupported.F);
32 let SchedModel = Ampere1BModel in {
34 //===----------------------------------------------------------------------===//
35 // Define each kind of processor resource and number available on Ampere-1B.
37 def Ampere1BUnitA  : ProcResource<2>;  // integer single-cycle, branch, and flags r/w
38 def Ampere1BUnitB  : ProcResource<2>;  // integer single-cycle, and complex shifts
39 def Ampere1BUnitBS : ProcResource<1>;  // integer multi-cycle
40 def Ampere1BUnitL  : ProcResource<2>;  // load
41 def Ampere1BUnitS  : ProcResource<2>;  // store address calculation
42 def Ampere1BUnitX  : ProcResource<1>;  // FP and vector operations, and flag write
43 def Ampere1BUnitY  : ProcResource<1>;  // FP and vector operations, and crypto
44 def Ampere1BUnitZ  : ProcResource<1>;  // FP store data and FP-to-integer moves
46 def Ampere1BUnitAB : ProcResGroup<[Ampere1BUnitA, Ampere1BUnitB]>;
47 def Ampere1BUnitXY : ProcResGroup<[Ampere1BUnitX, Ampere1BUnitY]>;
49 //===----------------------------------------------------------------------===//
50 // Define customized scheduler read/write types specific to the Ampere-1.
52 def Ampere1BWrite_1cyc_1A : SchedWriteRes<[Ampere1BUnitA]> {
53   let Latency = 1;
54   let NumMicroOps = 1;
57 def Ampere1BWrite_1cyc_2A : SchedWriteRes<[Ampere1BUnitA, Ampere1BUnitA]> {
58   let Latency = 1;
59   let NumMicroOps = 2;
62 def Ampere1BWrite_1cyc_1B : SchedWriteRes<[Ampere1BUnitB]> {
63   let Latency = 1;
64   let NumMicroOps = 1;
67 def Ampere1BWrite_1cyc_1BS : SchedWriteRes<[Ampere1BUnitBS]> {
68   let Latency = 1;
69   let NumMicroOps = 1;
72 def Ampere1BWrite_1cyc_1BS_1B : SchedWriteRes<[Ampere1BUnitBS, Ampere1BUnitB]> {
73   let Latency = 1;
74   let NumMicroOps = 2;
77 def Ampere1BWrite_1cyc_1AB : SchedWriteRes<[Ampere1BUnitAB]> {
78   let Latency = 1;
79   let NumMicroOps = 1;
82 def Ampere1BWrite_1cyc_1AB_1A : SchedWriteRes<[Ampere1BUnitAB, Ampere1BUnitA]> {
83   let Latency = 1;
84   let NumMicroOps = 2;
87 def Ampere1BWrite_1cyc_1L : SchedWriteRes<[Ampere1BUnitL]> {
88   let Latency = 1;
89   let NumMicroOps = 1;
92 def Ampere1BWrite_1cyc_1S : SchedWriteRes<[Ampere1BUnitS]> {
93   let Latency = 1;
94   let NumMicroOps = 1;
97 def Ampere1BWrite_1cyc_2S : SchedWriteRes<[Ampere1BUnitS, Ampere1BUnitS]> {
98   let Latency = 1;
99   let NumMicroOps = 2;
102 def Ampere1BWrite_2cyc_1Y : SchedWriteRes<[Ampere1BUnitY]> {
103   let Latency = 2;
104   let NumMicroOps = 1;
107 def Ampere1BWrite_2cyc_2AB : SchedWriteRes<[Ampere1BUnitAB, Ampere1BUnitAB]> {
108   let Latency = 2;
109   let NumMicroOps = 2;
112 def Ampere1BWrite_2cyc_1B_1AB : SchedWriteRes<[Ampere1BUnitB, Ampere1BUnitAB]> {
113   let Latency = 2;
114   let NumMicroOps = 2;
117 def Ampere1BWrite_2cyc_1B_1S : SchedWriteRes<[Ampere1BUnitB, Ampere1BUnitS]> {
118   let Latency = 2;
119   let NumMicroOps = 2;
122 def Ampere1BWrite_2cyc_1B_1S_1AB : SchedWriteRes<[Ampere1BUnitB,
123                                                   Ampere1BUnitS,
124                                                   Ampere1BUnitAB]> {
125   let Latency = 2;
126   let NumMicroOps = 3;
129 def Ampere1BWrite_2cyc_1S_2Z : SchedWriteRes<[Ampere1BUnitS,
130                                               Ampere1BUnitZ,
131                                               Ampere1BUnitZ]> {
132   let Latency = 2;
133   let NumMicroOps = 3;
136 def Ampere1BWrite_2cyc_1XY : SchedWriteRes<[Ampere1BUnitXY]> {
137   let Latency = 2;
138   let NumMicroOps = 1;
141 def Ampere1BWrite_2cyc_1S_1Z : SchedWriteRes<[Ampere1BUnitS, Ampere1BUnitZ]> {
142   let Latency = 2;
143   let NumMicroOps = 2;
146 def Ampere1BWrite_3cyc_1BS : SchedWriteRes<[Ampere1BUnitBS]> {
147   let Latency = 3;
148   let NumMicroOps = 1;
151 def Ampere1BWrite_3cyc_1L : SchedWriteRes<[Ampere1BUnitL]> {
152   let Latency = 3;
153   let NumMicroOps = 1;
156 def Ampere1BWrite_3cyc_1X : SchedWriteRes<[Ampere1BUnitX]> {
157   let Latency = 3;
158   let NumMicroOps = 1;
161 def Ampere1BWrite_3cyc_1XY : SchedWriteRes<[Ampere1BUnitXY]> {
162   let Latency = 3;
163   let NumMicroOps = 1;
166 def Ampere1BWrite_3cyc_1Z : SchedWriteRes<[Ampere1BUnitZ]> {
167   let Latency = 3;
168   let NumMicroOps = 1;
171 def Ampere1BWrite_3cyc_1S_1Z : SchedWriteRes<[Ampere1BUnitS,
172                                               Ampere1BUnitZ]> {
173   let Latency = 3;
174   let NumMicroOps = 2;
177 def Ampere1BWrite_3cyc_1S_2Z : SchedWriteRes<[Ampere1BUnitS,
178                                               Ampere1BUnitZ, Ampere1BUnitZ]> {
179   let Latency = 3;
180   let NumMicroOps = 3;
183 def Ampere1BWrite_3cyc_2S_2Z : SchedWriteRes<[Ampere1BUnitS, Ampere1BUnitS,
184                                               Ampere1BUnitZ, Ampere1BUnitZ]> {
185   let Latency = 3;
186   let NumMicroOps = 4;
189 def Ampere1BWrite_4cyc_1BS_1AB : SchedWriteRes<[Ampere1BUnitBS, Ampere1BUnitAB]> {
190   let Latency = 4;
191   let NumMicroOps = 2;
194 def Ampere1BWrite_4cyc_1L : SchedWriteRes<[Ampere1BUnitL]> {
195   let Latency = 4;
196   let NumMicroOps = 1;
199 def Ampere1BWrite_4cyc_2L : SchedWriteRes<[Ampere1BUnitL, Ampere1BUnitL]> {
200   let Latency = 4;
201   let NumMicroOps = 2;
204 def Ampere1BWrite_4cyc_1L_1B : SchedWriteRes<[Ampere1BUnitL, Ampere1BUnitB]> {
205   let Latency = 4;
206   let NumMicroOps = 2;
209 def Ampere1BWrite_4cyc_1X : SchedWriteRes<[Ampere1BUnitX]> {
210   let Latency = 4;
211   let NumMicroOps = 1;
214 def Ampere1BWrite_4cyc_1XY : SchedWriteRes<[Ampere1BUnitXY]> {
215   let Latency = 4;
216   let NumMicroOps = 1;
219 def Ampere1BWrite_4cyc_2XY : SchedWriteRes<[Ampere1BUnitXY, Ampere1BUnitXY]> {
220   let Latency = 4;
221   let NumMicroOps = 2;
224 def Ampere1BWrite_5cyc_1BS : SchedWriteRes<[Ampere1BUnitBS]> {
225   let Latency = 5;
226   let NumMicroOps = 1;
229 def Ampere1BWrite_4cyc_1XY_1S_1Z : SchedWriteRes<[Ampere1BUnitXY,
230                                                   Ampere1BUnitS,
231                                                   Ampere1BUnitZ]> {
232   let Latency = 4;
233   let NumMicroOps = 3;
236 def Ampere1BWrite_4cyc_3S_3Z : SchedWriteRes<[Ampere1BUnitS, Ampere1BUnitS,
237                                               Ampere1BUnitS, Ampere1BUnitZ,
238                                               Ampere1BUnitZ, Ampere1BUnitZ]> {
239   let Latency = 4;
240   let NumMicroOps = 6;
243 def Ampere1BWrite_5cyc_4S_4Z : SchedWriteRes<[Ampere1BUnitS, Ampere1BUnitS,
244                                               Ampere1BUnitS, Ampere1BUnitS,
245                                               Ampere1BUnitZ, Ampere1BUnitZ,
246                                               Ampere1BUnitZ, Ampere1BUnitZ]> {
247   let Latency = 5;
248   let NumMicroOps = 8;
251 def Ampere1BWrite_5cyc_1L_1BS : SchedWriteRes<[Ampere1BUnitL,
252                                                Ampere1BUnitBS]> {
253   let Latency = 5;
254   let NumMicroOps = 2;
257 def Ampere1BWrite_5cyc_3L : SchedWriteRes<[Ampere1BUnitL,
258                                            Ampere1BUnitL,
259                                            Ampere1BUnitL]> {
260   let Latency = 5;
261   let NumMicroOps = 3;
264 def Ampere1BWrite_5cyc_4L : SchedWriteRes<[Ampere1BUnitL,
265                                            Ampere1BUnitL,
266                                            Ampere1BUnitL,
267                                            Ampere1BUnitL]> {
268   let Latency = 5;
269   let NumMicroOps = 4;
272 def Ampere1BWrite_5cyc_1X : SchedWriteRes<[Ampere1BUnitX]> {
273   let Latency = 5;
274   let NumMicroOps = 1;
277 def Ampere1BWrite_5cyc_2XY_2S_2Z : SchedWriteRes<[Ampere1BUnitXY, Ampere1BUnitXY,
278                                                   Ampere1BUnitS,  Ampere1BUnitS,
279                                                   Ampere1BUnitZ,  Ampere1BUnitZ]> {
280   let Latency = 5;
281   let NumMicroOps = 6;
284 def Ampere1BWrite_6cyc_1BS_1A : SchedWriteRes<[Ampere1BUnitBS, Ampere1BUnitA]> {
285   let Latency = 6;
286   let NumMicroOps = 2;
289 def Ampere1BWrite_6cyc_1BS_2A : SchedWriteRes<[Ampere1BUnitBS, Ampere1BUnitA,
290                                                                Ampere1BUnitA]> {
291   let Latency = 6;
292   let NumMicroOps = 3;
295 def Ampere1BWrite_6cyc_1L_1XY : SchedWriteRes<[Ampere1BUnitL, Ampere1BUnitXY]> {
296   let Latency = 6;
297   let NumMicroOps = 2;
300 def Ampere1BWrite_6cyc_2L_2XY : SchedWriteRes<[Ampere1BUnitL,  Ampere1BUnitL,
301                                                Ampere1BUnitXY, Ampere1BUnitXY]> {
302   let Latency = 6;
303   let NumMicroOps = 4;
306 def Ampere1BWrite_6cyc_1X : SchedWriteRes<[Ampere1BUnitX]> {
307   let Latency = 6;
308   let NumMicroOps = 2;
311 def Ampere1BWrite_6cyc_2XY : SchedWriteRes<[Ampere1BUnitXY, Ampere1BUnitXY]> {
312   let Latency = 6;
313   let NumMicroOps = 2;
316 def Ampere1BWrite_6cyc_3XY : SchedWriteRes<[Ampere1BUnitXY, Ampere1BUnitXY,
317                                             Ampere1BUnitXY]> {
318   let Latency = 6;
319   let NumMicroOps = 3;
322 def Ampere1BWrite_6cyc_2XY_2S_2Z : SchedWriteRes<[Ampere1BUnitXY, Ampere1BUnitXY,
323                                                   Ampere1BUnitS,  Ampere1BUnitS,
324                                                   Ampere1BUnitZ,  Ampere1BUnitZ]> {
325   let Latency = 6;
326   let NumMicroOps = 6;
329 def Ampere1BWrite_6cyc_3XY_3S_3Z : SchedWriteRes<[Ampere1BUnitXY, Ampere1BUnitXY, Ampere1BUnitXY,
330                                                   Ampere1BUnitS,  Ampere1BUnitS,  Ampere1BUnitS,
331                                                   Ampere1BUnitZ,  Ampere1BUnitZ,  Ampere1BUnitZ]> {
332   let Latency = 6;
333   let NumMicroOps = 9;
336 def Ampere1BWrite_7cyc_1BS_1XY : SchedWriteRes<[Ampere1BUnitBS, Ampere1BUnitXY]> {
337   let Latency = 7;
338   let NumMicroOps = 2;
341 def Ampere1BWrite_7cyc_1XY_1Z : SchedWriteRes<[Ampere1BUnitXY, Ampere1BUnitZ]> {
342   let Latency = 7;
343   let NumMicroOps = 2;
346 def Ampere1BWrite_7cyc_1X_1Z : SchedWriteRes<[Ampere1BUnitX, Ampere1BUnitZ]> {
347   let Latency = 7;
348   let NumMicroOps = 2;
351 def Ampere1BWrite_7cyc_3L_3XY : SchedWriteRes<[Ampere1BUnitL,  Ampere1BUnitL,
352                                                Ampere1BUnitL,  Ampere1BUnitXY,
353                                                Ampere1BUnitXY, Ampere1BUnitXY]> {
354   let Latency = 7;
355   let NumMicroOps = 6;
358 def Ampere1BWrite_7cyc_4L_4XY : SchedWriteRes<[Ampere1BUnitL,  Ampere1BUnitL,
359                                                Ampere1BUnitL,  Ampere1BUnitL,
360                                                Ampere1BUnitXY, Ampere1BUnitXY,
361                                                Ampere1BUnitXY, Ampere1BUnitXY]> {
362   let Latency = 7;
363   let NumMicroOps = 8;
366 def Ampere1BWrite_7cyc_4XY_4S_4Z : SchedWriteRes<[Ampere1BUnitXY, Ampere1BUnitXY,
367                                                   Ampere1BUnitXY, Ampere1BUnitXY,
368                                                   Ampere1BUnitS,  Ampere1BUnitS,
369                                                   Ampere1BUnitS,  Ampere1BUnitS,
370                                                   Ampere1BUnitZ,  Ampere1BUnitZ,
371                                                   Ampere1BUnitZ,  Ampere1BUnitZ]> {
372   let Latency = 7;
373   let NumMicroOps = 12;
376 def Ampere1BWrite_8cyc_1BS_1L : SchedWriteRes<[Ampere1BUnitBS, Ampere1BUnitL]> {
377   let Latency = 8;
378   let NumMicroOps = 2;
381 def Ampere1BWrite_8cyc_1BS_1XY : SchedWriteRes<[Ampere1BUnitBS, Ampere1BUnitXY]> {
382   let Latency = 8;
383   let NumMicroOps = 2;
386 def Ampere1BWrite_8cyc_2L_3XY : SchedWriteRes<[Ampere1BUnitL,  Ampere1BUnitL,
387                                                Ampere1BUnitXY, Ampere1BUnitXY,
388                                                Ampere1BUnitXY]> {
389   let Latency = 8;
390   let NumMicroOps = 5;
393 def Ampere1BWrite_8cyc_3L_3XY : SchedWriteRes<[Ampere1BUnitL,  Ampere1BUnitL,
394                                                Ampere1BUnitL,  Ampere1BUnitXY,
395                                                Ampere1BUnitXY, Ampere1BUnitXY]> {
396   let Latency = 8;
397   let NumMicroOps = 6;
400 def Ampere1BWrite_8cyc_4L_4XY : SchedWriteRes<[Ampere1BUnitL,  Ampere1BUnitL,
401                                                Ampere1BUnitL,  Ampere1BUnitL,
402                                                Ampere1BUnitXY, Ampere1BUnitXY,
403                                                Ampere1BUnitXY, Ampere1BUnitXY]> {
404   let Latency = 8;
405   let NumMicroOps = 8;
408 def Ampere1BWrite_8cyc_2XY : SchedWriteRes<[Ampere1BUnitXY, Ampere1BUnitXY]> {
409   let Latency = 8;
410   let NumMicroOps = 2;
413 def Ampere1BWrite_8cyc_4XY : SchedWriteRes<[Ampere1BUnitXY, Ampere1BUnitXY,
414                                             Ampere1BUnitXY, Ampere1BUnitXY]> {
415   let Latency = 8;
416   let NumMicroOps = 4;
419 def Ampere1BWrite_9cyc_6XY_4S_4Z : SchedWriteRes<[Ampere1BUnitXY, Ampere1BUnitXY,
420                                                   Ampere1BUnitXY, Ampere1BUnitXY,
421                                                   Ampere1BUnitXY, Ampere1BUnitXY,
422                                                   Ampere1BUnitS,  Ampere1BUnitS,
423                                                   Ampere1BUnitS,  Ampere1BUnitS,
424                                                   Ampere1BUnitZ,  Ampere1BUnitZ,
425                                                   Ampere1BUnitZ,  Ampere1BUnitZ]> {
426   let Latency = 9;
427   let NumMicroOps = 14;
430 def Ampere1BWrite_9cyc_1A_1BS_1X : SchedWriteRes<[Ampere1BUnitA, Ampere1BUnitBS, Ampere1BUnitX]> {
431   let Latency = 9;
432   let NumMicroOps = 3;
435 def Ampere1BWrite_9cyc_1A_1BS_1XY : SchedWriteRes<[Ampere1BUnitA, Ampere1BUnitBS, Ampere1BUnitXY]> {
436   let Latency = 9;
437   let NumMicroOps = 3;
440 def Ampere1BWrite_9cyc_3L_3XY : SchedWriteRes<[Ampere1BUnitL,  Ampere1BUnitL,
441                                                Ampere1BUnitL,  Ampere1BUnitXY,
442                                                Ampere1BUnitXY, Ampere1BUnitXY]> {
443   let Latency = 9;
444   let NumMicroOps = 6;
447 def Ampere1BWrite_9cyc_1X : SchedWriteRes<[Ampere1BUnitX]> {
448   let Latency = 9;
449   let NumMicroOps = 1;
452 def Ampere1BWrite_9cyc_3XY : SchedWriteRes<[Ampere1BUnitXY, Ampere1BUnitXY, Ampere1BUnitXY]> {
453   let Latency = 9;
454   let NumMicroOps = 3;
457 def Ampere1BWrite_10cyc_4L_8XY : SchedWriteRes<[Ampere1BUnitL,  Ampere1BUnitL,
458                                                 Ampere1BUnitL,  Ampere1BUnitL,
459                                                 Ampere1BUnitXY, Ampere1BUnitXY,
460                                                 Ampere1BUnitXY, Ampere1BUnitXY]> {
461   let Latency = 10;
462   let NumMicroOps = 12;
465 def Ampere1BWrite_11cyc_1BS_2XY : SchedWriteRes<[Ampere1BUnitBS, Ampere1BUnitXY, Ampere1BUnitXY]> {
466   let Latency = 11;
467   let NumMicroOps = 3;
470 def Ampere1BWrite_11cyc_4L_8XY : SchedWriteRes<[Ampere1BUnitL,  Ampere1BUnitL,
471                                                 Ampere1BUnitL,  Ampere1BUnitL,
472                                                 Ampere1BUnitXY, Ampere1BUnitXY,
473                                                 Ampere1BUnitXY, Ampere1BUnitXY]> {
474   let Latency = 11;
475   let NumMicroOps = 12;
478 def Ampere1BWrite_12cyc_1X : SchedWriteRes<[Ampere1BUnitX]> {
479   let Latency = 12;
480   let NumMicroOps = 1;
483 def Ampere1BWrite_13cyc_1BS_1X : SchedWriteRes<[Ampere1BUnitBS, Ampere1BUnitX]> {
484   let Latency = 13;
485   let NumMicroOps = 2;
488 def Ampere1BWrite_17cyc_1X : SchedWriteRes<[Ampere1BUnitX]> {
489   let Latency = 17;
490   let NumMicroOps = 1;
493 def Ampere1BWrite_19cyc_2BS_1X : SchedWriteRes<[Ampere1BUnitBS,
494                                                 Ampere1BUnitBS,
495                                                 Ampere1BUnitX]> {
496   let Latency = 13;
497   let NumMicroOps = 3;
500 def Ampere1BWrite_19cyc_1X : SchedWriteRes<[Ampere1BUnitX]> {
501   let Latency = 19;
502   let NumMicroOps = 1;
505 def Ampere1BWrite_21cyc_1X : SchedWriteRes<[Ampere1BUnitX]> {
506   let Latency = 21;
507   let NumMicroOps = 1;
510 def Ampere1BWrite_33cyc_1X : SchedWriteRes<[Ampere1BUnitX]> {
511   let Latency = 33;
512   let NumMicroOps = 1;
515 def Ampere1BWrite_39cyc_1X : SchedWriteRes<[Ampere1BUnitX]> {
516   let Latency = 39;
517   let NumMicroOps = 1;
520 def Ampere1BWrite_63cyc_1X : SchedWriteRes<[Ampere1BUnitX]> {
521   let Latency = 63;
522   let NumMicroOps = 1;
525 // For basic arithmetic, we have more flexibility for short shifts (LSL shift <= 4),
526 // which are a single uop, and for extended registers, which have full flexibility
527 // across Unit A or B for both uops.
528 def Ampere1BWrite_Arith : SchedWriteVariant<[
529                                 SchedVar<RegExtendedPred, [Ampere1BWrite_2cyc_2AB]>,
530                                 SchedVar<IsCheapLSL,      [Ampere1BWrite_1cyc_1AB]>,
531                                 SchedVar<NoSchedPred,     [Ampere1BWrite_2cyc_1B_1AB]>]>;
533 def Ampere1BWrite_ArithFlagsetting : SchedWriteVariant<[
534                                 SchedVar<RegExtendedPred, [Ampere1BWrite_2cyc_2AB]>,
535                                 SchedVar<IsCheapLSL,      [Ampere1BWrite_1cyc_1AB]>,
536                                 SchedVar<NoSchedPred,     [Ampere1BWrite_2cyc_1B_1AB]>]>;
538 //===----------------------------------------------------------------------===//
539 // Map the target-defined scheduler read/write resources and latencies for Ampere-1.
540 // This provides a coarse model, which is then specialised below.
542 def : WriteRes<WriteImm,   [Ampere1BUnitAB]>;  // MOVN, MOVZ
543 def : WriteRes<WriteI,     [Ampere1BUnitAB]>;  // ALU
544 def : WriteRes<WriteISReg, [Ampere1BUnitB, Ampere1BUnitAB]> {
545   let Latency = 2;
546   let NumMicroOps = 2;
547 }  // ALU of Shifted-Reg
548 def : WriteRes<WriteIEReg, [Ampere1BUnitAB, Ampere1BUnitAB]> {
549   let Latency = 2;
550   let NumMicroOps = 2;
551 }  // ALU of Extended-Reg
552 def : WriteRes<WriteExtr,  [Ampere1BUnitB]>;  // EXTR shifts a reg pair
553 def : WriteRes<WriteIS,    [Ampere1BUnitB]>;  // Shift/Scale
554 def : WriteRes<WriteID32,  [Ampere1BUnitBS, Ampere1BUnitX]> {
555   let Latency = 13;
556 }  // 32-bit Divide
557 def : WriteRes<WriteID64,  [Ampere1BUnitBS, Ampere1BUnitX]> {
558   let Latency = 19;
559 }  // 64-bit Divide
560 def : WriteRes<WriteIM32,  [Ampere1BUnitBS]> {
561   let Latency = 3;
562 }  // 32-bit Multiply
563 def : WriteRes<WriteIM64,  [Ampere1BUnitBS, Ampere1BUnitAB]> {
564   let Latency = 3;
565 }  // 64-bit Multiply
566 def : WriteRes<WriteBr,    [Ampere1BUnitA]>;
567 def : WriteRes<WriteBrReg, [Ampere1BUnitA, Ampere1BUnitA]>;
568 def : WriteRes<WriteLD,    [Ampere1BUnitL]> {
569   let Latency = 3;
570 }  // Load from base addr plus immediate offset
571 def : WriteRes<WriteST,    [Ampere1BUnitS]> {
572   let Latency = 1;
573 }  // Store to base addr plus immediate offset
574 def : WriteRes<WriteSTP,   [Ampere1BUnitS, Ampere1BUnitS]> {
575   let Latency = 1;
576   let NumMicroOps = 1;
577 }  // Store a register pair.
578 def : WriteRes<WriteAdr,   [Ampere1BUnitAB]>;
579 def : WriteRes<WriteLDIdx, [Ampere1BUnitAB, Ampere1BUnitS]> {
580   let Latency = 3;
581   let NumMicroOps = 1;
582 }  // Load from a register index (maybe scaled).
583 def : WriteRes<WriteSTIdx, [Ampere1BUnitS, Ampere1BUnitS]> {
584   let Latency = 1;
585   let NumMicroOps = 2;
586 }  // Store to a register index (maybe scaled).
587 def : WriteRes<WriteF,  [Ampere1BUnitXY]> {
588   let Latency = 2;
589 }  // General floating-point ops.
590 def : WriteRes<WriteFCmp,  [Ampere1BUnitX]> {
591   let Latency = 3;
592 }  // Floating-point compare.
593 def : WriteRes<WriteFCvt,  [Ampere1BUnitXY]> {
594   let Latency = 3;
595 }  // Float conversion.
596 def : WriteRes<WriteFCopy, [Ampere1BUnitXY]> {
597 }  // Float-int register copy.
598 def : WriteRes<WriteFImm,  [Ampere1BUnitXY]> {
599   let Latency = 2;
600 }  // Float-int register copy.
601 def : WriteRes<WriteFMul,  [Ampere1BUnitXY]> {
602   let Latency = 4;
603 }  // Floating-point multiply.
604 def : WriteRes<WriteFDiv,  [Ampere1BUnitXY]> {
605   let Latency = 19;
606 }  // Floating-point division.
607 def : WriteRes<WriteVd,    [Ampere1BUnitXY]> {
608   let Latency = 3;
609 }  // 64bit Vector D ops.
610 def : WriteRes<WriteVq,    [Ampere1BUnitXY]> {
611   let Latency = 3;
612 }  // 128bit Vector Q ops.
613 def : WriteRes<WriteVLD,   [Ampere1BUnitL, Ampere1BUnitL]> {
614   let Latency = 4;
615 }  // Vector loads.
616 def : WriteRes<WriteVST,   [Ampere1BUnitS, Ampere1BUnitZ]> {
617   let Latency = 2;
618 }  // Vector stores.
620 def : WriteRes<WriteAtomic,  []> { let Unsupported = 1; }
622 def : WriteRes<WriteSys,     []> { let Latency = 1; }
623 def : WriteRes<WriteBarrier, []> { let Latency = 1; }
624 def : WriteRes<WriteHint,    []> { let Latency = 1; }
626 def : WriteRes<WriteLDHi,    []> {
627   let Latency = 3;
628 }  // The second register of a load-pair: LDP,LDPSW,LDNP,LDXP,LDAXP
630 // Forwarding logic.
631 def : ReadAdvance<ReadI,       0>;
632 def : ReadAdvance<ReadISReg,   0>;
633 def : ReadAdvance<ReadIEReg,   0>;
634 def : ReadAdvance<ReadIM,      0>;
635 def : ReadAdvance<ReadIMA,     1, [WriteIM32, WriteIM64]>;
636 def : ReadAdvance<ReadID,      0>;
637 def : ReadAdvance<ReadExtrHi,  0>;
638 def : ReadAdvance<ReadST,      0>;
639 def : ReadAdvance<ReadAdrBase, 0>;
640 def : ReadAdvance<ReadVLD,     0>;
642 //===----------------------------------------------------------------------===//
643 // Specialising the scheduling model further for Ampere-1B.
645 def : InstRW<[Ampere1BWrite_1cyc_1AB], (instrs COPY)>;
647 // Branch instructions
648 def : InstRW<[Ampere1BWrite_1cyc_1A], (instrs Bcc, BL, RET)>;
649 def : InstRW<[Ampere1BWrite_1cyc_1A],
650         (instrs CBZW, CBZX, CBNZW, CBNZX, TBZW, TBZX, TBNZW, TBNZX)>;
651 def : InstRW<[Ampere1BWrite_1cyc_2A], (instrs BLR)>;
653 // Common Short Sequence Compression (CSSC)
654 def : InstRW<[Ampere1BWrite_1cyc_1AB], (instregex "^ABS[WX]")>;
655 def : InstRW<[Ampere1BWrite_3cyc_1BS], (instregex "^CNT[WX]")>;
656 def : InstRW<[Ampere1BWrite_1cyc_1B], (instregex "^CTZ[WX]")>;
657 def : InstRW<[Ampere1BWrite_1cyc_1AB_1A], (instregex "^[SU](MAX|MIN)[WX]")>;
659 // Cryptography instructions
660 // -- AES encryption/decryption
661 def : InstRW<[Ampere1BWrite_2cyc_1XY], (instregex "^AES[DE]")>;
662 def : InstRW<[Ampere1BWrite_2cyc_1XY], (instregex "^AESI?MC")>;
663 // -- Polynomial multiplication
664 def : InstRW<[Ampere1BWrite_2cyc_1XY], (instregex "^PMUL", "^PMULL")>;
665 // -- SHA-256 hash
666 def : InstRW<[Ampere1BWrite_4cyc_1X], (instregex "^SHA256(H|H2)")>;
667 // -- SHA-256 schedule update
668 def : InstRW<[Ampere1BWrite_2cyc_1Y], (instregex "^SHA256SU[01]")>;
669 // -- SHA-3 instructions
670 def : InstRW<[Ampere1BWrite_2cyc_1XY],
671         (instregex "^BCAX", "^EOR3", "^RAX1", "^XAR")>;
672 // -- SHA-512 hash
673 def : InstRW<[Ampere1BWrite_4cyc_1X], (instregex "^SHA512(H|H2)")>;
674 // -- SHA-512 schedule update
675 def : InstRW<[Ampere1BWrite_2cyc_1Y], (instregex "^SHA512SU[01]")>;
676 // -- SHA1 choose/majority/parity
677 def : InstRW<[Ampere1BWrite_4cyc_1X], (instregex "^SHA1[CMP]")>;
678 // -- SHA1 hash/schedule update
679 def : InstRW<[Ampere1BWrite_2cyc_1Y], (instregex "^SHA1SU[01]")>;
680 def : InstRW<[Ampere1BWrite_2cyc_1Y], (instregex "^SHA1H")>;
681 // -- SM3 hash
682 def : InstRW<[Ampere1BWrite_2cyc_1XY],
683     (instregex "^SM3PARTW[12]$", "^SM3SS1$", "^SM3TT[12][AB]$")>;
684 def : InstRW<[Ampere1BWrite_4cyc_1X], (instrs SM4E, SM4ENCKEY)>;
686 // FP and vector load instructions
687 // -- Load 1-element structure to one/all lanes
688 // ---- all lanes
689 def : InstRW<[Ampere1BWrite_6cyc_1L_1XY],
690         (instregex "^LD1Rv(8b|4h|2s|16b|8h|4s|2d)")>;
691 // ---- one lane
692 def : InstRW<[Ampere1BWrite_6cyc_1L_1XY],
693         (instregex "^LD1i(8|16|32|64)")>;
694 // -- Load 1-element structure to one/all lanes, 1D size
695 def : InstRW<[Ampere1BWrite_4cyc_1L],
696         (instregex "^LD1Rv1d")>;
697 // -- Load 1-element structures to 1 register
698 def : InstRW<[Ampere1BWrite_4cyc_1L],
699         (instregex "^LD1Onev(8b|4h|2s|1d|16b|8h|4s|2d)")>;
700 // -- Load 1-element structures to 2 registers
701 def : InstRW<[Ampere1BWrite_4cyc_2L],
702         (instregex "^LD1Twov(8b|4h|2s|1d|16b|8h|4s|2d)")>;
703 // -- Load 1-element structures to 3 registers
704 def : InstRW<[Ampere1BWrite_5cyc_3L],
705         (instregex "^LD1Threev(8b|4h|2s|1d|16b|8h|4s|2d)")>;
706 // -- Load 1-element structures to 4 registers
707 def : InstRW<[Ampere1BWrite_5cyc_4L],
708         (instregex "^LD1Fourv(8b|4h|2s|1d|16b|8h|4s|2d)")>;
709 // -- Load 2-element structure to all lanes of 2 registers, 1D size
710 def : InstRW<[Ampere1BWrite_4cyc_2L],
711         (instregex "^LD2Rv1d")>;
712 // -- Load 2-element structure to all lanes of 2 registers, other sizes
713 def : InstRW<[Ampere1BWrite_6cyc_2L_2XY],
714         (instregex "^LD2Rv(8b|4h|2s|16b|8h|4s|2d)")>;
715 // -- Load 2-element structure to one lane of 2 registers
716 def : InstRW<[Ampere1BWrite_6cyc_2L_2XY],
717         (instregex "^LD2i(8|16|32|64)")>;
718 // -- Load 2-element structures to 2 registers, 16B/8H/4S/2D size
719 def : InstRW<[Ampere1BWrite_6cyc_2L_2XY],
720         (instregex "^LD2Twov(16b|8h|4s|2d)")>;
721 // -- Load 2-element structures to 2 registers, 8B/4H/2S size
722 def : InstRW<[Ampere1BWrite_8cyc_2L_3XY],
723         (instregex "^LD2Twov(8b|4h|2s)")>;
724 // -- Load 3-element structure to all lanes of 3 registers, 1D size
725 def : InstRW<[Ampere1BWrite_5cyc_3L],
726         (instregex "^LD3Rv1d")>;
727 // -- Load 3-element structure to all lanes of 3 registers, other sizes
728 def : InstRW<[Ampere1BWrite_7cyc_3L_3XY],
729         (instregex "^LD3Rv(8b|4h|2s|16b|8h|4s|2d)")>;
730 // -- Load 3-element structure to one lane of 3 registers
731 def : InstRW<[Ampere1BWrite_7cyc_3L_3XY],
732         (instregex "^LD3i(8|16|32|64)")>;
733 // -- Load 3-element structures to 3 registers, 16B/8H/4S sizes
734 def : InstRW<[Ampere1BWrite_8cyc_3L_3XY],
735         (instregex "^LD3Threev(16b|8h|4s)")>;
736 // -- Load 3-element structures to 3 registers, 2D size
737 def : InstRW<[Ampere1BWrite_7cyc_3L_3XY],
738         (instregex "^LD3Threev2d")>;
739 // -- Load 3-element structures to 3 registers, 8B/4H/2S sizes
740 def : InstRW<[Ampere1BWrite_9cyc_3L_3XY],
741         (instregex "^LD3Threev(8b|4h|2s)")>;
742 // -- Load 4-element structure to all lanes of 4 registers, 1D size
743 def : InstRW<[Ampere1BWrite_5cyc_4L],
744         (instregex "^LD4Rv1d")>;
745 // -- Load 4-element structure to all lanes of 4 registers, other sizes
746 def : InstRW<[Ampere1BWrite_7cyc_4L_4XY],
747         (instregex "^LD4Rv(8b|4h|2s|16b|8h|4s|2d)")>;
748 // -- Load 4-element structure to one lane of 4 registers
749 def : InstRW<[Ampere1BWrite_7cyc_4L_4XY],
750         (instregex "^LD4i(8|16|32|64)")>;
751 // -- Load 4-element structures to 4 registers, 2D size
752 def : InstRW<[Ampere1BWrite_8cyc_4L_4XY],
753         (instregex "^LD4Fourv2d")>;
754 // -- Load 4-element structures to 4 registers, 2S size
755 def : InstRW<[Ampere1BWrite_11cyc_4L_8XY],
756         (instregex "^LD4Fourv2s")>;
757 // -- Load 4-element structures to 4 registers, other sizes
758 def : InstRW<[Ampere1BWrite_10cyc_4L_8XY],
759         (instregex "^LD4Fourv(8b|4h|16b|8h|4s)")>;
760 // -- Load pair, Q-form
761 def : InstRW<[Ampere1BWrite_4cyc_2L], (instregex "LDN?PQ")>;
762 // -- Load pair, S/D-form
763 def : InstRW<[Ampere1BWrite_5cyc_1L_1BS], (instregex "LDN?P(S|D)")>;
764 // -- Load register
765 def : InstRW<[Ampere1BWrite_4cyc_1L], (instregex "LDU?R[BHSDQ]i")>;
766 // -- Load register, sign-extended register
767 def : InstRW<[Ampere1BWrite_4cyc_1L], (instregex "LDR[BHSDQ]ro(W|X)")>;
769 // FP and vector store instructions
770 // -- Store 1-element structure from one lane of 1 register
771 def : InstRW<[Ampere1BWrite_4cyc_1XY_1S_1Z],
772         (instregex "^ST1i(8|16|32|64)")>;
773 // -- Store 1-element structures from 1 register
774 def : InstRW<[Ampere1BWrite_2cyc_1S_1Z],
775         (instregex "^ST1Onev(8b|4h|2s|1d|16b|8h|4s|2d)")>;
776 // -- Store 1-element structures from 2 registers
777 def : InstRW<[Ampere1BWrite_3cyc_2S_2Z],
778         (instregex "^ST1Twov(8b|4h|2s|1d|16b|8h|4s|2d)")>;
779 // -- Store 1-element structures from 3 registers
780 def : InstRW<[Ampere1BWrite_4cyc_3S_3Z],
781         (instregex "^ST1Threev(8b|4h|2s|1d|16b|8h|4s|2d)")>;
782 // -- Store 1-element structures from 4 registers
783 def : InstRW<[Ampere1BWrite_5cyc_4S_4Z],
784         (instregex "^ST1Fourv(8b|4h|2s|1d|16b|8h|4s|2d)")>;
785 // -- Store 2-element structure from one lane of 2 registers
786 def : InstRW<[Ampere1BWrite_5cyc_2XY_2S_2Z],
787         (instregex "^ST2i(8|16|32|64)")>;
788 // -- Store 2-element structures from 2 registers, 16B/8H/4S/2D sizes
789 def : InstRW<[Ampere1BWrite_5cyc_2XY_2S_2Z],
790         (instregex "^ST2Twov(16b|8h|4s|2d)")>;
791 // -- Store 2-element structures from 2 registers, 8B/4H/2S sizes
792 def : InstRW<[Ampere1BWrite_6cyc_2XY_2S_2Z],
793         (instregex "^ST2Twov(8b|4h|2s)")>;
794 // -- Store 3-element structure from one lane of 3 registers
795 def : InstRW<[Ampere1BWrite_6cyc_3XY_3S_3Z],
796         (instregex "^ST3i(8|16|32|64)")>;
797 // -- Store 3-element structures from 3 registers
798 def : InstRW<[Ampere1BWrite_6cyc_3XY_3S_3Z],
799         (instregex "^ST3Threev(8b|4h|2s|1d|16b|8h|4s|2d)")>;
800 // -- Store 4-element structure from one lane of 4 registers
801 def : InstRW<[Ampere1BWrite_7cyc_4XY_4S_4Z],
802         (instregex "^ST4i(8|16|32|64)")>;
803 // -- Store 4-element structures from 4 registers, 16B/8H/4S sizes
804 def : InstRW<[Ampere1BWrite_7cyc_4XY_4S_4Z],
805         (instregex "^ST4Fourv(16b|8h|4s)")>;
806 // -- Store 4-element structures from 4 registers, 2D sizes
807 def : InstRW<[Ampere1BWrite_7cyc_4XY_4S_4Z],
808         (instregex "^ST4Fourv2d")>;
809 // -- Store 4-element structures from 4 registers, 8B/4H/2S sizes
810 def : InstRW<[Ampere1BWrite_9cyc_6XY_4S_4Z],
811         (instregex "^ST4Fourv(8b|4h|2s)")>;
812 // -- Store pair, Q-form
813 def : InstRW<[Ampere1BWrite_3cyc_2S_2Z], (instregex "^STN?PQ")>;
814 // -- Store pair, S/D-form
815 def : InstRW<[Ampere1BWrite_3cyc_2S_2Z], (instregex "^STN?P[SD]")>;
816 // -- Store register
817 def : InstRW<[Ampere1BWrite_2cyc_1S_1Z], (instregex "^STU?R[BHSDQ](ui|i)")>;
818 // -- Store register, sign-extended register offset
819 def : InstRW<[Ampere1BWrite_2cyc_1S_1Z], (instregex "^STR[BHSDQ]ro[XW]")>;
821 // FP data processing, bfloat16 format
822 def : InstRW<[Ampere1BWrite_3cyc_1XY], (instrs BFCVT)>;
823 def : InstRW<[Ampere1BWrite_8cyc_2XY], (instrs BFCVTN, BFCVTN2)>;
824 def : InstRW<[Ampere1BWrite_2cyc_1XY], (instregex "^BFDOTv", "^BF16DOT")>;
825 def : InstRW<[Ampere1BWrite_3cyc_1XY], (instrs BFMMLA)>;
826 def : InstRW<[Ampere1BWrite_4cyc_1XY], (instregex "^BFMLAL")>;
828 // FP data processing, scalar/vector, half precision
829 def : InstRW<[Ampere1BWrite_3cyc_1XY], (instregex "^F(ABD|ABS)v.[fi]16")>;
830 def : InstRW<[Ampere1BWrite_3cyc_1XY],
831         (instregex "^F(ADD|ADDP|CADD|NEG|NMUL|SUB)v.[fi]16")>;
832 def : InstRW<[Ampere1BWrite_3cyc_1XY],
833         (instregex "^F(AC|CM)(EQ|GE|GT|LE|LT)v.[fi]16")>;
834 def : InstRW<[Ampere1BWrite_3cyc_1XY],
835         (instregex "^F(AC|CM)(EQ|GE|GT|LE|LT)16")>;
836 def : InstRW<[Ampere1BWrite_3cyc_1X],
837         (instregex "^FCMPE?H")>;
838 def : InstRW<[Ampere1BWrite_9cyc_1A_1BS_1X],
839         (instregex "^FCCMPE?H")>;
840 def : InstRW<[Ampere1BWrite_9cyc_1A_1BS_1XY],
841         (instregex "^FCSELH")>;
842 def : InstRW<[Ampere1BWrite_3cyc_1XY], (instregex "^FCVT[AMNPZ][SU]v.[if]16")>;
843 // Convert FP to integer, H-form
844 def : InstRW<[Ampere1BWrite_3cyc_1XY], (instregex "^[SUd]CVTFv.[fi]16")>;
845 // Convert to FP from GPR, H-form
846 def : InstRW<[Ampere1BWrite_8cyc_1BS_1XY], (instregex "^[SU]CVTF_ZPmZ_[DSH]toH$")>;
847 // Convert to FP from GPR, fixed-point, H-form
848 def : InstRW<[Ampere1BWrite_11cyc_1BS_2XY], (instregex "^[SU]CVTF[SU][WX]Hri$")>;
849 def : InstRW<[Ampere1BWrite_9cyc_1X], (instrs FDIVHrr)>;
850 def : InstRW<[Ampere1BWrite_17cyc_1X], (instregex "^FDIVv.[if]16")>;
851 def : InstRW<[Ampere1BWrite_3cyc_1XY], (instregex "^F(MAX|MIN)(NM)?P?v.[if]16")>;
852 def : InstRW<[Ampere1BWrite_6cyc_2XY], (instregex "^F(MAX|MIN)(NM)?Vv4[if]16")>;
853 def : InstRW<[Ampere1BWrite_9cyc_3XY], (instregex "^F(MAX|MIN)(NM)?Vv8[if]16")>;
854 def : InstRW<[Ampere1BWrite_4cyc_1XY], (instregex "^FMULX?v.[if]16")>;
855 def : InstRW<[Ampere1BWrite_4cyc_1XY], (instrs FMULX16)>;
856 def : InstRW<[Ampere1BWrite_4cyc_1XY], (instregex "^FN?M(ADD|SUB)[H]rrr")>;
857 def : InstRW<[Ampere1BWrite_4cyc_1XY], (instregex "^FML[AS]v.[if]16")>;
858 def : InstRW<[Ampere1BWrite_3cyc_1XY], (instregex "^FRECPXv.[if]16")>;
859 def : InstRW<[Ampere1BWrite_4cyc_1XY], (instregex "^F(RECP|RSQRT)S16")>;
860 def : InstRW<[Ampere1BWrite_3cyc_1XY], (instregex "^FRINT[AIMNPXZ]v.[if]16")>;
861 // FP square root, H-form
862 def : InstRW<[Ampere1BWrite_21cyc_1X], (instrs FSQRTHr)>;
863 // FP square root, vector-form, F16
864 def : InstRW<[Ampere1BWrite_39cyc_1X], (instregex "^FSQRTv.f16")>;
866 // FP data processing, scalar/vector, single/double precision
867 def : InstRW<[Ampere1BWrite_3cyc_1XY], (instregex "^F(ABD|ABS)v.[fi](32|64)")>;
868 def : InstRW<[Ampere1BWrite_3cyc_1XY],
869         (instregex "^F(ADD|ADDP|CADD|NEG|NMUL|SUB)v.[fi](32|64)")>;
870 def : InstRW<[Ampere1BWrite_3cyc_1XY],
871         (instregex "^F(AC|CM)(EQ|GE|GT|LE|LT)v.[fi](32|64)")>;
872 def : InstRW<[Ampere1BWrite_3cyc_1XY],
873         (instregex "^F(AC|CM)(EQ|GE|GT|LE|LT)(32|64)")>;
874 def : InstRW<[Ampere1BWrite_3cyc_1X],
875         (instregex "^FCMPE?(S|D)")>;
876 def : InstRW<[Ampere1BWrite_9cyc_1A_1BS_1X],
877         (instregex "^FCCMPE?(S|D)")>;
878 def : InstRW<[Ampere1BWrite_9cyc_1A_1BS_1XY],
879         (instregex "^FCSEL(S|D)")>;
880 def : InstRW<[Ampere1BWrite_3cyc_1XY], (instregex "^FCVT[AMNPZ][SU]v.[if](32|64)")>;
881 // Convert FP to integer, S/D-form
882 def : InstRW<[Ampere1BWrite_3cyc_1XY], (instregex "^[SUd]CVTFv.[fi](32|64)")>;
883 // Convert to FP from GPR, S/D-form
884 def : InstRW<[Ampere1BWrite_8cyc_1BS_1XY], (instregex "^[SU]CVTF_ZPmZ_[DSH]to[DS]$")>;
885 // Convert to FP from GPR, fixed-point, S/D-form
886 def : InstRW<[Ampere1BWrite_11cyc_1BS_2XY], (instregex "^[SU]CVTF[SU][WX][SD]ri$")>;
887 def : InstRW<[Ampere1BWrite_19cyc_1X], (instregex "^FDIVv.[if](64)", "FDIVD")>;
888 def : InstRW<[Ampere1BWrite_12cyc_1X], (instregex "^FDIVv.[if](32)", "FDIVS")>;
889 def : InstRW<[Ampere1BWrite_3cyc_1XY], (instregex "^F(MAX|MIN)(NM)?P?v.[if](32|64)")>;
890 def : InstRW<[Ampere1BWrite_6cyc_2XY], (instregex "^F(MAX|MIN)(NM)?Vv.[if](32|64)")>;
891 def : InstRW<[Ampere1BWrite_4cyc_1XY], (instregex "^FMULX?v.[if](32|64)")>;
892 def : InstRW<[Ampere1BWrite_4cyc_1XY], (instrs FMULX32, FMULX64)>;
893 def : InstRW<[Ampere1BWrite_4cyc_1XY], (instrs FMULSrr, FNMULSrr)>;
894 def : InstRW<[Ampere1BWrite_4cyc_1XY], (instrs FMULDrr, FNMULDrr)>;
895 def : InstRW<[Ampere1BWrite_4cyc_1XY], (instregex "^FN?M(ADD|SUB)[SD]rrr")>;
896 def : InstRW<[Ampere1BWrite_4cyc_1XY], (instregex "^FML[AS]v.[if](32|64)")>;
897 def : InstRW<[Ampere1BWrite_3cyc_1XY], (instregex "^FRECPXv.[if](32|64)")>;
898 def : InstRW<[Ampere1BWrite_3cyc_1XY], (instregex "^F(RECP|RSQRT)S(32|64)")>;
899 def : InstRW<[Ampere1BWrite_3cyc_1XY], (instregex "^FRINT[AIMNPXZ]v.[if](32|64)")>;
900 def : InstRW<[Ampere1BWrite_3cyc_1XY], (instregex "^FRINT(32|64)")>;
901 def : InstRW<[Ampere1BWrite_63cyc_1X], (instregex "^FSQRTv.f64", "^FSQRTDr")>;
902 def : InstRW<[Ampere1BWrite_33cyc_1X], (instregex "^FSQRTv.f32", "^FSQRTSr")>;
904 // FP miscellaneous instructions
905 def : InstRW<[Ampere1BWrite_7cyc_1XY_1Z], (instregex "^FCVT[AMNPZ][SU][SU][XW][HSD]r")>;
906 def : InstRW<[Ampere1BWrite_3cyc_1XY], (instregex "^FCVT[HSD]Hr")>;
907 def : InstRW<[Ampere1BWrite_3cyc_1XY], (instregex "^FCVT[HSD][SD]r")>;
908 def : InstRW<[Ampere1BWrite_3cyc_1XY], (instregex "^FCVTLv")>;
909 def : InstRW<[Ampere1BWrite_3cyc_1XY], (instregex "^FCVT(N|XN)v")>;
910 def : InstRW<[Ampere1BWrite_7cyc_1X_1Z], (instrs FJCVTZS)>;
911 def : InstRW<[Ampere1BWrite_5cyc_1BS], (instregex "^FMOV[HSD][WX]r")>;
912 def : InstRW<[Ampere1BWrite_7cyc_1BS_1XY], (instregex "^FMOVDXHighr")>;
913 def : InstRW<[Ampere1BWrite_2cyc_1XY], (instregex "^FMOV[HSD][ri]")>;
914 def : InstRW<[Ampere1BWrite_5cyc_1X], (instregex "^FMOVXDHighr")>;
915 def : InstRW<[Ampere1BWrite_3cyc_1Z], (instregex "^FMOV[WX][HSD]r")>;
917 // Integer arithmetic and logical instructions
918 def : InstRW<[Ampere1BWrite_1cyc_1A],
919         (instregex "ADC(W|X)r", "SBC(W|X)r")>;
920 def : InstRW<[Ampere1BWrite_Arith],
921         (instregex "(ADD|AND|BIC|EON|EOR|ORN|ORR|SUB)[WX]r[sx]")>;
922 def : InstRW<[Ampere1BWrite_1cyc_1AB],
923         (instregex "(ADD|AND|BIC|EON|EOR|ORN|ORR|SUB)[WX]r[ri]")>;
924 def : InstRW<[Ampere1BWrite_ArithFlagsetting],
925         (instregex "(ADD|AND|BIC|SUB)S[WX]r[sx]")>;
926 def : InstRW<[Ampere1BWrite_1cyc_1A],
927         (instregex "(ADD|AND|BIC|SUB)S[WX]r[ri]")>;
928 def : InstRW<[Ampere1BWrite_1cyc_1A],
929         (instregex "(ADC|SBC)S[WX]r")>;
930 def : InstRW<[Ampere1BWrite_1cyc_1A], (instrs RMIF)>;
931 def : InstRW<[Ampere1BWrite_1cyc_1A],
932         (instregex "(CCMN|CCMP)(X|W)")>;
933 def : InstRW<[Ampere1BWrite_1cyc_1A],
934         (instregex "(CSEL|CSINC|CSINV|CSNEG)(X|W)")>;
935 def : InstRW<[Ampere1BWrite_13cyc_1BS_1X], (instrs SDIVWr, UDIVWr)>;
936 def : InstRW<[Ampere1BWrite_19cyc_2BS_1X], (instrs SDIVXr, UDIVXr)>;
937 def : InstRW<[Ampere1BWrite_3cyc_1BS],
938         (instregex "(S|U)MULHr")>;
939 def : InstRW<[Ampere1BWrite_4cyc_1BS_1AB],
940         (instregex "(S|U)?M(ADD|SUB)L?r")>;
942 // Integer load instructions
943 def : InstRW<[Ampere1BWrite_3cyc_1L],
944         (instregex "(LDNP|LDP|LDPSW)(X|W)")>;
945 def : InstRW<[Ampere1BWrite_3cyc_1L],
946         (instregex "LDR(B|D|H|Q|S)ui")>;
947 def : InstRW<[Ampere1BWrite_3cyc_1L],
948         (instregex "LDR(D|Q|W|X)l")>;
949 def : InstRW<[Ampere1BWrite_3cyc_1L],
950         (instregex "LDTR(B|H|W|X)i")>;
951 def : InstRW<[Ampere1BWrite_3cyc_1L],
952         (instregex "LDTRS(BW|BX|HW|HX|W)i")>;
953 def : InstRW<[Ampere1BWrite_3cyc_1L],
954         (instregex "LDUR(BB|HH|X|W)i")>;
955 def : InstRW<[Ampere1BWrite_3cyc_1L],
956         (instregex "LDURS(BW|BX|HW|HX|W)i")>;
957 def : InstRW<[Ampere1BWrite_3cyc_1L],
958         (instregex "LDR(HH|SHW|SHX|W|X)ro(W|X)")>;
959 def : InstRW<[Ampere1BWrite_1cyc_1L],
960         (instrs PRFMl, PRFUMi, PRFUMi)>;
961 def : InstRW<[Ampere1BWrite_1cyc_1L],
962         (instrs PRFMroW, PRFMroX)>;
964 // Integer miscellaneous instructions
965 def : InstRW<[Ampere1BWrite_1cyc_1A],  (instrs ADR, ADRP)>;
966 def : InstRW<[Ampere1BWrite_1cyc_1B],  (instregex "EXTR(W|X)")>;
967 def : InstRW<[Ampere1BWrite_1cyc_1B],  (instregex "(S|U)?BFM(W|X)")>;
968 def : InstRW<[Ampere1BWrite_3cyc_1BS], (instregex "^CRC32C?[BHWX]")>;
969 def : InstRW<[Ampere1BWrite_1cyc_1B],  (instregex "CLS(W|X)")>;
970 def : InstRW<[Ampere1BWrite_1cyc_1A],  (instrs SETF8, SETF16)>;
971 def : InstRW<[Ampere1BWrite_1cyc_1AB],
972         (instrs MOVKWi, MOVKXi, MOVNWi, MOVNXi, MOVZWi, MOVZXi)>;
973 def : InstRW<[Ampere1BWrite_1cyc_1B],
974         (instregex "(RBIT|REV|REV16)(W|X)r", "REV32Xr")>;
975 def : InstRW<[Ampere1BWrite_1cyc_1B],
976         (instregex "(ASR|LSL|LSR|ROR)V(W|X)r")>;
978 // Integer store instructions
979 def : InstRW<[Ampere1BWrite_1cyc_2S],        (instregex "STNP(X|W)i")>;
980 def : InstRW<[Ampere1BWrite_1cyc_2S],        (instrs STPXi)>;
981 def : InstRW<[Ampere1BWrite_2cyc_1B_1S],     (instrs STPWi)>;
982 def : InstRW<[Ampere1BWrite_2cyc_1B_1S_1AB], (instregex "STP(W|X)(pre|post)")>;
983 def : InstRW<[Ampere1BWrite_1cyc_1S],        (instrs STTRBi, STTRHi, STTRWi, STTRXi)>;
984 def : InstRW<[Ampere1BWrite_1cyc_1S],        (instregex "STUR(BB|HH|X|W)i",
985                                                         "STR(X|W)ui",
986                                                         "STUR(BB|HH|X|W)i")>;
987 def : InstRW<[Ampere1BWrite_1cyc_2S],        (instrs STRWroX, STRXroX)>;
988 def : InstRW<[Ampere1BWrite_1cyc_2S],        (instrs STRWroW, STRXroW)>;
990 // Memory tagging
992 // Insert Random Tags
993 def : InstRW<[Ampere1BWrite_1cyc_1BS_1B], (instrs IRG, IRGstack)>;
994 // Load allocation tag
995 def : InstRW<[Ampere1BWrite_4cyc_1L_1B], (instrs LDG, LDGM)>;
996 // Store allocation tags
997 def : InstRW<[Ampere1BWrite_1cyc_1S],
998     (instrs STGi, STGM, STGPreIndex, STGPostIndex)>;
999 // Store allocation tags and pair of registers
1000 def : InstRW<[Ampere1BWrite_1cyc_2S],
1001     (instrs STGPi, STGPpre, STGPpost)>;
1002 // Store allocation tags and zero data
1003 def : InstRW<[Ampere1BWrite_1cyc_1S],
1004     (instrs STZGi, STZGM, STZGPreIndex, STZGPostIndex)>;
1005 // Store two tags
1006 def : InstRW<[Ampere1BWrite_1cyc_2S],
1007     (instrs ST2Gi, ST2GPreIndex, ST2GPostIndex)>;
1008 // Store two tags and zero data
1009 def : InstRW<[Ampere1BWrite_1cyc_2S],
1010     (instrs STZ2Gi, STZ2GPreIndex, STZ2GPostIndex)>;
1011 // Subtract Pointer
1012 def : InstRW<[Ampere1BWrite_1cyc_1AB], (instrs SUBP)>;
1013 // Subtract Pointer, flagset
1014 def : InstRW<[Ampere1BWrite_1cyc_1AB], (instrs SUBPS)>;
1015 // Insert Tag Mask
1016 def : InstRW<[Ampere1BWrite_1cyc_1AB], (instrs GMI)>;
1017 // Arithmetic, immediate to logical address tag
1018 def : InstRW<[Ampere1BWrite_1cyc_1B], (instrs ADDG, SUBG)>;
1020 // Pointer authentication
1021 def : InstRW<[Ampere1BWrite_5cyc_1BS], (instregex "^AUT")>;
1022 def : InstRW<[Ampere1BWrite_6cyc_1BS_1A],
1023         (instregex "BRA(A|AZ|B|BZ)", "RETA(A|B)", "ERETA(A|B)")>;
1024 def : InstRW<[Ampere1BWrite_6cyc_1BS_2A],
1025         (instrs BLRAA, BLRAAZ, BLRAB, BLRABZ)>;
1026 def : InstRW<[Ampere1BWrite_5cyc_1BS], (instregex "^PAC")>;
1027 def : InstRW<[Ampere1BWrite_8cyc_1BS_1L], (instregex "^LDRA(A|B)")>;
1028 def : InstRW<[Ampere1BWrite_1cyc_1B], (instrs XPACD, XPACI)>;
1030 // Vector integer instructions
1031 // -- absolute difference
1032 def : InstRW<[Ampere1BWrite_2cyc_1XY],
1033              (instregex "^SABAv", "^SABALv", "^SABDv", "^SABDLv",
1034                         "^UABAv", "^UABALv", "^UABDv", "^UABDLv")>;
1035 // -- arithmetic
1036 def : InstRW<[Ampere1BWrite_2cyc_1XY],
1037         (instregex "^ABSv", "^(ADD|SUB)v", "^SADDLv", "^SADDW", "SHADD",
1038                    "SHSUB", "^SRHADD", "^URHADD", "SSUBL", "SSUBW",
1039                    "^UADDLv", "^UADDW", "UHADD", "UHSUB", "USUBL", "USUBW")>;
1040 // -- arithmetic, horizontal, 16B
1041 def : InstRW<[Ampere1BWrite_8cyc_4XY],
1042             (instregex "^ADDVv16i8v", "^SADDLVv16i8v", "^UADDLVv16i8v")>;
1043 def : InstRW<[Ampere1BWrite_8cyc_4XY],
1044             (instregex "^[SU](MIN|MAX)Vv16i8v")>;
1045 // -- arithmetic, horizontal, 4H/4S
1046 def : InstRW<[Ampere1BWrite_4cyc_2XY],
1047             (instregex "^[SU]?ADDL?V(v8i8|v4i16|v2i32)v")>;
1048 def : InstRW<[Ampere1BWrite_4cyc_2XY],
1049             (instregex "^[SU](MIN|MAX)V(v4i16|v4i32)v")>;
1050 // -- arithmetic, horizontal, 8B/8H
1051 def : InstRW<[Ampere1BWrite_6cyc_3XY],
1052             (instregex "^[SU]?ADDL?V(v8i16|v4i32)v")>;
1053 def : InstRW<[Ampere1BWrite_6cyc_3XY],
1054             (instregex "^[SU](MIN|MAX)V(v8i8|v8i16)v")>;
1055 // -- arithmetic, narrowing
1056 def : InstRW<[Ampere1BWrite_6cyc_2XY], (instregex "(ADD|SUB)HNv.*")>;
1057 def : InstRW<[Ampere1BWrite_6cyc_2XY], (instregex "(RADD|RSUB)HNv.*")>;
1058 // -- arithmetic, pairwise
1059 def : InstRW<[Ampere1BWrite_2cyc_1XY],
1060         (instregex "^ADDPv", "^SADALP", "^UADALP", "^SADDLPv", "^UADDLPv")>;
1061 // -- arithmetic, saturating
1062 def : InstRW<[Ampere1BWrite_2cyc_1XY],
1063         (instregex "^SQADD", "^SQSUB", "^SUQADD", "^UQADD", "^UQSUB", "^USQADD")>;
1064 // -- bit count
1065 def : InstRW<[Ampere1BWrite_2cyc_1XY],
1066         (instregex "^(CLS|CLZ|CNT)v")>;
1067 // -- compare
1068 def : InstRW<[Ampere1BWrite_2cyc_1XY],
1069         (instregex "^CMEQv", "^CMGEv", "^CMGTv", "^CMLEv", "^CMLTv",
1070                    "^CMHIv", "^CMHSv")>;
1071 // -- compare non-zero
1072 def : InstRW<[Ampere1BWrite_2cyc_1XY], (instregex "^CMTSTv")>;
1073 // -- dot product
1074 def : InstRW<[Ampere1BWrite_3cyc_1XY], (instregex "^(S|SU|U|US)DOTv")>;
1075 // -- fp reciprocal estimate
1076 def : InstRW<[Ampere1BWrite_6cyc_1X], (instregex "^FRECPEv", "^FRSQRTEv")>;
1077 // -- integer reciprocal estimate
1078 def : InstRW<[Ampere1BWrite_2cyc_1XY], (instregex "^URECPEv", "^URSQRTEv")>;
1079 // -- logical
1080 def : InstRW<[Ampere1BWrite_2cyc_1XY],
1081         (instregex "^ANDv", "^BICv", "^EORv", "^ORRv", "^ORNv", "^NOTv")>;
1082 // -- logical, narrowing
1083 def : InstRW<[Ampere1BWrite_6cyc_2XY],
1084         (instregex "RSHRNv",
1085                    "SHRNv", "SQSHRNv", "SQSHRUNv",
1086                    "UQXTNv")>;
1087 // -- matrix multiply
1088 def : InstRW<[Ampere1BWrite_3cyc_1XY],
1089         (instrs SMMLA, UMMLA, USMMLA)>;
1090 // -- max/min
1091 def : InstRW<[Ampere1BWrite_2cyc_1XY],
1092         (instregex "^SMAXv", "^SMINv", "^UMAXv", "^UMINv")>;
1093 def : InstRW<[Ampere1BWrite_2cyc_1XY],
1094         (instregex "^SMAXPv", "^SMINPv", "^UMAXPv", "^UMINPv")>;
1095 // -- move immediate
1096 def : InstRW<[Ampere1BWrite_2cyc_1XY], (instregex "^MOVIv", "^MVNIv")>;
1097 // -- multiply
1098 def : InstRW<[Ampere1BWrite_3cyc_1XY],
1099         (instregex "MULv", "SMULLv", "UMULLv", "SQDMUL(H|L)v", "SQRDMULHv")>;
1100 // -- multiply accumulate
1101 def : InstRW<[Ampere1BWrite_3cyc_1XY],
1102         (instregex "MLAv", "MLSv", "(S|U|SQD)(MLAL|MLSL)v", "SQRDML(A|S)Hv")>;
1103 // -- negation, saturating
1104 def : InstRW<[Ampere1BWrite_2cyc_1XY], (instregex "^SQABS", "^SQNEG")>;
1105 // -- reverse bits/bytes
1106 def : InstRW<[Ampere1BWrite_2cyc_1XY],
1107         (instregex "^RBITv", "^REV16v", "^REV32v", "^REV64v")>;
1108 // -- shift
1109 def : InstRW<[Ampere1BWrite_2cyc_1XY], (instregex "^[SU]SHL(v16i8|v8i16|v4i32|v2i64)")>;
1110 // -- shift and accumulate
1111 def : InstRW<[Ampere1BWrite_2cyc_1XY],
1112         (instregex "SRSRAv", "SSRAv", "URSRAv", "USRAv")>;
1113 // -- shift, saturating
1114 def : InstRW<[Ampere1BWrite_2cyc_1XY],
1115         (instregex "^SQRSHLv", "^SQRSHRNv", "^SQRSHRUNv", "^SQSHL", "^SQSHLU",
1116                    "^SQXTNv", "^SQXTUNv", "^UQSHRNv", "UQRSHRNv", "^UQRSHL",
1117                    "^UQSHL")>;
1119 // Vector miscellaneous instructions
1120 // -- duplicate element
1121 def : InstRW<[Ampere1BWrite_2cyc_1XY], (instregex "^DUPv.+lane")>;
1122 // -- duplicate from GPR
1123 def : InstRW<[Ampere1BWrite_5cyc_1BS], (instregex "^DUPv.+gpr")>;
1124 // -- extract narrow
1125 def : InstRW<[Ampere1BWrite_2cyc_1XY], (instregex "^XTNv")>;
1126 // -- insert/extract element
1127 def : InstRW<[Ampere1BWrite_2cyc_1XY], (instregex "^EXTv", "^INSv.+lane")>;
1128 // -- move FP immediate
1129 def : InstRW<[Ampere1BWrite_2cyc_1XY], (instregex "^FMOVv")>;
1130 // -- move element to GPR
1131 def : InstRW<[Ampere1BWrite_5cyc_1X], (instregex "(S|U)MOVv")>;
1132 // -- move from GPR to any element
1133 def : InstRW<[Ampere1BWrite_7cyc_1BS_1XY], (instregex "^INSv.+gpr")>;
1134 // -- table lookup
1135 def : InstRW<[Ampere1BWrite_2cyc_1XY],
1136             (instrs TBLv8i8One, TBLv16i8One, TBXv8i8One, TBXv16i8One)>;
1137 def : InstRW<[Ampere1BWrite_4cyc_2XY],
1138             (instrs TBLv8i8Two, TBLv16i8Two, TBXv8i8Two, TBXv16i8Two)>;
1139 def : InstRW<[Ampere1BWrite_6cyc_3XY],
1140             (instrs TBLv8i8Three, TBLv16i8Three, TBXv8i8Three, TBXv16i8Three)>;
1141 def : InstRW<[Ampere1BWrite_8cyc_4XY],
1142             (instrs TBLv8i8Four, TBLv16i8Four, TBXv8i8Four, TBXv16i8Four)>;
1143 // -- transpose
1144 def : InstRW<[Ampere1BWrite_2cyc_1XY],
1145               (instregex "^TRN1v", "^TRN2v", "^UZP1v", "^UZP2v")>;
1146 // -- zip/unzip
1147 def : InstRW<[Ampere1BWrite_2cyc_1XY], (instregex "^ZIP1v", "^ZIP2v")>;
1149 } // SchedModel = Ampere1BModel