[mlir] More fixes for 9fddaf6b14102963f12dbb9730f101fc52e662c1
[llvm-project.git] / llvm / lib / Target / AArch64 / AArch64SchedAmpere1.td
blob269f4ec5e5fb16266eed65fc4cbf8a337ce74244
1 //=- AArch64SchedAmpere1.td - Ampere-1 scheduling def -----*- tablegen -*-=//
2 //
3 // Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
4 // See https://llvm.org/LICENSE.txt for license information.
5 // SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
6 //
7 //===----------------------------------------------------------------------===//
8 //
9 // This file defines the machine model for the Ampere Computing Ampere-1 to
10 // support instruction scheduling and other instruction cost heuristics.
12 //===----------------------------------------------------------------------===//
14 // The Ampere-1 core is an out-of-order micro-architecture.  The front
15 // end has branch prediction, with a 10-cycle recovery time from a
16 // mispredicted branch.  Instructions coming out of the front end are
17 // decoded into internal micro-ops (uops).
19 def Ampere1Model : SchedMachineModel {
20   let IssueWidth            =   4;  // 4-way decode and dispatch
21   let MicroOpBufferSize     = 192;  // re-order buffer size
22   let LoadLatency           =   4;  // Optimistic load latency
23   let MispredictPenalty     =  10;  // Branch mispredict penalty
24   let LoopMicroOpBufferSize =  32;  // Instruction queue size
25   let CompleteModel = 0;
27   list<Predicate> UnsupportedFeatures = !listconcat(SVEUnsupported.F,
28                                                     SMEUnsupported.F,
29                                                     PAUnsupported.F,
30                                                     [HasMTE]);
33 let SchedModel = Ampere1Model in {
35 //===----------------------------------------------------------------------===//
36 // Define each kind of processor resource and number available on Ampere-1.
37 // Ampere-1 has 12 pipelines that 8 independent scheduler (4 integer, 2 FP,
38 // and 2 memory) issue into.  The integer and FP schedulers can each issue
39 // one uop per cycle, while the memory schedulers can each issue one load
40 // and one store address calculation per cycle.
42 def Ampere1UnitA  : ProcResource<2>;  // integer single-cycle, branch, and flags r/w
43 def Ampere1UnitB  : ProcResource<2>;  // integer single-cycle, and complex shifts
44 def Ampere1UnitBS : ProcResource<1>;  // integer multi-cycle
45 def Ampere1UnitL  : ProcResource<2>;  // load
46 def Ampere1UnitS  : ProcResource<2>;  // store address calculation
47 def Ampere1UnitX  : ProcResource<1>;  // FP and vector operations, and flag write
48 def Ampere1UnitY  : ProcResource<1>;  // FP and vector operations, and crypto
49 def Ampere1UnitZ  : ProcResource<1>;  // FP store data and FP-to-integer moves
51 def Ampere1UnitAB : ProcResGroup<[Ampere1UnitA, Ampere1UnitB]>;
52 def Ampere1UnitXY : ProcResGroup<[Ampere1UnitX, Ampere1UnitY]>;
54 //===----------------------------------------------------------------------===//
55 // Define customized scheduler read/write types specific to the Ampere-1.
57 def Ampere1Write_1cyc_1A : SchedWriteRes<[Ampere1UnitA]> {
58   let Latency = 1;
59   let NumMicroOps = 1;
62 def Ampere1Write_1cyc_2A : SchedWriteRes<[Ampere1UnitA, Ampere1UnitA]> {
63   let Latency = 1;
64   let NumMicroOps = 2;
67 def Ampere1Write_1cyc_1B : SchedWriteRes<[Ampere1UnitB]> {
68   let Latency = 1;
69   let NumMicroOps = 1;
72 def Ampere1Write_1cyc_1AB : SchedWriteRes<[Ampere1UnitAB]> {
73   let Latency = 1;
74   let NumMicroOps = 1;
77 def Ampere1Write_1cyc_1L : SchedWriteRes<[Ampere1UnitL]> {
78   let Latency = 1;
79   let NumMicroOps = 1;
82 def Ampere1Write_1cyc_1S : SchedWriteRes<[Ampere1UnitS]> {
83   let Latency = 1;
84   let NumMicroOps = 1;
87 def Ampere1Write_1cyc_2S : SchedWriteRes<[Ampere1UnitS, Ampere1UnitS]> {
88   let Latency = 1;
89   let NumMicroOps = 2;
92 def Ampere1Write_2cyc_1Y : SchedWriteRes<[Ampere1UnitY]> {
93   let Latency = 2;
94   let NumMicroOps = 1;
97 def Ampere1Write_2cyc_2AB : SchedWriteRes<[Ampere1UnitAB, Ampere1UnitAB]> {
98   let Latency = 2;
99   let NumMicroOps = 2;
102 def Ampere1Write_2cyc_1B_1AB : SchedWriteRes<[Ampere1UnitB, Ampere1UnitAB]> {
103   let Latency = 2;
104   let NumMicroOps = 2;
107 def Ampere1Write_2cyc_1B_1A : SchedWriteRes<[Ampere1UnitB, Ampere1UnitA]> {
108   let Latency = 2;
109   let NumMicroOps = 2;
112 def Ampere1Write_2cyc_1AB_1A : SchedWriteRes<[Ampere1UnitAB, Ampere1UnitA]> {
113   let Latency = 2;
114   let NumMicroOps = 2;
117 def Ampere1Write_2cyc_1AB_1L : SchedWriteRes<[Ampere1UnitAB, Ampere1UnitL]> {
118   let Latency = 2;
119   let NumMicroOps = 2;
122 def Ampere1Write_2cyc_1AB_2S : SchedWriteRes<[Ampere1UnitAB, Ampere1UnitS,
123                                                              Ampere1UnitS]> {
124   let Latency = 2;
125   let NumMicroOps = 3;
128 def Ampere1Write_2cyc_1AB_1S_1Z : SchedWriteRes<[Ampere1UnitAB, Ampere1UnitS,
129                                                                 Ampere1UnitZ]> {
130   let Latency = 2;
131   let NumMicroOps = 3;
134 def Ampere1Write_2cyc_1B_1S : SchedWriteRes<[Ampere1UnitB, Ampere1UnitS]> {
135   let Latency = 2;
136   let NumMicroOps = 2;
139 def Ampere1Write_2cyc_1XY : SchedWriteRes<[Ampere1UnitXY]> {
140   let Latency = 2;
141   let NumMicroOps = 1;
144 def Ampere1Write_2cyc_1S_1Z : SchedWriteRes<[Ampere1UnitS, Ampere1UnitZ]> {
145   let Latency = 2;
146   let NumMicroOps = 2;
149 def Ampere1Write_3cyc_1BS : SchedWriteRes<[Ampere1UnitBS]> {
150   let Latency = 3;
151   let NumMicroOps = 1;
154 def Ampere1Write_3cyc_1XY : SchedWriteRes<[Ampere1UnitXY]> {
155   let Latency = 3;
156   let NumMicroOps = 1;
159 def Ampere1Write_3cyc_1B_1S_1AB : SchedWriteRes<[Ampere1UnitB, Ampere1UnitS,
160                                                                Ampere1UnitAB]> {
161   let Latency = 2;
162   let NumMicroOps = 3;
165 def Ampere1Write_3cyc_1S_2Z : SchedWriteRes<[Ampere1UnitS, Ampere1UnitZ, Ampere1UnitZ]> {
166   let Latency = 2;
167   let NumMicroOps = 3;
170 def Ampere1Write_3cyc_2S_2Z : SchedWriteRes<[Ampere1UnitS, Ampere1UnitS,
171                                              Ampere1UnitZ, Ampere1UnitZ]> {
172   let Latency = 2;
173   let NumMicroOps = 4;
176 def Ampere1Write_4cyc_1BS : SchedWriteRes<[Ampere1UnitBS]> {
177   let Latency = 4;
178   let NumMicroOps = 1;
181 def Ampere1Write_4cyc_1L : SchedWriteRes<[Ampere1UnitL]> {
182   let Latency = 4;
183   let NumMicroOps = 1;
186 def Ampere1Write_4cyc_1X : SchedWriteRes<[Ampere1UnitX]> {
187   let Latency = 4;
188   let NumMicroOps = 1;
191 def Ampere1Write_4cyc_1Y : SchedWriteRes<[Ampere1UnitY]> {
192   let Latency = 4;
193   let NumMicroOps = 1;
196 def Ampere1Write_4cyc_1Z : SchedWriteRes<[Ampere1UnitZ]> {
197   let Latency = 4;
198   let NumMicroOps = 1;
201 def Ampere1Write_4cyc_2L : SchedWriteRes<[Ampere1UnitL, Ampere1UnitL]> {
202   let Latency = 4;
203   let NumMicroOps = 2;
206 def Ampere1Write_4cyc_1XY : SchedWriteRes<[Ampere1UnitXY]> {
207   let Latency = 4;
208   let NumMicroOps = 1;
211 def Ampere1Write_4cyc_2XY : SchedWriteRes<[Ampere1UnitXY, Ampere1UnitXY]> {
212   let Latency = 4;
213   let NumMicroOps = 2;
216 def Ampere1Write_4cyc_1XY_1S_1Z : SchedWriteRes<[Ampere1UnitXY, Ampere1UnitS, Ampere1UnitZ]> {
217   let Latency = 4;
218   let NumMicroOps = 3;
221 def Ampere1Write_4cyc_3S_3Z : SchedWriteRes<[Ampere1UnitS, Ampere1UnitS, Ampere1UnitS,
222                                              Ampere1UnitZ, Ampere1UnitZ, Ampere1UnitZ]> {
223   let Latency = 4;
224   let NumMicroOps = 6;
227 def Ampere1Write_5cyc_1AB_1L : SchedWriteRes<[Ampere1UnitAB, Ampere1UnitL]> {
228   let Latency = 5;
229   let NumMicroOps = 2;
232 def Ampere1Write_5cyc_1BS : SchedWriteRes<[Ampere1UnitBS]> {
233   let Latency = 5;
234   let NumMicroOps = 1;
237 def Ampere1Write_5cyc_1X : SchedWriteRes<[Ampere1UnitX]> {
238   let Latency = 5;
239   let NumMicroOps = 1;
242 def Ampere1Write_5cyc_1L : SchedWriteRes<[Ampere1UnitL]> {
243   let Latency = 5;
244   let NumMicroOps = 1;
247 def Ampere1Write_5cyc_2L : SchedWriteRes<[Ampere1UnitL, Ampere1UnitL]> {
248   let Latency = 5;
249   let NumMicroOps = 2;
252 def Ampere1Write_5cyc_1L_1BS : SchedWriteRes<[Ampere1UnitL, Ampere1UnitBS]> {
253   let Latency = 5;
254   let NumMicroOps = 2;
257 def Ampere1Write_5cyc_1XY : SchedWriteRes<[Ampere1UnitXY]> {
258   let Latency = 5;
259   let NumMicroOps = 1;
262 def Ampere1Write_5cyc_2XY : SchedWriteRes<[Ampere1UnitXY, Ampere1UnitXY]> {
263   let Latency = 5;
264   let NumMicroOps = 2;
267 def Ampere1Write_5cyc_4S_4Z : SchedWriteRes<[Ampere1UnitS, Ampere1UnitS,
268                                              Ampere1UnitS, Ampere1UnitS,
269                                              Ampere1UnitZ, Ampere1UnitZ,
270                                              Ampere1UnitZ, Ampere1UnitZ]> {
271   let Latency = 5;
272   let NumMicroOps = 8;
275 def Ampere1Write_5cyc_2XY_2S_2Z : SchedWriteRes<[Ampere1UnitXY, Ampere1UnitXY,
276                                                  Ampere1UnitS, Ampere1UnitS,
277                                                  Ampere1UnitZ, Ampere1UnitZ]> {
278   let Latency = 5;
279   let NumMicroOps = 6;
282 def Ampere1Write_6cyc_2XY_2S_2Z : SchedWriteRes<[Ampere1UnitXY, Ampere1UnitXY,
283                                                  Ampere1UnitS, Ampere1UnitS,
284                                                  Ampere1UnitZ, Ampere1UnitZ]> {
285   let Latency = 6;
286   let NumMicroOps = 6;
289 def Ampere1Write_6cyc_3XY_3S_3Z : SchedWriteRes<[Ampere1UnitXY, Ampere1UnitXY, Ampere1UnitXY,
290                                                  Ampere1UnitS, Ampere1UnitS, Ampere1UnitS,
291                                                  Ampere1UnitZ, Ampere1UnitZ, Ampere1UnitZ]> {
292   let Latency = 6;
293   let NumMicroOps = 9;
296 def Ampere1Write_6cyc_1AB_1L : SchedWriteRes<[Ampere1UnitAB, Ampere1UnitL]> {
297   let Latency = 6;
298   let NumMicroOps = 2;
301 def Ampere1Write_6cyc_1XY : SchedWriteRes<[Ampere1UnitXY]> {
302   let Latency = 6;
303   let NumMicroOps = 1;
306 def Ampere1Write_6cyc_2XY : SchedWriteRes<[Ampere1UnitXY, Ampere1UnitXY]> {
307   let Latency = 6;
308   let NumMicroOps = 2;
311 def Ampere1Write_6cyc_3XY : SchedWriteRes<[Ampere1UnitXY, Ampere1UnitXY, Ampere1UnitXY]> {
312   let Latency = 6;
313   let NumMicroOps = 3;
316 def Ampere1Write_6cyc_3L : SchedWriteRes<[Ampere1UnitL, Ampere1UnitL, Ampere1UnitL]> {
317   let Latency = 6;
318   let NumMicroOps = 3;
321 def Ampere1Write_6cyc_4L : SchedWriteRes<[Ampere1UnitL, Ampere1UnitL,
322                                           Ampere1UnitL, Ampere1UnitL]> {
323   let Latency = 6;
324   let NumMicroOps = 4;
327 def Ampere1Write_6cyc_1XY_1Z : SchedWriteRes<[Ampere1UnitXY, Ampere1UnitZ]> {
328   let Latency = 6;
329   let NumMicroOps = 2;
332 def Ampere1Write_7cyc_1BS : SchedWriteRes<[Ampere1UnitBS]> {
333   let Latency = 7;
334   let NumMicroOps = 1;
337 def Ampere1Write_7cyc_1BS_1XY : SchedWriteRes<[Ampere1UnitBS, Ampere1UnitXY]> {
338   let Latency = 7;
339   let NumMicroOps = 2;
342 def Ampere1Write_7cyc_1L_1XY : SchedWriteRes<[Ampere1UnitL, Ampere1UnitXY]> {
343   let Latency = 7;
344   let NumMicroOps = 2;
347 def Ampere1Write_7cyc_2L_2XY : SchedWriteRes<[Ampere1UnitL, Ampere1UnitL,
348                                               Ampere1UnitXY, Ampere1UnitXY]> {
349   let Latency = 7;
350   let NumMicroOps = 4;
353 def Ampere1Write_7cyc_2XY : SchedWriteRes<[Ampere1UnitXY, Ampere1UnitXY]> {
354   let Latency = 7;
355   let NumMicroOps = 2;
358 def Ampere1Write_7cyc_4XY_4S_4Z : SchedWriteRes<[Ampere1UnitXY, Ampere1UnitXY,
359                                                  Ampere1UnitXY, Ampere1UnitXY,
360                                                  Ampere1UnitS, Ampere1UnitS,
361                                                  Ampere1UnitS, Ampere1UnitS,
362                                                  Ampere1UnitZ, Ampere1UnitZ,
363                                                  Ampere1UnitZ, Ampere1UnitZ]> {
364   let Latency = 7;
365   let NumMicroOps = 12;
368 def Ampere1Write_8cyc_1BS_1A : SchedWriteRes<[Ampere1UnitBS, Ampere1UnitA]> {
369   let Latency = 8;
370   let NumMicroOps = 2;
373 def Ampere1Write_8cyc_1BS_2A : SchedWriteRes<[Ampere1UnitBS, Ampere1UnitA,
374                                                              Ampere1UnitA]> {
375   let Latency = 8;
376   let NumMicroOps = 3;
379 def Ampere1Write_8cyc_2XY : SchedWriteRes<[Ampere1UnitXY, Ampere1UnitXY]> {
380   let Latency = 8;
381   let NumMicroOps = 2;
384 def Ampere1Write_8cyc_4XY : SchedWriteRes<[Ampere1UnitXY, Ampere1UnitXY,
385                                            Ampere1UnitXY, Ampere1UnitXY]> {
386   let Latency = 8;
387   let NumMicroOps = 4;
390 def Ampere1Write_8cyc_3L_3XY : SchedWriteRes<[Ampere1UnitL, Ampere1UnitL, Ampere1UnitL,
391                                               Ampere1UnitXY, Ampere1UnitXY, Ampere1UnitXY]> {
392   let Latency = 8;
393   let NumMicroOps = 6;
396 def Ampere1Write_8cyc_4L_4XY : SchedWriteRes<[Ampere1UnitL, Ampere1UnitL,
397                                               Ampere1UnitL, Ampere1UnitL,
398                                               Ampere1UnitXY, Ampere1UnitXY,
399                                               Ampere1UnitXY, Ampere1UnitXY]> {
400   let Latency = 8;
401   let NumMicroOps = 8;
404 def Ampere1Write_9cyc_3L_3XY : SchedWriteRes<[Ampere1UnitL, Ampere1UnitL, Ampere1UnitL,
405                                               Ampere1UnitXY, Ampere1UnitXY, Ampere1UnitXY]> {
406   let Latency = 9;
407   let NumMicroOps = 6;
410 def Ampere1Write_9cyc_4L_4XY : SchedWriteRes<[Ampere1UnitL, Ampere1UnitL,
411                                               Ampere1UnitL, Ampere1UnitL,
412                                               Ampere1UnitXY, Ampere1UnitXY,
413                                               Ampere1UnitXY, Ampere1UnitXY]> {
414   let Latency = 9;
415   let NumMicroOps = 8;
418 def Ampere1Write_9cyc_3XY : SchedWriteRes<[Ampere1UnitXY, Ampere1UnitXY, Ampere1UnitXY]> {
419   let Latency = 9;
420   let NumMicroOps = 3;
423 def Ampere1Write_9cyc_2L_3XY : SchedWriteRes<[Ampere1UnitL, Ampere1UnitL,
424                                               Ampere1UnitXY, Ampere1UnitXY, Ampere1UnitXY]> {
425   let Latency = 9;
426   let NumMicroOps = 5;
429 def Ampere1Write_9cyc_6XY_4S_4Z : SchedWriteRes<[Ampere1UnitXY, Ampere1UnitXY,
430                                                  Ampere1UnitXY, Ampere1UnitXY,
431                                                  Ampere1UnitXY, Ampere1UnitXY,
432                                                  Ampere1UnitS, Ampere1UnitS,
433                                                  Ampere1UnitS, Ampere1UnitS,
434                                                  Ampere1UnitZ, Ampere1UnitZ,
435                                                  Ampere1UnitZ, Ampere1UnitZ]> {
436   let Latency = 9;
437   let NumMicroOps = 14;
440 def Ampere1Write_9cyc_8XY_4S_4Z : SchedWriteRes<[Ampere1UnitXY, Ampere1UnitXY,
441                                                  Ampere1UnitXY, Ampere1UnitXY,
442                                                  Ampere1UnitXY, Ampere1UnitXY,
443                                                  Ampere1UnitXY, Ampere1UnitXY,
444                                                  Ampere1UnitS, Ampere1UnitS,
445                                                  Ampere1UnitS, Ampere1UnitS,
446                                                  Ampere1UnitZ, Ampere1UnitZ,
447                                                  Ampere1UnitZ, Ampere1UnitZ]> {
448   let Latency = 9;
449   let NumMicroOps = 16;
452 def Ampere1Write_10cyc_2XY : SchedWriteRes<[Ampere1UnitXY, Ampere1UnitXY]> {
453   let Latency = 10;
454   let NumMicroOps = 2;
457 def Ampere1Write_10cyc_1XY_1Z : SchedWriteRes<[Ampere1UnitXY, Ampere1UnitZ]> {
458   let Latency = 10;
459   let NumMicroOps = 2;
462 def Ampere1Write_10cyc_1X_1Z : SchedWriteRes<[Ampere1UnitX, Ampere1UnitZ]> {
463   let Latency = 10;
464   let NumMicroOps = 2;
467 def Ampere1Write_10cyc_3L_3XY : SchedWriteRes<[Ampere1UnitL, Ampere1UnitL, Ampere1UnitL,
468                                                Ampere1UnitXY, Ampere1UnitXY, Ampere1UnitXY]> {
469   let Latency = 10;
470   let NumMicroOps = 6;
473 def Ampere1Write_10cyc_1A_1BS_1X : SchedWriteRes<[Ampere1UnitA, Ampere1UnitBS, Ampere1UnitX]> {
474   let Latency = 10;
475   let NumMicroOps = 3;
478 def Ampere1Write_10cyc_1A_1BS_1XY : SchedWriteRes<[Ampere1UnitA, Ampere1UnitBS, Ampere1UnitXY]> {
479   let Latency = 10;
480   let NumMicroOps = 3;
483 def Ampere1Write_11cyc_1BS_1L : SchedWriteRes<[Ampere1UnitBS, Ampere1UnitL]> {
484   let Latency = 11;
485   let NumMicroOps = 2;
488 def Ampere1Write_11cyc_1A_1BS_1X : SchedWriteRes<[Ampere1UnitA, Ampere1UnitBS, Ampere1UnitX]> {
489   let Latency = 11;
490   let NumMicroOps = 3;
493 def Ampere1Write_11cyc_1A_1BS_1XY : SchedWriteRes<[Ampere1UnitA, Ampere1UnitBS, Ampere1UnitXY]> {
494   let Latency = 11;
495   let NumMicroOps = 3;
498 def Ampere1Write_11cyc_4L_8XY : SchedWriteRes<[Ampere1UnitL, Ampere1UnitL,
499                                                Ampere1UnitL, Ampere1UnitL,
500                                                Ampere1UnitXY, Ampere1UnitXY,
501                                                Ampere1UnitXY, Ampere1UnitXY,
502                                                Ampere1UnitXY, Ampere1UnitXY,
503                                                Ampere1UnitXY, Ampere1UnitXY]> {
504   let Latency = 11;
505   let NumMicroOps = 12;
508 def Ampere1Write_12cyc_4L_8XY : SchedWriteRes<[Ampere1UnitL, Ampere1UnitL,
509                                                Ampere1UnitL, Ampere1UnitL,
510                                                Ampere1UnitXY, Ampere1UnitXY,
511                                                Ampere1UnitXY, Ampere1UnitXY,
512                                                Ampere1UnitXY, Ampere1UnitXY,
513                                                Ampere1UnitXY, Ampere1UnitXY]> {
514   let Latency = 12;
515   let NumMicroOps = 12;
518 def Ampere1Write_12cyc_3XY : SchedWriteRes<[Ampere1UnitXY, Ampere1UnitXY, Ampere1UnitXY]> {
519   let Latency = 12;
520   let NumMicroOps = 3;
523 def Ampere1Write_12cyc_4XY : SchedWriteRes<[Ampere1UnitXY, Ampere1UnitXY,
524                                             Ampere1UnitXY, Ampere1UnitXY]> {
525   let Latency = 12;
526   let NumMicroOps = 4;
529 def Ampere1Write_18cyc_1BS : SchedWriteRes<[Ampere1UnitBS]> {
530   let Latency = 18;
531   let NumMicroOps = 1;
534 def Ampere1Write_19cyc_1XY : SchedWriteRes<[Ampere1UnitXY]> {
535   let Latency = 19;
536   let NumMicroOps = 1;
539 def Ampere1Write_25cyc_1XY : SchedWriteRes<[Ampere1UnitXY]> {
540   let Latency = 25;
541   let NumMicroOps = 1;
544 def Ampere1Write_32cyc_1XY : SchedWriteRes<[Ampere1UnitXY]> {
545   let Latency = 32;
546   let NumMicroOps = 1;
549 def Ampere1Write_34cyc_1BS : SchedWriteRes<[Ampere1UnitBS]> {
550   let Latency = 34;
551   let NumMicroOps = 1;
554 def Ampere1Write_34cyc_1XY : SchedWriteRes<[Ampere1UnitXY]> {
555   let Latency = 34;
556   let NumMicroOps = 1;
559 def Ampere1Write_39cyc_1XY : SchedWriteRes<[Ampere1UnitXY]> {
560   let Latency = 39;
561   let NumMicroOps = 1;
564 def Ampere1Write_62cyc_1XY : SchedWriteRes<[Ampere1UnitXY]> {
565   let Latency = 62;
566   let NumMicroOps = 1;
569 // For basic arithmetic, we have more flexibility for short shifts (LSL shift <= 4),
570 // which are a single uop, and for extended registers, which have full flexibility
571 // across Unit A or B for both uops.
572 def Ampere1Write_Arith : SchedWriteVariant<[
573                                 SchedVar<RegExtendedPred, [Ampere1Write_2cyc_2AB]>,
574                                 SchedVar<IsCheapLSL,      [Ampere1Write_1cyc_1AB]>,
575                                 SchedVar<NoSchedPred,     [Ampere1Write_2cyc_1B_1AB]>]>;
577 def Ampere1Write_ArithFlagsetting : SchedWriteVariant<[
578                                 SchedVar<RegExtendedPred, [Ampere1Write_2cyc_1AB_1A]>,
579                                 SchedVar<IsCheapLSL,      [Ampere1Write_1cyc_1A]>,
580                                 SchedVar<NoSchedPred,     [Ampere1Write_2cyc_1B_1A]>]>;
582 //===----------------------------------------------------------------------===//
583 // Map the target-defined scheduler read/write resources and latencies for Ampere-1.
584 // This provides a coarse model, which is then specialised below.
586 def : WriteRes<WriteImm,   [Ampere1UnitAB]>;  // MOVN, MOVZ
587 def : WriteRes<WriteI,     [Ampere1UnitAB]>;  // ALU
588 def : WriteRes<WriteISReg, [Ampere1UnitB, Ampere1UnitA]> {
589   let Latency = 2;
590   let NumMicroOps = 2;
591 }  // ALU of Shifted-Reg
592 def : WriteRes<WriteIEReg, [Ampere1UnitAB, Ampere1UnitA]> {
593   let Latency = 2;
594   let NumMicroOps = 2;
595 }  // ALU of Extended-Reg
596 def : WriteRes<WriteExtr,  [Ampere1UnitB]>;  // EXTR shifts a reg pair
597 def : WriteRes<WriteIS,    [Ampere1UnitB]>;  // Shift/Scale
598 def : WriteRes<WriteID32,  [Ampere1UnitBS]> {
599   let Latency = 18;
600 }  // 32-bit Divide
601 def : WriteRes<WriteID64,  [Ampere1UnitBS]> {
602   let Latency = 34;
603 }  // 64-bit Divide
604 def : WriteRes<WriteIM32,  [Ampere1UnitBS]> {
605   let Latency = 3;
606 }  // 32-bit Multiply
607 def : WriteRes<WriteIM64,  [Ampere1UnitBS]> {
608   let Latency = 3;
609 }  // 32-bit Multiply
610 def : WriteRes<WriteBr,    [Ampere1UnitA]>;
611 def : WriteRes<WriteBrReg, [Ampere1UnitA, Ampere1UnitA]>;
612 def : WriteRes<WriteLD,    [Ampere1UnitL]> {
613   let Latency = 4;
614 }  // Load from base addr plus immediate offset
615 def : WriteRes<WriteST,    [Ampere1UnitS]> {
616   let Latency = 1;
617 }  // Store to base addr plus immediate offset
618 def : WriteRes<WriteSTP,   [Ampere1UnitS, Ampere1UnitS]> {
619   let Latency = 1;
620   let NumMicroOps = 2;
621 }  // Store a register pair.
622 def : WriteRes<WriteAdr,   [Ampere1UnitAB]>;
623 def : WriteRes<WriteLDIdx, [Ampere1UnitAB, Ampere1UnitS]> {
624   let Latency = 5;
625   let NumMicroOps = 2;
626 }  // Load from a register index (maybe scaled).
627 def : WriteRes<WriteSTIdx, [Ampere1UnitS, Ampere1UnitS]> {
628   let Latency = 1;
629   let NumMicroOps = 2;
630 }  // Store to a register index (maybe scaled).
631 def : WriteRes<WriteF,  [Ampere1UnitXY]> {
632   let Latency = 2;
633 }  // General floating-point ops.
634 def : WriteRes<WriteFCmp,  [Ampere1UnitX]> {
635   let Latency = 5;
636 }  // Floating-point compare.
637 def : WriteRes<WriteFCvt,  [Ampere1UnitXY]> {
638   let Latency = 6;
639 }  // Float conversion.
640 def : WriteRes<WriteFCopy, [Ampere1UnitXY]> {
641 }  // Float-int register copy.
642 def : WriteRes<WriteFImm,  [Ampere1UnitXY]> {
643   let Latency = 2;
644 }  // Float-int register copy.
645 def : WriteRes<WriteFMul,  [Ampere1UnitXY]> {
646   let Latency = 5;
647 }  // Floating-point multiply.
648 def : WriteRes<WriteFDiv,  [Ampere1UnitXY]> {
649   let Latency = 34;
650 }  // Floating-point division.
651 def : WriteRes<WriteVd,    [Ampere1UnitXY]> {
652   let Latency = 3;
653 }  // 64bit Vector D ops.
654 def : WriteRes<WriteVq,    [Ampere1UnitXY]> {
655   let Latency = 3;
656 }  // 128bit Vector Q ops.
657 def : WriteRes<WriteVLD,   [Ampere1UnitL, Ampere1UnitL]> {
658   let Latency = 5;
659 }  // Vector loads.
660 def : WriteRes<WriteVST,   [Ampere1UnitS, Ampere1UnitZ]> {
661   let Latency = 2;
662 }  // Vector stores.
664 def : WriteRes<WriteAtomic,  []> { let Unsupported = 1; }
666 def : WriteRes<WriteSys,     []> { let Latency = 1; }
667 def : WriteRes<WriteBarrier, []> { let Latency = 1; }
668 def : WriteRes<WriteHint,    []> { let Latency = 1; }
670 def : WriteRes<WriteLDHi,    []> {
671   let Latency = 4;
672 }  // The second register of a load-pair: LDP,LDPSW,LDNP,LDXP,LDAXP
674 // Forwarding logic.
675 def : ReadAdvance<ReadI,       0>;
676 def : ReadAdvance<ReadISReg,   0>;
677 def : ReadAdvance<ReadIEReg,   0>;
678 def : ReadAdvance<ReadIM,      0>;
679 def : ReadAdvance<ReadIMA,     1, [WriteIM32, WriteIM64]>;
680 def : ReadAdvance<ReadID,      0>;
681 def : ReadAdvance<ReadExtrHi,  0>;
682 def : ReadAdvance<ReadST,      0>;
683 def : ReadAdvance<ReadAdrBase, 0>;
684 def : ReadAdvance<ReadVLD,     0>;
686 //===----------------------------------------------------------------------===//
687 // Specialising the scheduling model further for Ampere-1.
689 def : InstRW<[Ampere1Write_1cyc_1AB], (instrs COPY)>;
691 // Branch instructions
692 def : InstRW<[Ampere1Write_1cyc_1A], (instrs Bcc, BL, RET)>;
693 def : InstRW<[Ampere1Write_1cyc_1A],
694         (instrs CBZW, CBZX, CBNZW, CBNZX, TBZW, TBZX, TBNZW, TBNZX)>;
695 def : InstRW<[Ampere1Write_1cyc_2A], (instrs BLR)>;
697 // Cryptography instructions
698 // -- AES encryption/decryption
699 def : InstRW<[Ampere1Write_2cyc_1XY], (instregex "^AES[DE]")>;
700 def : InstRW<[Ampere1Write_2cyc_1XY], (instregex "^AESI?MC")>;
701 // -- Polynomial multiplication
702 def : InstRW<[Ampere1Write_2cyc_1XY], (instregex "^PMUL", "^PMULL")>;
703 // -- SHA-256 hash
704 def : InstRW<[Ampere1Write_4cyc_1X], (instregex "^SHA256(H|H2)")>;
705 // -- SHA-256 schedule update
706 def : InstRW<[Ampere1Write_4cyc_1Y], (instregex "^SHA256SU[01]")>;
707 // -- SHA-3 instructions
708 def : InstRW<[Ampere1Write_2cyc_1XY],
709         (instregex "^BCAX", "^EOR3", "^RAX1", "^XAR")>;
710 // -- SHA-512 hash
711 def : InstRW<[Ampere1Write_4cyc_1X], (instregex "^SHA512(H|H2)")>;
712 // -- SHA-512 schedule update
713 def : InstRW<[Ampere1Write_4cyc_1Y], (instregex "^SHA512SU[01]")>;
714 // -- SHA1 choose/majority/parity
715 def : InstRW<[Ampere1Write_4cyc_1X], (instregex "^SHA1[CMP]")>;
716 // -- SHA1 hash/schedule update
717 def : InstRW<[Ampere1Write_2cyc_1Y], (instregex "^SHA1SU[01]")>;
718 def : InstRW<[Ampere1Write_2cyc_1Y], (instregex "^SHA1H")>;
720 // FP and vector load instructions
721 // -- Load 1-element structure to one/all lanes
722 // ---- all lanes
723 def : InstRW<[Ampere1Write_7cyc_1L_1XY],
724         (instregex "^LD1Rv(8b|4h|2s|16b|8h|4s|2d)")>;
725 // ---- one lane
726 def : InstRW<[Ampere1Write_7cyc_1L_1XY],
727         (instregex "^LD1i(8|16|32|64)")>;
728 // -- Load 1-element structure to one/all lanes, 1D size
729 def : InstRW<[Ampere1Write_5cyc_1L],
730         (instregex "^LD1Rv1d")>;
731 // -- Load 1-element structures to 1 register
732 def : InstRW<[Ampere1Write_5cyc_1L],
733         (instregex "^LD1Onev(8b|4h|2s|1d|16b|8h|4s|2d)")>;
734 // -- Load 1-element structures to 2 registers
735 def : InstRW<[Ampere1Write_5cyc_2L],
736         (instregex "^LD1Twov(8b|4h|2s|1d|16b|8h|4s|2d)")>;
737 // -- Load 1-element structures to 3 registers
738 def : InstRW<[Ampere1Write_6cyc_3L],
739         (instregex "^LD1Threev(8b|4h|2s|1d|16b|8h|4s|2d)")>;
740 // -- Load 1-element structures to 4 registers
741 def : InstRW<[Ampere1Write_6cyc_4L],
742         (instregex "^LD1Fourv(8b|4h|2s|1d|16b|8h|4s|2d)")>;
743 // -- Load 2-element structure to all lanes of 2 registers, 1D size
744 def : InstRW<[Ampere1Write_5cyc_2L],
745         (instregex "^LD2Rv1d")>;
746 // -- Load 2-element structure to all lanes of 2 registers, other sizes
747 def : InstRW<[Ampere1Write_7cyc_2L_2XY],
748         (instregex "^LD2Rv(8b|4h|2s|16b|8h|4s|2d)")>;
749 // -- Load 2-element structure to one lane of 2 registers
750 def : InstRW<[Ampere1Write_7cyc_2L_2XY],
751         (instregex "^LD2i(8|16|32|64)")>;
752 // -- Load 2-element structures to 2 registers, 16B/8H/4S/2D size
753 def : InstRW<[Ampere1Write_7cyc_2L_2XY],
754         (instregex "^LD2Twov(16b|8h|4s|2d)")>;
755 // -- Load 2-element structures to 2 registers, 8B/4H/2S size
756 def : InstRW<[Ampere1Write_9cyc_2L_3XY],
757         (instregex "^LD2Twov(8b|4h|2s)")>;
758 // -- Load 3-element structure to all lanes of 3 registers, 1D size
759 def : InstRW<[Ampere1Write_6cyc_3L],
760         (instregex "^LD3Rv1d")>;
761 // -- Load 3-element structure to all lanes of 3 registers, other sizes
762 def : InstRW<[Ampere1Write_8cyc_3L_3XY],
763         (instregex "^LD3Rv(8b|4h|2s|16b|8h|4s|2d)")>;
764 // -- Load 3-element structure to one lane of 3 registers
765 def : InstRW<[Ampere1Write_8cyc_3L_3XY],
766         (instregex "^LD3i(8|16|32|64)")>;
767 // -- Load 3-element structures to 3 registers, 16B/8H/4S sizes
768 def : InstRW<[Ampere1Write_9cyc_3L_3XY],
769         (instregex "^LD3Threev(16b|8h|4s)")>;
770 // -- Load 3-element structures to 3 registers, 2D size
771 def : InstRW<[Ampere1Write_8cyc_3L_3XY],
772         (instregex "^LD3Threev2d")>;
773 // -- Load 3-element structures to 3 registers, 8B/4H/2S sizes
774 def : InstRW<[Ampere1Write_10cyc_3L_3XY],
775         (instregex "^LD3Threev(8b|4h|2s)")>;
776 // -- Load 4-element structure to all lanes of 4 registers, 1D size
777 def : InstRW<[Ampere1Write_6cyc_4L],
778         (instregex "^LD4Rv1d")>;
779 // -- Load 4-element structure to all lanes of 4 registers, other sizes
780 def : InstRW<[Ampere1Write_8cyc_4L_4XY],
781         (instregex "^LD4Rv(8b|4h|2s|16b|8h|4s|2d)")>;
782 // -- Load 4-element structure to one lane of 4 registers
783 def : InstRW<[Ampere1Write_6cyc_4L],
784         (instregex "^LD4i(8|16|32|64)")>;
785 // -- Load 4-element structures to 4 registers, 2D size
786 def : InstRW<[Ampere1Write_9cyc_4L_4XY],
787         (instregex "^LD4Fourv2d")>;
788 // -- Load 4-element structures to 4 registers, 2S size
789 def : InstRW<[Ampere1Write_12cyc_4L_8XY],
790         (instregex "^LD4Fourv2s")>;
791 // -- Load 4-element structures to 4 registers, other sizes
792 def : InstRW<[Ampere1Write_11cyc_4L_8XY],
793         (instregex "^LD4Fourv(8b|4h|16b|8h|4s)")>;
794 // -- Load pair, Q-form
795 def : InstRW<[Ampere1Write_5cyc_2L], (instregex "LDN?PQ")>;
796 // -- Load pair, S/D-form
797 def : InstRW<[Ampere1Write_5cyc_1L_1BS], (instregex "LDN?P(S|D)")>;
798 // -- Load register
799 def : InstRW<[Ampere1Write_5cyc_1L], (instregex "LDU?R[BHSDQ]i")>;
800 // -- Load register, sign-extended register
801 def : InstRW<[Ampere1Write_6cyc_1AB_1L], (instregex "LDR[BHSDQ]ro(W|X)")>;
803 // FP and vector store instructions
804 // -- Store 1-element structure from one lane of 1 register
805 def : InstRW<[Ampere1Write_4cyc_1XY_1S_1Z],
806         (instregex "^ST1i(8|16|32|64)")>;
807 // -- Store 1-element structures from 1 register
808 def : InstRW<[Ampere1Write_2cyc_1S_1Z],
809         (instregex "^ST1Onev(8b|4h|2s|1d|16b|8h|4s|2d)")>;
810 // -- Store 1-element structures from 2 registers
811 def : InstRW<[Ampere1Write_3cyc_2S_2Z],
812         (instregex "^ST1Twov(8b|4h|2s|1d|16b|8h|4s|2d)")>;
813 // -- Store 1-element structures from 3 registers
814 def : InstRW<[Ampere1Write_4cyc_3S_3Z],
815         (instregex "^ST1Threev(8b|4h|2s|1d|16b|8h|4s|2d)")>;
816 // -- Store 1-element structures from 4 registers
817 def : InstRW<[Ampere1Write_5cyc_4S_4Z],
818         (instregex "^ST1Fourv(8b|4h|2s|1d|16b|8h|4s|2d)")>;
819 // -- Store 2-element structure from one lane of 2 registers
820 def : InstRW<[Ampere1Write_5cyc_2XY_2S_2Z],
821         (instregex "^ST2i(8|16|32|64)")>;
822 // -- Store 2-element structures from 2 registers, 16B/8H/4S/2D sizes
823 def : InstRW<[Ampere1Write_5cyc_2XY_2S_2Z],
824         (instregex "^ST2Twov(16b|8h|4s|2d)")>;
825 // -- Store 2-element structures from 2 registers, 8B/4H/2S sizes
826 def : InstRW<[Ampere1Write_6cyc_2XY_2S_2Z],
827         (instregex "^ST2Twov(8b|4h|2s)")>;
828 // -- Store 3-element structure from one lane of 3 registers
829 def : InstRW<[Ampere1Write_6cyc_3XY_3S_3Z],
830         (instregex "^ST3i(8|16|32|64)")>;
831 // -- Store 3-element structures from 3 registers
832 def : InstRW<[Ampere1Write_6cyc_3XY_3S_3Z],
833         (instregex "^ST3Threev(8b|4h|2s|1d|16b|8h|4s|2d)")>;
834 // -- Store 4-element structure from one lane of 4 registers
835 def : InstRW<[Ampere1Write_7cyc_4XY_4S_4Z],
836         (instregex "^ST4i(8|16|32|64)")>;
837 // -- Store 4-element structures from 4 registers, 16B/8H/4S sizes
838 def : InstRW<[Ampere1Write_9cyc_8XY_4S_4Z],
839         (instregex "^ST4Fourv(16b|8h|4s)")>;
840 // -- Store 4-element structures from 4 registers, 2D sizes
841 def : InstRW<[Ampere1Write_7cyc_4XY_4S_4Z],
842         (instregex "^ST4Fourv2d")>;
843 // -- Store 4-element structures from 4 registers, 8B/4H/2S sizes
844 def : InstRW<[Ampere1Write_9cyc_6XY_4S_4Z],
845         (instregex "^ST4Fourv(8b|4h|2s)")>;
846 // -- Store pair, Q-form
847 def : InstRW<[Ampere1Write_3cyc_2S_2Z], (instregex "^STN?PQ")>;
848 // -- Store pair, S/D-form
849 def : InstRW<[Ampere1Write_3cyc_1S_2Z], (instregex "^STN?P[SD]")>;
850 // -- Store register
851 def : InstRW<[Ampere1Write_2cyc_1S_1Z], (instregex "^STU?R[BHSDQ](ui|i)")>;
852 // -- Store register, sign-extended register offset
853 def : InstRW<[Ampere1Write_2cyc_1AB_1S_1Z], (instregex "^STR[BHSDQ]ro[XW]")>;
855 // FP data processing, bfloat16 format
856 def : InstRW<[Ampere1Write_5cyc_1XY], (instrs BFCVT)>;
857 def : InstRW<[Ampere1Write_7cyc_2XY], (instrs BFCVTN, BFCVTN2)>;
858 def : InstRW<[Ampere1Write_2cyc_1XY], (instregex "^BFDOTv", "^BF16DOT")>;
859 def : InstRW<[Ampere1Write_4cyc_2XY], (instrs BFMMLA)>;
860 def : InstRW<[Ampere1Write_5cyc_1XY], (instregex "^BFMLAL")>;
862 // FP data processing, scalar/vector, half precision
863 def : InstRW<[Ampere1Write_4cyc_1XY], (instregex "^F(ABD|ABS)v.[fi]16")>;
864 def : InstRW<[Ampere1Write_4cyc_1XY],
865         (instregex "^F(ADD|ADDP|CADD|NEG|NMUL|SUB)v.[fi]16")>;
866 def : InstRW<[Ampere1Write_4cyc_1XY],
867         (instregex "^F(AC|CM)(EQ|GE|GT|LE|LT)v.[fi]16")>;
868 def : InstRW<[Ampere1Write_4cyc_1XY],
869         (instregex "^F(AC|CM)(EQ|GE|GT|LE|LT)16")>;
870 def : InstRW<[Ampere1Write_4cyc_1X],
871         (instregex "^FCMPE?H")>;
872 def : InstRW<[Ampere1Write_10cyc_1A_1BS_1X],
873         (instregex "^FCCMPE?H")>;
874 def : InstRW<[Ampere1Write_10cyc_1A_1BS_1XY],
875         (instregex "^FCSELH")>;
876 def : InstRW<[Ampere1Write_4cyc_1XY], (instregex "^FCVT[AMNPZ][SU]v.[if]16")>;
877 def : InstRW<[Ampere1Write_4cyc_1XY], (instregex "^[SU]CVTFv.[fi]16")>;
878 def : InstRW<[Ampere1Write_25cyc_1XY], (instregex "^FDIVv.[if]16", "FDIVH")>;
879 def : InstRW<[Ampere1Write_4cyc_1XY], (instregex "^F(MAX|MIN)(NM)?P?v.[if]16")>;
880 def : InstRW<[Ampere1Write_8cyc_2XY], (instregex "^F(MAX|MIN)(NM)?Vv4[if]16")>;
881 def : InstRW<[Ampere1Write_12cyc_3XY], (instregex "^F(MAX|MIN)(NM)?Vv8[if]16")>;
882 def : InstRW<[Ampere1Write_4cyc_1XY], (instregex "^FMULX?v.[if]16")>;
883 def : InstRW<[Ampere1Write_4cyc_1XY], (instrs FMULX16)>;
884 def : InstRW<[Ampere1Write_4cyc_1XY], (instregex "^FN?M(ADD|SUB)[H]rrr")>;
885 def : InstRW<[Ampere1Write_4cyc_1XY], (instregex "^FML[AS]v.[if]16")>;
886 def : InstRW<[Ampere1Write_4cyc_1XY], (instregex "^FRECPXv.[if]16")>;
887 def : InstRW<[Ampere1Write_4cyc_1XY], (instregex "^F(RECP|RSQRT)S16")>;
888 def : InstRW<[Ampere1Write_4cyc_1XY], (instregex "^FRINT[AIMNPXZ]v.[if]16")>;
889 def : InstRW<[Ampere1Write_39cyc_1XY], (instregex "^FSQRTv.f16", "^FSQRTHr")>;
891 // FP data processing, scalar/vector, single/double precision
892 def : InstRW<[Ampere1Write_5cyc_1XY], (instregex "^F(ABD|ABS)v.[fi](32|64)")>;
893 def : InstRW<[Ampere1Write_5cyc_1XY],
894         (instregex "^F(ADD|ADDP|CADD|NEG|NMUL|SUB)v.[fi](32|64)")>;
895 def : InstRW<[Ampere1Write_5cyc_1XY],
896         (instregex "^F(AC|CM)(EQ|GE|GT|LE|LT)v.[fi](32|64)")>;
897 def : InstRW<[Ampere1Write_5cyc_1XY],
898         (instregex "^F(AC|CM)(EQ|GE|GT|LE|LT)(32|64)")>;
899 def : InstRW<[Ampere1Write_5cyc_1X],
900         (instregex "^FCMPE?(S|D)")>;
901 def : InstRW<[Ampere1Write_11cyc_1A_1BS_1X],
902         (instregex "^FCCMPE?(S|D)")>;
903 def : InstRW<[Ampere1Write_11cyc_1A_1BS_1XY],
904         (instregex "^FCSEL(S|D)")>;
905 def : InstRW<[Ampere1Write_6cyc_1XY], (instregex "^FCVT[AMNPZ][SU]v.[if](32|64)")>;
906 def : InstRW<[Ampere1Write_6cyc_1XY], (instregex "^[SU]CVTFv.[fi](32|64)")>;
907 def : InstRW<[Ampere1Write_34cyc_1XY], (instregex "^FDIVv.[if](64)", "FDIVD")>;
908 def : InstRW<[Ampere1Write_19cyc_1XY], (instregex "^FDIVv.[if](32)", "FDIVS")>;
909 def : InstRW<[Ampere1Write_5cyc_1XY], (instregex "^F(MAX|MIN)(NM)?P?v.[if](32|64)")>;
910 def : InstRW<[Ampere1Write_10cyc_2XY], (instregex "^F(MAX|MIN)(NM)?Vv.[if](32|64)")>;
911 def : InstRW<[Ampere1Write_6cyc_1XY], (instregex "^FMULX?v.[if](32|64)")>;
912 def : InstRW<[Ampere1Write_6cyc_1XY], (instrs FMULX32, FMULX64)>;
913 def : InstRW<[Ampere1Write_5cyc_1XY], (instregex "^FN?M(ADD|SUB)[SD]rrr")>;
914 def : InstRW<[Ampere1Write_5cyc_1XY], (instregex "^FML[AS]v.[if](32|64)")>;
915 def : InstRW<[Ampere1Write_5cyc_1XY], (instregex "^FRECPXv.[if](32|64)")>;
916 def : InstRW<[Ampere1Write_6cyc_1XY], (instregex "^F(RECP|RSQRT)S(32|64)")>;
917 def : InstRW<[Ampere1Write_6cyc_1XY], (instregex "^FRINT[AIMNPXZ]v.[if](32|64)")>;
918 def : InstRW<[Ampere1Write_6cyc_1XY], (instregex "^FRINT(32|64)")>;
919 def : InstRW<[Ampere1Write_62cyc_1XY], (instregex "^FSQRTv.f64", "^FSQRTDr")>;
920 def : InstRW<[Ampere1Write_32cyc_1XY], (instregex "^FSQRTv.f32", "^FSQRTSr")>;
922 // FP miscellaneous instructions
923 def : InstRW<[Ampere1Write_10cyc_1XY_1Z], (instregex "^FCVT[AMNPZ][SU][SU][XW][HSD]r")>;
924 def : InstRW<[Ampere1Write_5cyc_1XY], (instregex "^FCVT[HSD]Hr")>;
925 def : InstRW<[Ampere1Write_6cyc_1XY], (instregex "^FCVT[HSD][SD]r")>;
926 def : InstRW<[Ampere1Write_6cyc_1XY], (instregex "^FCVTLv")>;
927 def : InstRW<[Ampere1Write_8cyc_2XY], (instregex "^FCVT(N|XN)v")>;
928 def : InstRW<[Ampere1Write_10cyc_1X_1Z], (instrs FJCVTZS)>;
929 def : InstRW<[Ampere1Write_5cyc_1BS], (instregex "^FMOV[HSD][WX]r")>;
930 def : InstRW<[Ampere1Write_7cyc_1BS_1XY], (instregex "^FMOVDXHighr")>;
931 def : InstRW<[Ampere1Write_2cyc_1XY], (instregex "^FMOV[HSD][ri]")>;
932 def : InstRW<[Ampere1Write_6cyc_1XY_1Z], (instregex "^FMOVXDHighr")>;
933 def : InstRW<[Ampere1Write_4cyc_1Z], (instregex "^FMOV[WX][HSD]r")>;
935 // Integer arithmetic and logical instructions
936 def : InstRW<[Ampere1Write_1cyc_1A],
937         (instregex "ADC(W|X)r", "SBC(W|X)r")>;
938 def : InstRW<[Ampere1Write_Arith],
939         (instregex "(ADD|AND|BIC|EON|EOR|ORN|ORR|SUB)(W|X)r[sx]")>;
940 def : InstRW<[Ampere1Write_1cyc_1AB],
941         (instregex "(ADD|AND|BIC|EON|EOR|ORN|ORR|SUB)(W|X)r[ri]")>;
942 def : InstRW<[Ampere1Write_ArithFlagsetting],
943         (instregex "(ADD|AND|BIC|SUB)S(W|X)r[sx]")>;
944 def : InstRW<[Ampere1Write_1cyc_1A],
945         (instregex "(ADD|AND|BIC|SUB)S(W|X)r[ri]")>;
946 def : InstRW<[Ampere1Write_1cyc_1A],
947         (instregex "(ADC|SBC)S(W|X)r")>;
948 def : InstRW<[Ampere1Write_1cyc_1A], (instrs RMIF)>;
949 def : InstRW<[Ampere1Write_1cyc_1A],
950         (instregex "(CCMN|CCMP)(X|W)")>;
951 def : InstRW<[Ampere1Write_1cyc_1A],
952         (instregex "(CSEL|CSINC|CSINV|CSNEG)(X|W)")>;
953 def : InstRW<[Ampere1Write_18cyc_1BS], (instrs SDIVWr, UDIVWr)>;
954 def : InstRW<[Ampere1Write_34cyc_1BS], (instrs SDIVXr, UDIVXr)>;
955 def : InstRW<[Ampere1Write_3cyc_1BS],
956         (instregex "(S|U)MULHr")>;
957 def : InstRW<[Ampere1Write_4cyc_1BS],
958         (instregex "(S|U)?M(ADD|SUB)L?r")>;
960 // Integer load instructions
961 def : InstRW<[Ampere1Write_4cyc_2L],
962         (instregex "(LDNP|LDP|LDPSW)(X|W)")>;
963 def : InstRW<[Ampere1Write_4cyc_1L],
964         (instregex "LDR(B|D|H|Q|S)ui")>;
965 def : InstRW<[Ampere1Write_4cyc_1L],
966         (instregex "LDR(D|Q|W|X)l")>;
967 def : InstRW<[Ampere1Write_4cyc_1L],
968         (instregex "LDTR(B|H|W|X)i")>;
969 def : InstRW<[Ampere1Write_4cyc_1L],
970         (instregex "LDTRS(BW|BX|HW|HX|W)i")>;
971 def : InstRW<[Ampere1Write_4cyc_1L],
972         (instregex "LDUR(BB|HH|X|W)i")>;
973 def : InstRW<[Ampere1Write_4cyc_1L],
974         (instregex "LDURS(BW|BX|HW|HX|W)i")>;
975 def : InstRW<[Ampere1Write_5cyc_1AB_1L],
976         (instregex "LDR(HH|SHW|SHX|W|X)ro(W|X)")>;
977 def : InstRW<[Ampere1Write_1cyc_1L],
978         (instrs PRFMl, PRFUMi, PRFUMi)>;
979 def : InstRW<[Ampere1Write_2cyc_1AB_1L],
980         (instrs PRFMroW, PRFMroX)>;
982 // Integer miscellaneous instructions
983 def : InstRW<[Ampere1Write_1cyc_1A],  (instrs ADR, ADRP)>;
984 def : InstRW<[Ampere1Write_1cyc_1B],  (instregex "EXTR(W|X)")>;
985 def : InstRW<[Ampere1Write_1cyc_1B],  (instregex "(S|U)?BFM(W|X)")>;
986 def : InstRW<[Ampere1Write_3cyc_1BS], (instregex "^CRC32C?[BHWX]")>;
987 def : InstRW<[Ampere1Write_1cyc_1B],  (instregex "CLS(W|X)")>;
988 def : InstRW<[Ampere1Write_1cyc_1A],  (instrs SETF8, SETF16)>;
989 def : InstRW<[Ampere1Write_1cyc_1AB],
990         (instrs MOVKWi, MOVKXi, MOVNWi, MOVNXi, MOVZWi, MOVZXi)>;
991 def : InstRW<[Ampere1Write_1cyc_1B],
992         (instregex "(RBIT|REV|REV16)(W|X)r", "REV32Xr")>;
993 def : InstRW<[Ampere1Write_1cyc_1B],
994         (instregex "(ASR|LSL|LSR|ROR)V(W|X)r")>;
996 // Integer store instructions
997 def : InstRW<[Ampere1Write_1cyc_2S],  (instregex "STNP(X|W)i")>;
998 def : InstRW<[Ampere1Write_2cyc_1B_1S],
999         (instrs STPWi, STPXi)>;
1000 def : InstRW<[Ampere1Write_3cyc_1B_1S_1AB],
1001         (instregex "STP(W|X)(pre|post)")>;
1002 def : InstRW<[Ampere1Write_1cyc_1S],
1003         (instrs STTRBi, STTRHi, STTRWi, STTRXi)>;
1004 def : InstRW<[Ampere1Write_1cyc_1S],
1005         (instregex "STUR(BB|HH|X|W)i",
1006                    "STR(X|W)ui",
1007                    "STUR(BB|HH|X|W)i")>;
1008 def : InstRW<[Ampere1Write_1cyc_2S], (instrs STRWroX, STRXroX)>;
1009 def : InstRW<[Ampere1Write_2cyc_1AB_2S], (instrs STRWroW, STRXroW)>;
1011 // Pointer authentication
1012 //def : InstRW<[Ampere1Write_7cyc_1BS],
1013 //      (instrs AUTIAZ, AUTIBZ, AUTIASP, AUTIBSP, AUTIA1716, AUTIB1716)>;
1014 def : InstRW<[Ampere1Write_8cyc_1BS_1A],
1015         (instregex "BRA(A|AZ|B|BZ)", "RETA(A|B)", "ERETA(A|B)")>;
1016 def : InstRW<[Ampere1Write_8cyc_1BS_2A],
1017         (instrs BLRAA, BLRAAZ, BLRAB, BLRABZ)>;
1018 //def : InstRW<[Ampere1Write_7cyc_1BS],
1019 //      (instrs PACIAZ, PACIBZ, PACIASP, PACIBSP, PACIA1716, PACIB1716)>;
1020 def : InstRW<[Ampere1Write_11cyc_1BS_1L], (instregex "^LDRA(A|B)")>;
1021 def : InstRW<[Ampere1Write_7cyc_1BS], (instrs XPACD, XPACI)>;
1023 // Vector integer instructions
1024 // -- absolute difference
1025 def : InstRW<[Ampere1Write_3cyc_1XY],
1026              (instregex "^SABAv", "^SABALv", "^SABDv", "^SABDLv",
1027                         "^UABAv", "^UABALv", "^UABDv", "^UABDLv")>;
1028 // -- arithmetic
1029 def : InstRW<[Ampere1Write_3cyc_1XY],
1030         (instregex "^ABSv", "^(ADD|SUB)v", "^SADDLv", "^SADDW", "SHADD",
1031                    "SHSUB", "^SRHADD", "^URHADD", "SSUBL", "SSUBW",
1032                    "^UADDLv", "^UADDW", "UHADD", "UHSUB", "USUBL", "USUBW")>;
1033 // -- arithmetic, horizontal, 16B
1034 def : InstRW<[Ampere1Write_12cyc_4XY],
1035             (instregex "^ADDVv16i8v", "^SADDLVv16i8v", "^UADDLVv16i8v")>;
1036 def : InstRW<[Ampere1Write_12cyc_4XY],
1037             (instregex "^[SU](MIN|MAX)Vv16i8v")>;
1038 // -- arithmetic, horizontal, 4H/4S
1039 def : InstRW<[Ampere1Write_6cyc_2XY],
1040             (instregex "^[SU]?ADDL?V(v8i8|v4i16|v2i32)v")>;
1041 def : InstRW<[Ampere1Write_6cyc_2XY],
1042             (instregex "^[SU](MIN|MAX)V(v4i16|v4i32)v")>;
1043 // -- arithmetic, horizontal, 8B/8H
1044 def : InstRW<[Ampere1Write_9cyc_3XY],
1045             (instregex "^[SU]?ADDL?V(v8i16|v4i32)v")>;
1046 def : InstRW<[Ampere1Write_9cyc_3XY],
1047             (instregex "^[SU](MIN|MAX)V(v8i8|v8i16)v")>;
1048 // -- arithmetic, narrowing
1049 def : InstRW<[Ampere1Write_5cyc_2XY], (instregex "(ADD|SUB)HNv.*")>;
1050 def : InstRW<[Ampere1Write_5cyc_2XY], (instregex "(RADD|RSUB)HNv.*")>;
1051 // -- arithmetic, pairwise
1052 def : InstRW<[Ampere1Write_3cyc_1XY],
1053         (instregex "^ADDPv", "^SADALP", "^UADALP", "^SADDLPv", "^UADDLPv")>;
1054 // -- arithmetic, saturating
1055 def : InstRW<[Ampere1Write_3cyc_1XY],
1056         (instregex "^SQADD", "^SQSUB", "^SUQADD", "^UQADD", "^UQSUB", "^USQADD")>;
1057 // -- bit count
1058 def : InstRW<[Ampere1Write_2cyc_1XY],
1059         (instregex "^(CLS|CLZ|CNT)v")>;
1060 // -- compare
1061 def : InstRW<[Ampere1Write_3cyc_1XY],
1062         (instregex "^CMEQv", "^CMGEv", "^CMGTv", "^CMLEv", "^CMLTv",
1063                    "^CMHIv", "^CMHSv")>;
1064 // -- compare non-zero
1065 def : InstRW<[Ampere1Write_2cyc_1XY], (instregex "^CMTSTv")>;
1066 // -- dot product
1067 def : InstRW<[Ampere1Write_3cyc_1XY], (instregex "^(S|SU|U|US)DOTv")>;
1068 // -- fp reciprocal estimate
1069 def : InstRW<[Ampere1Write_5cyc_1XY], (instregex "^FRECPEv", "^FRSQRTEv")>;
1070 // -- integer reciprocal estimate
1071 def : InstRW<[Ampere1Write_5cyc_1XY], (instregex "^URECPEv", "^URSQRTEv")>;
1072 // -- logical
1073 def : InstRW<[Ampere1Write_2cyc_1XY],
1074         (instregex "^ANDv", "^BICv", "^EORv", "^ORRv", "^ORNv", "^NOTv")>;
1075 // -- logical, narrowing
1076 def : InstRW<[Ampere1Write_5cyc_2XY],
1077         (instregex "RSHRNv",
1078                    "SHRNv", "SQSHRNv", "SQSHRUNv",
1079                    "UQXTNv")>;
1080 // -- matrix multiply
1081 def : InstRW<[Ampere1Write_6cyc_2XY],
1082         (instrs SMMLA, UMMLA, USMMLA)>;
1083 // -- max/min
1084 def : InstRW<[Ampere1Write_3cyc_1XY],
1085         (instregex "^SMAXv", "^SMINv", "^UMAXv", "^UMINv")>;
1086 def : InstRW<[Ampere1Write_3cyc_1XY],
1087         (instregex "^SMAXPv", "^SMINPv", "^UMAXPv", "^UMINPv")>;
1088 // -- move immediate
1089 def : InstRW<[Ampere1Write_2cyc_1XY], (instregex "^MOVIv", "^MVNIv")>;
1090 // -- multiply
1091 def : InstRW<[Ampere1Write_3cyc_1XY],
1092         (instregex "MULv", "SMULLv", "UMULLv", "SQDMUL(H|L)v", "SQRDMULHv")>;
1093 // -- multiply accumulate
1094 def : InstRW<[Ampere1Write_3cyc_1XY],
1095         (instregex "MLAv", "MLSv", "(S|U|SQD)(MLAL|MLSL)v", "SQRDML(A|S)Hv")>;
1096 // -- negation, saturating
1097 def : InstRW<[Ampere1Write_2cyc_1XY], (instregex "^SQABS", "^SQNEG")>;
1098 // -- reverse bits/bytes
1099 def : InstRW<[Ampere1Write_2cyc_1XY],
1100         (instregex "^RBITv", "^REV16v", "^REV32v", "^REV64v")>;
1101 // -- shift
1102 def : InstRW<[Ampere1Write_3cyc_1XY], (instregex "^[SU]SHL(v16i8|v8i16|v4i32|v2i64)")>;
1103 // -- shift and accumulate
1104 def : InstRW<[Ampere1Write_3cyc_1XY],
1105         (instregex "SRSRAv", "SSRAv", "URSRAv", "USRAv")>;
1106 // -- shift, saturating
1107 def : InstRW<[Ampere1Write_3cyc_1XY],
1108         (instregex "^SQRSHLv", "^SQRSHRNv", "^SQRSHRUNv", "^SQSHL", "^SQSHLU",
1109                    "^SQXTNv", "^SQXTUNv", "^UQSHRNv", "UQRSHRNv", "^UQRSHL",
1110                    "^UQSHL")>;
1112 // Vector miscellaneous instructions
1113 // -- duplicate element
1114 def : InstRW<[Ampere1Write_2cyc_1XY], (instregex "^DUPv.+lane")>;
1115 // -- duplicate from GPR
1116 def : InstRW<[Ampere1Write_5cyc_1BS], (instregex "^DUPv.+gpr")>;
1117 // -- extract narrow
1118 def : InstRW<[Ampere1Write_2cyc_1XY], (instregex "^XTNv")>;
1119 // -- insert/extract element
1120 def : InstRW<[Ampere1Write_2cyc_1XY], (instregex "^EXTv", "^INSv.+lane")>;
1121 // -- move FP immediate
1122 def : InstRW<[Ampere1Write_2cyc_1XY], (instregex "^FMOVv")>;
1123 // -- move element to GPR
1124 def : InstRW<[Ampere1Write_6cyc_1XY_1Z], (instregex "(S|U)MOVv")>;
1125 // -- move from GPR to any element
1126 def : InstRW<[Ampere1Write_7cyc_1BS_1XY], (instregex "^INSv.+gpr")>;
1127 // -- table lookup
1128 def : InstRW<[Ampere1Write_2cyc_1XY],
1129             (instrs TBLv8i8One, TBLv16i8One, TBXv8i8One, TBXv16i8One)>;
1130 def : InstRW<[Ampere1Write_4cyc_2XY],
1131             (instrs TBLv8i8Two, TBLv16i8Two, TBXv8i8Two, TBXv16i8Two)>;
1132 def : InstRW<[Ampere1Write_6cyc_3XY],
1133             (instrs TBLv8i8Three, TBLv16i8Three, TBXv8i8Three, TBXv16i8Three)>;
1134 def : InstRW<[Ampere1Write_8cyc_4XY],
1135             (instrs TBLv8i8Four, TBLv16i8Four, TBXv8i8Four, TBXv16i8Four)>;
1136 // -- transpose
1137 def : InstRW<[Ampere1Write_2cyc_1XY],
1138               (instregex "^TRN1v", "^TRN2v", "^UZP1v", "^UZP2v")>;
1139 // -- zip/unzip
1140 def : InstRW<[Ampere1Write_2cyc_1XY], (instregex "^ZIP1v", "^ZIP2v")>;
1142 } // SchedModel = Ampere1Model