1 //=- AArch64SchedNeoverseV1.td - NeoverseV1 Scheduling Model -*- tablegen -*-=//
3 // Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
4 // See https://llvm.org/LICENSE.txt for license information.
5 // SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
7 //===----------------------------------------------------------------------===//
9 // This file defines the scheduling model for the Arm Neoverse V1 processors.
12 // - "Arm Neoverse V1 Software Optimization Guide"
13 // - "Arm Neoverse V1 Platform: Unleashing a new performance tier for Arm-based computing"
14 // https://community.arm.com/arm-community-blogs/b/architectures-and-processors-blog/posts/neoverse-v1-platform-a-new-performance-tier-for-arm
16 // https://en.wikichip.org/wiki/arm_holdings/microarchitectures/neoverse_v1
19 //===----------------------------------------------------------------------===//
21 def NeoverseV1Model : SchedMachineModel {
22 let IssueWidth = 15; // Maximum micro-ops dispatch rate.
23 let MicroOpBufferSize = 256; // Micro-op re-order buffer.
24 let LoadLatency = 4; // Optimistic load latency.
25 let MispredictPenalty = 11; // Cycles cost of branch mispredicted.
26 let LoopMicroOpBufferSize = 16; // NOTE: Copied from Cortex-A57.
27 let CompleteModel = 1;
29 list<Predicate> UnsupportedFeatures = !listconcat(SVE2Unsupported.F,
35 //===----------------------------------------------------------------------===//
36 // Define each kind of processor resource and number available on Neoverse V1.
37 // Instructions are first fetched and then decoded into internal macro-ops
38 // (MOPs). From there, the MOPs proceed through register renaming and dispatch
39 // stages. A MOP can be split into one or more micro-ops further down the
40 // pipeline, after the decode stage. Once dispatched, micro-ops wait for their
41 // operands and issue out-of-order to one of the issue pipelines. Each issue
42 // pipeline can accept one micro-op per cycle.
44 let SchedModel = NeoverseV1Model in {
46 // Define the issue ports.
47 def V1UnitB : ProcResource<2>; // Branch 0/1
48 def V1UnitS : ProcResource<2>; // Integer single cycle 0/1
49 def V1UnitM0 : ProcResource<1>; // Integer multicycle 0
50 def V1UnitM1 : ProcResource<1>; // Integer multicycle 1
51 def V1UnitL01 : ProcResource<2>; // Load/Store 0/1
52 def V1UnitL2 : ProcResource<1>; // Load 2
53 def V1UnitD : ProcResource<2>; // Store data 0/1
54 def V1UnitV0 : ProcResource<1>; // FP/ASIMD 0
55 def V1UnitV1 : ProcResource<1>; // FP/ASIMD 1
56 def V1UnitV2 : ProcResource<1>; // FP/ASIMD 2
57 def V1UnitV3 : ProcResource<1>; // FP/ASIMD 3
58 def V1UnitFlg : ProcResource<3>; // Flags
60 def V1UnitI : ProcResGroup<[V1UnitS,
61 V1UnitM0, V1UnitM1]>; // Integer units
62 def V1UnitM : ProcResGroup<[V1UnitM0, V1UnitM1]>; // Integer multicycle units
63 def V1UnitL : ProcResGroup<[V1UnitL01, V1UnitL2]>; // Load units
64 def V1UnitV : ProcResGroup<[V1UnitV0, V1UnitV1,
65 V1UnitV2, V1UnitV3]>; // FP/ASIMD units
66 def V1UnitV01 : ProcResGroup<[V1UnitV0, V1UnitV1]>; // FP/ASIMD 0/1 units
67 def V1UnitV02 : ProcResGroup<[V1UnitV0, V1UnitV2]>; // FP/ASIMD 0/2 units
68 def V1UnitV13 : ProcResGroup<[V1UnitV1, V1UnitV3]>; // FP/ASIMD 1/3 units
70 // Define commonly used read types.
72 // No generic forwarding is provided for these types.
73 def : ReadAdvance<ReadI, 0>;
74 def : ReadAdvance<ReadISReg, 0>;
75 def : ReadAdvance<ReadIEReg, 0>;
76 def : ReadAdvance<ReadIM, 0>;
77 def : ReadAdvance<ReadIMA, 0>;
78 def : ReadAdvance<ReadID, 0>;
79 def : ReadAdvance<ReadExtrHi, 0>;
80 def : ReadAdvance<ReadAdrBase, 0>;
81 def : ReadAdvance<ReadST, 0>;
82 def : ReadAdvance<ReadVLD, 0>;
84 def : WriteRes<WriteAtomic, []> { let Unsupported = 1; }
85 def : WriteRes<WriteBarrier, []> { let Latency = 1; }
86 def : WriteRes<WriteHint, []> { let Latency = 1; }
89 //===----------------------------------------------------------------------===//
90 // Define generic 0 micro-op types
92 let Latency = 0, NumMicroOps = 0 in
93 def V1Write_0c_0Z : SchedWriteRes<[]>;
96 //===----------------------------------------------------------------------===//
97 // Define generic 1 micro-op types
99 def V1Write_1c_1B : SchedWriteRes<[V1UnitB]> { let Latency = 1; }
100 def V1Write_1c_1I : SchedWriteRes<[V1UnitI]> { let Latency = 1; }
101 def V1Write_1c_1I_1Flg : SchedWriteRes<[V1UnitI, V1UnitFlg]> { let Latency = 1; }
102 def V1Write_4c_1L : SchedWriteRes<[V1UnitL]> { let Latency = 4; }
103 def V1Write_6c_1L : SchedWriteRes<[V1UnitL]> { let Latency = 6; }
104 def V1Write_1c_1L01 : SchedWriteRes<[V1UnitL01]> { let Latency = 1; }
105 def V1Write_4c_1L01 : SchedWriteRes<[V1UnitL01]> { let Latency = 4; }
106 def V1Write_6c_1L01 : SchedWriteRes<[V1UnitL01]> { let Latency = 6; }
107 def V1Write_2c_1M : SchedWriteRes<[V1UnitM]> { let Latency = 2; }
108 def V1Write_2c_1M_1Flg : SchedWriteRes<[V1UnitM, V1UnitFlg]> { let Latency = 2; }
109 def V1Write_3c_1M : SchedWriteRes<[V1UnitM]> { let Latency = 3; }
110 def V1Write_4c_1M : SchedWriteRes<[V1UnitM]> { let Latency = 4; }
111 def V1Write_1c_1M0 : SchedWriteRes<[V1UnitM0]> { let Latency = 1; }
112 def V1Write_2c_1M0 : SchedWriteRes<[V1UnitM0]> { let Latency = 2; }
113 def V1Write_3c_1M0 : SchedWriteRes<[V1UnitM0]> { let Latency = 3; }
114 def V1Write_5c_1M0 : SchedWriteRes<[V1UnitM0]> { let Latency = 5; }
115 def V1Write_12c5_1M0 : SchedWriteRes<[V1UnitM0]> { let Latency = 12;
116 let ReleaseAtCycles = [5]; }
117 def V1Write_20c5_1M0 : SchedWriteRes<[V1UnitM0]> { let Latency = 20;
118 let ReleaseAtCycles = [5]; }
119 def V1Write_2c_1V : SchedWriteRes<[V1UnitV]> { let Latency = 2; }
120 def V1Write_3c_1V : SchedWriteRes<[V1UnitV]> { let Latency = 3; }
121 def V1Write_4c_1V : SchedWriteRes<[V1UnitV]> { let Latency = 4; }
122 def V1Write_5c_1V : SchedWriteRes<[V1UnitV]> { let Latency = 5; }
123 def V1Write_2c_1V0 : SchedWriteRes<[V1UnitV0]> { let Latency = 2; }
124 def V1Write_3c_1V0 : SchedWriteRes<[V1UnitV0]> { let Latency = 3; }
125 def V1Write_4c_1V0 : SchedWriteRes<[V1UnitV0]> { let Latency = 4; }
126 def V1Write_6c_1V0 : SchedWriteRes<[V1UnitV0]> { let Latency = 6; }
127 def V1Write_10c7_1V0 : SchedWriteRes<[V1UnitV0]> { let Latency = 10;
128 let ReleaseAtCycles = [7]; }
129 def V1Write_12c7_1V0 : SchedWriteRes<[V1UnitV0]> { let Latency = 12;
130 let ReleaseAtCycles = [7]; }
131 def V1Write_13c10_1V0 : SchedWriteRes<[V1UnitV0]> { let Latency = 13;
132 let ReleaseAtCycles = [10]; }
133 def V1Write_15c7_1V0 : SchedWriteRes<[V1UnitV0]> { let Latency = 15;
134 let ReleaseAtCycles = [7]; }
135 def V1Write_16c7_1V0 : SchedWriteRes<[V1UnitV0]> { let Latency = 16;
136 let ReleaseAtCycles = [7]; }
137 def V1Write_20c7_1V0 : SchedWriteRes<[V1UnitV0]> { let Latency = 20;
138 let ReleaseAtCycles = [7]; }
139 def V1Write_2c_1V01 : SchedWriteRes<[V1UnitV01]> { let Latency = 2; }
140 def V1Write_3c_1V01 : SchedWriteRes<[V1UnitV01]> { let Latency = 3; }
141 def V1Write_4c_1V01 : SchedWriteRes<[V1UnitV01]> { let Latency = 4; }
142 def V1Write_5c_1V01 : SchedWriteRes<[V1UnitV01]> { let Latency = 5; }
143 def V1Write_3c_1V02 : SchedWriteRes<[V1UnitV02]> { let Latency = 3; }
144 def V1Write_4c_1V02 : SchedWriteRes<[V1UnitV02]> { let Latency = 4; }
145 def V1Write_7c7_1V02 : SchedWriteRes<[V1UnitV02]> { let Latency = 7;
146 let ReleaseAtCycles = [7]; }
147 def V1Write_10c7_1V02 : SchedWriteRes<[V1UnitV02]> { let Latency = 10;
148 let ReleaseAtCycles = [7]; }
149 def V1Write_13c5_1V02 : SchedWriteRes<[V1UnitV02]> { let Latency = 13;
150 let ReleaseAtCycles = [5]; }
151 def V1Write_13c11_1V02 : SchedWriteRes<[V1UnitV02]> { let Latency = 13;
152 let ReleaseAtCycles = [11]; }
153 def V1Write_15c7_1V02 : SchedWriteRes<[V1UnitV02]> { let Latency = 15;
154 let ReleaseAtCycles = [7]; }
155 def V1Write_16c7_1V02 : SchedWriteRes<[V1UnitV02]> { let Latency = 16;
156 let ReleaseAtCycles = [7]; }
157 def V1Write_2c_1V1 : SchedWriteRes<[V1UnitV1]> { let Latency = 2; }
158 def V1Write_3c_1V1 : SchedWriteRes<[V1UnitV1]> { let Latency = 3; }
159 def V1Write_4c_1V1 : SchedWriteRes<[V1UnitV1]> { let Latency = 4; }
160 def V1Write_2c_1V13 : SchedWriteRes<[V1UnitV13]> { let Latency = 2; }
161 def V1Write_4c_1V13 : SchedWriteRes<[V1UnitV13]> { let Latency = 4; }
163 //===----------------------------------------------------------------------===//
164 // Define generic 2 micro-op types
166 let Latency = 1, NumMicroOps = 2 in
167 def V1Write_1c_1B_1S : SchedWriteRes<[V1UnitB, V1UnitS]>;
168 let Latency = 6, NumMicroOps = 2 in
169 def V1Write_6c_1B_1M0 : SchedWriteRes<[V1UnitB, V1UnitM0]>;
170 let Latency = 3, NumMicroOps = 2 in
171 def V1Write_3c_1I_1M : SchedWriteRes<[V1UnitI, V1UnitM]>;
172 let Latency = 5, NumMicroOps = 2 in
173 def V1Write_5c_1I_1L : SchedWriteRes<[V1UnitI, V1UnitL]>;
174 let Latency = 7, NumMicroOps = 2 in
175 def V1Write_7c_1I_1L : SchedWriteRes<[V1UnitI, V1UnitL]>;
176 let Latency = 6, NumMicroOps = 2 in
177 def V1Write_6c_2L : SchedWriteRes<[V1UnitL, V1UnitL]>;
178 let Latency = 6, NumMicroOps = 2 in
179 def V1Write_6c_1L_1M : SchedWriteRes<[V1UnitL, V1UnitM]>;
180 let Latency = 8, NumMicroOps = 2 in
181 def V1Write_8c_1L_1V : SchedWriteRes<[V1UnitL, V1UnitV]>;
182 let Latency = 9, NumMicroOps = 2 in
183 def V1Write_9c_1L_1V : SchedWriteRes<[V1UnitL, V1UnitV]>;
184 let Latency = 11, NumMicroOps = 2 in
185 def V1Write_11c_1L_1V : SchedWriteRes<[V1UnitL, V1UnitV]>;
186 let Latency = 1, NumMicroOps = 2 in
187 def V1Write_1c_1L01_1D : SchedWriteRes<[V1UnitL01, V1UnitD]>;
188 let Latency = 6, NumMicroOps = 2 in
189 def V1Write_6c_1L01_1S : SchedWriteRes<[V1UnitL01, V1UnitS]>;
190 let Latency = 7, NumMicroOps = 2 in
191 def V1Write_7c_1L01_1S : SchedWriteRes<[V1UnitL01, V1UnitS]>;
192 let Latency = 2, NumMicroOps = 2 in
193 def V1Write_2c_1L01_1V : SchedWriteRes<[V1UnitL01, V1UnitV]>;
194 let Latency = 4, NumMicroOps = 2 in
195 def V1Write_4c_1L01_1V : SchedWriteRes<[V1UnitL01, V1UnitV]>;
196 let Latency = 6, NumMicroOps = 2 in
197 def V1Write_6c_1L01_1V : SchedWriteRes<[V1UnitL01, V1UnitV]>;
198 let Latency = 2, NumMicroOps = 2 in
199 def V1Write_2c_1L01_1V01 : SchedWriteRes<[V1UnitL01, V1UnitV01]>;
200 let Latency = 4, NumMicroOps = 2 in
201 def V1Write_4c_1L01_1V01 : SchedWriteRes<[V1UnitL01, V1UnitV01]>;
202 let Latency = 2, NumMicroOps = 2 in
203 def V1Write_2c_2M0 : SchedWriteRes<[V1UnitM0, V1UnitM0]>;
204 let Latency = 3, NumMicroOps = 2 in
205 def V1Write_3c_2M0 : SchedWriteRes<[V1UnitM0, V1UnitM0]>;
206 let Latency = 9, NumMicroOps = 2 in
207 def V1Write_9c_1M0_1L : SchedWriteRes<[V1UnitM0, V1UnitL]>;
208 let Latency = 5, NumMicroOps = 2 in
209 def V1Write_5c_1M0_1V : SchedWriteRes<[V1UnitM0, V1UnitV]>;
210 let Latency = 4, NumMicroOps = 2 in
211 def V1Write_4c_1M0_1V0 : SchedWriteRes<[V1UnitM0, V1UnitV0]>;
212 let Latency = 7, NumMicroOps = 2 in
213 def V1Write_7c_1M0_1V0 : SchedWriteRes<[V1UnitM0, V1UnitV1]>;
214 let Latency = 5, NumMicroOps = 2 in
215 def V1Write_5c_1M0_1V01 : SchedWriteRes<[V1UnitM0, V1UnitV01]>;
216 let Latency = 6, NumMicroOps = 2 in
217 def V1Write_6c_1M0_1V1 : SchedWriteRes<[V1UnitM0, V1UnitV1]>;
218 let Latency = 9, NumMicroOps = 2 in
219 def V1Write_9c_1M0_1V1 : SchedWriteRes<[V1UnitM0, V1UnitV1]>;
220 let Latency = 4, NumMicroOps = 2 in
221 def V1Write_4c_2V : SchedWriteRes<[V1UnitV, V1UnitV]>;
222 let Latency = 8, NumMicroOps = 2 in
223 def V1Write_8c_1V_1V01 : SchedWriteRes<[V1UnitV, V1UnitV01]>;
224 let Latency = 4, NumMicroOps = 2 in
225 def V1Write_4c_2V0 : SchedWriteRes<[V1UnitV0, V1UnitV0]>;
226 let Latency = 5, NumMicroOps = 2 in
227 def V1Write_5c_2V0 : SchedWriteRes<[V1UnitV0, V1UnitV0]>;
228 let Latency = 2, NumMicroOps = 2 in
229 def V1Write_2c_2V01 : SchedWriteRes<[V1UnitV01, V1UnitV01]>;
230 let Latency = 4, NumMicroOps = 2 in
231 def V1Write_4c_2V01 : SchedWriteRes<[V1UnitV01, V1UnitV01]>;
232 let Latency = 4, NumMicroOps = 2 in
233 def V1Write_4c_2V02 : SchedWriteRes<[V1UnitV02, V1UnitV02]>;
234 let Latency = 6, NumMicroOps = 2 in
235 def V1Write_6c_2V02 : SchedWriteRes<[V1UnitV02, V1UnitV02]>;
236 let Latency = 4, NumMicroOps = 2 in
237 def V1Write_4c_1V13_1V : SchedWriteRes<[V1UnitV13, V1UnitV]>;
238 let Latency = 4, NumMicroOps = 2 in
239 def V1Write_4c_2V13 : SchedWriteRes<[V1UnitV13, V1UnitV13]>;
241 //===----------------------------------------------------------------------===//
242 // Define generic 3 micro-op types
244 let Latency = 2, NumMicroOps = 3 in
245 def V1Write_2c_1I_1L01_1V01 : SchedWriteRes<[V1UnitI, V1UnitL01, V1UnitV01]>;
246 let Latency = 7, NumMicroOps = 3 in
247 def V1Write_7c_2M0_1V01 : SchedWriteRes<[V1UnitM0, V1UnitM0, V1UnitV01]>;
248 let Latency = 8, NumMicroOps = 3 in
249 def V1Write_8c_1L_2V : SchedWriteRes<[V1UnitL, V1UnitV, V1UnitV]>;
250 let Latency = 6, NumMicroOps = 3 in
251 def V1Write_6c_3L : SchedWriteRes<[V1UnitL, V1UnitL, V1UnitL]>;
252 let Latency = 2, NumMicroOps = 3 in
253 def V1Write_2c_1L01_1S_1V : SchedWriteRes<[V1UnitL01, V1UnitS, V1UnitV]>;
254 let Latency = 4, NumMicroOps = 3 in
255 def V1Write_4c_1L01_1S_1V : SchedWriteRes<[V1UnitL01, V1UnitS, V1UnitV]>;
256 let Latency = 2, NumMicroOps = 3 in
257 def V1Write_2c_2L01_1V01 : SchedWriteRes<[V1UnitL01, V1UnitL01, V1UnitV01]>;
258 let Latency = 6, NumMicroOps = 3 in
259 def V1Write_6c_3V : SchedWriteRes<[V1UnitV, V1UnitV, V1UnitV]>;
260 let Latency = 4, NumMicroOps = 3 in
261 def V1Write_4c_3V01 : SchedWriteRes<[V1UnitV01, V1UnitV01, V1UnitV01]>;
262 let Latency = 6, NumMicroOps = 3 in
263 def V1Write_6c_3V01 : SchedWriteRes<[V1UnitV01, V1UnitV01, V1UnitV01]>;
264 let Latency = 8, NumMicroOps = 3 in
265 def V1Write_8c_3V01 : SchedWriteRes<[V1UnitV01, V1UnitV01, V1UnitV01]>;
267 //===----------------------------------------------------------------------===//
268 // Define generic 4 micro-op types
270 let Latency = 8, NumMicroOps = 4 in
271 def V1Write_8c_2M0_2V0 : SchedWriteRes<[V1UnitM0, V1UnitM0,
272 V1UnitV0, V1UnitV0]>;
273 let Latency = 7, NumMicroOps = 4 in
274 def V1Write_7c_4L : SchedWriteRes<[V1UnitL, V1UnitL, V1UnitL, V1UnitL]>;
275 let Latency = 8, NumMicroOps = 4 in
276 def V1Write_8c_2L_2V : SchedWriteRes<[V1UnitL, V1UnitL,
278 let Latency = 9, NumMicroOps = 4 in
279 def V1Write_9c_2L_2V : SchedWriteRes<[V1UnitL, V1UnitL,
281 let Latency = 11, NumMicroOps = 4 in
282 def V1Write_11c_2L_2V : SchedWriteRes<[V1UnitL, V1UnitL,
284 let Latency = 10, NumMicroOps = 4 in
285 def V1Write_10c_2L01_2V : SchedWriteRes<[V1UnitL01, V1UnitL01,
287 let Latency = 2, NumMicroOps = 4 in
288 def V1Write_2c_2L01_2V01 : SchedWriteRes<[V1UnitL01, V1UnitL01,
289 V1UnitV01, V1UnitV01]>;
290 let Latency = 4, NumMicroOps = 4 in
291 def V1Write_4c_2L01_2V01 : SchedWriteRes<[V1UnitL01, V1UnitL01,
292 V1UnitV01, V1UnitV01]>;
293 let Latency = 8, NumMicroOps = 4 in
294 def V1Write_8c_2L01_2V01 : SchedWriteRes<[V1UnitL01, V1UnitL01,
295 V1UnitV01, V1UnitV01]>;
296 let Latency = 9, NumMicroOps = 4 in
297 def V1Write_9c_2L01_2V01 : SchedWriteRes<[V1UnitL01, V1UnitL01,
298 V1UnitV01, V1UnitV01]>;
299 let Latency = 10, NumMicroOps = 4 in
300 def V1Write_10c_2L01_2V01 : SchedWriteRes<[V1UnitL01, V1UnitL01,
301 V1UnitV01, V1UnitV01]>;
302 let Latency = 10, NumMicroOps = 4 in
303 def V1Write_10c_1V_1V01_2V1 : SchedWriteRes<[V1UnitV, V1UnitV01,
304 V1UnitV1, V1UnitV1]>;
305 let Latency = 12, NumMicroOps = 4 in
306 def V1Write_12c_1V_1V01_2V1 : SchedWriteRes<[V1UnitV, V1UnitV01,
307 V1UnitV1, V1UnitV1]>;
308 let Latency = 6, NumMicroOps = 4 in
309 def V1Write_6c_4V0 : SchedWriteRes<[V1UnitV0, V1UnitV0,
310 V1UnitV0, V1UnitV0]>;
311 let Latency = 12, NumMicroOps = 4 in
312 def V1Write_12c_4V01 : SchedWriteRes<[V1UnitV01, V1UnitV01,
313 V1UnitV01, V1UnitV01]>;
314 let Latency = 6, NumMicroOps = 4 in
315 def V1Write_6c_4V02 : SchedWriteRes<[V1UnitV02, V1UnitV02]>;
317 //===----------------------------------------------------------------------===//
318 // Define generic 5 micro-op types
320 let Latency = 8, NumMicroOps = 5 in
321 def V1Write_8c_2L_3V : SchedWriteRes<[V1UnitL, V1UnitL,
322 V1UnitV, V1UnitV, V1UnitV]>;
323 let Latency = 14, NumMicroOps = 5 in
324 def V1Write_14c_1V_1V0_2V1_1V13 : SchedWriteRes<[V1UnitV,
328 let Latency = 9, NumMicroOps = 5 in
329 def V1Write_9c_1V_4V01 : SchedWriteRes<[V1UnitV,
330 V1UnitV01, V1UnitV01,
331 V1UnitV01, V1UnitV01]>;
332 let Latency = 6, NumMicroOps = 5 in
333 def V1Write_6c_5V01 : SchedWriteRes<[V1UnitV01, V1UnitV01,
334 V1UnitV01, V1UnitV01, V1UnitV01]>;
336 //===----------------------------------------------------------------------===//
337 // Define generic 6 micro-op types
339 let Latency = 6, NumMicroOps = 6 in
340 def V1Write_6c_3L_3V : SchedWriteRes<[V1UnitL, V1UnitL, V1UnitL,
341 V1UnitV, V1UnitV, V1UnitV]>;
342 let Latency = 8, NumMicroOps = 6 in
343 def V1Write_8c_3L_3V : SchedWriteRes<[V1UnitL, V1UnitL, V1UnitL,
344 V1UnitV, V1UnitV, V1UnitV]>;
345 let Latency = 2, NumMicroOps = 6 in
346 def V1Write_2c_3L01_3V01 : SchedWriteRes<[V1UnitL01, V1UnitL01, V1UnitL01,
347 V1UnitV01, V1UnitV01, V1UnitV01]>;
348 let Latency = 5, NumMicroOps = 6 in
349 def V1Write_5c_3L01_3V01 : SchedWriteRes<[V1UnitL01, V1UnitL01, V1UnitL01,
350 V1UnitV01, V1UnitV01, V1UnitV01]>;
351 let Latency = 6, NumMicroOps = 6 in
352 def V1Write_6c_3L01_3V01 : SchedWriteRes<[V1UnitL01, V1UnitL01, V1UnitL01,
353 V1UnitV01, V1UnitV01, V1UnitV01]>;
354 let Latency = 11, NumMicroOps = 6 in
355 def V1Write_11c_3L01_3V01 : SchedWriteRes<[V1UnitL01, V1UnitL01, V1UnitL01,
356 V1UnitV01, V1UnitV01, V1UnitV01]>;
357 let Latency = 11, NumMicroOps = 6 in
358 def V1Write_11c_1V_5V01 : SchedWriteRes<[V1UnitV,
359 V1UnitV01, V1UnitV01,
360 V1UnitV01, V1UnitV01, V1UnitV01]>;
361 let Latency = 13, NumMicroOps = 6 in
362 def V1Write_13c_6V01 : SchedWriteRes<[V1UnitV01, V1UnitV01, V1UnitV01,
363 V1UnitV01, V1UnitV01, V1UnitV01]>;
365 //===----------------------------------------------------------------------===//
366 // Define generic 7 micro-op types
368 let Latency = 8, NumMicroOps = 7 in
369 def V1Write_8c_3L_4V : SchedWriteRes<[V1UnitL, V1UnitL, V1UnitL,
370 V1UnitV, V1UnitV, V1UnitV, V1UnitV]>;
371 let Latency = 8, NumMicroOps = 7 in
372 def V1Write_13c_3L01_1S_3V01 : SchedWriteRes<[V1UnitL01, V1UnitL01, V1UnitL01,
374 V1UnitV01, V1UnitV01, V1UnitV01]>;
376 //===----------------------------------------------------------------------===//
377 // Define generic 8 micro-op types
379 let Latency = 9, NumMicroOps = 8 in
380 def V1Write_9c_4L_4V : SchedWriteRes<[V1UnitL, V1UnitL,
384 let Latency = 2, NumMicroOps = 8 in
385 def V1Write_2c_4L01_4V01 : SchedWriteRes<[V1UnitL01, V1UnitL01,
386 V1UnitL01, V1UnitL01,
387 V1UnitV01, V1UnitV01,
388 V1UnitV01, V1UnitV01]>;
389 let Latency = 4, NumMicroOps = 8 in
390 def V1Write_4c_4L01_4V01 : SchedWriteRes<[V1UnitL01, V1UnitL01,
391 V1UnitL01, V1UnitL01,
392 V1UnitV01, V1UnitV01,
393 V1UnitV01, V1UnitV01]>;
394 let Latency = 12, NumMicroOps = 8 in
395 def V1Write_12c_4L01_4V01 : SchedWriteRes<[V1UnitL01, V1UnitL01,
396 V1UnitL01, V1UnitL01,
397 V1UnitV01, V1UnitV01,
398 V1UnitV01, V1UnitV01]>;
400 //===----------------------------------------------------------------------===//
401 // Define generic 10 micro-op types
403 let Latency = 13, NumMicroOps = 10 in
404 def V1Write_13c_4L01_2S_4V01 : SchedWriteRes<[V1UnitL01, V1UnitL01,
405 V1UnitL01, V1UnitL01,
407 V1UnitV01, V1UnitV01,
408 V1UnitV01, V1UnitV01]>;
409 let Latency = 7, NumMicroOps = 10 in
410 def V1Write_7c_5L01_5V : SchedWriteRes<[V1UnitL01, V1UnitL01,
411 V1UnitL01, V1UnitL01, V1UnitL01,
413 V1UnitV, V1UnitV, V1UnitV]>;
414 let Latency = 11, NumMicroOps = 10 in
415 def V1Write_11c_10V0 : SchedWriteRes<[V1UnitV0,
416 V1UnitV0, V1UnitV0, V1UnitV0,
417 V1UnitV0, V1UnitV0, V1UnitV0,
418 V1UnitV0, V1UnitV0, V1UnitV0]>;
420 //===----------------------------------------------------------------------===//
421 // Define generic 12 micro-op types
423 let Latency = 7, NumMicroOps = 12 in
424 def V1Write_7c_6L01_6V01 : SchedWriteRes<[V1UnitL01, V1UnitL01, V1UnitL01,
425 V1UnitL01, V1UnitL01, V1UnitL01,
426 V1UnitV01, V1UnitV01, V1UnitV01,
427 V1UnitV01, V1UnitV01, V1UnitV01]>;
429 //===----------------------------------------------------------------------===//
430 // Define generic 15 micro-op types
432 let Latency = 7, NumMicroOps = 15 in
433 def V1Write_7c_5L01_5S_5V : SchedWriteRes<[V1UnitL01, V1UnitL01,
434 V1UnitL01, V1UnitL01, V1UnitL01,
436 V1UnitS, V1UnitS, V1UnitS,
438 V1UnitV, V1UnitV, V1UnitV]>;
441 //===----------------------------------------------------------------------===//
442 // Define generic 18 micro-op types
444 let Latency = 19, NumMicroOps = 18 in
445 def V1Write_11c_9L01_9V : SchedWriteRes<[V1UnitL01, V1UnitL01, V1UnitL01,
446 V1UnitL01, V1UnitL01, V1UnitL01,
447 V1UnitL01, V1UnitL01, V1UnitL01,
448 V1UnitV, V1UnitV, V1UnitV,
449 V1UnitV, V1UnitV, V1UnitV,
450 V1UnitV, V1UnitV, V1UnitV]>;
451 let Latency = 19, NumMicroOps = 18 in
452 def V1Write_19c_18V0 : SchedWriteRes<[V1UnitV0, V1UnitV0, V1UnitV0,
453 V1UnitV0, V1UnitV0, V1UnitV0,
454 V1UnitV0, V1UnitV0, V1UnitV0,
455 V1UnitV0, V1UnitV0, V1UnitV0,
456 V1UnitV0, V1UnitV0, V1UnitV0,
457 V1UnitV0, V1UnitV0, V1UnitV0]>;
459 //===----------------------------------------------------------------------===//
460 // Define generic 27 micro-op types
462 let Latency = 11, NumMicroOps = 27 in
463 def V1Write_11c_9L01_9S_9V : SchedWriteRes<[V1UnitL01, V1UnitL01, V1UnitL01,
464 V1UnitL01, V1UnitL01, V1UnitL01,
465 V1UnitL01, V1UnitL01, V1UnitL01,
466 V1UnitS, V1UnitS, V1UnitS,
467 V1UnitS, V1UnitS, V1UnitS,
468 V1UnitS, V1UnitS, V1UnitS,
469 V1UnitV, V1UnitV, V1UnitV,
470 V1UnitV, V1UnitV, V1UnitV,
471 V1UnitV, V1UnitV, V1UnitV]>;
473 //===----------------------------------------------------------------------===//
474 // Define forwarded types
476 // NOTE: SOG, p. 20, n. 2: Accumulator forwarding is not supported for
477 // consumers of 64 bit multiply high operations?
478 def V1Wr_IM : SchedWriteRes<[V1UnitM]> { let Latency = 2; }
479 def V1Wr_IMA : SchedWriteRes<[V1UnitM0]> { let Latency = 2; }
480 def V1WriteIM : SchedWriteVariant<
481 [SchedVar<NeoverseMULIdiomPred, [V1Wr_IM]>,
482 SchedVar<NoSchedPred, [V1Wr_IMA]>]>;
483 def V1Rd_IMA : SchedReadAdvance<1, [V1Wr_IMA]>;
485 def V1Wr_FMA : SchedWriteRes<[V1UnitV]> { let Latency = 4; }
486 def V1Rd_FMA : SchedReadAdvance<2, [WriteFMul, V1Wr_FMA]>;
488 def V1Wr_ADA : SchedWriteRes<[V1UnitV13]> { let Latency = 4; }
489 def V1Rd_ADA : SchedReadAdvance<3, [V1Wr_ADA]>;
491 def V1Wr_VDOT : SchedWriteRes<[V1UnitV]> { let Latency = 3; }
492 def V1Rd_VDOT : SchedReadAdvance<2, [V1Wr_VDOT]>;
494 def V1Wr_VMMA : SchedWriteRes<[V1UnitV]> { let Latency = 3; }
495 def V1Rd_VMMA : SchedReadAdvance<2, [V1Wr_VMMA]>;
497 def V1Wr_VMA : SchedWriteRes<[V1UnitV02]> { let Latency = 4; }
498 def V1Rd_VMA : SchedReadAdvance<3, [V1Wr_VMA]>;
500 def V1Wr_VMAL : SchedWriteRes<[V1UnitV02]> { let Latency = 4; }
501 def V1Rd_VMAL : SchedReadAdvance<3, [V1Wr_VMAL]>;
503 def V1Wr_VSA : SchedWriteRes<[V1UnitV13]> { let Latency = 4; }
504 def V1Rd_VSA : SchedReadAdvance<3, [V1Wr_VSA]>;
506 def V1Wr_FCMA : SchedWriteRes<[V1UnitV]> { let Latency = 4; }
507 def V1Rd_FCMA : SchedReadAdvance<2, [V1Wr_FCMA]>;
509 def V1Wr_FPM : SchedWriteRes<[V1UnitV]> { let Latency = 3; }
510 def V1Wr_FPMA : SchedWriteRes<[V1UnitV]> { let Latency = 4; }
511 def V1Rd_FPMA : SchedReadAdvance<2, [V1Wr_FPM, V1Wr_FPMA]>;
513 def V1Wr_FPMAL : SchedWriteRes<[V1UnitV]> { let Latency = 5; }
514 def V1Rd_FPMAL : SchedReadAdvance<3, [V1Wr_FPMAL]>;
516 def V1Wr_BFD : SchedWriteRes<[V1UnitV]> { let Latency = 4; }
517 def V1Rd_BFD : SchedReadAdvance<2, [V1Wr_BFD]>;
519 def V1Wr_BFMMA : SchedWriteRes<[V1UnitV]> { let Latency = 5; }
520 def V1Rd_BFMMA : SchedReadAdvance<2, [V1Wr_BFMMA]>;
522 def V1Wr_BFMLA : SchedWriteRes<[V1UnitV]> { let Latency = 4; }
523 def V1Rd_BFMLA : SchedReadAdvance<2, [V1Wr_BFMLA]>;
525 def V1Wr_CRC : SchedWriteRes<[V1UnitM0]> { let Latency = 2; }
526 def V1Rd_CRC : SchedReadAdvance<1, [V1Wr_CRC]>;
528 def V1Wr_ZDOTB : SchedWriteRes<[V1UnitV01]> { let Latency = 3; }
529 def V1Rd_ZDOTB : SchedReadAdvance<2, [V1Wr_ZDOTB]>;
531 def V1Wr_ZUDOTB : SchedWriteRes<[V1UnitV]> { let Latency = 3; }
532 def V1Rd_ZUDOTB : SchedReadAdvance<2, [V1Wr_ZUDOTB]>;
534 def V1Wr_ZDOTH : SchedWriteRes<[V1UnitV0]> { let Latency = 4; }
535 def V1Rd_ZDOTH : SchedReadAdvance<3, [V1Wr_ZDOTH]>;
537 def V1Wr_ZMMA : SchedWriteRes<[V1UnitV01]> { let Latency = 3; }
538 def V1Rd_ZMMA : SchedReadAdvance<2, [V1Wr_ZMMA]>;
540 let Latency = 5, NumMicroOps = 2 in
541 def V1Wr_ZMAD : SchedWriteRes<[V1UnitV0, V1UnitV0]>;
542 def V1Rd_ZMAD : SchedReadAdvance<3, [V1Wr_ZMAD]>;
544 def V1Wr_ZFCMA : SchedWriteRes<[V1UnitV01]> { let Latency = 5; }
545 def V1Rd_ZFCMA : SchedReadAdvance<3, [V1Wr_ZFCMA]>;
547 def V1Wr_ZFMA : SchedWriteRes<[V1UnitV01]> { let Latency = 4; }
548 def V1Rd_ZFMA : SchedReadAdvance<2, [V1Wr_ZFMA]>;
550 def V1Wr_ZBFDOT : SchedWriteRes<[V1UnitV01]> { let Latency = 4; }
551 def V1Rd_ZBFDOT : SchedReadAdvance<2, [V1Wr_ZBFDOT]>;
552 def V1Wr_ZBFMMA : SchedWriteRes<[V1UnitV01]> { let Latency = 5; }
553 def V1Rd_ZBFMMA : SchedReadAdvance<2, [V1Wr_ZBFMMA]>;
554 def V1Wr_ZBFMAL : SchedWriteRes<[V1UnitV01]> { let Latency = 5; }
555 def V1Rd_ZBFMAL : SchedReadAdvance<3, [V1Wr_ZBFMAL]>;
557 // Miscellaneous Instructions
558 // -----------------------------------------------------------------------------
561 def : InstRW<[V1Write_1c_1I], (instrs COPY)>;
564 def : WriteRes<WriteSys, []> { let Latency = 1; }
567 // Branch Instructions
568 // -----------------------------------------------------------------------------
571 // Compare and branch
572 def : SchedAlias<WriteBr, V1Write_1c_1B>;
575 def : SchedAlias<WriteBrReg, V1Write_1c_1B>;
577 // Branch and link, immed
578 // Branch and link, register
579 def : InstRW<[V1Write_1c_1B_1S], (instrs BL, BLR)>;
581 // Compare and branch
582 def : InstRW<[V1Write_1c_1B], (instregex "^[CT]BN?Z[XW]$")>;
585 // Arithmetic and Logical Instructions
586 // -----------------------------------------------------------------------------
589 // Conditional compare
590 // Conditional select
592 // Address generation
594 // Reverse bits/bytes
596 def : SchedAlias<WriteI, V1Write_1c_1I>;
598 // ALU, basic, flagset
599 def : InstRW<[V1Write_1c_1I_1Flg],
600 (instregex "^(ADD|SUB)S[WX]r[ir]$",
603 "^(AND|BIC)S[WX]rr$")>;
605 // ALU, extend and shift
606 def : SchedAlias<WriteIEReg, V1Write_2c_1M>;
608 // Arithmetic, LSL shift, shift <= 4
609 // Arithmetic, LSR/ASR/ROR shift or LSL shift > 4
610 def V1WriteISReg : SchedWriteVariant<
611 [SchedVar<IsCheapLSL, [V1Write_1c_1I]>,
612 SchedVar<NoSchedPred, [V1Write_2c_1M]>]>;
613 def : SchedAlias<WriteISReg, V1WriteISReg>;
615 // Arithmetic, flagset, LSL shift, shift <= 4
616 // Arithmetic, flagset, LSR/ASR/ROR shift or LSL shift > 4
617 def V1WriteISRegS : SchedWriteVariant<
618 [SchedVar<IsCheapLSL, [V1Write_1c_1I_1Flg]>,
619 SchedVar<NoSchedPred, [V1Write_2c_1M_1Flg]>]>;
620 def : InstRW<[V1WriteISRegS],
621 (instregex "^(ADD|SUB)S(([WX]r[sx])|Xrx64)$")>;
623 // Logical, shift, no flagset
624 def : InstRW<[V1Write_1c_1I], (instregex "^(AND|BIC|EON|EOR|ORN|ORR)[WX]rs$")>;
626 // Logical, shift, flagset
627 def : InstRW<[V1Write_2c_1M_1Flg], (instregex "^(AND|BIC)S[WX]rs$")>;
629 // Flag manipulation instructions
630 def : InstRW<[V1Write_1c_1I_1Flg], (instrs SETF8, SETF16, RMIF, CFINV)>;
633 // Divide and multiply instructions
634 // -----------------------------------------------------------------------------
637 def : SchedAlias<WriteID32, V1Write_12c5_1M0>;
638 def : SchedAlias<WriteID64, V1Write_20c5_1M0>;
640 def : SchedAlias<WriteIM32, V1Write_2c_1M>;
641 def : SchedAlias<WriteIM64, V1Write_2c_1M>;
644 // Multiply accumulate, W-form
645 // Multiply accumulate, X-form
646 def : InstRW<[V1WriteIM, ReadIM, ReadIM, V1Rd_IMA],
647 (instregex "^M(ADD|SUB)[WX]rrr$")>;
649 // Multiply accumulate long
651 def : InstRW<[V1WriteIM, ReadIM, ReadIM, V1Rd_IMA],
652 (instregex "^(S|U)M(ADD|SUB)Lrrr$")>;
654 def : InstRW<[V1Write_3c_1M, ReadIM, ReadIM], (instrs SMULHrr, UMULHrr)>;
657 // Pointer Authentication Instructions (v8.3 PAC)
658 // -----------------------------------------------------------------------------
660 // Authenticate data address
661 // Authenticate instruction address
662 // Compute pointer authentication code for data address
663 // Compute pointer authentication code, using generic key
664 // Compute pointer authentication code for instruction address
665 def : InstRW<[V1Write_5c_1M0], (instregex "^AUT",
668 // Branch and link, register, with pointer authentication
669 // Branch, register, with pointer authentication
670 // Branch, return, with pointer authentication
671 def : InstRW<[V1Write_6c_1B_1M0], (instregex "^BL?RA[AB]Z?$",
674 // Load register, with pointer authentication
675 def : InstRW<[V1Write_9c_1M0_1L], (instregex "^LDRA[AB](indexed|writeback)")>;
677 // Strip pointer authentication code
678 def : InstRW<[V1Write_2c_1M0], (instrs XPACD, XPACI, XPACLRI)>;
681 // Miscellaneous data-processing instructions
682 // -----------------------------------------------------------------------------
684 // Bitfield extract, one reg
685 // Bitfield extract, two regs
686 def V1WriteExtr : SchedWriteVariant<
687 [SchedVar<IsRORImmIdiomPred, [V1Write_1c_1I]>,
688 SchedVar<NoSchedPred, [V1Write_3c_1I_1M]>]>;
689 def : SchedAlias<WriteExtr, V1WriteExtr>;
691 // Bitfield move, basic
693 def : SchedAlias<WriteIS, V1Write_1c_1I>;
695 // Bitfield move, insert
696 def : InstRW<[V1Write_2c_1M], (instregex "^BFM[WX]ri$")>;
699 def : SchedAlias<WriteImm, V1Write_1c_1I>;
703 // -----------------------------------------------------------------------------
705 // Load register, immed offset
706 def : SchedAlias<WriteLD, V1Write_4c_1L>;
708 // Load register, immed offset, index
709 def : SchedAlias<WriteLDIdx, V1Write_4c_1L>;
710 def : SchedAlias<WriteAdr, V1Write_1c_1I>;
712 // Load pair, immed offset
713 def : SchedAlias<WriteLDHi, V1Write_4c_1L>;
714 def : InstRW<[V1Write_4c_1L, V1Write_0c_0Z], (instrs LDPWi, LDNPWi)>;
715 def : InstRW<[WriteAdr, V1Write_4c_1L, V1Write_0c_0Z],
716 (instrs LDPWpost, LDPWpre)>;
718 // Load pair, signed immed offset, signed words
719 def : InstRW<[V1Write_5c_1I_1L, V1Write_0c_0Z], (instrs LDPSWi)>;
721 // Load pair, immed post or pre-index, signed words
722 def : InstRW<[WriteAdr, V1Write_5c_1I_1L, V1Write_0c_0Z],
723 (instrs LDPSWpost, LDPSWpre)>;
726 // Store instructions
727 // -----------------------------------------------------------------------------
729 // Store register, immed offset
730 def : SchedAlias<WriteST, V1Write_1c_1L01_1D>;
732 // Store register, immed offset, index
733 def : SchedAlias<WriteSTIdx, V1Write_1c_1L01_1D>;
735 // Store pair, immed offset
736 def : SchedAlias<WriteSTP, V1Write_1c_1L01_1D>;
739 // FP data processing instructions
740 // -----------------------------------------------------------------------------
746 def : SchedAlias<WriteF, V1Write_2c_1V>;
749 def : SchedAlias<WriteFCmp, V1Write_2c_1V0>;
753 def : SchedAlias<WriteFDiv, V1Write_10c7_1V02>;
756 // FP square root, H-form
757 def : InstRW<[V1Write_7c7_1V02], (instrs FDIVHrr, FSQRTHr)>;
760 // FP square root, S-form
761 def : InstRW<[V1Write_10c7_1V02], (instrs FDIVSrr, FSQRTSr)>;
764 def : InstRW<[V1Write_15c7_1V02], (instrs FDIVDrr)>;
766 // FP square root, D-form
767 def : InstRW<[V1Write_16c7_1V02], (instrs FSQRTDr)>;
770 def : WriteRes<WriteFMul, [V1UnitV]> { let Latency = 3; }
772 // FP multiply accumulate
773 def : InstRW<[V1Wr_FMA, ReadDefault, ReadDefault, V1Rd_FMA],
774 (instregex "^FN?M(ADD|SUB)[HSD]rrr$")>;
776 // FP round to integral
777 def : InstRW<[V1Write_3c_1V02], (instregex "^FRINT[AIMNPXZ][HSD]r$",
778 "^FRINT(32|64)[XZ][SD]r$")>;
781 def : InstRW<[V1Write_2c_1V01], (instregex "^FCSEL[HSD]rrr$")>;
784 // FP miscellaneous instructions
785 // -----------------------------------------------------------------------------
787 // FP convert, from gen to vec reg
788 def : InstRW<[V1Write_3c_1M0], (instregex "^[SU]CVTF[SU][WX][HSD]ri$")>;
790 // FP convert, from vec to gen reg
791 def : InstRW<[V1Write_3c_1V0], (instregex "^FCVT[AMNPZ][SU][SU][WX][HSD]r$")>;
793 // FP convert, Javascript from vec to gen reg
794 def : InstRW<[V1Write_3c_1V0], (instrs FJCVTZS)>;
796 // FP convert, from vec to vec reg
797 def : SchedAlias<WriteFCvt, V1Write_3c_1V02>;
800 def : SchedAlias<WriteFImm, V1Write_2c_1V>;
803 def : InstRW<[V1Write_2c_1V], (instrs FMOVHr, FMOVSr, FMOVDr)>;
805 // FP transfer, from gen to low half of vec reg
806 def : InstRW<[V1Write_3c_1M0], (instrs FMOVWHr, FMOVXHr, FMOVWSr, FMOVXDr)>;
808 // FP transfer, from gen to high half of vec reg
809 def : InstRW<[V1Write_5c_1M0_1V], (instrs FMOVXDHighr)>;
811 // FP transfer, from vec to gen reg
812 def : SchedAlias<WriteFCopy, V1Write_2c_1V1>;
815 // FP load instructions
816 // -----------------------------------------------------------------------------
818 // Load vector reg, literal, S/D/Q forms
819 // Load vector reg, unscaled immed
820 // Load vector reg, unsigned immed
821 def : InstRW<[V1Write_6c_1L, ReadAdrBase], (instregex "^LDR[SDQ]l$",
825 // Load vector reg, immed post-index
826 // Load vector reg, immed pre-index
827 def : InstRW<[WriteAdr, V1Write_6c_1L],
828 (instregex "^LDR[BHSDQ](post|pre)$")>;
830 // Load vector reg, register offset, basic
831 // Load vector reg, register offset, scale, S/D-form
832 // Load vector reg, register offset, extend
833 // Load vector reg, register offset, extend, scale, S/D-form
834 def : InstRW<[V1Write_6c_1L, ReadAdrBase], (instregex "^LDR[BSD]ro[WX]$")>;
836 // Load vector reg, register offset, scale, H/Q-form
837 // Load vector reg, register offset, extend, scale, H/Q-form
838 def : InstRW<[V1Write_7c_1I_1L, ReadAdrBase], (instregex "^LDR[HQ]ro[WX]$")>;
840 // Load vector pair, immed offset, S/D-form
841 def : InstRW<[V1Write_6c_1L, V1Write_0c_0Z], (instregex "^LDN?P[SD]i$")>;
843 // Load vector pair, immed offset, Q-form
844 def : InstRW<[V1Write_6c_1L, WriteLDHi], (instrs LDPQi, LDNPQi)>;
846 // Load vector pair, immed post-index, S/D-form
847 // Load vector pair, immed pre-index, S/D-form
848 def : InstRW<[WriteAdr, V1Write_6c_1L, V1Write_0c_0Z],
849 (instregex "^LDP[SD](pre|post)$")>;
851 // Load vector pair, immed post-index, Q-form
852 // Load vector pair, immed pre-index, Q-form
853 def : InstRW<[WriteAdr, V1Write_6c_1L, WriteLDHi],
854 (instrs LDPQpost, LDPQpre)>;
857 // FP store instructions
858 // -----------------------------------------------------------------------------
860 // Store vector reg, unscaled immed, B/H/S/D/Q-form
861 def : InstRW<[V1Write_2c_1L01_1V01], (instregex "^STUR[BHSDQ]i$")>;
863 // Store vector reg, immed post-index, B/H/S/D/Q-form
864 // Store vector reg, immed pre-index, B/H/S/D/Q-form
865 def : InstRW<[WriteAdr, V1Write_2c_1L01_1V01],
866 (instregex "^STR[BHSDQ](pre|post)$")>;
868 // Store vector reg, unsigned immed, B/H/S/D/Q-form
869 def : InstRW<[V1Write_2c_1L01_1V01], (instregex "^STR[BHSDQ]ui$")>;
871 // Store vector reg, register offset, basic, B/S/D-form
872 // Store vector reg, register offset, scale, B/S/D-form
873 // Store vector reg, register offset, extend, B/S/D-form
874 // Store vector reg, register offset, extend, scale, B/S/D-form
875 def : InstRW<[V1Write_2c_1L01_1V01, ReadAdrBase],
876 (instregex "^STR[BSD]ro[WX]$")>;
878 // Store vector reg, register offset, basic, H/Q-form
879 // Store vector reg, register offset, scale, H/Q-form
880 // Store vector reg, register offset, extend, H/Q-form
881 // Store vector reg, register offset, extend, scale, H/Q-form
882 def : InstRW<[V1Write_2c_1I_1L01_1V01, ReadAdrBase],
883 (instregex "^STR[HQ]ro[WX]$")>;
885 // Store vector pair, immed offset, S/D/Q-form
886 def : InstRW<[V1Write_2c_1L01_1V01], (instregex "^STN?P[SDQ]i$")>;
888 // Store vector pair, immed post-index, S/D-form
889 // Store vector pair, immed pre-index, S/D-form
890 def : InstRW<[WriteAdr, V1Write_2c_1L01_1V01],
891 (instregex "^STP[SD](pre|post)$")>;
893 // Store vector pair, immed post-index, Q-form
894 // Store vector pair, immed pre-index, Q-form
895 def : InstRW<[WriteAdr, V1Write_2c_2L01_1V01], (instrs STPQpre, STPQpost)>;
898 // ASIMD integer instructions
899 // -----------------------------------------------------------------------------
901 // ASIMD absolute diff
902 // ASIMD absolute diff long
903 // ASIMD arith, basic
904 // ASIMD arith, complex
905 // ASIMD arith, pair-wise
908 // ASIMD max/min, basic and pair-wise
909 def : SchedAlias<WriteVd, V1Write_2c_1V>;
910 def : SchedAlias<WriteVq, V1Write_2c_1V>;
912 // ASIMD absolute diff accum
913 // ASIMD absolute diff accum long
914 // ASIMD pairwise add and accumulate long
915 def : InstRW<[V1Wr_ADA, V1Rd_ADA], (instregex "^[SU]ABAL?v", "^[SU]ADALPv")>;
917 // ASIMD arith, reduce, 4H/4S
918 // ASIMD max/min, reduce, 4H/4S
919 def : InstRW<[V1Write_2c_1V13], (instregex "^(ADD|[SU]ADDL)Vv4(i16|i32)v$",
920 "^[SU](MAX|MIN)Vv4(i16|i32)v$")>;
922 // ASIMD arith, reduce, 8B/8H
923 // ASIMD max/min, reduce, 8B/8H
924 def : InstRW<[V1Write_4c_1V13_1V], (instregex "^(ADD|[SU]ADDL)Vv8(i8|i16)v$",
925 "^[SU](MAX|MIN)Vv8(i8|i16)v$")>;
927 // ASIMD arith, reduce, 16B
928 // ASIMD max/min, reduce, 16B
929 def : InstRW<[V1Write_4c_2V13], (instregex "^(ADD|[SU]ADDL)Vv16i8v$",
930 "[SU](MAX|MIN)Vv16i8v$")>;
933 // ASIMD dot product using signed and unsigned integers
934 def : InstRW<[V1Wr_VDOT, V1Rd_VDOT],
935 (instregex "^([SU]|SU|US)DOT(lane)?v(8|16)i8$")>;
937 // ASIMD matrix multiply-accumulate
938 def : InstRW<[V1Wr_VMMA, V1Rd_VMMA], (instrs SMMLA, UMMLA, USMMLA)>;
941 def : InstRW<[V1Write_4c_1V02], (instregex "^MULv", "^SQ(R)?DMULHv")>;
943 // ASIMD multiply accumulate
944 def : InstRW<[V1Wr_VMA, V1Rd_VMA], (instregex "^MLAv", "^MLSv")>;
946 // ASIMD multiply accumulate long
947 def : InstRW<[V1Wr_VMAL, V1Rd_VMAL], (instregex "^[SU]MLALv", "^[SU]MLSLv")>;
949 // ASIMD multiply accumulate high
950 def : InstRW<[V1Write_4c_1V02], (instregex "^SQRDMLAHv", "^SQRDMLSHv")>;
952 // ASIMD multiply accumulate saturating long
953 def : InstRW<[V1Write_4c_1V02], (instregex "^SQDML[AS]L[iv]")>;
955 // ASIMD multiply/multiply long (8x8) polynomial
956 def : InstRW<[V1Write_3c_1V01], (instregex "^PMULL?v(8|16)i8$")>;
958 // ASIMD multiply long
959 def : InstRW<[V1Write_3c_1V02], (instregex "^([SU]|SQD)MULLv")>;
961 // ASIMD shift accumulate
962 def : InstRW<[V1Wr_VSA, V1Rd_VSA], (instregex "^[SU]SRAv", "^[SU]RSRAv")>;
964 // ASIMD shift by immed, complex
965 // ASIMD shift by register, complex
966 def : InstRW<[V1Write_4c_1V13],
967 (instregex "^RSHRNv", "^SQRSHRU?Nv", "^(SQSHLU?|UQSHL)[bhsd]$",
968 "^(SQSHLU?|UQSHL)(v8i8|v16i8|v4i16|v8i16|v2i32|v4i32|v2i64)_shift$",
969 "^SQSHU?RNv", "^[SU]RSHRv", "^UQR?SHRNv",
970 "^[SU]Q?RSHLv", "^[SU]QSHLv")>;
972 // ASIMD shift by immed, basic
973 // ASIMD shift by immed and insert, basic
974 // ASIMD shift by register, basic
975 def : InstRW<[V1Write_2c_1V13], (instregex "^SHLL?v", "^SHRNv", "^[SU]SHLLv",
976 "^[SU]SHRv", "^S[LR]Iv", "^[SU]SHLv")>;
979 // ASIMD FP instructions
980 // -----------------------------------------------------------------------------
982 // ASIMD FP absolute value/difference
983 // ASIMD FP arith, normal
985 // ASIMD FP max/min, normal
986 // ASIMD FP max/min, pairwise
988 // Covered by "SchedAlias (WriteV[dq]...)" above
990 // ASIMD FP complex add
991 def : InstRW<[V1Write_4c_1V], (instregex "^FCADD(v[48]f16|v[24]f32|v2f64)$")>;
993 // ASIMD FP complex multiply add
994 def : InstRW<[V1Wr_FCMA, V1Rd_FCMA], (instregex "^FCMLAv")>;
997 def : InstRW<[V1Wr_FPM], (instregex "^FMULX?v")>;
999 // ASIMD FP multiply accumulate
1000 def : InstRW<[V1Wr_FPMA, V1Rd_FPMA], (instregex "^FML[AS]v")>;
1002 // ASIMD FP multiply accumulate long
1003 def : InstRW<[V1Wr_FPMAL, V1Rd_FPMAL], (instregex "^FML[AS]L2?v")>;
1005 // ASIMD FP convert, long (F16 to F32)
1006 def : InstRW<[V1Write_4c_2V02], (instregex "^FCVTLv[48]i16$")>;
1008 // ASIMD FP convert, long (F32 to F64)
1009 def : InstRW<[V1Write_3c_1V02], (instregex "^FCVTLv[24]i32$")>;
1011 // ASIMD FP convert, narrow (F32 to F16)
1012 def : InstRW<[V1Write_4c_2V02], (instregex "^FCVTNv[48]i16$")>;
1014 // ASIMD FP convert, narrow (F64 to F32)
1015 def : InstRW<[V1Write_3c_1V02], (instregex "^FCVTNv[24]i32$",
1016 "^FCVTXN(v[24]f32|v1i64)$")>;
1018 // ASIMD FP convert, other, D-form F32 and Q-form F64
1019 def : InstRW<[V1Write_3c_1V02], (instregex "^FCVT[AMNPZ][SU]v2f(32|64)$",
1020 "^FCVT[AMNPZ][SU]v2i(32|64)_shift$",
1021 "^FCVT[AMNPZ][SU]v1i64$",
1023 "^[SU]CVTFv2f(32|64)$",
1024 "^[SU]CVTFv2i(32|64)_shift$",
1028 // ASIMD FP convert, other, D-form F16 and Q-form F32
1029 def : InstRW<[V1Write_4c_2V02], (instregex "^FCVT[AMNPZ][SU]v4f(16|32)$",
1030 "^FCVT[AMNPZ][SU]v4i(16|32)_shift$",
1031 "^FCVT[AMNPZ][SU]v1i32$",
1033 "^[SU]CVTFv4f(16|32)$",
1034 "^[SU]CVTFv4i(16|32)_shift$",
1038 // ASIMD FP convert, other, Q-form F16
1039 def : InstRW<[V1Write_6c_4V02], (instregex "^FCVT[AMNPZ][SU]v8f16$",
1040 "^FCVT[AMNPZ][SU]v8i16_shift$",
1041 "^FCVT[AMNPZ][SU]v1f16$",
1044 "^[SU]CVTFv8i16_shift$",
1048 // ASIMD FP divide, D-form, F16
1049 // ASIMD FP square root, D-form, F16
1050 def : InstRW<[V1Write_7c7_1V02], (instrs FDIVv4f16, FSQRTv4f16)>;
1052 // ASIMD FP divide, F32
1053 // ASIMD FP square root, F32
1054 def : InstRW<[V1Write_10c7_1V02], (instrs FDIVv2f32, FDIVv4f32,
1055 FSQRTv2f32, FSQRTv4f32)>;
1057 // ASIMD FP divide, Q-form, F16
1058 def : InstRW<[V1Write_13c5_1V02], (instrs FDIVv8f16)>;
1060 // ASIMD FP divide, Q-form, F64
1061 def : InstRW<[V1Write_15c7_1V02], (instrs FDIVv2f64)>;
1063 // ASIMD FP square root, Q-form, F16
1064 def : InstRW<[V1Write_13c11_1V02], (instrs FSQRTv8f16)>;
1066 // ASIMD FP square root, Q-form, F64
1067 def : InstRW<[V1Write_16c7_1V02], (instrs FSQRTv2f64)>;
1069 // ASIMD FP max/min, reduce, F32 and D-form F16
1070 def : InstRW<[V1Write_4c_2V], (instregex "^F(MAX|MIN)(NM)?Vv4(i16|i32)v$")>;
1072 // ASIMD FP max/min, reduce, Q-form F16
1073 def : InstRW<[V1Write_6c_3V], (instregex "^F(MAX|MIN)(NM)?Vv8i16v$")>;
1075 // ASIMD FP round, D-form F32 and Q-form F64
1076 def : InstRW<[V1Write_3c_1V02], (instregex "^FRINT[AIMNPXZ]v2f(32|64)$")>;
1078 // ASIMD FP round, D-form F16 and Q-form F32
1079 def : InstRW<[V1Write_4c_2V02], (instregex "^FRINT[AIMNPXZ]v4f(16|32)$")>;
1081 // ASIMD FP round, Q-form F16
1082 def : InstRW<[V1Write_6c_4V02], (instregex "^FRINT[AIMNPXZ]v8f16$")>;
1085 // ASIMD BF instructions
1086 // -----------------------------------------------------------------------------
1088 // ASIMD convert, F32 to BF16
1089 def : InstRW<[V1Write_4c_1V02], (instrs BFCVTN, BFCVTN2)>;
1091 // ASIMD dot product
1092 def : InstRW<[V1Wr_BFD, V1Rd_BFD], (instregex "^BF(DOT|16DOTlane)v[48]bf16$")>;
1094 // ASIMD matrix multiply accumulate
1095 def : InstRW<[V1Wr_BFMMA, V1Rd_BFMMA], (instrs BFMMLA)>;
1097 // ASIMD multiply accumulate long
1098 def : InstRW<[V1Wr_BFMLA, V1Rd_BFMLA], (instregex "^BFMLAL[BT](Idx)?$")>;
1100 // Scalar convert, F32 to BF16
1101 def : InstRW<[V1Write_3c_1V02], (instrs BFCVT)>;
1104 // ASIMD miscellaneous instructions
1105 // -----------------------------------------------------------------------------
1107 // ASIMD bit reverse
1108 // ASIMD bitwise insert
1110 // ASIMD duplicate, element
1112 // ASIMD extract narrow
1113 // ASIMD insert, element to element
1114 // ASIMD move, FP immed
1115 // ASIMD move, integer immed
1117 // ASIMD table lookup, 1 or 2 table regs
1118 // ASIMD table lookup extension, 1 table reg
1119 // ASIMD transfer, element to gen reg
1122 // Covered by "SchedAlias (WriteV[dq]...)" above
1124 // ASIMD duplicate, gen reg
1125 def : InstRW<[V1Write_3c_1M0],
1126 (instregex "^DUP((v16|v8)i8|(v8|v4)i16|(v4|v2)i32|v2i64)gpr$")>;
1128 // ASIMD extract narrow, saturating
1129 def : InstRW<[V1Write_4c_1V13], (instregex "^[SU]QXTNv", "^SQXTUNv")>;
1131 // ASIMD reciprocal and square root estimate, D-form U32
1132 // ASIMD reciprocal and square root estimate, D-form F32 and F64
1133 def : InstRW<[V1Write_3c_1V02], (instrs URECPEv2i32,
1135 FRECPEv1i32, FRECPEv2f32, FRECPEv1i64,
1136 FRSQRTEv1i32, FRSQRTEv2f32, FRSQRTEv1i64)>;
1138 // ASIMD reciprocal and square root estimate, Q-form U32
1139 // ASIMD reciprocal and square root estimate, D-form F16 and Q-form F32 and F64
1140 def : InstRW<[V1Write_4c_1V02], (instrs URECPEv4i32,
1142 FRECPEv1f16, FRECPEv4f16,
1143 FRECPEv4f32, FRECPEv2f64,
1144 FRSQRTEv1f16, FRSQRTEv4f16,
1145 FRSQRTEv4f32, FRSQRTEv2f64)>;
1147 // ASIMD reciprocal and square root estimate, Q-form F16
1148 def : InstRW<[V1Write_6c_2V02], (instrs FRECPEv8f16,
1151 // ASIMD reciprocal exponent
1152 def : InstRW<[V1Write_3c_1V02], (instrs FRECPXv1f16, FRECPXv1i32, FRECPXv1i64)>;
1154 // ASIMD reciprocal step
1155 def : InstRW<[V1Write_4c_1V], (instregex "^FRECPS(16|32|64)$", "^FRECPSv",
1156 "^FRSQRTS(16|32|64)$", "^FRSQRTSv")>;
1158 // ASIMD table lookup, 1 or 2 table regs
1159 // ASIMD table lookup extension, 1 table reg
1160 def : InstRW<[V1Write_2c_2V01], (instregex "^TBLv(8|16)i8(One|Two)$",
1161 "^TBXv(8|16)i8One$")>;
1163 // ASIMD table lookup, 3 table regs
1164 // ASIMD table lookup extension, 2 table reg
1165 def : InstRW<[V1Write_4c_2V01], (instrs TBLv8i8Three, TBLv16i8Three,
1166 TBXv8i8Two, TBXv16i8Two)>;
1168 // ASIMD table lookup, 4 table regs
1169 def : InstRW<[V1Write_4c_3V01], (instrs TBLv8i8Four, TBLv16i8Four)>;
1171 // ASIMD table lookup extension, 3 table reg
1172 def : InstRW<[V1Write_6c_3V01], (instrs TBXv8i8Three, TBXv16i8Three)>;
1174 // ASIMD table lookup extension, 4 table reg
1175 def : InstRW<[V1Write_6c_5V01], (instrs TBXv8i8Four, TBXv16i8Four)>;
1177 // ASIMD transfer, element to gen reg
1178 def : InstRW<[V1Write_2c_1V], (instregex "^SMOVvi(((8|16)to(32|64))|32to64)$",
1179 "^UMOVvi(8|16|32|64)$")>;
1181 // ASIMD transfer, gen reg to element
1182 def : InstRW<[V1Write_5c_1M0_1V], (instregex "^INSvi(8|16|32|64)gpr$")>;
1185 // ASIMD load instructions
1186 // -----------------------------------------------------------------------------
1188 // ASIMD load, 1 element, multiple, 1 reg
1189 def : InstRW<[V1Write_6c_1L],
1190 (instregex "^LD1Onev(8b|16b|4h|8h|2s|4s|1d|2d)$")>;
1191 def : InstRW<[WriteAdr, V1Write_6c_1L],
1192 (instregex "^LD1Onev(8b|16b|4h|8h|2s|4s|1d|2d)_POST$")>;
1194 // ASIMD load, 1 element, multiple, 2 reg
1195 def : InstRW<[V1Write_6c_2L],
1196 (instregex "^LD1Twov(8b|16b|4h|8h|2s|4s|1d|2d)$")>;
1197 def : InstRW<[WriteAdr, V1Write_6c_2L],
1198 (instregex "^LD1Twov(8b|16b|4h|8h|2s|4s|1d|2d)_POST$")>;
1200 // ASIMD load, 1 element, multiple, 3 reg
1201 def : InstRW<[V1Write_6c_3L],
1202 (instregex "^LD1Threev(8b|16b|4h|8h|2s|4s|1d|2d)$")>;
1203 def : InstRW<[WriteAdr, V1Write_6c_3L],
1204 (instregex "^LD1Threev(8b|16b|4h|8h|2s|4s|1d|2d)_POST$")>;
1206 // ASIMD load, 1 element, multiple, 4 reg, D-form
1207 def : InstRW<[V1Write_6c_2L],
1208 (instregex "^LD1Fourv(8b|4h|2s|1d)$")>;
1209 def : InstRW<[WriteAdr, V1Write_6c_2L],
1210 (instregex "^LD1Fourv(8b|4h|2s|1d)_POST$")>;
1212 // ASIMD load, 1 element, multiple, 4 reg, Q-form
1213 def : InstRW<[V1Write_7c_4L],
1214 (instregex "^LD1Fourv(16b|8h|4s|2d)$")>;
1215 def : InstRW<[WriteAdr, V1Write_7c_4L],
1216 (instregex "^LD1Fourv(16b|8h|4s|2d)_POST$")>;
1218 // ASIMD load, 1 element, one lane
1219 // ASIMD load, 1 element, all lanes
1220 def : InstRW<[V1Write_8c_1L_1V],
1221 (instregex "^LD1(i|Rv)(8|16|32|64)$",
1222 "^LD1Rv(8b|16b|4h|8h|2s|4s|1d|2d)$")>;
1223 def : InstRW<[WriteAdr, V1Write_8c_1L_1V],
1224 (instregex "^LD1i(8|16|32|64)_POST$",
1225 "^LD1Rv(8b|16b|4h|8h|2s|4s|1d|2d)_POST$")>;
1227 // ASIMD load, 2 element, multiple, D-form
1228 def : InstRW<[V1Write_8c_1L_2V],
1229 (instregex "^LD2Twov(8b|4h|2s)$")>;
1230 def : InstRW<[WriteAdr, V1Write_8c_1L_2V],
1231 (instregex "^LD2Twov(8b|4h|2s)_POST$")>;
1233 // ASIMD load, 2 element, multiple, Q-form
1234 def : InstRW<[V1Write_8c_2L_2V],
1235 (instregex "^LD2Twov(16b|8h|4s|2d)$")>;
1236 def : InstRW<[WriteAdr, V1Write_8c_2L_2V],
1237 (instregex "^LD2Twov(16b|8h|4s|2d)_POST$")>;
1239 // ASIMD load, 2 element, one lane
1240 // ASIMD load, 2 element, all lanes
1241 def : InstRW<[V1Write_8c_1L_2V],
1242 (instregex "^LD2i(8|16|32|64)$",
1243 "^LD2Rv(8b|16b|4h|8h|2s|4s|1d|2d)$")>;
1244 def : InstRW<[WriteAdr, V1Write_8c_1L_2V],
1245 (instregex "^LD2i(8|16|32|64)_POST$",
1246 "^LD2Rv(8b|16b|4h|8h|2s|4s|1d|2d)_POST$")>;
1248 // ASIMD load, 3 element, multiple, D-form
1249 // ASIMD load, 3 element, one lane
1250 // ASIMD load, 3 element, all lanes
1251 def : InstRW<[V1Write_8c_2L_3V],
1252 (instregex "^LD3Threev(8b|4h|2s)$",
1253 "^LD3i(8|16|32|64)$",
1254 "^LD3Rv(8b|16b|4h|8h|2s|4s|1d|2d)$")>;
1255 def : InstRW<[WriteAdr, V1Write_8c_2L_3V],
1256 (instregex "^LD3Threev(8b|4h|2s)_POST$",
1257 "^LD3i(8|16|32|64)_POST$",
1258 "^LD3Rv(8b|16b|4h|8h|2s|4s|1d|2d)_POST$")>;
1260 // ASIMD load, 3 element, multiple, Q-form
1261 def : InstRW<[V1Write_8c_3L_3V],
1262 (instregex "^LD3Threev(16b|8h|4s|2d)$")>;
1263 def : InstRW<[WriteAdr, V1Write_8c_3L_3V],
1264 (instregex "^LD3Threev(16b|8h|4s|2d)_POST$")>;
1266 // ASIMD load, 4 element, multiple, D-form
1267 // ASIMD load, 4 element, one lane
1268 // ASIMD load, 4 element, all lanes
1269 def : InstRW<[V1Write_8c_3L_4V],
1270 (instregex "^LD4Fourv(8b|4h|2s)$",
1271 "^LD4i(8|16|32|64)$",
1272 "^LD4Rv(8b|16b|4h|8h|2s|4s|1d|2d)$")>;
1273 def : InstRW<[WriteAdr, V1Write_8c_3L_4V],
1274 (instregex "^LD4Fourv(8b|4h|2s)_POST$",
1275 "^LD4i(8|16|32|64)_POST$",
1276 "^LD4Rv(8b|16b|4h|8h|2s|4s|1d|2d)_POST$")>;
1278 // ASIMD load, 4 element, multiple, Q-form
1279 def : InstRW<[V1Write_9c_4L_4V],
1280 (instregex "^LD4Fourv(16b|8h|4s|2d)$")>;
1281 def : InstRW<[WriteAdr, V1Write_9c_4L_4V],
1282 (instregex "^LD4Fourv(16b|8h|4s|2d)_POST$")>;
1285 // ASIMD store instructions
1286 // -----------------------------------------------------------------------------
1288 // ASIMD store, 1 element, multiple, 1 reg
1289 // ASIMD store, 1 element, multiple, 2 reg, D-form
1290 def : InstRW<[V1Write_2c_1L01_1V01],
1291 (instregex "^ST1Onev(8b|16b|4h|8h|2s|4s|1d|2d)$",
1292 "^ST1Twov(8b|4h|2s|1d)$")>;
1293 def : InstRW<[WriteAdr, V1Write_2c_1L01_1V01],
1294 (instregex "^ST1Onev(8b|16b|4h|8h|2s|4s|1d|2d)_POST$",
1295 "^ST1Twov(8b|4h|2s|1d)_POST$")>;
1297 // ASIMD store, 1 element, multiple, 2 reg, Q-form
1298 // ASIMD store, 1 element, multiple, 3 reg, D-form
1299 // ASIMD store, 1 element, multiple, 4 reg, D-form
1300 def : InstRW<[V1Write_2c_2L01_2V01],
1301 (instregex "^ST1Twov(16b|8h|4s|2d)$",
1302 "^ST1Threev(8b|4h|2s|1d)$",
1303 "^ST1Fourv(8b|4h|2s|1d)$")>;
1304 def : InstRW<[WriteAdr, V1Write_2c_2L01_2V01],
1305 (instregex "^ST1Twov(16b|8h|4s|2d)_POST$",
1306 "^ST1Threev(8b|4h|2s|1d)_POST$",
1307 "^ST1Fourv(8b|4h|2s|1d)_POST$")>;
1309 // ASIMD store, 1 element, multiple, 3 reg, Q-form
1310 def : InstRW<[V1Write_2c_3L01_3V01],
1311 (instregex "^ST1Threev(16b|8h|4s|2d)$")>;
1312 def : InstRW<[WriteAdr, V1Write_2c_3L01_3V01],
1313 (instregex "^ST1Threev(16b|8h|4s|2d)_POST$")>;
1315 // ASIMD store, 1 element, multiple, 4 reg, Q-form
1316 def : InstRW<[V1Write_2c_4L01_4V01],
1317 (instregex "^ST1Fourv(16b|8h|4s|2d)$")>;
1318 def : InstRW<[WriteAdr, V1Write_2c_4L01_4V01],
1319 (instregex "^ST1Fourv(16b|8h|4s|2d)_POST$")>;
1321 // ASIMD store, 1 element, one lane
1322 // ASIMD store, 2 element, multiple, D-form
1323 // ASIMD store, 2 element, one lane
1324 def : InstRW<[V1Write_4c_1L01_1V01],
1325 (instregex "^ST1i(8|16|32|64)$",
1326 "^ST2Twov(8b|4h|2s)$",
1327 "^ST2i(8|16|32|64)$")>;
1328 def : InstRW<[WriteAdr, V1Write_4c_1L01_1V01],
1329 (instregex "^ST1i(8|16|32|64)_POST$",
1330 "^ST2Twov(8b|4h|2s)_POST$",
1331 "^ST2i(8|16|32|64)_POST$")>;
1333 // ASIMD store, 2 element, multiple, Q-form
1334 // ASIMD store, 3 element, multiple, D-form
1335 // ASIMD store, 3 element, one lane
1336 // ASIMD store, 4 element, one lane, D
1337 def : InstRW<[V1Write_4c_2L01_2V01],
1338 (instregex "^ST2Twov(16b|8h|4s|2d)$",
1339 "^ST3Threev(8b|4h|2s)$",
1340 "^ST3i(8|16|32|64)$",
1342 def : InstRW<[WriteAdr, V1Write_4c_2L01_2V01],
1343 (instregex "^ST2Twov(16b|8h|4s|2d)_POST$",
1344 "^ST3Threev(8b|4h|2s)_POST$",
1345 "^ST3i(8|16|32|64)_POST$",
1348 // ASIMD store, 3 element, multiple, Q-form
1349 def : InstRW<[V1Write_5c_3L01_3V01],
1350 (instregex "^ST3Threev(16b|8h|4s|2d)$")>;
1351 def : InstRW<[WriteAdr, V1Write_5c_3L01_3V01],
1352 (instregex "^ST3Threev(16b|8h|4s|2d)_POST$")>;
1354 // ASIMD store, 4 element, multiple, D-form
1355 def : InstRW<[V1Write_6c_3L01_3V01],
1356 (instregex "^ST4Fourv(8b|4h|2s)$")>;
1357 def : InstRW<[WriteAdr, V1Write_6c_3L01_3V01],
1358 (instregex "^ST4Fourv(8b|4h|2s)_POST$")>;
1360 // ASIMD store, 4 element, multiple, Q-form, B/H/S
1361 def : InstRW<[V1Write_7c_6L01_6V01],
1362 (instregex "^ST4Fourv(16b|8h|4s)$")>;
1363 def : InstRW<[WriteAdr, V1Write_7c_6L01_6V01],
1364 (instregex "^ST4Fourv(16b|8h|4s)_POST$")>;
1366 // ASIMD store, 4 element, multiple, Q-form, D
1367 def : InstRW<[V1Write_4c_4L01_4V01],
1368 (instrs ST4Fourv2d)>;
1369 def : InstRW<[WriteAdr, V1Write_4c_4L01_4V01],
1370 (instrs ST4Fourv2d_POST)>;
1372 // ASIMD store, 4 element, one lane, B/H/S
1373 def : InstRW<[V1Write_6c_3L_3V],
1374 (instregex "^ST4i(8|16|32)$")>;
1375 def : InstRW<[WriteAdr, V1Write_6c_3L_3V],
1376 (instregex "^ST4i(8|16|32)_POST$")>;
1379 // Cryptography extensions
1380 // -----------------------------------------------------------------------------
1382 // Crypto polynomial (64x64) multiply long
1383 // Covered by "SchedAlias (WriteV[dq]...)" above
1386 def V1WriteVC : WriteSequence<[V1Write_2c_1V]>;
1387 def V1ReadVC : SchedReadAdvance<2, [V1WriteVC]>;
1388 def : InstRW<[V1WriteVC], (instrs AESDrr, AESErr)>;
1389 def : InstRW<[V1Write_2c_1V, V1ReadVC], (instrs AESMCrr, AESIMCrr)>;
1391 // Crypto SHA1 hash acceleration op
1392 // Crypto SHA1 schedule acceleration ops
1393 // Crypto SHA256 schedule acceleration ops
1394 // Crypto SHA512 hash acceleration ops
1396 def : InstRW<[V1Write_2c_1V0], (instregex "^SHA1(H|SU[01])rr$",
1398 "^SHA512(H2?|SU[01])$",
1399 "^SM3(PARTW(1|2SM3SS1)|TT[12][AB])$")>;
1401 // Crypto SHA1 hash acceleration ops
1402 // Crypto SHA256 hash acceleration ops
1404 def : InstRW<[V1Write_4c_1V0], (instregex "^SHA1[CMP]rrr$",
1409 def : InstRW<[V1Write_2c_1V0], (instrs BCAX, EOR3, RAX1, XAR)>;
1413 // -----------------------------------------------------------------------------
1416 def : InstRW<[V1Wr_CRC, V1Rd_CRC], (instregex "^CRC32C?[BHWX]rr$")>;
1419 // SVE Predicate instructions
1420 // -----------------------------------------------------------------------------
1422 // Loop control, based on predicate
1423 def : InstRW<[V1Write_2c_1M0], (instregex "^BRK[AB]_PP[mz]P$")>;
1424 def : InstRW<[V1Write_2c_1M0], (instrs BRKN_PPzP, BRKPA_PPzPP, BRKPB_PPzPP)>;
1426 // Loop control, based on predicate and flag setting
1427 def : InstRW<[V1Write_3c_2M0], (instrs BRKAS_PPzP, BRKBS_PPzP, BRKNS_PPzP,
1428 BRKPAS_PPzPP, BRKPBS_PPzPP)>;
1430 // Loop control, based on GPR
1431 def : InstRW<[V1Write_3c_2M0], (instregex "^WHILE(LE|LO|LS|LT)_P(WW|XX)_[BHSD]$")>;
1434 def : InstRW<[V1Write_1c_1M0], (instregex "^CTERM(EQ|NE)_(WW|XX)$")>;
1436 // Predicate counting scalar
1437 // Predicate counting scalar, active predicate
1438 def : InstRW<[V1Write_2c_1M0], (instrs ADDPL_XXI, ADDVL_XXI, RDVLI_XI)>;
1439 def : InstRW<[V1Write_2c_1M0], (instregex "^(CNT|([SU]Q)?(DEC|INC))[BHWD]_XPiI$",
1440 "^SQ(DEC|INC)[BHWD]_XPiWdI$",
1441 "^UQ(DEC|INC)[BHWD]_WPiI$",
1442 "^CNTP_XPP_[BHSD]$",
1443 "^([SU]Q)?(DEC|INC)P_XP_[BHSD]$",
1444 "^UQ(DEC|INC)P_WP_[BHSD]$",
1445 "^[SU]Q(DEC|INC)P_XPWd_[BHSD]$")>;
1447 // Predicate counting vector, active predicate
1448 def : InstRW<[V1Write_7c_2M0_1V01], (instregex "^([SU]Q)?(DEC|INC)P_ZP_[HSD]$")>;
1450 // Predicate logical
1451 def : InstRW<[V1Write_1c_1M0],
1452 (instregex "^(AND|BIC|EOR|NAND|NOR|ORN|ORR)_PPzPP$")>;
1454 // Predicate logical, flag setting
1455 def : InstRW<[V1Write_2c_2M0],
1456 (instregex "^(AND|BIC|EOR|NAND|NOR|ORN|ORR)S_PPzPP$")>;
1458 // Predicate reverse
1459 // Predicate set/initialize/find next
1460 // Predicate transpose
1461 // Predicate unpack and widen
1462 // Predicate zip/unzip
1463 def : InstRW<[V1Write_2c_1M0], (instregex "^REV_PP_[BHSD]$",
1464 "^PFALSE$", "^PFIRST_B$",
1465 "^PNEXT_[BHSD]$", "^PTRUE_[BHSD]$",
1466 "^TRN[12]_PPP_[BHSDQ]$",
1467 "^(ZIP|UZP)[12]_PPP_[BHSDQ]$")>;
1469 // Predicate set/initialize/find next
1470 // Predicate unpack and widen
1471 def : InstRW<[V1Write_2c_1M0], (instrs PTEST_PP,
1472 PUNPKHI_PP, PUNPKLO_PP)>;
1475 def : InstRW<[V1Write_1c_1M0], (instrs SEL_PPPP)>;
1477 // Predicate set/initialize, set flags
1478 def : InstRW<[V1Write_3c_2M0], (instregex "^PTRUES_[BHSD]$")>;
1482 // SVE integer instructions
1483 // -----------------------------------------------------------------------------
1485 // Arithmetic, basic
1487 def : InstRW<[V1Write_2c_1V01],
1488 (instregex "^(ABS|CNOT|NEG)_ZPmZ_[BHSD]",
1489 "^(ADD|SUB)_Z(I|P[mZ]Z|ZZ)_[BHSD]",
1490 "^ADR_[SU]XTW_ZZZ_D_[0123]$",
1491 "^ADR_LSL_ZZZ_[SD]_[0123]$",
1492 "^[SU]ABD_ZP[mZ]Z_[BHSD]",
1493 "^[SU](MAX|MIN)_Z(I|P[mZ]Z)_[BHSD]",
1494 "^[SU]Q(ADD|SUB)_Z(I|ZZ)_[BHSD]$",
1495 "^SUBR_Z(I|P[mZ]Z)_[BHSD]",
1496 "^(AND|EOR|ORR)_ZI$",
1497 "^(AND|BIC|EOR|EOR(BT|TB)?|ORR)_ZP?ZZ",
1498 "^EOR(BT|TB)_ZZZ_[BHSD]$",
1499 "^(AND|BIC|EOR|NOT|ORR)_ZPmZ_[BHSD]")>;
1501 // Arithmetic, shift
1502 def : InstRW<[V1Write_2c_1V1],
1503 (instregex "^(ASR|LSL|LSR)_WIDE_Z(Pm|Z)Z_[BHS]",
1504 "^(ASR|LSL|LSR)_ZPm[IZ]_[BHSD]",
1505 "^(ASR|LSL|LSR)_ZZI_[BHSD]",
1506 "^(ASR|LSL|LSR)_ZPZ[IZ]_[BHSD]",
1507 "^(ASRR|LSLR|LSRR)_ZPmZ_[BHSD]")>;
1509 // Arithmetic, shift right for divide
1510 def : InstRW<[V1Write_4c_1V1], (instregex "^ASRD_(ZPmI|ZPZI)_[BHSD]")>;
1512 // Count/reverse bits
1513 def : InstRW<[V1Write_2c_1V01], (instregex "^(CLS|CLZ|CNT|RBIT)_ZPmZ_[BHSD]")>;
1515 // Broadcast logical bitmask immediate to vector
1516 def : InstRW<[V1Write_2c_1V01], (instrs DUPM_ZI)>;
1518 // Compare and set flags
1519 def : InstRW<[V1Write_4c_1M0_1V0],
1520 (instregex "^CMP(EQ|GE|GT|HI|HS|LE|LO|LS|LT|NE)_PPzZ[IZ]_[BHSD]$",
1521 "^CMP(EQ|GE|GT|HI|HS|LE|LO|LS|LT|NE)_WIDE_PPzZZ_[BHS]$")>;
1523 // Conditional extract operations, scalar form
1524 def : InstRW<[V1Write_9c_1M0_1V1], (instregex "^CLAST[AB]_RPZ_[BHSD]$")>;
1526 // Conditional extract operations, SIMD&FP scalar and vector forms
1527 def : InstRW<[V1Write_3c_1V1], (instregex "^CLAST[AB]_[VZ]PZ_[BHSD]$",
1528 "^COMPACT_ZPZ_[SD]$",
1529 "^SPLICE_ZPZZ?_[BHSD]$")>;
1531 // Convert to floating point, 64b to float or convert to double
1532 def : InstRW<[V1Write_3c_1V0], (instregex "^[SU]CVTF_ZPmZ_Dto[HSD]",
1533 "^[SU]CVTF_ZPmZ_StoD")>;
1535 // Convert to floating point, 32b to single or half
1536 def : InstRW<[V1Write_4c_2V0], (instregex "^[SU]CVTF_ZPmZ_Sto[HS]")>;
1538 // Convert to floating point, 16b to half
1539 def : InstRW<[V1Write_6c_4V0], (instregex "^[SU]CVTF_ZPmZ_HtoH")>;
1542 def : InstRW<[V1Write_5c_1M0_1V01], (instregex "^CPY_ZPmR_[BHSD]$")>;
1544 // Copy, scalar SIMD&FP or imm
1545 def : InstRW<[V1Write_2c_1V01], (instregex "^CPY_ZP([mz]I|mV)_[BHSD]$")>;
1548 def : InstRW<[V1Write_12c7_1V0], (instregex "^[SU]DIVR?_ZPmZ_S",
1549 "^[SU]DIV_ZPZZ_S")>;
1552 def : InstRW<[V1Write_20c7_1V0], (instregex "^[SU]DIVR?_ZPmZ_D",
1553 "^[SU]DIV_ZPZZ_D")>;
1555 // Dot product, 8 bit
1556 def : InstRW<[V1Wr_ZDOTB, V1Rd_ZDOTB], (instregex "^[SU]DOT_ZZZI?_S$")>;
1558 // Dot product, 8 bit, using signed and unsigned integers
1559 def : InstRW<[V1Wr_ZUDOTB, V1Rd_ZUDOTB],
1560 (instrs SUDOT_ZZZI, USDOT_ZZZ, USDOT_ZZZI)>;
1562 // Dot product, 16 bit
1563 def : InstRW<[V1Wr_ZDOTH, V1Rd_ZDOTH], (instregex "^[SU]DOT_ZZZI?_D$")>;
1565 // Duplicate, immediate and indexed form
1566 def : InstRW<[V1Write_2c_1V01], (instregex "^DUP_ZI_[BHSD]$",
1567 "^DUP_ZZI_[BHSDQ]$")>;
1569 // Duplicate, scalar form
1570 def : InstRW<[V1Write_3c_1M0], (instregex "^DUP_ZR_[BHSD]$")>;
1572 // Extend, sign or zero
1573 def : InstRW<[V1Write_2c_1V1], (instregex "^[SU]XTB_ZPmZ_[HSD]",
1574 "^[SU]XTH_ZPmZ_[SD]",
1575 "^[SU]XTW_ZPmZ_[D]")>;
1578 def : InstRW<[V1Write_2c_1V01], (instrs EXT_ZZI)>;
1580 // Extract/insert operation, SIMD and FP scalar form
1581 def : InstRW<[V1Write_3c_1V1], (instregex "^LAST[AB]_VPZ_[BHSD]$",
1582 "^INSR_ZV_[BHSD]$")>;
1584 // Extract/insert operation, scalar
1585 def : InstRW<[V1Write_6c_1M0_1V1], (instregex "^LAST[AB]_RPZ_[BHSD]$",
1586 "^INSR_ZR_[BHSD]$")>;
1588 // Horizontal operations, B, H, S form, imm, imm
1589 def : InstRW<[V1Write_4c_1V0], (instregex "^INDEX_II_[BHS]$")>;
1591 // Horizontal operations, B, H, S form, scalar, imm / scalar / imm, scalar
1592 def : InstRW<[V1Write_7c_1M0_1V0], (instregex "^INDEX_(IR|RI|RR)_[BHS]$")>;
1594 // Horizontal operations, D form, imm, imm
1595 def : InstRW<[V1Write_5c_2V0], (instrs INDEX_II_D)>;
1597 // Horizontal operations, D form, scalar, imm / scalar / imm, scalar
1598 def : InstRW<[V1Write_8c_2M0_2V0], (instregex "^INDEX_(IR|RI|RR)_D$")>;
1601 def : InstRW<[V1Write_2c_1V01], (instregex "^MOVPRFX_ZP[mz]Z_[BHSD]$",
1604 // Matrix multiply-accumulate
1605 def : InstRW<[V1Wr_ZMMA, V1Rd_ZMMA], (instrs SMMLA_ZZZ, UMMLA_ZZZ, USMMLA_ZZZ)>;
1607 // Multiply, B, H, S element size
1608 def : InstRW<[V1Write_4c_1V0], (instregex "^MUL_(ZI|ZPmZ|ZZZI|ZZZ)_[BHS]",
1610 "^[SU]MULH_(ZPmZ|ZZZ)_[BHS]",
1611 "^[SU]MULH_ZPZZ_[BHS]")>;
1613 // Multiply, D element size
1614 def : InstRW<[V1Write_5c_2V0], (instregex "^MUL_(ZI|ZPmZ|ZZZI|ZZZ)_D",
1616 "^[SU]MULH_(ZPmZ|ZZZ)_D",
1617 "^[SU]MULH_ZPZZ_D")>;
1619 // Multiply accumulate, D element size
1620 def : InstRW<[V1Wr_ZMAD, V1Rd_ZMAD],
1621 (instregex "^ML[AS]_ZPZZZ_D")>;
1622 def : InstRW<[V1Wr_ZMAD, ReadDefault, V1Rd_ZMAD],
1623 (instregex "^(ML[AS]|MAD|MSB)_ZPmZZ_D")>;
1625 // Multiply accumulate, B, H, S element size
1626 // NOTE: This is not specified in the SOG.
1627 def : InstRW<[V1Write_4c_1V0], (instregex "^(ML[AS]|MAD|MSB)_(ZPmZZ|ZPZZZ)_[BHS]")>;
1629 // Predicate counting vector
1630 def : InstRW<[V1Write_2c_1V0], (instregex "^([SU]Q)?(DEC|INC)[HWD]_ZPiI$")>;
1632 // Reduction, arithmetic, B form
1633 def : InstRW<[V1Write_14c_1V_1V0_2V1_1V13],
1634 (instregex "^[SU](ADD|MAX|MIN)V_VPZ_B")>;
1636 // Reduction, arithmetic, H form
1637 def : InstRW<[V1Write_12c_1V_1V01_2V1],
1638 (instregex "^[SU](ADD|MAX|MIN)V_VPZ_H")>;
1640 // Reduction, arithmetic, S form
1641 def : InstRW<[V1Write_10c_1V_1V01_2V1],
1642 (instregex "^[SU](ADD|MAX|MIN)V_VPZ_S")>;
1644 // Reduction, arithmetic, D form
1645 def : InstRW<[V1Write_8c_1V_1V01],
1646 (instregex "^[SU](ADD|MAX|MIN)V_VPZ_D")>;
1648 // Reduction, logical
1649 def : InstRW<[V1Write_12c_4V01], (instregex "^(AND|EOR|OR)V_VPZ_[BHSD]$")>;
1652 def : InstRW<[V1Write_2c_1V01], (instregex "^REV_ZZ_[BHSD]$",
1653 "^REVB_ZPmZ_[HSD]$",
1657 // Select, vector form
1659 // Table lookup extension
1660 // Transpose, vector form
1661 // Unpack and extend
1663 def : InstRW<[V1Write_2c_1V01], (instregex "^SEL_ZPZZ_[BHSD]$",
1664 "^TB[LX]_ZZZ_[BHSD]$",
1665 "^TRN[12]_ZZZ_[BHSDQ]$",
1666 "^[SU]UNPK(HI|LO)_ZZ_[HSD]$",
1667 "^(UZP|ZIP)[12]_ZZZ_[BHSDQ]$")>;
1670 // SVE floating-point instructions
1671 // -----------------------------------------------------------------------------
1673 // Floating point absolute value/difference
1674 def : InstRW<[V1Write_2c_1V01], (instregex "^FAB[SD]_ZPmZ_[HSD]",
1676 "^FABS_ZPmZ_[HSD]")>;
1678 // Floating point arithmetic
1679 def : InstRW<[V1Write_2c_1V01], (instregex "^F(ADD|SUB)_(ZPm[IZ]|ZZZ)_[HSD]",
1680 "^F(ADD|SUB)_ZPZ[IZ]_[HSD]",
1681 "^FADDP_ZPmZZ_[HSD]",
1683 "^FSUBR_ZPm[IZ]_[HSD]",
1684 "^FSUBR_(ZPZI|ZPZZ)_[HSD]")>;
1686 // Floating point associative add, F16
1687 def : InstRW<[V1Write_19c_18V0], (instrs FADDA_VPZ_H)>;
1689 // Floating point associative add, F32
1690 def : InstRW<[V1Write_11c_10V0], (instrs FADDA_VPZ_S)>;
1692 // Floating point associative add, F64
1693 def : InstRW<[V1Write_8c_3V01], (instrs FADDA_VPZ_D)>;
1695 // Floating point compare
1696 def : InstRW<[V1Write_2c_1V0], (instregex "^FAC(GE|GT)_PPzZZ_[HSD]$",
1697 "^FCM(EQ|GE|GT|NE|UO)_PPzZZ_[HSD]$",
1698 "^FCM(EQ|GE|GT|LE|LT|NE)_PPzZ0_[HSD]$")>;
1700 // Floating point complex add
1701 def : InstRW<[V1Write_3c_1V01], (instregex "^FCADD_ZPmZ_[HSD]$")>;
1703 // Floating point complex multiply add
1704 def : InstRW<[V1Wr_ZFCMA, ReadDefault, V1Rd_ZFCMA], (instregex "^FCMLA_ZPmZZ_[HSD]")>;
1705 def : InstRW<[V1Wr_ZFCMA, V1Rd_ZFCMA], (instregex "^FCMLA_ZZZI_[HS]")>;
1707 // Floating point convert, long or narrow (F16 to F32 or F32 to F16)
1708 // Floating point convert to integer, F32
1709 def : InstRW<[V1Write_4c_2V0], (instregex "^FCVT_ZPmZ_(HtoS|StoH)",
1710 "^FCVTZ[SU]_ZPmZ_(HtoS|StoS)")>;
1712 // Floating point convert, long or narrow (F16 to F64, F32 to F64, F64 to F32 or F64 to F16)
1713 // Floating point convert to integer, F64
1714 def : InstRW<[V1Write_3c_1V0], (instregex "^FCVT_ZPmZ_(HtoD|StoD|DtoS|DtoH)",
1715 "^FCVTZ[SU]_ZPmZ_(HtoD|StoD|DtoS|DtoD)")>;
1717 // Floating point convert to integer, F16
1718 def : InstRW<[V1Write_6c_4V0], (instregex "^FCVTZ[SU]_ZPmZ_HtoH")>;
1720 // Floating point copy
1721 def : InstRW<[V1Write_2c_1V01], (instregex "^FCPY_ZPmI_[HSD]$",
1722 "^FDUP_ZI_[HSD]$")>;
1724 // Floating point divide, F16
1725 def : InstRW<[V1Write_13c10_1V0], (instregex "^FDIVR?_(ZPmZ|ZPZZ)_H")>;
1727 // Floating point divide, F32
1728 def : InstRW<[V1Write_10c7_1V0], (instregex "^FDIVR?_(ZPmZ|ZPZZ)_S")>;
1730 // Floating point divide, F64
1731 def : InstRW<[V1Write_15c7_1V0], (instregex "^FDIVR?_(ZPmZ|ZPZZ)_D")>;
1733 // Floating point min/max
1734 def : InstRW<[V1Write_2c_1V01], (instregex "^F(MAX|MIN)(NM)?_ZPm[IZ]_[HSD]",
1735 "^F(MAX|MIN)(NM)?_ZPZ[IZ]_[HSD]")>;
1737 // Floating point multiply
1738 def : InstRW<[V1Write_3c_1V01], (instregex "^(FSCALE|FMULX)_ZPmZ_[HSD]",
1739 "^FMULX_ZPZZ_[HSD]",
1740 "^FMUL_(ZPm[IZ]|ZZZI?)_[HSD]",
1741 "^FMUL_ZPZ[IZ]_[HSD]")>;
1743 // Floating point multiply accumulate
1744 def : InstRW<[V1Wr_ZFMA, ReadDefault, V1Rd_ZFMA],
1745 (instregex "^FN?ML[AS]_ZPmZZ_[HSD]",
1746 "^FN?(MAD|MSB)_ZPmZZ_[HSD]")>;
1747 def : InstRW<[V1Wr_ZFMA, V1Rd_ZFMA],
1748 (instregex "^FML[AS]_ZZZI_[HSD]",
1749 "^FN?ML[AS]_ZPZZZ_[HSD]")>;
1751 // Floating point reciprocal step
1752 def : InstRW<[V1Write_4c_1V01], (instregex "^F(RECPS|RSQRTS)_ZZZ_[HSD]")>;
1754 // Floating point reciprocal estimate, F16
1755 def : InstRW<[V1Write_6c_4V0], (instrs FRECPE_ZZ_H, FRSQRTE_ZZ_H)>;
1757 // Floating point reciprocal estimate, F32
1758 def : InstRW<[V1Write_4c_2V0], (instrs FRECPE_ZZ_S, FRSQRTE_ZZ_S)>;
1760 // Floating point reciprocal estimate, F64
1761 def : InstRW<[V1Write_3c_1V0], (instrs FRECPE_ZZ_D, FRSQRTE_ZZ_D)>;
1763 // Floating point reciprocal exponent
1764 def : InstRW<[V1Write_3c_1V0], (instregex "^FRECPX_ZPmZ_[HSD]")>;
1766 // Floating point reduction, F16
1767 def : InstRW<[V1Write_13c_6V01], (instregex "^F(ADD|((MAX|MIN)(NM)?))V_VPZ_H$")>;
1769 // Floating point reduction, F32
1770 def : InstRW<[V1Write_11c_1V_5V01], (instregex "^F(ADD|((MAX|MIN)(NM)?))V_VPZ_S$")>;
1772 // Floating point reduction, F64
1773 def : InstRW<[V1Write_9c_1V_4V01], (instregex "^F(ADD|((MAX|MIN)(NM)?))V_VPZ_D$")>;
1775 // Floating point round to integral, F16
1776 def : InstRW<[V1Write_6c_1V0], (instregex "^FRINT[AIMNPXZ]_ZPmZ_H")>;
1778 // Floating point round to integral, F32
1779 def : InstRW<[V1Write_4c_1V0], (instregex "^FRINT[AIMNPXZ]_ZPmZ_S")>;
1781 // Floating point round to integral, F64
1782 def : InstRW<[V1Write_3c_1V0], (instregex "^FRINT[AIMNPXZ]_ZPmZ_D")>;
1784 // Floating point square root, F16
1785 def : InstRW<[V1Write_13c10_1V0], (instregex "^FSQRT_ZPmZ_H")>;
1787 // Floating point square root, F32
1788 def : InstRW<[V1Write_10c7_1V0], (instregex "^FSQRT_ZPmZ_S")>;
1790 // Floating point square root, F64
1791 def : InstRW<[V1Write_16c7_1V0], (instregex "^FSQRT_ZPmZ_D")>;
1793 // Floating point trigonometric
1794 def : InstRW<[V1Write_3c_1V01], (instregex "^FEXPA_ZZ_[HSD]$",
1795 "^FTMAD_ZZI_[HSD]$",
1796 "^FTS(MUL|SEL)_ZZZ_[HSD]$")>;
1799 // SVE BFloat16 (BF16) instructions
1800 // -----------------------------------------------------------------------------
1802 // Convert, F32 to BF16
1803 def : InstRW<[V1Write_4c_1V0], (instrs BFCVT_ZPmZ, BFCVTNT_ZPmZ)>;
1806 def : InstRW<[V1Wr_ZBFDOT, V1Rd_ZBFDOT], (instrs BFDOT_ZZI, BFDOT_ZZZ)>;
1808 // Matrix multiply accumulate
1809 def : InstRW<[V1Wr_ZBFMMA, V1Rd_ZBFMMA], (instrs BFMMLA_ZZZ)>;
1811 // Multiply accumulate long
1812 def : InstRW<[V1Wr_ZBFMAL, V1Rd_ZBFMAL], (instregex "^BFMLAL[BT]_ZZZ(I)?$")>;
1815 // SVE Load instructions
1816 // -----------------------------------------------------------------------------
1819 def : InstRW<[V1Write_6c_1L01], (instrs LDR_ZXI)>;
1822 def : InstRW<[V1Write_6c_1L_1M], (instrs LDR_PXI)>;
1824 // Contiguous load, scalar + imm
1825 // Contiguous load, scalar + scalar
1826 // Contiguous load broadcast, scalar + imm
1827 // Contiguous load broadcast, scalar + scalar
1828 def : InstRW<[V1Write_6c_1L01], (instregex "^LD1[BHWD]_IMM$",
1829 "^LD1S?B_[HSD]_IMM$",
1830 "^LD1S?H_[SD]_IMM$",
1837 "^LD1RS?B_[HSD]_IMM$",
1838 "^LD1RS?H_[SD]_IMM$",
1840 "^LD1RQ_[BHWD]_IMM$",
1842 def : InstRW<[V1Write_7c_1L01_1S], (instregex "^LD1H$",
1846 // Non temporal load, scalar + imm
1847 def : InstRW<[V1Write_6c_1L01], (instregex "^LDNT1[BHWD]_ZRI$")>;
1849 // Non temporal load, scalar + scalar
1850 def : InstRW<[V1Write_7c_1L01_1S], (instrs LDNT1H_ZRR)>;
1851 def : InstRW<[V1Write_6c_1L01_1S], (instregex "^LDNT1[BWD]_ZRR$")>;
1853 // Contiguous first faulting load, scalar + scalar
1854 def : InstRW<[V1Write_7c_1L01_1S], (instregex "^LDFF1H$",
1855 "^LDFF1S?H_[SD]$")>;
1856 def : InstRW<[V1Write_6c_1L01_1S], (instregex "^LDFF1[BWD]$",
1860 // Contiguous non faulting load, scalar + imm
1861 def : InstRW<[V1Write_6c_1L01], (instregex "^LDNF1[BHWD]_IMM$",
1862 "^LDNF1S?B_[HSD]_IMM$",
1863 "^LDNF1S?H_[SD]_IMM$",
1864 "^LDNF1S?W_D_IMM$")>;
1866 // Contiguous Load two structures to two vectors, scalar + imm
1867 def : InstRW<[V1Write_8c_2L01_2V01], (instregex "^LD2[BHWD]_IMM$")>;
1869 // Contiguous Load two structures to two vectors, scalar + scalar
1870 def : InstRW<[V1Write_10c_2L01_2V01], (instrs LD2H)>;
1871 def : InstRW<[V1Write_9c_2L01_2V01], (instregex "^LD2[BWD]$")>;
1873 // Contiguous Load three structures to three vectors, scalar + imm
1874 def : InstRW<[V1Write_11c_3L01_3V01], (instregex "^LD3[BHWD]_IMM$")>;
1876 // Contiguous Load three structures to three vectors, scalar + scalar
1877 def : InstRW<[V1Write_13c_3L01_1S_3V01], (instregex "^LD3[BHWD]$")>;
1879 // Contiguous Load four structures to four vectors, scalar + imm
1880 def : InstRW<[V1Write_12c_4L01_4V01], (instregex "^LD4[BHWD]_IMM$")>;
1882 // Contiguous Load four structures to four vectors, scalar + scalar
1883 def : InstRW<[V1Write_13c_4L01_2S_4V01], (instregex "^LD4[BHWD]$")>;
1885 // Gather load, vector + imm, 32-bit element size
1886 def : InstRW<[V1Write_11c_1L_1V], (instregex "^GLD(FF)?1S?[BH]_S_IMM$",
1887 "^GLD(FF)?1W_IMM$")>;
1889 // Gather load, vector + imm, 64-bit element size
1890 def : InstRW<[V1Write_9c_2L_2V],
1891 (instregex "^GLD(FF)?1S?[BHW]_D_IMM$",
1892 "^GLD(FF)?1S?[BHW]_D(_[SU]XTW)?(_SCALED)?$",
1894 "^GLD(FF)?1D(_[SU]XTW)?(_SCALED)?$")>;
1896 // Gather load, 32-bit scaled offset
1897 def : InstRW<[V1Write_11c_2L_2V],
1898 (instregex "^GLD(FF)?1S?[HW]_S_[SU]XTW_SCALED$",
1899 "^GLD(FF)?1W_[SU]XTW_SCALED")>;
1901 // Gather load, 32-bit unpacked unscaled offset
1902 def : InstRW<[V1Write_9c_1L_1V],
1903 (instregex "^GLD(FF)?1S?[BH]_S_[SU]XTW$",
1904 "^GLD(FF)?1W_[SU]XTW$")>;
1907 // NOTE: This is not specified in the SOG.
1908 def : InstRW<[V1Write_4c_1L01], (instregex "^PRF[BHWD]")>;
1911 // SVE Store instructions
1912 // -----------------------------------------------------------------------------
1914 // Store from predicate reg
1915 def : InstRW<[V1Write_1c_1L01], (instrs STR_PXI)>;
1917 // Store from vector reg
1918 def : InstRW<[V1Write_2c_1L01_1V], (instrs STR_ZXI)>;
1920 // Contiguous store, scalar + imm
1921 // Contiguous store, scalar + scalar
1922 def : InstRW<[V1Write_2c_1L01_1V], (instregex "^ST1[BHWD]_IMM$",
1929 def : InstRW<[V1Write_2c_1L01_1S_1V], (instregex "^ST1H(_[SD])?$")>;
1931 // Contiguous store two structures from two vectors, scalar + imm
1932 // Contiguous store two structures from two vectors, scalar + scalar
1933 def : InstRW<[V1Write_4c_1L01_1V], (instregex "^ST2[BHWD]_IMM$",
1935 def : InstRW<[V1Write_4c_1L01_1S_1V], (instrs ST2H)>;
1937 // Contiguous store three structures from three vectors, scalar + imm
1938 def : InstRW<[V1Write_7c_5L01_5V], (instregex "^ST3[BHWD]_IMM$")>;
1940 // Contiguous store three structures from three vectors, scalar + scalar
1941 def : InstRW<[V1Write_7c_5L01_5S_5V], (instregex "^ST3[BHWD]$")>;
1943 // Contiguous store four structures from four vectors, scalar + imm
1944 def : InstRW<[V1Write_11c_9L01_9V], (instregex "^ST4[BHWD]_IMM$")>;
1946 // Contiguous store four structures from four vectors, scalar + scalar
1947 def : InstRW<[V1Write_11c_9L01_9S_9V], (instregex "^ST4[BHWD]$")>;
1949 // Non temporal store, scalar + imm
1950 // Non temporal store, scalar + scalar
1951 def : InstRW<[V1Write_2c_1L01_1V], (instregex "^STNT1[BHWD]_ZRI$",
1952 "^STNT1[BWD]_ZRR$")>;
1953 def : InstRW<[V1Write_2c_1L01_1S_1V], (instrs STNT1H_ZRR)>;
1955 // Scatter store vector + imm 32-bit element size
1956 // Scatter store, 32-bit scaled offset
1957 // Scatter store, 32-bit unscaled offset
1958 def : InstRW<[V1Write_10c_2L01_2V], (instregex "^SST1[BH]_S_IMM$",
1960 "^SST1(H_S|W)_[SU]XTW_SCALED$",
1961 "^SST1[BH]_S_[SU]XTW$",
1962 "^SST1W_[SU]XTW$")>;
1964 // Scatter store, 32-bit unpacked unscaled offset
1965 // Scatter store, 32-bit unpacked scaled offset
1966 def : InstRW<[V1Write_6c_1L01_1V], (instregex "^SST1[BHW]_D_[SU]XTW$",
1968 "^SST1[HW]_D_[SU]XTW_SCALED$",
1969 "^SST1D_[SU]XTW_SCALED$")>;
1971 // Scatter store vector + imm 64-bit element size
1972 // Scatter store, 64-bit scaled offset
1973 // Scatter store, 64-bit unscaled offset
1974 def : InstRW<[V1Write_6c_1L01_1V], (instregex "^SST1[BHW]_D_IMM$",
1976 "^SST1[HW]_D_SCALED$",
1982 // SVE Miscellaneous instructions
1983 // -----------------------------------------------------------------------------
1985 // Read first fault register, unpredicated
1986 // Set first fault register
1987 // Write to first fault register
1988 def : InstRW<[V1Write_2c_1M0], (instrs RDFFR_P,
1992 // Read first fault register, predicated
1993 def : InstRW<[V1Write_3c_2M0], (instrs RDFFR_PPz)>;
1995 // Read first fault register and set flags
1996 def : InstRW<[V1Write_4c_1M], (instrs RDFFRS_PPz)>;