llvm/lib/Target/AArch64/AArch64SchedA510.td

   1 //==- AArch64SchedCortexA510.td - ARM Cortex-A510 Scheduling Definitions -*- tablegen -*-=//
   2 //
   3 // Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
   4 // See https://llvm.org/LICENSE.txt for license information.
   5 // SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
   6 //
   7 //===----------------------------------------------------------------------===//
   8 //
   9 // This file defines the machine model for the ARM Cortex-A510 processor.
  10 //
  11 //===----------------------------------------------------------------------===//
  12
  13 // ===---------------------------------------------------------------------===//
  14 // The following definitions describe the per-operand machine model.
  15 // This works with MachineScheduler. See MCSchedModel.h for details.
  16
  17 // Cortex-A510 machine model for scheduling and other instruction cost heuristics.
  18 def CortexA510Model : SchedMachineModel {
  19   let MicroOpBufferSize = 0;  // The Cortex-A510 is an in-order processor
  20   let IssueWidth = 3;         // It dual-issues under most circumstances
  21   let LoadLatency = 3;        // Cycles for loads to access the cache.
  22                               // Most loads have a latency of 2, but some have higher latencies.
  23                               // 3 seems to be a good tradeoff
  24   let PostRAScheduler = 1;    // Enable PostRA scheduler pass.
  25   let CompleteModel = 0;      // Covers instructions applicable to Cortex-A510.
  26
  27   // FIXME: Remove when all errors have been fixed.
  28   let FullInstRWOverlapCheck = 0;
  29 }
  30
  31
  32 //===----------------------------------------------------------------------===//
  33 // Subtarget-specific SchedWrite types
  34
  35 let SchedModel = CortexA510Model in {
  36
  37 //===----------------------------------------------------------------------===//
  38 // Define each kind of processor resource and number available.
  39
  40 // Modeling each pipeline as a ProcResource using the BufferSize = 0 since the
  41 // Cortex-A510 is in-order.
  42 let BufferSize = 0 in {
  43   def CortexA510UnitALU0   : ProcResource<1>;    // Int ALU0
  44   def CortexA510UnitALU12  : ProcResource<2>;    // Int ALU1 & ALU2
  45   def CortexA510UnitMAC    : ProcResource<1>;    // Int MAC, 64-bi wide
  46   def CortexA510UnitDiv    : ProcResource<1>;    // Int Division, not pipelined
  47   // There are 2 LS pipes, 1 for Load/Store; 1 for Store only
  48   def CortexA510UnitLdSt   : ProcResource<1>;    // Load/Store shared pipe
  49   def CortexA510UnitLd1    : ProcResource<1>;    // Load pipe
  50   def CortexA510UnitB      : ProcResource<1>;    // Branch
  51   def CortexA510UnitPAC    : ProcResource<1>;    // Pointer Authentication (PAC) pipe
  52
  53   // The FP DIV/SQRT instructions execute totally differently from the FP ALU
  54   // instructions, which can mostly be dual-issued; that's why for now we model
  55   // them with 2 resources.
  56   def CortexA510UnitVALU0  : ProcResource<1>;    // SIMD/FP/SVE ALU0
  57   def CortexA510UnitVALU1  : ProcResource<1>;    // SIMD/FP/SVE ALU0
  58   def CortexA510UnitVMAC   : ProcResource<2>;    // SIMD/FP/SVE MAC
  59   def CortexA510UnitVMC    : ProcResource<1>;    // SIMD/FP/SVE multicycle instrs  (e.g Div, SQRT, cryptography)
  60 }
  61
  62 def CortexA510UnitLd     : ProcResGroup<[CortexA510UnitLdSt, CortexA510UnitLd1]>;
  63 def CortexA510UnitVALU   : ProcResGroup<[CortexA510UnitVALU0, CortexA510UnitVALU1]>;
  64 def CortexA510UnitALU    : ProcResGroup<[CortexA510UnitALU0, CortexA510UnitALU12]>;
  65 // These latencies are modeled without taking into account forwarding paths
  66 // (the software optimisation guide lists latencies taking into account
  67 // typical forwarding paths).
  68 def : WriteRes<WriteImm, [CortexA510UnitALU]> { let Latency = 1; }    // MOVN, MOVZ
  69 def : WriteRes<WriteI, [CortexA510UnitALU]> { let Latency = 1; }      // ALU
  70 def : WriteRes<WriteISReg, [CortexA510UnitALU]> { let Latency = 2; }  // ALU of Shifted-Reg
  71 def : WriteRes<WriteIEReg, [CortexA510UnitALU]> { let Latency = 2; }  // ALU of Extended-Reg
  72 def : WriteRes<WriteExtr, [CortexA510UnitALU]> { let Latency = 2; }   // EXTR from a reg pair
  73 def : WriteRes<WriteIS, [CortexA510UnitALU]> { let Latency = 2; }     // Shift/Scale
  74
  75 // MAC
  76 def : WriteRes<WriteIM32, [CortexA510UnitMAC]> { let Latency = 3; }   // 32-bit Multiply
  77 def : WriteRes<WriteIM64, [CortexA510UnitMAC]> { let Latency = 5; let ReleaseAtCycles = [2];}   // 64-bit Multiply
  78
  79 // Div
  80 def : WriteRes<WriteID32, [CortexA510UnitDiv]> {
  81   let Latency = 8; let ReleaseAtCycles = [8];
  82 }
  83 def : WriteRes<WriteID64, [CortexA510UnitDiv]> {
  84   let Latency = 16; let ReleaseAtCycles = [16];
  85 }
  86
  87 //===----------------------------------------------------------------------===//
  88 // Define customized scheduler read/write types specific to the Cortex A510
  89
  90 //===----------------------------------------------------------------------===//
  91 class CortexA510Write<int n, ProcResourceKind res> : SchedWriteRes<[res]> {
  92   let Latency = n;
  93 }
  94
  95 class CortexA510MCWrite<int n, int m, ProcResourceKind res> : SchedWriteRes<[res]> {
  96   let Latency = n;
  97   let ReleaseAtCycles = [m];
  98   let BeginGroup = 1;
  99 }
 100
 101 class CortexA510MC_RC0Write<int n, ProcResourceKind res> : SchedWriteRes<[res]> {
 102   let Latency = n;
 103   let BeginGroup = 1;
 104 }
 105
 106 //===----------------------------------------------------------------------===//
 107 // Define generic 2 micro-op types
 108 def A510Write_10cyc_1VMAC_1VALU : SchedWriteRes<[CortexA510UnitVALU, CortexA510UnitVMAC]> {
 109   let Latency     = 10;
 110   let NumMicroOps = 2;
 111 }
 112
 113 def A510Write_15cyc_1VMAC_1VALU : SchedWriteRes<[CortexA510UnitVALU, CortexA510UnitVMAC]> {
 114   let Latency     = 15;
 115   let NumMicroOps = 2;
 116 }
 117
 118 class A510Write_PAC_B <int lat> : SchedWriteRes<[CortexA510UnitPAC, CortexA510UnitB]> {
 119   let Latency = lat;
 120   let NumMicroOps = 2;
 121 }
 122 // Load
 123 def : WriteRes<WriteLD, [CortexA510UnitLd]> { let Latency = 2; }
 124 def : WriteRes<WriteLDIdx, [CortexA510UnitLd]> { let Latency = 2; }
 125 def : WriteRes<WriteLDHi, [CortexA510UnitLd]> { let Latency = 2; }
 126
 127 def CortexA510WriteVLD1 : SchedWriteRes<[CortexA510UnitLd]> { let Latency = 3; }
 128 def CortexA510WriteVLD1SI : SchedWriteRes<[CortexA510UnitLd]> { let Latency = 3; let SingleIssue = 1; }
 129 def CortexA510WriteVLD2 : SchedWriteRes<[CortexA510UnitLd]> { let Latency = 4;
 130                                                   let ReleaseAtCycles = [2]; }
 131 def CortexA510WriteVLD3 : SchedWriteRes<[CortexA510UnitLd]> { let Latency = 5;
 132                                                   let ReleaseAtCycles = [3]; }
 133 def CortexA510WriteVLD4 : SchedWriteRes<[CortexA510UnitLd]> { let Latency = 6;
 134                                                   let ReleaseAtCycles = [4]; }
 135 def CortexA510WriteVLD6 : SchedWriteRes<[CortexA510UnitLd]> { let Latency = 5;
 136                                                   let ReleaseAtCycles = [3]; }
 137 def CortexA510WriteVLD8 : SchedWriteRes<[CortexA510UnitLd]> { let Latency = 6;
 138                                                   let ReleaseAtCycles = [4]; }
 139
 140 def CortexA510WriteLDP1 : SchedWriteRes<[CortexA510UnitLd]> { let Latency = 3; }
 141 def CortexA510WriteLDP2 : SchedWriteRes<[CortexA510UnitLd]> { let Latency = 3; }
 142 def CortexA510WriteLDP4 : SchedWriteRes<[CortexA510UnitLd]> { let Latency = 3; }
 143
 144 // Pre/Post Indexing - Performed as part of address generation
 145 def : WriteRes<WriteAdr, []> { let Latency = 0; }
 146
 147 // Store
 148 let RetireOOO = 1 in {
 149 def : WriteRes<WriteST, [CortexA510UnitLdSt]> { let Latency = 1; }
 150 def : WriteRes<WriteSTP, [CortexA510UnitLdSt]> { let Latency = 1; }
 151 def : WriteRes<WriteSTIdx, [CortexA510UnitLdSt]> { let Latency = 1; }
 152 }
 153 def : WriteRes<WriteSTX, [CortexA510UnitLdSt]> { let Latency = 3; }
 154
 155 // Vector Store - Similar to vector loads, can take 1-3 cycles to issue.
 156 def : WriteRes<WriteVST, [CortexA510UnitLdSt]> { let Latency = 5;
 157                                           let ReleaseAtCycles = [2];}
 158 def CortexA510WriteVST1 : SchedWriteRes<[CortexA510UnitLdSt]> { let Latency = 4; }
 159 def CortexA510WriteVST2 : SchedWriteRes<[CortexA510UnitLdSt]> { let Latency = 5;
 160                                                   let ReleaseAtCycles = [2]; }
 161 def CortexA510WriteVST3 : SchedWriteRes<[CortexA510UnitLdSt]> { let Latency = 5;
 162                                                   let ReleaseAtCycles = [3]; }
 163 def CortexA510WriteVST4 : SchedWriteRes<[CortexA510UnitLdSt]> { let Latency = 5;
 164                                                   let ReleaseAtCycles = [4]; }
 165
 166 def : WriteRes<WriteAtomic, []> { let Unsupported = 1; }
 167
 168 // Branch
 169 def : WriteRes<WriteBr, [CortexA510UnitB]>;
 170 def : WriteRes<WriteBrReg, [CortexA510UnitB]>;
 171 def : WriteRes<WriteSys, [CortexA510UnitB]>;
 172 def : WriteRes<WriteBarrier, [CortexA510UnitB]>;
 173 def : WriteRes<WriteHint, [CortexA510UnitB]>;
 174
 175 // FP ALU
 176 //   As WriteF result is produced in F5 and it can be mostly forwarded
 177 //   to consumer at F1, the effectively Latency is set as 4.
 178 def : WriteRes<WriteF, [CortexA510UnitVALU]> { let Latency = 4; }
 179 def : WriteRes<WriteFCmp, [CortexA510UnitVALU]> { let Latency = 3; }
 180 def : WriteRes<WriteFCvt, [CortexA510UnitVALU]> { let Latency = 4; }
 181 def : WriteRes<WriteFCopy, [CortexA510UnitVALU]> { let Latency = 3; }
 182 def : WriteRes<WriteFImm, [CortexA510UnitVALU]> { let Latency = 3; }
 183
 184 class CortexA510VSt<int n> : SchedWriteRes<[CortexA510UnitLdSt]> {
 185   let RetireOOO = 1;
 186   let ReleaseAtCycles = [n];
 187 }
 188
 189 def CortexA510VSt0      : SchedWriteRes<[CortexA510UnitLdSt]> {
 190   let RetireOOO = 1;
 191 }
 192
 193 def : SchedAlias<WriteVd, CortexA510Write<4, CortexA510UnitVALU>>;
 194 def : SchedAlias<WriteVq, CortexA510Write<4, CortexA510UnitVALU>>;
 195
 196 // FP ALU specific new schedwrite definitions
 197 def CortexA510WriteFPALU_F3 : SchedWriteRes<[CortexA510UnitVALU]> { let Latency = 3;}
 198 def CortexA510WriteFPALU_F4 : SchedWriteRes<[CortexA510UnitVALU]> { let Latency = 4;}
 199
 200 // FP Mul, Div, Sqrt. Div/Sqrt are not pipelined
 201 def : WriteRes<WriteFMul, [CortexA510UnitVMAC]> { let Latency = 4; }
 202
 203 let RetireOOO = 1 in {
 204 def : WriteRes<WriteFDiv, [CortexA510UnitVMC]> { let Latency = 22;
 205                                             let ReleaseAtCycles = [29]; }
 206 def CortexA510WriteVMAC : SchedWriteRes<[CortexA510UnitVMAC]> { let Latency = 4; }
 207 def CortexA510WriteFDivHP : SchedWriteRes<[CortexA510UnitVMC]> { let Latency = 8;
 208                                                      let ReleaseAtCycles = [5]; }
 209 def CortexA510WriteFDivSP : SchedWriteRes<[CortexA510UnitVMC]> { let Latency = 13;
 210                                                      let ReleaseAtCycles = [10]; }
 211 def CortexA510WriteFDivDP : SchedWriteRes<[CortexA510UnitVMC]> { let Latency = 22;
 212                                                      let ReleaseAtCycles = [19]; }
 213 def CortexA510WriteFSqrtHP : SchedWriteRes<[CortexA510UnitVMC]> { let Latency = 8;
 214                                                       let ReleaseAtCycles = [5]; }
 215 def CortexA510WriteFSqrtSP : SchedWriteRes<[CortexA510UnitVMC]> { let Latency = 12;
 216                                                       let ReleaseAtCycles = [9]; }
 217 def CortexA510WriteFSqrtDP : SchedWriteRes<[CortexA510UnitVMC]> { let Latency = 22;
 218                                                       let ReleaseAtCycles = [19]; }
 219 }
 220 //===----------------------------------------------------------------------===//
 221 // Subtarget-specific SchedRead types.
 222
 223 def : ReadAdvance<ReadVLD, 0>;
 224 def : ReadAdvance<ReadExtrHi, 0>;
 225 def : ReadAdvance<ReadAdrBase, 0>;
 226 def : ReadAdvance<ReadST, 1>;
 227
 228 def : ReadAdvance<ReadI, 0>;
 229 def : ReadAdvance<ReadISReg, 0>;
 230 def : ReadAdvance<ReadIEReg, 0>;
 231
 232
 233 // MUL
 234 def : ReadAdvance<ReadIM, 0>;
 235 def : ReadAdvance<ReadIMA, 2>;
 236
 237 // Div
 238 def : ReadAdvance<ReadID, 0>;
 239
 240 //===----------------------------------------------------------------------===//
 241 // Subtarget-specific InstRWs.
 242
 243 def A510WriteISReg : SchedWriteVariant<[
 244        SchedVar<RegShiftedPred, [WriteISReg]>,
 245        SchedVar<NoSchedPred, [WriteI]>]>;
 246 def : InstRW<[A510WriteISReg], (instregex ".*rs$")>;
 247 def : InstRW<[WriteIS], (instrs RBITWr, RBITXr)>;
 248
 249 // Pointer Authentication Instructions (v8.3 PAC)
 250 // -----------------------------------------------------------------------------
 251
 252 // Authenticate data address
 253 // Authenticate instruction address
 254 // Compute pointer authentication code for data address
 255 // Compute pointer authentication code, using generic key
 256 // Compute pointer authentication code for instruction address
 257 def : InstRW<[CortexA510Write<5, CortexA510UnitPAC>], (instregex "^AUT", "^PAC")>;
 258
 259 // Branch and link, register, with pointer authentication
 260 // Branch, register, with pointer authentication
 261 // Branch, return, with pointer authentication
 262 def : InstRW<[A510Write_PAC_B<1>], (instrs BLRAA, BLRAAZ, BLRAB, BLRABZ, BRAA,
 263                                             BRAAZ, BRAB, BRABZ, RETAA, RETAB,
 264                                             ERETAA, ERETAB)>;
 265
 266 // Load register, with pointer authentication
 267 def : InstRW<[CortexA510Write<2, CortexA510UnitPAC>], (instregex "^LDRA[AB](indexed|writeback)")>;
 268
 269 // Strip pointer authentication code
 270 def : InstRW<[CortexA510Write<5, CortexA510UnitPAC>], (instrs XPACD, XPACI, XPACLRI)>;
 271 //---
 272 // Miscellaneous
 273 //---
 274 def : InstRW<[CortexA510WriteVLD1SI,CortexA510WriteLDP1], (instregex "LDPS?Wi")>;
 275 def : InstRW<[CortexA510WriteVLD1,CortexA510WriteLDP1], (instregex "LDPSi")>;
 276 def : InstRW<[CortexA510WriteVLD1,CortexA510WriteLDP2], (instregex "LDP(X|D)i")>;
 277 def : InstRW<[CortexA510WriteVLD1,CortexA510WriteLDP4], (instregex "LDPQi")>;
 278 def : InstRW<[WriteAdr, CortexA510WriteVLD1SI,CortexA510WriteLDP1], (instregex "LDPS?W(pre|post)")>;
 279 def : InstRW<[WriteAdr, CortexA510WriteVLD1,CortexA510WriteLDP1], (instregex "LDPS(pre|post)")>;
 280 def : InstRW<[WriteAdr, CortexA510WriteVLD1,CortexA510WriteLDP2], (instregex "LDP(X|D)(pre|post)")>;
 281 def : InstRW<[WriteAdr, CortexA510WriteVLD1,CortexA510WriteLDP4], (instregex "LDPQ(pre|post)")>;
 282 def : InstRW<[WriteI], (instrs COPY)>;
 283 //---
 284 // Vector Loads - 128-bit per cycle
 285 //---
 286 //   1-element structures
 287 def : InstRW<[CortexA510WriteVLD1], (instregex "LD1i(8|16|32|64)$")>;                // single element
 288 def : InstRW<[CortexA510WriteVLD1], (instregex "LD1Rv(8b|4h|2s|1d|16b|8h|4s|2d)$")>; // replicate
 289 def : InstRW<[CortexA510WriteVLD1], (instregex "LD1Onev(8b|4h|2s|1d)$")>;
 290 def : InstRW<[CortexA510WriteVLD1], (instregex "LD1Onev(16b|8h|4s|2d)$")>;
 291 def : InstRW<[CortexA510WriteVLD1], (instregex "LD1Twov(8b|4h|2s|1d)$")>; // multiple structures
 292 def : InstRW<[CortexA510WriteVLD1], (instregex "LD1Twov(16b|8h|4s|2d)$")>;
 293 def : InstRW<[CortexA510WriteVLD2], (instregex "LD1Threev(8b|4h|2s|1d)$")>;
 294 def : InstRW<[CortexA510WriteVLD2], (instregex "LD1Threev(16b|8h|4s|2d)$")>;
 295 def : InstRW<[CortexA510WriteVLD2], (instregex "LD1Fourv(8b|4h|2s|1d)$")>;
 296 def : InstRW<[CortexA510WriteVLD2], (instregex "LD1Fourv(16b|8h|4s|2d)$")>;
 297
 298 def : InstRW<[WriteAdr, CortexA510WriteVLD1], (instregex "LD1i(8|16|32|64)_POST$")>;
 299 def : InstRW<[WriteAdr, CortexA510WriteVLD1], (instregex "LD1Rv(8b|4h|2s|1d|16b|8h|4s|2d)_POST$")>;
 300 def : InstRW<[WriteAdr, CortexA510WriteVLD1], (instregex "LD1Onev(8b|4h|2s|1d)_POST$")>;
 301 def : InstRW<[WriteAdr, CortexA510WriteVLD2], (instregex "LD1Onev(16b|8h|4s|2d)_POST$")>;
 302 def : InstRW<[WriteAdr, CortexA510WriteVLD2], (instregex "LD1Twov(8b|4h|2s|1d)_POST$")>;
 303 def : InstRW<[WriteAdr, CortexA510WriteVLD2], (instregex "LD1Twov(16b|8h|4s|2d)_POST$")>;
 304 def : InstRW<[WriteAdr, CortexA510WriteVLD2], (instregex "LD1Threev(8b|4h|2s|1d)_POST$")>;
 305 def : InstRW<[WriteAdr, CortexA510WriteVLD2], (instregex "LD1Threev(16b|8h|4s|2d)_POST$")>;
 306 def : InstRW<[WriteAdr, CortexA510WriteVLD2], (instregex "LD1Fourv(8b|4h|2s|1d)_POST$")>;
 307 def : InstRW<[WriteAdr, CortexA510WriteVLD2], (instregex "LD1Fourv(16b|8h|4s|2d)_POST$")>;
 308
 309 //    2-element structures
 310 def : InstRW<[CortexA510WriteVLD2], (instregex "LD2i(8|16|32|64)$")>;
 311 def : InstRW<[CortexA510WriteVLD2], (instregex "LD2Rv(8b|4h|2s|1d|16b|8h|4s|2d)$")>;
 312 def : InstRW<[CortexA510WriteVLD2], (instregex "LD2Twov(8b|4h|2s)$")>;
 313 def : InstRW<[CortexA510WriteVLD4], (instregex "LD2Twov(16b|8h|4s|2d)$")>;
 314
 315 def : InstRW<[WriteAdr, CortexA510WriteVLD2], (instregex "LD2i(8|16|32|64)(_POST)?$")>;
 316 def : InstRW<[WriteAdr, CortexA510WriteVLD2], (instregex "LD2Rv(8b|4h|2s|1d|16b|8h|4s|2d)(_POST)?$")>;
 317 def : InstRW<[WriteAdr, CortexA510WriteVLD2], (instregex "LD2Twov(8b|4h|2s)(_POST)?$")>;
 318 def : InstRW<[WriteAdr, CortexA510WriteVLD4], (instregex "LD2Twov(16b|8h|4s|2d)(_POST)?$")>;
 319
 320 //    3-element structures
 321 def : InstRW<[CortexA510WriteVLD2], (instregex "LD3i(8|16|32|64)$")>;
 322 def : InstRW<[CortexA510WriteVLD2], (instregex "LD3Rv(8b|4h|2s|1d|16b|8h|4s|2d)$")>;
 323 def : InstRW<[CortexA510WriteVLD3], (instregex "LD3Threev(8b|4h|2s|1d)$")>;
 324 def : InstRW<[CortexA510WriteVLD6], (instregex "LD3Threev(16b|8h|4s|2d)$")>;
 325
 326 def : InstRW<[WriteAdr, CortexA510WriteVLD2], (instregex "LD3i(8|16|32|64)_POST$")>;
 327 def : InstRW<[WriteAdr, CortexA510WriteVLD2], (instregex "LD3Rv(8b|4h|2s|1d|16b|8h|4s|2d)_POST$")>;
 328 def : InstRW<[WriteAdr, CortexA510WriteVLD3], (instregex "LD3Threev(8b|4h|2s|1d)_POST$")>;
 329 def : InstRW<[WriteAdr, CortexA510WriteVLD6], (instregex "LD3Threev(16b|8h|4s|2d)_POST$")>;
 330
 331 //    4-element structures
 332 def : InstRW<[CortexA510WriteVLD2], (instregex "LD4i(8|16|32|64)$")>;                // load single 4-el structure to one lane of 4 regs.
 333 def : InstRW<[CortexA510WriteVLD2], (instregex "LD4Rv(8b|4h|2s|1d|16b|8h|4s|2d)$")>; // load single 4-el structure, replicate to all lanes of 4 regs.
 334 def : InstRW<[CortexA510WriteVLD4], (instregex "LD4Fourv(8b|4h|2s|1d)$")>;           // load multiple 4-el structures to 4 regs.
 335 def : InstRW<[CortexA510WriteVLD8], (instregex "LD4Fourv(16b|8h|4s|2d)$")>;
 336
 337 def : InstRW<[WriteAdr, CortexA510WriteVLD2], (instregex "LD4i(8|16|32|64)_POST$")>;
 338 def : InstRW<[WriteAdr, CortexA510WriteVLD2], (instregex "LD4Rv(8b|4h|2s|1d|16b|8h|4s|2d)_POST$")>;
 339 def : InstRW<[WriteAdr, CortexA510WriteVLD4], (instregex "LD4Fourv(8b|4h|2s|1d)_POST$")>;
 340 def : InstRW<[WriteAdr, CortexA510WriteVLD8], (instregex "LD4Fourv(16b|8h|4s|2d)_POST$")>;
 341
 342 //---
 343 // Vector Stores
 344 //---
 345 def : InstRW<[CortexA510WriteVST1], (instregex "ST1i(8|16|32|64)$")>;
 346 def : InstRW<[CortexA510WriteVST1], (instregex "ST1Onev(8b|4h|2s|1d|16b|8h|4s|2d)$")>;
 347 def : InstRW<[CortexA510WriteVST1], (instregex "ST1Twov(8b|4h|2s|1d|16b|8h|4s|2d)$")>;
 348 def : InstRW<[CortexA510WriteVST2], (instregex "ST1Threev(8b|4h|2s|1d|16b|8h|4s|2d)$")>;
 349 def : InstRW<[CortexA510WriteVST4], (instregex "ST1Fourv(8b|4h|2s|1d|16b|8h|4s|2d)$")>;
 350 def : InstRW<[WriteAdr, CortexA510WriteVST1], (instregex "ST1i(8|16|32|64)_POST$")>;
 351 def : InstRW<[WriteAdr, CortexA510WriteVST1], (instregex "ST1Onev(8b|4h|2s|1d|16b|8h|4s|2d)_POST$")>;
 352 def : InstRW<[WriteAdr, CortexA510WriteVST1], (instregex "ST1Twov(8b|4h|2s|1d|16b|8h|4s|2d)_POST$")>;
 353 def : InstRW<[WriteAdr, CortexA510WriteVST2], (instregex "ST1Threev(8b|4h|2s|1d|16b|8h|4s|2d)_POST$")>;
 354 def : InstRW<[WriteAdr, CortexA510WriteVST4], (instregex "ST1Fourv(8b|4h|2s|1d|16b|8h|4s|2d)_POST$")>;
 355
 356 def : InstRW<[CortexA510WriteVST2], (instregex "ST2i(8|16|32|64)$")>;
 357 def : InstRW<[CortexA510WriteVST2], (instregex "ST2Twov(8b|4h|2s)$")>;
 358 def : InstRW<[CortexA510WriteVST4], (instregex "ST2Twov(16b|8h|4s|2d)$")>;
 359 def : InstRW<[WriteAdr, CortexA510WriteVST2], (instregex "ST2i(8|16|32|64)_POST$")>;
 360 def : InstRW<[WriteAdr, CortexA510WriteVST2], (instregex "ST2Twov(8b|4h|2s)_POST$")>;
 361 def : InstRW<[WriteAdr, CortexA510WriteVST4], (instregex "ST2Twov(16b|8h|4s|2d)_POST$")>;
 362
 363 def : InstRW<[CortexA510WriteVST2], (instregex "ST3i(8|16|32|64)$")>;
 364 def : InstRW<[CortexA510WriteVST4], (instregex "ST3Threev(8b|4h|2s|1d|16b|8h|4s|2d)$")>;
 365 def : InstRW<[WriteAdr, CortexA510WriteVST2], (instregex "ST3i(8|16|32|64)_POST$")>;
 366 def : InstRW<[WriteAdr, CortexA510WriteVST4], (instregex "ST3Threev(8b|4h|2s|1d|2d|16b|8h|4s|4d)_POST$")>;
 367
 368 def : InstRW<[CortexA510WriteVST2], (instregex "ST4i(8|16|32|64)$")>;
 369 def : InstRW<[CortexA510WriteVST4], (instregex "ST4Fourv(8b|4h|2s|1d|16b|8h|4s|2d)$")>;
 370 def : InstRW<[WriteAdr, CortexA510WriteVST2], (instregex "ST4i(8|16|32|64)_POST$")>;
 371 def : InstRW<[WriteAdr, CortexA510WriteVST4], (instregex "ST4Fourv(8b|4h|2s|1d|16b|8h|4s|2d)_POST$")>;
 372
 373 //---
 374 // Floating Point Conversions, MAC, DIV, SQRT
 375 //---
 376 def : InstRW<[CortexA510WriteFPALU_F3], (instregex "^DUP(v2i64|v4i32|v8i16|v16i8)")>;
 377 def : InstRW<[CortexA510WriteFPALU_F4], (instregex "^XTN")>;
 378 def : InstRW<[CortexA510WriteFPALU_F4], (instregex "^FCVT[ALMNPZ][SU](S|U)?(W|X)")>;
 379 def : InstRW<[CortexA510WriteFPALU_F4], (instregex "^FCVT(X)?[ALMNPXZ](S|U|N)?v")>;
 380
 381 def : InstRW<[CortexA510WriteFPALU_F4], (instregex "^(S|U)CVTF(S|U)(W|X)(H|S|D)")>;
 382 def : InstRW<[CortexA510WriteFPALU_F4], (instregex "^(S|U)CVTF(h|s|d)")>;
 383 def : InstRW<[CortexA510WriteFPALU_F4], (instregex "^(S|U)CVTFv")>;
 384
 385 def : InstRW<[CortexA510WriteVMAC], (instregex "^FN?M(ADD|SUB).*")>;
 386 def : InstRW<[CortexA510WriteVMAC], (instregex "^FML(A|S)v.*")>;
 387 def : InstRW<[CortexA510WriteFDivHP], (instrs FDIVHrr)>;
 388 def : InstRW<[CortexA510WriteFDivSP], (instrs FDIVSrr)>;
 389 def : InstRW<[CortexA510WriteFDivDP], (instrs FDIVDrr)>;
 390 def : InstRW<[CortexA510WriteFDivHP], (instregex "^FDIVv.*16$")>;
 391 def : InstRW<[CortexA510WriteFDivSP], (instregex "^FDIVv.*32$")>;
 392 def : InstRW<[CortexA510WriteFDivDP], (instregex "^FDIVv.*64$")>;
 393 def : InstRW<[CortexA510WriteFSqrtHP], (instregex "^.*SQRT.*16$")>;
 394 def : InstRW<[CortexA510WriteFSqrtSP], (instregex "^.*SQRT.*32$")>;
 395 def : InstRW<[CortexA510WriteFSqrtDP], (instregex "^.*SQRT.*64$")>;
 396
 397 def : InstRW<[CortexA510WriteFPALU_F3], (instrs FCSELHrrr, FCSELSrrr, FCSELDrrr)>;
 398
 399 // 4.15. Advanced SIMD integer instructions
 400 // ASIMD absolute diff
 401 def : InstRW<[CortexA510Write<3, CortexA510UnitVALU>], (instregex "[SU]ABDv(2i32|4i16|8i8)")>;
 402 def : InstRW<[CortexA510Write<3, CortexA510UnitVALU>], (instregex "[SU]ABDv(16i8|4i32|8i16)")>;
 403 // ASIMD absolute diff accum
 404 def : InstRW<[CortexA510Write<6, CortexA510UnitVALU>], (instregex "[SU]ABAL?v")>;
 405 // ASIMD absolute diff long
 406 def : InstRW<[CortexA510Write<3, CortexA510UnitVALU>], (instregex "[SU]ABDLv")>;
 407 // ASIMD arith #1
 408 def : InstRW<[CortexA510Write<3, CortexA510UnitVALU>], (instregex "(ADD|SUB|NEG)v",
 409   "[SU]R?HADDv", "[SU]HSUBv")>;
 410 // ASIMD arith #2
 411 def : InstRW<[CortexA510Write<3, CortexA510UnitVALU>], (instregex "ABSv(1i64|2i32|4i16|8i8)$",
 412   "[SU]ADDLPv(2i32_v1i64|4i16_v2i32|8i8_v4i16)$",
 413   "ADDPv(2i32|4i16|8i8)$")>;
 414 def : InstRW<[CortexA510Write<4, CortexA510UnitVALU>], (instregex "([SU]QADD|[SU]QSUB|SQNEG|SUQADD|USQADD)v(1i16|1i32|1i64|1i8|2i32|4i16|8i8)$")>;
 415 def : InstRW<[CortexA510Write<3, CortexA510UnitVALU>], (instregex "ABSv(2i64|4i32|8i16|16i8)$",
 416   "[SU]ADDLPv(16i8_v8i16|4i32_v2i64|8i16_v4i32)$",
 417   "ADDPv(16i8|2i64|4i32|8i16)$")>;
 418 def : InstRW<[CortexA510Write<4, CortexA510UnitVALU>], (instregex "([SU]QADD|[SU]QSUB|SQNEG|SUQADD|USQADD)v(16i8|2i64|4i32|8i16)$")>;
 419 // ASIMD arith #3
 420 def : InstRW<[CortexA510Write<3, CortexA510UnitVALU>], (instregex  "SADDLv", "UADDLv", "SADDWv",
 421   "UADDWv", "SSUBLv", "USUBLv", "SSUBWv", "USUBWv")>;
 422 def : InstRW<[CortexA510Write<4, CortexA510UnitVALU>], (instregex  "ADDHNv", "SUBHNv")>;
 423 // ASIMD arith #5
 424 def : InstRW<[CortexA510Write<8, CortexA510UnitVALU>], (instregex "RADDHNv", "RSUBHNv")>;
 425 // ASIMD arith, reduce
 426 def : InstRW<[CortexA510Write<3, CortexA510UnitVALU>], (instregex  "ADDVv")>;
 427 def : InstRW<[CortexA510Write<4, CortexA510UnitVALU>], (instregex  "SADDLVv", "UADDLVv")>;
 428 // ASIMD compare #1
 429 def : InstRW<[CortexA510Write<3, CortexA510UnitVALU>], (instregex "CM(EQ|GE|GT|HI|HS|LE|LT)v(1i64|2i32|4i16|8i8)")>;
 430 def : InstRW<[CortexA510Write<3, CortexA510UnitVALU>], (instregex "CM(EQ|GE|GT|HI|HS|LE|LT)v(2i64|4i32|8i16|16i8)")>;
 431 // ASIMD compare #2
 432 def : InstRW<[CortexA510Write<4, CortexA510UnitVALU>], (instregex "CMTSTv(1i64|2i32|4i16|8i8)")>;
 433 def : InstRW<[CortexA510Write<4, CortexA510UnitVALU>], (instregex "CMTSTv(2i64|4i32|8i16|16i8)")>;
 434 // ASIMD logical $1
 435 def : InstRW<[CortexA510Write<3, CortexA510UnitVALU>], (instregex "(AND|EOR|NOT|ORN)v8i8",
 436   "(ORR|BIC)v(2i32|4i16|8i8)$", "MVNIv(2i|2s|4i16)")>;
 437 def : InstRW<[CortexA510Write<3, CortexA510UnitVALU>], (instregex "(AND|EOR|NOT|ORN)v16i8",
 438   "(ORR|BIC)v(16i8|4i32|8i16)$", "MVNIv(4i32|4s|8i16)")>;
 439 // ASIMD max/min, basic
 440 def : InstRW<[CortexA510Write<3, CortexA510UnitVALU>], (instregex "[SU](MIN|MAX)P?v(2i32|4i16|8i8)")>;
 441 def : InstRW<[CortexA510Write<3, CortexA510UnitVALU>], (instregex "[SU](MIN|MAX)P?v(16i8|4i132|8i16)")>;
 442 // SIMD max/min, reduce
 443 def : InstRW<[CortexA510Write<4, CortexA510UnitVALU>], (instregex "[SU](MAX|MIN)Vv")>;
 444 // ASIMD multiply, by element
 445 def : InstRW<[CortexA510Write<4, CortexA510UnitVALU>], (instregex "MULv(2i32|4i16|4i32|8i16)_indexed$",
 446   "SQR?DMULHv(1i16|1i32|2i32|4i16|4i32|8i16)_indexed$")>;
 447 // ASIMD multiply
 448 def : InstRW<[CortexA510Write<4, CortexA510UnitVALU>], (instrs PMULv8i8)>;
 449 def : InstRW<[CortexA510Write<4, CortexA510UnitVALU>], (instrs PMULv16i8)>;
 450 // ASIMD multiply accumulate
 451 def : InstRW<[CortexA510Write<4, CortexA510UnitVALU>], (instregex "ML[AS]v(2i32|4i16|8i8)$")>;
 452 def : InstRW<[CortexA510Write<4, CortexA510UnitVALU>], (instregex "ML[AS]v(16i8|4i32|8i16)$")>;
 453 def : InstRW<[CortexA510Write<4, CortexA510UnitVALU>], (instregex "ML[AS]v(2i32|4i16|4i32|8i16)_indexed$")>;
 454 // ASIMD multiply accumulate half
 455 def : InstRW<[CortexA510Write<4, CortexA510UnitVALU>], (instregex "SQRDML[AS]H[vi]")>;
 456 // ASIMD multiply accumulate long
 457 def : InstRW<[CortexA510Write<4, CortexA510UnitVALU>], (instregex "[SU]ML[AS]Lv")>;
 458 // ASIMD multiply accumulate long #2
 459 def : InstRW<[CortexA510Write<4, CortexA510UnitVALU>], (instregex "SQDML[AS]L[iv]")>;
 460 // ASIMD dot product
 461 def : InstRW<[CortexA510Write<4, CortexA510UnitVALU>], (instregex "[SU]DOTv8i8")>;
 462 def : InstRW<[CortexA510Write<4, CortexA510UnitVALU>], (instregex "[SU]DOTv16i8")>;
 463 // ASIMD dot product, by scalar
 464 def : InstRW<[CortexA510Write<4, CortexA510UnitVALU>], (instregex "[SU]DOTlanev")>;
 465 // ASIMD multiply long
 466 def : InstRW<[CortexA510Write<4, CortexA510UnitVALU>], (instregex "[SU]MULLv", "SQDMULL[iv]")>;
 467 // ASIMD polynomial (8x8) multiply long
 468 def : InstRW<[CortexA510Write<4, CortexA510UnitVALU>], (instrs PMULLv8i8, PMULLv16i8)>;
 469 // ASIMD pairwise add and accumulate
 470 def : InstRW<[CortexA510MCWrite<7, 2, CortexA510UnitVALU>], (instregex "[SU]ADALPv")>;
 471 // ASIMD shift accumulate
 472 def : InstRW<[CortexA510Write<3, CortexA510UnitVALU>], (instregex "[SU]SRA(d|v2i32|v4i16|v8i8)")>;
 473 def : InstRW<[CortexA510Write<3, CortexA510UnitVALU>], (instregex "[SU]SRAv(16i8|2i64|4i32|8i16)")>;
 474 // ASIMD shift accumulate #2
 475 def : InstRW<[CortexA510MCWrite<7, 2, CortexA510UnitVALU>], (instregex "[SU]RSRA[vd]")>;
 476 // ASIMD shift by immed
 477 def : InstRW<[CortexA510Write<3, CortexA510UnitVALU>], (instregex "SHLd$", "SHLv",
 478   "SLId$", "SRId$", "[SU]SHR[vd]", "SHRNv")>;
 479 // ASIMD shift by immed
 480 // SXTL and UXTL are aliases for SHLL
 481 def : InstRW<[CortexA510Write<3, CortexA510UnitVALU>], (instregex "[US]?SHLLv")>;
 482 // ASIMD shift by immed #2
 483 def : InstRW<[CortexA510Write<3, CortexA510UnitVALU>], (instregex "[SU]RSHR(d|v2i32|v4i16|v8i8)",
 484   "[SU]RSHRv(16i8|2i64|4i32|8i16)")>;
 485 def : InstRW<[CortexA510Write<4, CortexA510UnitVALU>], (instregex "RSHRNv(2i32|4i16|8i8)",
 486   "RSHRNv(16i8|4i32|8i16)")>;
 487 // ASIMD shift by register
 488 def : InstRW<[CortexA510Write<3, CortexA510UnitVALU>], (instregex "[SU]SHLv(1i64|2i32|4i16|8i8)")>;
 489 def : InstRW<[CortexA510Write<3, CortexA510UnitVALU>], (instregex "[SU]SHLv(2i64|4i32|8i16|16i8)")>;
 490 // ASIMD shift by register #2
 491 def : InstRW<[CortexA510Write<3, CortexA510UnitVALU>], (instregex "[SU]RSHLv(1i64|2i32|4i16|8i8)")>;
 492 def : InstRW<[CortexA510Write<3, CortexA510UnitVALU>], (instregex "[SU]RSHLv(2i64|4i32|8i16|16i8)")>;
 493
 494 def : InstRW<[CortexA510Write<4, CortexA510UnitVALU>], (instregex "[SU]QSHLv(1i64|2i32|4i16|8i8)")>;
 495 def : InstRW<[CortexA510Write<4, CortexA510UnitVALU>], (instregex "[SU]QSHLv(2i64|4i32|8i16|16i8)")>;
 496
 497 def : InstRW<[CortexA510Write<4, CortexA510UnitVALU>], (instregex "[SU]QRSHLv(1i64|2i32|4i16|8i8)")>;
 498 def : InstRW<[CortexA510Write<4, CortexA510UnitVALU>], (instregex "[SU]QRSHLv(2i64|4i32|8i16|16i8)")>;
 499
 500 // Cryptography extensions
 501 // -----------------------------------------------------------------------------
 502
 503 // Crypto AES ops
 504 def : InstRW<[CortexA510Write<3, CortexA510UnitVALU>], (instregex "^AES[DE]rr$", "^AESI?MCrr")>;
 505
 506 // Crypto polynomial (64x64) multiply long
 507 def : InstRW<[CortexA510MCWrite<4, 0, CortexA510UnitVMC>], (instrs PMULLv1i64, PMULLv2i64)>;
 508
 509 // Crypto SHA1 hash acceleration op
 510 // Crypto SHA1 schedule acceleration ops
 511 def : InstRW<[CortexA510Write<3, CortexA510UnitVALU>], (instregex "^SHA1(H|SU0|SU1)")>;
 512
 513 // Crypto SHA1 hash acceleration ops
 514 // Crypto SHA256 hash acceleration ops
 515 def : InstRW<[CortexA510MCWrite<4, 0, CortexA510UnitVMC>], (instregex "^SHA1[CMP]", "^SHA256H2?")>;
 516
 517 // Crypto SHA256 schedule acceleration ops
 518 def : InstRW<[CortexA510MCWrite<4, 0, CortexA510UnitVMC>], (instregex "^SHA256SU[01]")>;
 519
 520 // Crypto SHA512 hash acceleration ops
 521 def : InstRW<[CortexA510MCWrite<9, 0, CortexA510UnitVMC>], (instregex "^SHA512(H|H2|SU0|SU1)")>;
 522
 523 // Crypto SHA3 ops
 524 def : InstRW<[CortexA510Write<3, CortexA510UnitVALU>], (instrs BCAX, EOR3)>;
 525 def : InstRW<[CortexA510Write<4, CortexA510UnitVALU>], (instrs XAR)>;
 526 def : InstRW<[CortexA510MCWrite<9, 0, CortexA510UnitVMC>], (instrs RAX1)>;
 527
 528
 529 // Crypto SM3 ops
 530 def : InstRW<[CortexA510MCWrite<9, 0, CortexA510UnitVMC>], (instregex "^SM3PARTW[12]$", "^SM3SS1$",
 531                                                             "^SM3TT[12][AB]$")>;
 532
 533 // Crypto SM4 ops
 534 def : InstRW<[CortexA510MCWrite<9, 0, CortexA510UnitVMC>], (instrs SM4E, SM4ENCKEY)>;
 535
 536 // CRC
 537 // -----------------------------------------------------------------------------
 538
 539 def : InstRW<[CortexA510MCWrite<2, 0, CortexA510UnitMAC>], (instregex "^CRC32")>;
 540
 541 // SVE Predicate instructions
 542
 543 // Loop control, based on predicate
 544 def : InstRW<[CortexA510Write<2, CortexA510UnitVALU0>], (instrs BRKA_PPmP, BRKA_PPzP,
 545                                                   BRKB_PPmP, BRKB_PPzP)>;
 546
 547 // Loop control, based on predicate and flag setting
 548 def : InstRW<[CortexA510Write<2, CortexA510UnitVALU0>], (instrs BRKAS_PPzP, BRKBS_PPzP)>;
 549
 550 // Loop control, propagating
 551 def : InstRW<[CortexA510Write<2, CortexA510UnitVALU0>], (instrs BRKN_PPzP, BRKPA_PPzPP, BRKPB_PPzPP)>;
 552
 553 // Loop control, propagating and flag setting
 554 def : InstRW<[CortexA510Write<2, CortexA510UnitVALU0>], (instrs BRKNS_PPzP)>;
 555 def : InstRW<[CortexA510Write<4, CortexA510UnitVALU0>], (instrs BRKPAS_PPzPP, BRKPBS_PPzPP)>;
 556
 557
 558 // Loop control, based on GPR
 559 def : InstRW<[CortexA510Write<2, CortexA510UnitVALU0>],
 560              (instregex "^WHILE(GE|GT|HI|HS|LE|LO|LS|LT)_P(WW|XX)_[BHSD]")>;
 561
 562 def : InstRW<[CortexA510Write<2, CortexA510UnitVALU0>], (instregex "^WHILE(RW|WR)_PXX_[BHSD]")>;
 563
 564 // Loop terminate
 565 def : InstRW<[CortexA510Write<1, CortexA510UnitALU>], (instregex "^CTERM(EQ|NE)_(WW|XX)")>;
 566
 567 // Predicate counting scalar
 568 def : InstRW<[CortexA510Write<1, CortexA510UnitALU>], (instrs ADDPL_XXI, ADDVL_XXI, RDVLI_XI)>;
 569
 570 def : InstRW<[CortexA510Write<1, CortexA510UnitALU>],
 571              (instregex "^CNT[BHWD]_XPiI")>;
 572
 573 def : InstRW<[CortexA510Write<3, CortexA510UnitALU>],
 574              (instregex "^(INC|DEC)[BHWD]_XPiI")>;
 575
 576 def : InstRW<[CortexA510Write<4, CortexA510UnitALU>],
 577              (instregex "^(SQINC|SQDEC|UQINC|UQDEC)[BHWD]_[XW]Pi(Wd)?I")>;
 578
 579 // Predicate counting scalar, active predicate
 580 def : InstRW<[CortexA510Write<4, CortexA510UnitVALU0>],
 581              (instregex "^CNTP_XPP_[BHSD]")>;
 582
 583 def : InstRW<[CortexA510Write<4, CortexA510UnitVALU0>],
 584              (instregex "^(DEC|INC)P_XP_[BHSD]")>;
 585
 586 def : InstRW<[CortexA510Write<9, CortexA510UnitVALU0>],
 587              (instregex "^(SQDEC|SQINC|UQDEC|UQINC)P_XP_[BHSD]",
 588                         "^(UQDEC|UQINC)P_WP_[BHSD]",
 589                         "^(SQDEC|SQINC|UQDEC|UQINC)P_XPWd_[BHSD]")>;
 590
 591
 592 // Predicate counting vector, active predicate
 593 def : InstRW<[CortexA510Write<4, CortexA510UnitVALU>],
 594              (instregex "^(DEC|INC|SQDEC|SQINC|UQDEC|UQINC)P_ZP_[HSD]")>;
 595
 596 // Predicate logical
 597 def : InstRW<[CortexA510Write<2, CortexA510UnitVALU0>],
 598              (instregex "^(AND|BIC|EOR|NAND|NOR|ORN|ORR)_PPzPP")>;
 599
 600 // Predicate logical, flag setting
 601 def : InstRW<[CortexA510Write<2, CortexA510UnitVALU0>],
 602              (instregex "^(ANDS|BICS|EORS|NANDS|NORS|ORNS|ORRS)_PPzPP")>;
 603
 604 // Predicate reverse
 605 def : InstRW<[CortexA510Write<2, CortexA510UnitVALU0>], (instregex "^REV_PP_[BHSD]")>;
 606
 607 // Predicate select
 608 def : InstRW<[CortexA510Write<2, CortexA510UnitVALU0>], (instrs SEL_PPPP)>;
 609
 610 // Predicate set
 611 def : InstRW<[CortexA510Write<2, CortexA510UnitVALU0>], (instregex "^PFALSE", "^PTRUE_[BHSD]")>;
 612
 613 // Predicate set/initialize, set flags
 614 def : InstRW<[CortexA510Write<2, CortexA510UnitVALU0>], (instregex "^PTRUES_[BHSD]")>;
 615
 616 // Predicate find first/next
 617 def : InstRW<[CortexA510Write<2, CortexA510UnitVALU0>], (instregex "^PFIRST_B", "^PNEXT_[BHSD]")>;
 618
 619 // Predicate test
 620 def : InstRW<[CortexA510Write<2, CortexA510UnitVALU0>], (instrs PTEST_PP)>;
 621
 622 // Predicate transpose
 623 def : InstRW<[CortexA510Write<2, CortexA510UnitVALU0>], (instregex "^TRN[12]_PPP_[BHSDQ]")>;
 624
 625 // Predicate unpack and widen
 626 def : InstRW<[CortexA510Write<2, CortexA510UnitVALU0>], (instrs PUNPKHI_PP, PUNPKLO_PP)>;
 627
 628 // Predicate zip/unzip
 629 def : InstRW<[CortexA510Write<2, CortexA510UnitVALU0>], (instregex "^(ZIP|UZP)[12]_PPP_[BHSDQ]")>;
 630
 631
 632 // SVE integer instructions
 633 // -----------------------------------------------------------------------------
 634 // Arithmetic, absolute diff
 635 def : InstRW<[CortexA510Write<3, CortexA510UnitVALU>], (instregex "^[SU]ABD_(ZPmZ|ZPZZ)_[BHSD]")>;
 636
 637 // Arithmetic, absolute diff accum
 638 def : InstRW<[CortexA510MCWrite<6, 2, CortexA510UnitVALU>], (instregex "^[SU]ABA_ZZZ_[BHSD]")>;
 639
 640 // Arithmetic, absolute diff accum long
 641 def : InstRW<[CortexA510MCWrite<6, 2, CortexA510UnitVALU>], (instregex "^[SU]ABAL[TB]_ZZZ_[HSD]")>;
 642
 643 // Arithmetic, absolute diff long
 644 def : InstRW<[CortexA510Write<3, CortexA510UnitVALU>], (instregex "^[SU]ABDL[TB]_ZZZ_[HSD]")>;
 645
 646 // Arithmetic, basic
 647 def : InstRW<[CortexA510Write<3, CortexA510UnitVALU>],
 648              (instregex "^(ABS|CNOT|NEG)_ZPmZ_[BHSD]",
 649                         "^(ADD|SUB|SUBR)_ZPmZ_[BHSD]",
 650                         "^(ADD|SUB|SUBR)_ZPZZ_[BHSD]",
 651                         "^(ADD|SUB)_ZZZ_[BHSD]",
 652                         "^(ADD|SUB|SUBR)_ZI_[BHSD]",
 653                         "^ADR_[SU]XTW_ZZZ_D_[0123]",
 654                         "^ADR_LSL_ZZZ_[SD]_[0123]",
 655                         "^[SU]H(ADD|SUB|SUBR)_ZPmZ_[BHSD]")>;
 656 def : InstRW<[CortexA510Write<4, CortexA510UnitVALU>],
 657              (instregex "^[SU](ADD|SUB)[LW][BT]_ZZZ_[HSD]",
 658                         "^SADDLBT_ZZZ_[HSD]",
 659                         "^SSUBL(BT|TB)_ZZZ_[HSD]")>;
 660
 661 // Arithmetic, complex
 662 def : InstRW<[CortexA510Write<4, CortexA510UnitVALU>],
 663              (instregex "^SQ(ABS|NEG)_ZPmZ_[BHSD]",
 664                         "^SQ(ADD|SUB|SUBR)_ZPmZ_?[BHSD]",
 665                         "^[SU]Q(ADD|SUB)_ZZZ_[BHSD]",
 666                         "^[SU]Q(ADD|SUB)_ZI_[BHSD]",
 667                         "^(SRH|SUQ|UQ|USQ|URH)ADD_ZPmZ_[BHSD]",
 668                         "^(UQSUB|UQSUBR)_ZPmZ_[BHSD]")>;
 669 def : InstRW<[CortexA510Write<8, CortexA510UnitVALU>],
 670              (instregex "^R?(ADD|SUB)HN[BT]_ZZZ_[BHS]")>;
 671
 672 // Arithmetic, large integer
 673 def : InstRW<[CortexA510Write<4, CortexA510UnitVALU>], (instregex "^(AD|SB)CL[BT]_ZZZ_[SD]")>;
 674
 675 // Arithmetic, pairwise add
 676 def : InstRW<[CortexA510Write<3, CortexA510UnitVALU>], (instregex "^ADDP_ZPmZ_[BHSD]")>;
 677
 678 // Arithmetic, pairwise add and accum long
 679 def : InstRW<[CortexA510MCWrite<7, 2, CortexA510UnitVALU>], (instregex "^[SU]ADALP_ZPmZ_[HSD]")>;
 680
 681 // Arithmetic, shift
 682 def : InstRW<[CortexA510Write<3, CortexA510UnitVALU>],
 683              (instregex "^(ASR|LSL|LSR)_WIDE_ZPmZ_[BHS]",
 684                         "^(ASR|LSL|LSR)_WIDE_ZZZ_[BHS]",
 685                         "^(ASR|LSL|LSR)_ZPmI_[BHSD]",
 686                         "^(ASR|LSL|LSR)_ZPZI_[BHSD]",
 687                         "^(ASR|LSL|LSR)_ZPmZ_[BHSD]",
 688                         "^(ASR|LSL|LSR)_ZPZZ_[BHSD]",
 689                         "^(ASR|LSL|LSR)_ZZI_[BHSD]",
 690                         "^(ASRR|LSLR|LSRR)_ZPmZ_[BHSD]")>;
 691 // Arithmetic, shift right for divide
 692 def : InstRW<[CortexA510Write<4, CortexA510UnitVALU>],
 693              (instregex "^ASRD_ZPmI_[BHSD]",
 694                         "^ASRD_ZPZI_[BHSD]")>;
 695
 696 // Arithmetic, shift and accumulate
 697 def : InstRW<[CortexA510Write<4, CortexA510UnitVALU>],
 698              (instregex "^(SSRA|USRA)_ZZI_[BHSD]")>;
 699
 700 def : InstRW<[CortexA510MCWrite<7, 2, CortexA510UnitVALU>],
 701              (instregex "^(SRSRA|URSRA)_ZZI_[BHSD]")>;
 702
 703
 704 // Arithmetic, shift by immediate
 705 // Arithmetic, shift by immediate and insert
 706 def : InstRW<[CortexA510Write<3, CortexA510UnitVALU>],
 707              (instregex "^(SHRNB|SHRNT|SSHLLB|SSHLLT|USHLLB|USHLLT|SLI|SRI)_ZZI_[BHSD]")>;
 708
 709 // Arithmetic, shift complex
 710 def : InstRW<[CortexA510Write<4, CortexA510UnitVALU>],
 711              (instregex "^(SQ)?RSHRU?N[BT]_ZZI_[BHS]",
 712                         "^(SQRSHL|SQRSHLR|SQSHL|SQSHLR|UQRSHL|UQRSHLR|UQSHL|UQSHLR)_(ZPmZ|ZPZZ)_[BHSD]",
 713                         "^(SQSHL|SQSHLU|UQSHL)_(ZPmI|ZPZI)_[BHSD]",
 714                         "^SQSHRU?N[BT]_ZZI_[BHS]",
 715                         "^UQR?SHRN[BT]_ZZI_[BHS]")>;
 716
 717 // Arithmetic, shift rounding
 718 def : InstRW<[CortexA510Write<4, CortexA510UnitVALU>],
 719              (instregex "^(SRSHL|SRSHR|SRSHLR|URSHL|URSHLR|URSHR)_(ZPmZ|ZPZZ|ZPZI)_[BHSD]",
 720                         "^[SU]RSHR_ZPmI_[BHSD]")>;
 721
 722 // Bit manipulation
 723 def : InstRW<[CortexA510MCWrite<14, 13, CortexA510UnitVMC>],
 724              (instregex "^(BDEP|BEXT|BGRP)_ZZZ_B")>;
 725
 726 def : InstRW<[CortexA510MCWrite<22, 21, CortexA510UnitVMC>],
 727              (instregex "^(BDEP|BEXT|BGRP)_ZZZ_H")>;
 728
 729 def : InstRW<[CortexA510MCWrite<38, 37, CortexA510UnitVMC>],
 730              (instregex "^(BDEP|BEXT|BGRP)_ZZZ_S")>;
 731
 732 def : InstRW<[CortexA510MCWrite<70, 69, CortexA510UnitVMC>],
 733              (instregex "^(BDEP|BEXT|BGRP)_ZZZ_D")>;
 734
 735
 736 // Bitwise select
 737 def : InstRW<[CortexA510Write<3, CortexA510UnitVALU>], (instregex "^(BSL|BSL1N|BSL2N|NBSL)_ZZZZ")>;
 738
 739 // Count/reverse bits
 740 def : InstRW<[CortexA510Write<3, CortexA510UnitVALU>], (instregex "^(CLS|CLZ|RBIT)_ZPmZ_[BHSD]")>;
 741 def : InstRW<[CortexA510Write<3, CortexA510UnitVALU>], (instregex "^CNT_ZPmZ_[BH]")>;
 742 def : InstRW<[CortexA510Write<8, CortexA510UnitVALU>], (instregex "^CNT_ZPmZ_S")>;
 743 def : InstRW<[CortexA510Write<12, CortexA510UnitVALU>], (instregex "^CNT_ZPmZ_D")>;
 744 // Broadcast logical bitmask immediate to vector
 745 def : InstRW<[CortexA510Write<4, CortexA510UnitVALU>], (instrs DUPM_ZI)>;
 746
 747 // Compare and set flags
 748 def : InstRW<[CortexA510Write<4, CortexA510UnitVALU>],
 749              (instregex "^CMP(EQ|GE|GT|HI|HS|LE|LO|LS|LT|NE)_PPzZ[IZ]_[BHSD]",
 750                         "^CMP(EQ|GE|GT|HI|HS|LE|LO|LS|LT|NE)_WIDE_PPzZZ_[BHS]")>;
 751
 752 // Complex add
 753 def : InstRW<[CortexA510Write<3, CortexA510UnitVALU>], (instregex "^CADD_ZZI_[BHSD]")>;
 754
 755 def : InstRW<[CortexA510Write<4, CortexA510UnitVALU>], (instregex "^SQCADD_ZZI_[BHSD]")>;
 756
 757 // Complex dot product 8-bit element
 758 def : InstRW<[CortexA510Write<4, CortexA510UnitVMAC>], (instrs CDOT_ZZZ_S, CDOT_ZZZI_S)>;
 759
 760 // Complex dot product 16-bit element
 761 def : InstRW<[CortexA510Write<4, CortexA510UnitVMAC>], (instrs CDOT_ZZZ_D, CDOT_ZZZI_D)>;
 762
 763 // Complex multiply-add B, H, S element size
 764 def : InstRW<[CortexA510Write<4, CortexA510UnitVMAC>], (instregex "^CMLA_ZZZ_[BHS]",
 765                                             "^CMLA_ZZZI_[HS]")>;
 766
 767 // Complex multiply-add D element size
 768 def : InstRW<[CortexA510Write<4, CortexA510UnitVMAC>], (instrs CMLA_ZZZ_D)>;
 769
 770 // Conditional extract operations, scalar form
 771 def : InstRW<[CortexA510MCWrite<8, 2, CortexA510UnitVALU>], (instregex "^CLAST[AB]_RPZ_[BHSD]")>;
 772
 773 // Conditional extract operations, SIMD&FP scalar and vector forms
 774 def : InstRW<[CortexA510Write<4, CortexA510UnitVALU>], (instregex "^CLAST[AB]_[VZ]PZ_[BHSD]",
 775                                             "^COMPACT_ZPZ_[SD]",
 776                                             "^SPLICE_ZPZZ?_[BHSD]")>;
 777
 778 // Convert to floating point, 64b to float or convert to double
 779 def : InstRW<[CortexA510Write<4, CortexA510UnitVALU>], (instregex "^[SU]CVTF_ZPmZ_Dto[SD]")>;
 780
 781 // Convert to floating point, 64b to half
 782 def : InstRW<[CortexA510Write<4, CortexA510UnitVALU>], (instregex "^[SU]CVTF_ZPmZ_DtoH")>;
 783
 784 // Convert to floating point, 32b to single or half
 785 def : InstRW<[CortexA510Write<4, CortexA510UnitVALU>], (instregex "^[SU]CVTF_ZPmZ_Sto[HS]")>;
 786
 787 // Convert to floating point, 32b to double
 788 def : InstRW<[CortexA510Write<4, CortexA510UnitVALU>], (instregex "^[SU]CVTF_ZPmZ_StoD")>;
 789
 790 // Convert to floating point, 16b to half
 791 def : InstRW<[CortexA510Write<4, CortexA510UnitVALU>], (instregex "^[SU]CVTF_ZPmZ_HtoH")>;
 792
 793 // Copy, scalar
 794 def : InstRW<[CortexA510Write<3, CortexA510UnitVALU0>],(instregex "^CPY_ZPmR_[BHSD]")>;
 795
 796 // Copy, scalar SIMD&FP or imm
 797 def : InstRW<[CortexA510Write<3, CortexA510UnitVALU>], (instregex "^CPY_ZPm[IV]_[BHSD]",
 798                                            "^CPY_ZPzI_[BHSD]")>;
 799
 800 // Divides, 32 bit
 801 def : InstRW<[CortexA510MCWrite<15, 12, CortexA510UnitVMC>], (instregex "^[SU]DIVR?_(ZPmZ|ZPZZ)_S")>;
 802
 803 // Divides, 64 bit
 804 def : InstRW<[CortexA510MCWrite<26, 23, CortexA510UnitVMC>], (instregex "^[SU]DIVR?_(ZPmZ|ZPZZ)_D")>;
 805
 806 // Dot product, 8 bit
 807 def : InstRW<[CortexA510Write<4, CortexA510UnitVMAC>], (instregex "^[SU]DOT_ZZZI?_S")>;
 808
 809 // Dot product, 8 bit, using signed and unsigned integers
 810 def : InstRW<[CortexA510Write<4, CortexA510UnitVMAC>], (instrs SUDOT_ZZZI, USDOT_ZZZI, USDOT_ZZZ)>;
 811
 812 // Dot product, 16 bit
 813 def : InstRW<[CortexA510Write<4, CortexA510UnitVMAC>], (instregex "^[SU]DOT_ZZZI?_D")>;
 814
 815 // Duplicate, immediate and indexed form
 816 def : InstRW<[CortexA510Write<3, CortexA510UnitVALU>], (instregex "^DUP_ZI_[BHSD]",
 817                                            "^DUP_ZZI_[BHSDQ]")>;
 818
 819 // Duplicate, scalar form
 820 def : InstRW<[CortexA510Write<3, CortexA510UnitVALU>], (instregex "^DUP_ZR_[BHSD]")>;
 821
 822 // Extend, sign or zero
 823 def : InstRW<[CortexA510Write<3, CortexA510UnitVALU>], (instregex "^[SU]XTB_ZPmZ_[HSD]",
 824                                             "^[SU]XTH_ZPmZ_[SD]",
 825                                             "^[SU]XTW_ZPmZ_[D]")>;
 826
 827 // Extract
 828 def : InstRW<[CortexA510Write<3, CortexA510UnitVALU>], (instrs EXT_ZZI, EXT_ZZI_B)>;
 829
 830 // Extract narrow saturating
 831 def : InstRW<[CortexA510Write<4, CortexA510UnitVALU>], (instregex "^[SU]QXTN[BT]_ZZ_[BHS]",
 832                                             "^SQXTUN[BT]_ZZ_[BHS]")>;
 833
 834 // Extract/insert operation, SIMD and FP scalar form
 835 def : InstRW<[CortexA510Write<4, CortexA510UnitVALU>], (instregex "^LAST[AB]_VPZ_[BHSD]",
 836                                             "^INSR_ZV_[BHSD]")>;
 837
 838 // Extract/insert operation, scalar
 839 def : InstRW<[CortexA510MCWrite<8, 2, CortexA510UnitVALU0>], (instregex "^LAST[AB]_RPZ_[BHSD]",
 840                                                 "^INSR_ZR_[BHSD]")>;
 841
 842 // Histogram operations
 843 def : InstRW<[CortexA510MCWrite<8, 2, CortexA510UnitVALU0>], (instregex "^HISTCNT_ZPzZZ_[SD]",
 844                                                   "^HISTSEG_ZZZ")>;
 845
 846 // Horizontal operations, B, H, S form, immediate operands only
 847 def : InstRW<[CortexA510Write<4, CortexA510UnitVMAC>], (instregex "^INDEX_II_[BHS]")>;
 848
 849 // Horizontal operations, B, H, S form, scalar, immediate operands/ scalar
 850 // operands only / immediate, scalar operands
 851 def : InstRW<[CortexA510Write<4, CortexA510UnitVMAC>], (instregex "^INDEX_(IR|RI|RR)_[BHS]")>;
 852
 853 // Horizontal operations, D form, immediate operands only
 854 def : InstRW<[CortexA510Write<4, CortexA510UnitVMAC>], (instrs INDEX_II_D)>;
 855
 856 // Horizontal operations, D form, scalar, immediate operands)/ scalar operands
 857 // only / immediate, scalar operands
 858 def : InstRW<[CortexA510Write<4, CortexA510UnitVMAC>], (instregex "^INDEX_(IR|RI|RR)_D")>;
 859
 860 // Logical
 861 def : InstRW<[CortexA510Write<3, CortexA510UnitVALU>],
 862              (instregex "^(AND|EOR|ORR)_ZI",
 863                         "^(AND|BIC|EOR|EOR|ORR)_ZZZ",
 864                         "^(AND|BIC|EOR|NOT|ORR)_ZPmZ_[BHSD]",
 865                         "^(AND|BIC|EOR|NOT|ORR)_ZPZZ_[BHSD]")>;
 866
 867 def : InstRW<[CortexA510Write<4, CortexA510UnitVALU>],
 868              (instregex "^EOR(BT|TB)_ZZZ_[BHSD]")>;
 869
 870 // Max/min, basic and pairwise
 871 def : InstRW<[CortexA510Write<3, CortexA510UnitVALU>], (instregex "^[SU](MAX|MIN)_ZI_[BHSD]",
 872                                            "^[SU](MAX|MIN)P?_(ZPmZ|ZPZZ)_[BHSD]")>;
 873
 874 // Matching operations
 875 def : InstRW<[CortexA510MCWrite<7, 2, CortexA510UnitVALU>], (instregex "^N?MATCH_PPzZZ_[BH]")>;
 876
 877 // Matrix multiply-accumulate
 878 def : InstRW<[CortexA510Write<4, CortexA510UnitVMAC>], (instrs SMMLA_ZZZ, UMMLA_ZZZ, USMMLA_ZZZ)>;
 879
 880 // Move prefix
 881 def : InstRW<[CortexA510Write<3, CortexA510UnitVALU>], (instregex "^MOVPRFX_ZP[mz]Z_[BHSD]",
 882                                            "^MOVPRFX_ZZ")>;
 883
 884 // Multiply, B, H, S element size
 885 def : InstRW<[CortexA510Write<4, CortexA510UnitVMAC>], (instregex "^MUL_(ZI|ZPmZ|ZZZI|ZZZ|ZPZZ)_[BHS]",
 886                                             "^[SU]MULH_(ZPmZ|ZZZ|ZPZZ)_[BHS]")>;
 887
 888 // Multiply, D element size
 889 def : InstRW<[CortexA510Write<4, CortexA510UnitVMAC>], (instregex "^MUL_(ZI|ZPmZ|ZZZI|ZZZ|ZPZZ)_D",
 890                                             "^[SU]MULH_(ZPmZ|ZZZ|ZPZZ)_D")>;
 891
 892 // Multiply long
 893 def : InstRW<[CortexA510Write<4, CortexA510UnitVMAC>], (instregex "^[SU]MULL[BT]_ZZZI_[SD]",
 894                                             "^[SU]MULL[BT]_ZZZ_[HSD]")>;
 895
 896 // Multiply accumulate, B, H, S element size
 897 def : InstRW<[CortexA510Write<4, CortexA510UnitVMAC>], (instregex "^ML[AS]_(ZZZI|ZPZZZ)_[BHS]",
 898                                             "^(ML[AS]|MAD|MSB)_ZPmZZ_[BHS]")>;
 899
 900 // Multiply accumulate, D element size
 901 def : InstRW<[CortexA510Write<4, CortexA510UnitVMAC>], (instregex "^ML[AS]_(ZZZI|ZPZZZ)_D",
 902                                             "^(ML[AS]|MAD|MSB)_ZPmZZ_D")>;
 903
 904 // Multiply accumulate long
 905 def : InstRW<[CortexA510Write<4, CortexA510UnitVMAC>], (instregex "^[SU]ML[AS]L[BT]_ZZZ_[HSD]",
 906                                             "^[SU]ML[AS]L[BT]_ZZZI_[SD]")>;
 907
 908 // Multiply accumulate saturating doubling long regular
 909 def : InstRW<[CortexA510Write<4, CortexA510UnitVMAC>], (instregex "^SQDML[AS](LB|LT|LBT)_ZZZ_[HSD]",
 910                                             "^SQDML[AS](LB|LT)_ZZZI_[SD]")>;
 911
 912 // Multiply saturating doubling high, B, H, S element size
 913 def : InstRW<[CortexA510Write<4, CortexA510UnitVMAC>], (instregex "^SQDMULH_ZZZ_[BHS]",
 914                                             "^SQDMULH_ZZZI_[HS]")>;
 915
 916 // Multiply saturating doubling high, D element size
 917 def : InstRW<[CortexA510Write<4, CortexA510UnitVMAC>], (instrs SQDMULH_ZZZ_D, SQDMULH_ZZZI_D)>;
 918
 919 // Multiply saturating doubling long
 920 def : InstRW<[CortexA510Write<4, CortexA510UnitVMAC>], (instregex "^SQDMULL[BT]_ZZZ_[HSD]",
 921                                             "^SQDMULL[BT]_ZZZI_[SD]")>;
 922
 923 // Multiply saturating rounding doubling regular/complex accumulate, B, H, S
 924 // element size
 925 def : InstRW<[CortexA510Write<4, CortexA510UnitVMAC>], (instregex "^SQRDML[AS]H_ZZZ_[BHS]",
 926                                             "^SQRDCMLAH_ZZZ_[BHS]",
 927                                             "^SQRDML[AS]H_ZZZI_[HS]",
 928                                             "^SQRDCMLAH_ZZZI_[HS]")>;
 929
 930 // Multiply saturating rounding doubling regular/complex accumulate, D element
 931 // size
 932 def : InstRW<[CortexA510Write<4, CortexA510UnitVMAC>], (instregex "^SQRDML[AS]H_ZZZI?_D",
 933                                             "^SQRDCMLAH_ZZZ_D")>;
 934
 935 // Multiply saturating rounding doubling regular/complex, B, H, S element size
 936 def : InstRW<[CortexA510Write<4, CortexA510UnitVMAC>], (instregex "^SQRDMULH_ZZZ_[BHS]",
 937                                             "^SQRDMULH_ZZZI_[HS]")>;
 938
 939 // Multiply saturating rounding doubling regular/complex, D element size
 940 def : InstRW<[CortexA510Write<4, CortexA510UnitVMAC>], (instregex "^SQRDMULH_ZZZI?_D")>;
 941
 942 // Multiply/multiply long, (8x8) polynomial
 943 def : InstRW<[CortexA510Write<4, CortexA510UnitVALU>], (instregex "^PMUL_ZZZ_B")>;
 944
 945 def : InstRW<[CortexA510Write<9, CortexA510UnitVMC>], (instregex "^PMULL[BT]_ZZZ_[HDQ]")>;
 946
 947
 948 // Predicate counting vector
 949 def : InstRW<[CortexA510Write<3, CortexA510UnitVALU>],
 950              (instregex "^(DEC|INC)[HWD]_ZPiI")>;
 951 def : InstRW<[CortexA510Write<4, CortexA510UnitVALU>],
 952              (instregex "^(SQDEC|SQINC|UQDEC|UQINC)[HWD]_ZPiI")>;
 953
 954 // Reciprocal estimate
 955 def : InstRW<[CortexA510Write<4, CortexA510UnitVMAC>], (instregex "^URECPE_ZPmZ_S", "^URSQRTE_ZPmZ_S")>;
 956
 957 // Reduction, arithmetic, B form
 958 def : InstRW<[CortexA510Write<4, CortexA510UnitVALU0>], (instregex "^[SU](ADD|MAX|MIN)V_VPZ_B")>;
 959
 960 // Reduction, arithmetic, H form
 961 def : InstRW<[CortexA510Write<4, CortexA510UnitVALU0>], (instregex "^[SU](ADD|MAX|MIN)V_VPZ_H")>;
 962
 963 // Reduction, arithmetic, S form
 964 def : InstRW<[CortexA510Write<4, CortexA510UnitVALU0>], (instregex "^[SU](ADD|MAX|MIN)V_VPZ_S")>;
 965
 966 // Reduction, arithmetic, D form
 967 def : InstRW<[CortexA510Write<4, CortexA510UnitVALU0>], (instregex "^[SU](ADD|MAX|MIN)V_VPZ_D")>;
 968
 969 // Reduction, logical
 970 def : InstRW<[CortexA510Write<4, CortexA510UnitVALU0>], (instregex "^(ANDV|EORV|ORV)_VPZ_[BHSD]")>;
 971
 972 // Reverse, vector
 973 def : InstRW<[CortexA510Write<3, CortexA510UnitVALU>], (instregex "^REV_ZZ_[BHSD]",
 974                                            "^REVB_ZPmZ_[HSD]",
 975                                            "^REVH_ZPmZ_[SD]",
 976                                            "^REVW_ZPmZ_D")>;
 977
 978 // Select, vector form
 979 def : InstRW<[CortexA510Write<3, CortexA510UnitVALU>], (instregex "^SEL_ZPZZ_[BHSD]")>;
 980
 981 // Table lookup
 982 def : InstRW<[CortexA510Write<4, CortexA510UnitVALU>], (instregex "^TBL_ZZZZ?_[BHSD]")>;
 983
 984 // Table lookup extension
 985 def : InstRW<[CortexA510Write<4, CortexA510UnitVALU>], (instregex "^TBX_ZZZ_[BHSD]")>;
 986
 987 // Transpose, vector form
 988 def : InstRW<[CortexA510Write<3, CortexA510UnitVALU>], (instregex "^TRN[12]_ZZZ_[BHSDQ]")>;
 989
 990 // Unpack and extend
 991 def : InstRW<[CortexA510Write<4, CortexA510UnitVALU>], (instregex "^[SU]UNPK(HI|LO)_ZZ_[HSD]")>;
 992
 993 // Zip/unzip
 994 def : InstRW<[CortexA510Write<3, CortexA510UnitVALU>], (instregex "^(UZP|ZIP)[12]_ZZZ_[BHSDQ]")>;
 995
 996 // SVE floating-point instructions
 997 // -----------------------------------------------------------------------------
 998
 999 // Floating point absolute value/difference
1000 def : InstRW<[CortexA510Write<4, CortexA510UnitVALU>], (instregex "^FAB[SD]_ZPmZ_[HSD]",
1001                                                                   "^FAB[SD]_ZPZZ_[HSD]")>;
1002
1003 // Floating point arithmetic
1004 def : InstRW<[CortexA510Write<4, CortexA510UnitVALU>], (instregex "^F(ADD|SUB)_(ZPm[IZ]|ZZZ|ZPZI|ZPZZ)_[HSD]",
1005                                            "^FADDP_ZPmZZ_[HSD]",
1006                                            "^FNEG_ZPmZ_[HSD]",
1007                                            "^FSUBR_(ZPm[IZ]|ZPZ[IZ])_[HSD]")>;
1008
1009 // Floating point associative add, F16
1010 def : InstRW<[CortexA510MCWrite<32, 29, CortexA510UnitVALU>], (instrs FADDA_VPZ_H)>;
1011
1012 // Floating point associative add, F32
1013 def : InstRW<[CortexA510MCWrite<16, 13, CortexA510UnitVALU>], (instrs FADDA_VPZ_S)>;
1014
1015 // Floating point associative add, F64
1016 def : InstRW<[CortexA510MCWrite<8, 5, CortexA510UnitVALU>], (instrs FADDA_VPZ_D)>;
1017
1018 // Floating point compare
1019 def : InstRW<[CortexA510Write<4, CortexA510UnitVALU>], (instregex "^FACG[ET]_PPzZZ_[HSD]",
1020                                             "^FCM(EQ|GE|GT|NE)_PPzZ[0Z]_[HSD]",
1021                                             "^FCM(LE|LT)_PPzZ0_[HSD]",
1022                                             "^FCMUO_PPzZZ_[HSD]")>;
1023
1024 // Floating point complex add
1025 def : InstRW<[CortexA510Write<4, CortexA510UnitVALU>], (instregex "^FCADD_ZPmZ_[HSD]")>;
1026
1027 // Floating point complex multiply add
1028 def : InstRW<[CortexA510Write<4, CortexA510UnitVMAC>], (instregex "^FCMLA_ZPmZZ_[HSD]",
1029                                            "^FCMLA_ZZZI_[HS]")>;
1030
1031 // Floating point convert, long or narrow (F16 to F32 or F32 to F16)
1032 def : InstRW<[CortexA510Write<4, CortexA510UnitVALU>], (instregex "^FCVT_ZPmZ_(HtoS|StoH)",
1033                                             "^FCVTLT_ZPmZ_HtoS",
1034                                             "^FCVTNT_ZPmZ_StoH")>;
1035
1036 // Floating point convert, long or narrow (F16 to F64, F32 to F64, F64 to F32
1037 // or F64 to F16)
1038 def : InstRW<[CortexA510Write<4, CortexA510UnitVALU>], (instregex "^FCVT_ZPmZ_(HtoD|StoD|DtoS|DtoH)",
1039                                             "^FCVTLT_ZPmZ_StoD",
1040                                             "^FCVTNT_ZPmZ_DtoS")>;
1041
1042 // Floating point convert, round to odd
1043 def : InstRW<[CortexA510Write<4, CortexA510UnitVALU>], (instregex "^FCVTX_ZPmZ_DtoS", "FCVTXNT_ZPmZ_DtoS")>;
1044
1045 // Floating point base2 log, F16
1046 def : InstRW<[CortexA510Write<4, CortexA510UnitVMAC>], (instregex "^FLOGB_(ZPmZ|ZPZZ)_H")>;
1047
1048 // Floating point base2 log, F32
1049 def : InstRW<[CortexA510Write<4, CortexA510UnitVMAC>], (instregex "^FLOGB_(ZPmZ|ZPZZ)_S")>;
1050
1051 // Floating point base2 log, F64
1052 def : InstRW<[CortexA510Write<4, CortexA510UnitVMAC>], (instregex "^FLOGB_(ZPmZ|ZPZZ)_D")>;
1053
1054 // Floating point convert to integer, F16
1055 def : InstRW<[CortexA510Write<4, CortexA510UnitVALU>], (instregex "^FCVTZ[SU]_ZPmZ_HtoH")>;
1056
1057 // Floating point convert to integer, F32
1058 def : InstRW<[CortexA510Write<4, CortexA510UnitVALU>], (instregex "^FCVTZ[SU]_ZPmZ_(HtoS|StoS)")>;
1059
1060 // Floating point convert to integer, F64
1061 def : InstRW<[CortexA510Write<4, CortexA510UnitVALU>],
1062              (instregex "^FCVTZ[SU]_ZPmZ_(HtoD|StoD|DtoS|DtoD)")>;
1063
1064 // Floating point copy
1065 def : InstRW<[CortexA510Write<3, CortexA510UnitVALU0>], (instregex "^FCPY_ZPmI_[HSD]",
1066                                            "^FDUP_ZI_[HSD]")>;
1067
1068 // Floating point divide, F16
1069 def : InstRW<[CortexA510MCWrite<8, 5, CortexA510UnitVMC>], (instregex "^FDIVR?_(ZPmZ|ZPZZ)_H")>;
1070
1071 // Floating point divide, F32
1072 def : InstRW<[CortexA510MCWrite<13, 10, CortexA510UnitVMC>], (instregex "^FDIVR?_(ZPmZ|ZPZZ)_S")>;
1073
1074 // Floating point divide, F64
1075 def : InstRW<[CortexA510MCWrite<22, 19, CortexA510UnitVMC>], (instregex "^FDIVR?_(ZPmZ|ZPZZ)_D")>;
1076
1077 // Floating point min/max pairwise
1078 def : InstRW<[CortexA510Write<4, CortexA510UnitVALU>], (instregex "^F(MAX|MIN)(NM)?P_ZPmZZ_[HSD]")>;
1079
1080 // Floating point min/max
1081 def : InstRW<[CortexA510Write<4, CortexA510UnitVALU>], (instregex "^F(MAX|MIN)(NM)?_(ZPm[IZ]|ZPZZ|ZPZI)_[HSD]")>;
1082
1083 // Floating point multiply
1084 def : InstRW<[CortexA510Write<4, CortexA510UnitVMAC>], (instregex "^(FSCALE|FMULX)_(ZPmZ|ZPZZ)_[HSD]",
1085                                            "^FMUL_(ZPm[IZ]|ZZZI?|ZPZI|ZPZZ)_[HSD]")>;
1086
1087 // Floating point multiply accumulate
1088 def : InstRW<[CortexA510Write<4, CortexA510UnitVMAC>],
1089              (instregex "^FML[AS]_(ZPmZZ|ZZZI|ZPZZZ)_[HSD]",
1090                         "^(FMAD|FNMAD|FNML[AS]|FN?MSB)_(ZPmZZ|ZPZZZ)_[HSD]")>;
1091
1092 // Floating point multiply add/sub accumulate long
1093 def : InstRW<[CortexA510Write<4, CortexA510UnitVMAC>], (instregex "^FML[AS]L[BT]_ZZZI?_SHH")>;
1094
1095 // Floating point reciprocal estimate, F16
1096 def : InstRW<[CortexA510Write<4, CortexA510UnitVMAC>], (instregex "^FRECPE_ZZ_H", "^FRECPX_ZPmZ_H",
1097                                          "^FRSQRTE_ZZ_H")>;
1098
1099 // Floating point reciprocal estimate, F32
1100 def : InstRW<[CortexA510Write<4, CortexA510UnitVMAC>], (instregex "^FRECPE_ZZ_S", "^FRECPX_ZPmZ_S",
1101                                          "^FRSQRTE_ZZ_S")>;
1102 // Floating point reciprocal estimate, F64
1103 def : InstRW<[CortexA510Write<4, CortexA510UnitVMAC>],(instregex "^FRECPE_ZZ_D", "^FRECPX_ZPmZ_D",
1104                                          "^FRSQRTE_ZZ_D")>;
1105
1106 // Floating point reciprocal step
1107 def : InstRW<[CortexA510Write<4, CortexA510UnitVMAC>], (instregex "^F(RECPS|RSQRTS)_ZZZ_[HSD]")>;
1108
1109 // Floating point reduction, F16
1110 def : InstRW<[CortexA510Write<4, CortexA510UnitVALU0>],
1111              (instregex "^(FMAXNMV|FMAXV|FMINNMV|FMINV)_VPZ_[HSD]")>;
1112
1113 // Floating point reduction, F32
1114 def : InstRW<[CortexA510MCWrite<12, 11, CortexA510UnitVALU0>],
1115              (instregex "^FADDV_VPZ_H")>;
1116
1117 def : InstRW<[CortexA510MCWrite<8, 5, CortexA510UnitVALU0>],
1118              (instregex "^FADDV_VPZ_S")>;
1119
1120 def : InstRW<[CortexA510Write<4, CortexA510UnitVALU0>],
1121              (instregex "^FADDV_VPZ_D")>;
1122
1123
1124 // Floating point round to integral, F16
1125 def : InstRW<[CortexA510Write<4, CortexA510UnitVALU>], (instregex "^FRINT[AIMNPXZ]_ZPmZ_H")>;
1126
1127 // Floating point round to integral, F32
1128 def : InstRW<[CortexA510Write<4, CortexA510UnitVALU>], (instregex "^FRINT[AIMNPXZ]_ZPmZ_S")>;
1129
1130 // Floating point round to integral, F64
1131 def : InstRW<[CortexA510Write<4, CortexA510UnitVALU>], (instregex "^FRINT[AIMNPXZ]_ZPmZ_D")>;
1132
1133 // Floating point square root, F16
1134 def : InstRW<[CortexA510MCWrite<8, 5, CortexA510UnitVMC>], (instregex "^FSQRT_ZPmZ_H")>;
1135
1136 // Floating point square root, F32
1137 def : InstRW<[CortexA510MCWrite<12, 9, CortexA510UnitVMC>], (instregex "^FSQRT_ZPmZ_S")>;
1138
1139 // Floating point square root, F64
1140 def : InstRW<[CortexA510MCWrite<22, 19, CortexA510UnitVMC>], (instregex "^FSQRT_ZPmZ_D")>;
1141
1142 // Floating point trigonometric exponentiation
1143 def : InstRW<[CortexA510Write<4, CortexA510UnitVMAC>], (instregex "^FEXPA_ZZ_[HSD]")>;
1144
1145 // Floating point trigonometric multiply add
1146 def : InstRW<[CortexA510Write<4, CortexA510UnitVMAC>], (instregex "^FTMAD_ZZI_[HSD]")>;
1147
1148 // Floating point trigonometric, miscellaneous
1149 def : InstRW<[CortexA510Write<4, CortexA510UnitVMAC>], (instregex "^FTSMUL_ZZZ_[HSD]")>;
1150 def : InstRW<[CortexA510Write<3, CortexA510UnitVALU>], (instregex "^FTSSEL_ZZZ_[HSD]")>;
1151
1152
1153 // SVE BFloat16 (BF16) instructions
1154 // -----------------------------------------------------------------------------
1155
1156 // Convert, F32 to BF16
1157 def : InstRW<[CortexA510Write<4, CortexA510UnitVALU>], (instrs BFCVT_ZPmZ, BFCVTNT_ZPmZ)>;
1158
1159 // Dot product
1160 def : InstRW<[A510Write_10cyc_1VMAC_1VALU], (instrs BFDOT_ZZI, BFDOT_ZZZ)>;
1161
1162 // Matrix multiply accumulate
1163 def : InstRW<[A510Write_15cyc_1VMAC_1VALU], (instrs BFMMLA_ZZZ)>;
1164
1165 // Multiply accumulate long
1166 def : InstRW<[CortexA510Write<4, CortexA510UnitVMAC>], (instregex "^BFMLAL[BT]_ZZZ(I)?")>;
1167
1168 // SVE Load instructions
1169 // -----------------------------------------------------------------------------
1170
1171 // Load vector
1172 def : InstRW<[CortexA510Write<3, CortexA510UnitLd>], (instrs LDR_ZXI)>;
1173
1174 // Load predicate
1175 def : InstRW<[CortexA510Write<3, CortexA510UnitLdSt>], (instrs LDR_PXI)>;
1176
1177 // Contiguous load, scalar + imm
1178 def : InstRW<[CortexA510Write<3, CortexA510UnitLd>], (instregex "^LD1[BHWD]_IMM$",
1179                                            "^LD1S?B_[HSD]_IMM$",
1180                                            "^LD1S?H_[SD]_IMM$",
1181                                            "^LD1S?W_D_IMM$" )>;
1182 // Contiguous load, scalar + scalar
1183 def : InstRW<[CortexA510Write<3, CortexA510UnitLd>], (instregex "^LD1[BHWD]$",
1184                                              "^LD1S?B_[HSD]$",
1185                                              "^LD1S?H_[SD]$",
1186                                              "^LD1S?W_D$" )>;
1187
1188 // Contiguous load broadcast, scalar + imm
1189 def : InstRW<[CortexA510Write<3, CortexA510UnitLd>], (instregex "^LD1R[BHWD]_IMM$",
1190                                            "^LD1RSW_IMM$",
1191                                            "^LD1RS?B_[HSD]_IMM$",
1192                                            "^LD1RS?H_[SD]_IMM$",
1193                                            "^LD1RS?W_D_IMM$",
1194                                            "^LD1RQ_[BHWD]_IMM$")>;
1195
1196 // Contiguous load broadcast, scalar + scalar
1197 def : InstRW<[CortexA510Write<3, CortexA510UnitLdSt>], (instregex "^LD1RQ_[BHWD]$")>;
1198
1199 // Non temporal load, scalar + imm
1200 def : InstRW<[CortexA510Write<3, CortexA510UnitLdSt>], (instregex "^LDNT1[BHWD]_ZRI$")>;
1201
1202 // Non temporal load, scalar + scalar
1203 def : InstRW<[CortexA510Write<3, CortexA510UnitLdSt>], (instregex "^LDNT1[BHWD]_ZRR$")>;
1204
1205 // Non temporal gather load, vector + scalar 32-bit element size
1206 def : InstRW<[CortexA510MCWrite<9, 9, CortexA510UnitLdSt>], (instregex "^LDNT1[BHW]_ZZR_S$",
1207                                               "^LDNT1S[BH]_ZZR_S$")>;
1208
1209 // Non temporal gather load, vector + scalar 64-bit element size
1210 def : InstRW<[CortexA510MCWrite<7, 7, CortexA510UnitLdSt>], (instregex "^LDNT1S?[BHW]_ZZR_D$")>;
1211 def : InstRW<[CortexA510MCWrite<7, 7, CortexA510UnitLdSt>], (instrs LDNT1D_ZZR_D)>;
1212
1213 // Contiguous first faulting load, scalar + scalar
1214 def : InstRW<[CortexA510Write<3, CortexA510UnitLd>], (instregex "^LDFF1[BHWD]$",
1215                                               "^LDFF1S?B_[HSD]$",
1216                                               "^LDFF1S?H_[SD]$",
1217                                               "^LDFF1S?W_D$")>;
1218
1219 // Contiguous non faulting load, scalar + imm
1220 def : InstRW<[CortexA510Write<3, CortexA510UnitLd>], (instregex "^LDNF1[BHWD]_IMM$",
1221                                            "^LDNF1S?B_[HSD]_IMM$",
1222                                            "^LDNF1S?H_[SD]_IMM$",
1223                                            "^LDNF1S?W_D_IMM$")>;
1224
1225 // Contiguous Load two structures to two vectors, scalar + imm
1226 def : InstRW<[CortexA510MCWrite<3, 1, CortexA510UnitLdSt>], (instregex "^LD2[BHWD]_IMM$")>;
1227
1228 // Contiguous Load two structures to two vectors, scalar + scalar
1229 def : InstRW<[CortexA510MCWrite<3, 2, CortexA510UnitLdSt>], (instregex "^LD2[BHWD]$")>;
1230
1231 // Contiguous Load three structures to three vectors, scalar + imm
1232 def : InstRW<[CortexA510MCWrite<5, 3, CortexA510UnitLdSt>], (instregex "^LD3[BHWD]_IMM$")>;
1233
1234 // Contiguous Load three structures to three vectors, scalar + scalar
1235 def : InstRW<[CortexA510MCWrite<5, 3, CortexA510UnitLdSt>], (instregex "^LD3[BHWD]$")>;
1236
1237 // Contiguous Load four structures to four vectors, scalar + imm
1238 def : InstRW<[CortexA510MCWrite<5, 3, CortexA510UnitLdSt>], (instregex "^LD4[BHWD]_IMM$")>;
1239
1240 // Contiguous Load four structures to four vectors, scalar + scalar
1241 def : InstRW<[CortexA510MCWrite<5, 3, CortexA510UnitLdSt>], (instregex "^LD4[BHWD]$")>;
1242
1243 // Gather load, vector + imm, 32-bit element size
1244 def : InstRW<[CortexA510MCWrite<9, 9, CortexA510UnitLdSt>], (instregex "^GLD(FF)?1S?[BH]_S_IMM$",
1245                                               "^GLD(FF)?1W_IMM$")>;
1246
1247 // Gather load, vector + imm, 64-bit element size
1248 def : InstRW<[CortexA510MCWrite<7, 7, CortexA510UnitLdSt>], (instregex "^GLD(FF)?1S?[BHW]_D_IMM$",
1249                                               "^GLD(FF)?1D_IMM$")>;
1250
1251 // Gather load, 64-bit element size
1252 def : InstRW<[CortexA510MCWrite<7, 7, CortexA510UnitLdSt>],
1253              (instregex "^GLD(FF)?1S?[BHW]_D_[SU]XTW(_SCALED)?$",
1254                         "^GLD(FF)?1S?[BHW]_D(_SCALED)?$",
1255                         "^GLD(FF)?1D_[SU]XTW(_SCALED)?$",
1256                         "^GLD(FF)?1D(_SCALED)?$")>;
1257
1258 // Gather load, 32-bit scaled offset
1259 def : InstRW<[CortexA510MCWrite<7, 7, CortexA510UnitLd>],
1260              (instregex "^GLD(FF)?1S?[HW]_S_[SU]XTW_SCALED$",
1261                         "^GLD(FF)?1W_[SU]XTW_SCALED")>;
1262
1263 // Gather load, 32-bit unpacked unscaled offset
1264 def : InstRW<[CortexA510MCWrite<7, 7, CortexA510UnitLd>], (instregex "^GLD(FF)?1S?[BH]_S_[SU]XTW$",
1265                                               "^GLD(FF)?1W_[SU]XTW$")>;
1266
1267 def : InstRW<[CortexA510Write<0, CortexA510UnitVALU>], (instregex "^PRF(B|H|W|D).*")>;
1268 // SVE Store instructions
1269 // -----------------------------------------------------------------------------
1270
1271 // Store from predicate reg
1272 def : InstRW<[CortexA510VSt0], (instrs STR_PXI)>;
1273
1274 // Store from vector reg
1275 def : InstRW<[CortexA510VSt0], (instrs STR_ZXI)>;
1276
1277 // Contiguous store, scalar + imm
1278 def : InstRW<[CortexA510VSt0], (instregex "^ST1[BHWD]_IMM$",
1279                                                 "^ST1B_[HSD]_IMM$",
1280                                                 "^ST1H_[SD]_IMM$",
1281                                                 "^ST1W_D_IMM$")>;
1282
1283 // Contiguous store, scalar + scalar
1284 def : InstRW<[CortexA510VSt0], (instregex "^ST1H(_[SD])?$")>;
1285 def : InstRW<[CortexA510VSt0], (instregex "^ST1[BWD]$",
1286                                                 "^ST1B_[HSD]$",
1287                                                 "^ST1W_D$")>;
1288
1289 // Contiguous store two structures from two vectors, scalar + imm
1290 def : InstRW<[CortexA510VSt<11>], (instregex "^ST2[BHWD]_IMM$")>;
1291
1292 // Contiguous store two structures from two vectors, scalar + scalar
1293 def : InstRW<[CortexA510VSt<11>], (instrs ST2H)>;
1294
1295 // Contiguous store two structures from two vectors, scalar + scalar
1296 def : InstRW<[CortexA510VSt<11>], (instregex "^ST2[BWD]$")>;
1297
1298 // Contiguous store three structures from three vectors, scalar + imm
1299 def : InstRW<[CortexA510VSt<25>], (instregex "^ST3[BHW]_IMM$")>;
1300 def : InstRW<[CortexA510VSt<14>], (instregex "^ST3D_IMM$")>;
1301
1302 // Contiguous store three structures from three vectors, scalar + scalar
1303 def : InstRW<[CortexA510VSt<25>], (instregex "^ST3[BHW]$")>;
1304 def : InstRW<[CortexA510VSt<14>], (instregex "^ST3D$")>;
1305
1306 // Contiguous store four structures from four vectors, scalar + imm
1307 def : InstRW<[CortexA510VSt<50>], (instregex "^ST4[BHW]_IMM$")>;
1308 def : InstRW<[CortexA510VSt<25>], (instregex "^ST4D_IMM$")>;
1309
1310 // Contiguous store four structures from four vectors, scalar + scalar
1311 def : InstRW<[CortexA510VSt<50>], (instregex "^ST4[BHW]$")>;
1312
1313 // Contiguous store four structures from four vectors, scalar + scalar
1314 def : InstRW<[CortexA510VSt<25>], (instregex "^ST4D$")>;
1315
1316 // Non temporal store, scalar + imm
1317 def : InstRW<[CortexA510VSt0], (instregex "^STNT1[BHWD]_ZRI$")>;
1318
1319 // Non temporal store, scalar + scalar
1320 def : InstRW<[CortexA510VSt0], (instrs STNT1H_ZRR)>;
1321 def : InstRW<[CortexA510VSt0], (instregex "^STNT1[BWD]_ZRR$")>;
1322
1323 // Scatter non temporal store, vector + scalar 32-bit element size
1324 def : InstRW<[CortexA510VSt<9>], (instregex "^STNT1[BHW]_ZZR_S")>;
1325
1326 // Scatter non temporal store, vector + scalar 64-bit element size
1327 def : InstRW<[CortexA510VSt<7>], (instregex "^STNT1[BHWD]_ZZR_D")>;
1328
1329 // Scatter store vector + imm 32-bit element size
1330 def : InstRW<[CortexA510VSt<9>], (instregex "^SST1[BH]_S_IMM$",
1331                                                 "^SST1W_IMM$")>;
1332
1333 // Scatter store vector + imm 64-bit element size
1334 def : InstRW<[CortexA510VSt<7>], (instregex "^SST1[BHW]_D_IMM$",
1335                                                 "^SST1D_IMM$")>;
1336
1337 // Scatter store, 32-bit scaled offset
1338 def : InstRW<[CortexA510VSt<8>],
1339              (instregex "^SST1(H_S|W)_[SU]XTW_SCALED$")>;
1340
1341 // Scatter store, 32-bit unpacked unscaled offset
1342 def : InstRW<[CortexA510VSt<8>], (instregex "^SST1[BHW]_D_[SU]XTW$",
1343                                                 "^SST1D_[SU]XTW$")>;
1344
1345 // Scatter store, 32-bit unpacked scaled offset
1346 def : InstRW<[CortexA510VSt<8>], (instregex "^SST1[HW]_D_[SU]XTW_SCALED$",
1347                                                 "^SST1D_[SU]XTW_SCALED$")>;
1348
1349 // Scatter store, 32-bit unscaled offset
1350 def : InstRW<[CortexA510VSt<8>], (instregex "^SST1[BH]_S_[SU]XTW$",
1351                                                 "^SST1W_[SU]XTW$")>;
1352
1353 // Scatter store, 64-bit scaled offset
1354 def : InstRW<[CortexA510VSt<8>], (instregex "^SST1[HW]_D_SCALED$",
1355                                                 "^SST1D_SCALED$")>;
1356
1357 // Scatter store, 64-bit unscaled offset
1358 def : InstRW<[CortexA510VSt<8>], (instregex "^SST1[BHW]_D$",
1359                                                 "^SST1D$")>;
1360
1361 // SVE Miscellaneous instructions
1362 // -----------------------------------------------------------------------------
1363
1364 // Read first fault register, unpredicated
1365 def : InstRW<[CortexA510Write<1, CortexA510UnitALU>], (instrs RDFFR_P)>;
1366
1367 // Read first fault register, predicated
1368 def : InstRW<[CortexA510Write<3, CortexA510UnitALU0>], (instrs RDFFR_PPz)>;
1369
1370 // Read first fault register and set flags
1371 def : InstRW<[CortexA510Write<3, CortexA510UnitALU0>], (instrs RDFFRS_PPz)>;
1372
1373 // Set first fault register
1374 // Write to first fault register
1375 def : InstRW<[CortexA510Write<1, CortexA510UnitALU>], (instrs SETFFR, WRFFR)>;
1376
1377 // SVE Cryptographic instructions
1378 // -----------------------------------------------------------------------------
1379
1380 // Crypto AES ops
1381 def : InstRW<[CortexA510Write<3, CortexA510UnitVALU>], (instregex "^AES[DE]_ZZZ_B$",
1382                                            "^AESI?MC_ZZ_B$")>;
1383
1384 // Crypto SHA3 ops
1385 def : InstRW<[CortexA510Write<4, CortexA510UnitVALU>], (instregex "^(BCAX|EOR3)_ZZZZ$",
1386                                             "^XAR_ZZZI_[BHSD]$")>;
1387
1388 def : InstRW<[CortexA510MC_RC0Write<9, CortexA510UnitVMC>], (instregex "^RAX1_ZZZ_D$")>;
1389
1390 // Crypto SM4 ops
1391 def : InstRW<[CortexA510MC_RC0Write<9, CortexA510UnitVMC>], (instregex "^SM4E(KEY)?_ZZZ_S$")>;
1392
1393 }