llvm/lib/Target/AArch64/AArch64SchedA55.td

   1 //==- AArch64SchedCortexA55.td - ARM Cortex-A55 Scheduling Definitions -*- tablegen -*-=//
   2 //
   3 // Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
   4 // See https://llvm.org/LICENSE.txt for license information.
   5 // SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
   6 //
   7 //===----------------------------------------------------------------------===//
   8 //
   9 // This file defines the machine model for the ARM Cortex-A55 processors. Note
  10 // that this schedule is currently used as the default for -mcpu=generic. As a
  11 // result, some of the modelling decision made do not precisely model the
  12 // Cortex-A55, instead aiming to be a good compromise between different cpus.
  13 //
  14 //===----------------------------------------------------------------------===//
  15
  16 // ===---------------------------------------------------------------------===//
  17 // The following definitions describe the per-operand machine model.
  18 // This works with MachineScheduler. See MCSchedModel.h for details.
  19
  20 // Cortex-A55 machine model for scheduling and other instruction cost heuristics.
  21 def CortexA55Model : SchedMachineModel {
  22   let MicroOpBufferSize = 0;  // The Cortex-A55 is an in-order processor
  23   let IssueWidth = 2;         // It dual-issues under most circumstances
  24   let LoadLatency = 4;        // Cycles for loads to access the cache. The
  25                               // optimisation guide shows that most loads have
  26                               // a latency of 3, but some have a latency of 4
  27                               // or 5. Setting it 4 looked to be good trade-off.
  28   let MispredictPenalty = 8;  // A branch direction mispredict.
  29   let PostRAScheduler = 1;    // Enable PostRA scheduler pass.
  30   let CompleteModel = 0;      // Covers instructions applicable to Cortex-A55.
  31
  32   list<Predicate> UnsupportedFeatures = [HasSVE, HasMTE];
  33
  34   // FIXME: Remove when all errors have been fixed.
  35   let FullInstRWOverlapCheck = 0;
  36 }
  37
  38 //===----------------------------------------------------------------------===//
  39 // Define each kind of processor resource and number available.
  40
  41 // Modeling each pipeline as a ProcResource using the BufferSize = 0 since the
  42 // Cortex-A55 is in-order.
  43
  44 def CortexA55UnitALU    : ProcResource<2> { let BufferSize = 0; } // Int ALU
  45 def CortexA55UnitMAC    : ProcResource<1> { let BufferSize = 0; } // Int MAC, 64-bi wide
  46 def CortexA55UnitDiv    : ProcResource<1> { let BufferSize = 0; } // Int Division, not pipelined
  47 def CortexA55UnitLd     : ProcResource<1> { let BufferSize = 0; } // Load pipe
  48 def CortexA55UnitSt     : ProcResource<1> { let BufferSize = 0; } // Store pipe
  49 def CortexA55UnitB      : ProcResource<1> { let BufferSize = 0; } // Branch
  50
  51 // The FP DIV/SQRT instructions execute totally differently from the FP ALU
  52 // instructions, which can mostly be dual-issued; that's why for now we model
  53 // them with 2 resources.
  54 def CortexA55UnitFPALU  : ProcResource<2> { let BufferSize = 0; } // FP ALU
  55 def CortexA55UnitFPMAC  : ProcResource<2> { let BufferSize = 0; } // FP MAC
  56 def CortexA55UnitFPDIV  : ProcResource<1> { let BufferSize = 0; } // FP Div/SQRT, 64/128
  57
  58 //===----------------------------------------------------------------------===//
  59 // Subtarget-specific SchedWrite types
  60
  61 let SchedModel = CortexA55Model in {
  62
  63 // These latencies are modeled without taking into account forwarding paths
  64 // (the software optimisation guide lists latencies taking into account
  65 // typical forwarding paths).
  66 def : WriteRes<WriteImm, [CortexA55UnitALU]> { let Latency = 3; }    // MOVN, MOVZ
  67 def : WriteRes<WriteI, [CortexA55UnitALU]> { let Latency = 3; }      // ALU
  68 def : WriteRes<WriteISReg, [CortexA55UnitALU]> { let Latency = 3; }  // ALU of Shifted-Reg
  69 def : WriteRes<WriteIEReg, [CortexA55UnitALU]> { let Latency = 3; }  // ALU of Extended-Reg
  70 def : WriteRes<WriteExtr, [CortexA55UnitALU]> { let Latency = 3; }   // EXTR from a reg pair
  71 def : WriteRes<WriteIS, [CortexA55UnitALU]> { let Latency = 3; }     // Shift/Scale
  72
  73 // MAC
  74 def : WriteRes<WriteIM32, [CortexA55UnitMAC]> { let Latency = 4; }   // 32-bit Multiply
  75 def : WriteRes<WriteIM64, [CortexA55UnitMAC]> { let Latency = 4; }   // 64-bit Multiply
  76
  77 // Div
  78 def : WriteRes<WriteID32, [CortexA55UnitDiv]> {
  79   let Latency = 8; let ReleaseAtCycles = [8];
  80 }
  81 def : WriteRes<WriteID64, [CortexA55UnitDiv]> {
  82   let Latency = 8; let ReleaseAtCycles = [8];
  83 }
  84
  85 // Load
  86 def : WriteRes<WriteLD, [CortexA55UnitLd]> { let Latency = 3; }
  87 def : WriteRes<WriteLDIdx, [CortexA55UnitLd]> { let Latency = 4; }
  88 def : WriteRes<WriteLDHi, [CortexA55UnitLd]> { let Latency = 5; }
  89
  90 // Vector Load - Vector loads take 1-5 cycles to issue. For the WriteVecLd
  91 //               below, choosing the median of 3 which makes the latency 6.
  92 // An extra cycle is needed to get the swizzling right.
  93 def : WriteRes<WriteVLD, [CortexA55UnitLd]> { let Latency = 6;
  94                                            let ReleaseAtCycles = [3]; }
  95 def CortexA55WriteVLD1 : SchedWriteRes<[CortexA55UnitLd]> { let Latency = 4; }
  96 def CortexA55WriteVLD1SI : SchedWriteRes<[CortexA55UnitLd]> { let Latency = 4; let SingleIssue = 1; }
  97 def CortexA55WriteVLD2 : SchedWriteRes<[CortexA55UnitLd]> { let Latency = 5;
  98                                                   let ReleaseAtCycles = [2]; }
  99 def CortexA55WriteVLD3 : SchedWriteRes<[CortexA55UnitLd]> { let Latency = 6;
 100                                                   let ReleaseAtCycles = [3]; }
 101 def CortexA55WriteVLD4 : SchedWriteRes<[CortexA55UnitLd]> { let Latency = 7;
 102                                                   let ReleaseAtCycles = [4]; }
 103 def CortexA55WriteVLD5 : SchedWriteRes<[CortexA55UnitLd]> { let Latency = 8;
 104                                                   let ReleaseAtCycles = [5]; }
 105 def CortexA55WriteVLD6 : SchedWriteRes<[CortexA55UnitLd]> { let Latency = 9;
 106                                                   let ReleaseAtCycles = [6]; }
 107 def CortexA55WriteVLD7 : SchedWriteRes<[CortexA55UnitLd]> { let Latency = 10;
 108                                                   let ReleaseAtCycles = [7]; }
 109 def CortexA55WriteVLD8 : SchedWriteRes<[CortexA55UnitLd]> { let Latency = 11;
 110                                                   let ReleaseAtCycles = [8]; }
 111
 112 def CortexA55WriteLDP1 : SchedWriteRes<[]> { let Latency = 4; }
 113 def CortexA55WriteLDP2 : SchedWriteRes<[CortexA55UnitLd]> { let Latency = 5; }
 114 def CortexA55WriteLDP4 : SchedWriteRes<[CortexA55UnitLd, CortexA55UnitLd, CortexA55UnitLd, CortexA55UnitLd, CortexA55UnitLd]> { let Latency = 6; }
 115
 116 // Pre/Post Indexing - Performed as part of address generation
 117 def : WriteRes<WriteAdr, []> { let Latency = 0; }
 118
 119 // Store
 120 let RetireOOO = 1 in {
 121 def : WriteRes<WriteST, [CortexA55UnitSt]> { let Latency = 1; }
 122 def : WriteRes<WriteSTP, [CortexA55UnitSt]> { let Latency = 1; }
 123 def : WriteRes<WriteSTIdx, [CortexA55UnitSt]> { let Latency = 1; }
 124 }
 125 def : WriteRes<WriteSTX, [CortexA55UnitSt]> { let Latency = 4; }
 126
 127 // Vector Store - Similar to vector loads, can take 1-3 cycles to issue.
 128 def : WriteRes<WriteVST, [CortexA55UnitSt]> { let Latency = 5;
 129                                           let ReleaseAtCycles = [2];}
 130 def CortexA55WriteVST1 : SchedWriteRes<[CortexA55UnitSt]> { let Latency = 4; }
 131 def CortexA55WriteVST2 : SchedWriteRes<[CortexA55UnitSt]> { let Latency = 5;
 132                                                   let ReleaseAtCycles = [2]; }
 133 def CortexA55WriteVST3 : SchedWriteRes<[CortexA55UnitSt]> { let Latency = 6;
 134                                                   let ReleaseAtCycles = [3]; }
 135 def CortexA55WriteVST4 : SchedWriteRes<[CortexA55UnitSt]> { let Latency = 5;
 136                                                   let ReleaseAtCycles = [4]; }
 137
 138 def : WriteRes<WriteAtomic, []> { let Unsupported = 1; }
 139
 140 // Branch
 141 def : WriteRes<WriteBr, [CortexA55UnitB]>;
 142 def : WriteRes<WriteBrReg, [CortexA55UnitB]>;
 143 def : WriteRes<WriteSys, [CortexA55UnitB]>;
 144 def : WriteRes<WriteBarrier, [CortexA55UnitB]>;
 145 def : WriteRes<WriteHint, [CortexA55UnitB]>;
 146
 147 // FP ALU
 148 //   As WriteF result is produced in F5 and it can be mostly forwarded
 149 //   to consumer at F1, the effectively latency is set as 4.
 150 def : WriteRes<WriteF, [CortexA55UnitFPALU]> { let Latency = 4; }
 151 def : WriteRes<WriteFCmp, [CortexA55UnitFPALU]> { let Latency = 3; }
 152 def : WriteRes<WriteFCvt, [CortexA55UnitFPALU]> { let Latency = 4; }
 153 def : WriteRes<WriteFCopy, [CortexA55UnitFPALU]> { let Latency = 3; }
 154 def : WriteRes<WriteFImm, [CortexA55UnitFPALU]> { let Latency = 3; }
 155
 156 // NEON
 157 class CortexA55WriteVd<int n, ProcResourceKind res> : SchedWriteRes<[res]> {
 158   let Latency = n;
 159 }
 160 class CortexA55WriteVq<int n, ProcResourceKind res> : SchedWriteRes<[res, res]> {
 161   let Latency = n;
 162   let BeginGroup = 1;
 163 }
 164 def CortexA55WriteDotScVq_4 : CortexA55WriteVq<4, CortexA55UnitFPALU>;
 165 def CortexA55WriteDotVq_4 : CortexA55WriteVq<4, CortexA55UnitFPALU>;
 166 def CortexA55WriteDotVd_4 : CortexA55WriteVd<4, CortexA55UnitFPALU>;
 167 def CortexA55WriteMlaLVq_4 : CortexA55WriteVq<4, CortexA55UnitFPALU>;
 168 def CortexA55WriteMlaIxVq_4 : CortexA55WriteVq<4, CortexA55UnitFPALU>;
 169 def CortexA55WriteMlaVq_4 : CortexA55WriteVq<4, CortexA55UnitFPALU>;
 170 def CortexA55WriteMlaVd_4 : CortexA55WriteVd<4, CortexA55UnitFPALU>;
 171 def CortexA55WriteAluVq_4 : CortexA55WriteVq<4, CortexA55UnitFPALU>;
 172 def CortexA55WriteAluVd_3 : CortexA55WriteVd<3, CortexA55UnitFPALU>;
 173 def CortexA55WriteAluVq_3 : CortexA55WriteVq<3, CortexA55UnitFPALU>;
 174 def CortexA55WriteAluVd_2 : CortexA55WriteVd<2, CortexA55UnitFPALU>;
 175 def CortexA55WriteAluVq_2 : CortexA55WriteVq<2, CortexA55UnitFPALU>;
 176 def CortexA55WriteAluVd_1 : CortexA55WriteVd<1, CortexA55UnitFPALU>;
 177 def CortexA55WriteAluVq_1 : CortexA55WriteVq<1, CortexA55UnitFPALU>;
 178 def : SchedAlias<WriteVd, CortexA55WriteVd<4, CortexA55UnitFPALU>>;
 179 def : SchedAlias<WriteVq, CortexA55WriteVq<4, CortexA55UnitFPALU>>;
 180
 181 // FP ALU specific new schedwrite definitions
 182 def CortexA55WriteFPALU_F2 : SchedWriteRes<[CortexA55UnitFPALU]> { let Latency = 2;}
 183 def CortexA55WriteFPALU_F3 : SchedWriteRes<[CortexA55UnitFPALU]> { let Latency = 3;}
 184 def CortexA55WriteFPALU_F4 : SchedWriteRes<[CortexA55UnitFPALU]> { let Latency = 4;}
 185 def CortexA55WriteFPALU_F5 : SchedWriteRes<[CortexA55UnitFPALU]> { let Latency = 5;}
 186
 187 // FP Mul, Div, Sqrt. Div/Sqrt are not pipelined
 188 def : WriteRes<WriteFMul, [CortexA55UnitFPMAC]> { let Latency = 4; }
 189
 190 let RetireOOO = 1 in {
 191 def : WriteRes<WriteFDiv, [CortexA55UnitFPDIV]> { let Latency = 22;
 192                                             let ReleaseAtCycles = [29]; }
 193 def CortexA55WriteFMAC : SchedWriteRes<[CortexA55UnitFPMAC]> { let Latency = 4; }
 194 def CortexA55WriteFDivHP : SchedWriteRes<[CortexA55UnitFPDIV]> { let Latency = 8;
 195                                                      let ReleaseAtCycles = [5]; }
 196 def CortexA55WriteFDivSP : SchedWriteRes<[CortexA55UnitFPDIV]> { let Latency = 13;
 197                                                      let ReleaseAtCycles = [10]; }
 198 def CortexA55WriteFDivDP : SchedWriteRes<[CortexA55UnitFPDIV]> { let Latency = 22;
 199                                                      let ReleaseAtCycles = [19]; }
 200 def CortexA55WriteFSqrtHP : SchedWriteRes<[CortexA55UnitFPDIV]> { let Latency = 8;
 201                                                       let ReleaseAtCycles = [5]; }
 202 def CortexA55WriteFSqrtSP : SchedWriteRes<[CortexA55UnitFPDIV]> { let Latency = 12;
 203                                                       let ReleaseAtCycles = [9]; }
 204 def CortexA55WriteFSqrtDP : SchedWriteRes<[CortexA55UnitFPDIV]> { let Latency = 22;
 205                                                       let ReleaseAtCycles = [19]; }
 206 }
 207 //===----------------------------------------------------------------------===//
 208 // Subtarget-specific SchedRead types.
 209
 210 def : ReadAdvance<ReadVLD, 0>;
 211 def : ReadAdvance<ReadExtrHi, 1>;
 212 def : ReadAdvance<ReadAdrBase, 1>;
 213 def : ReadAdvance<ReadST, 1>;
 214
 215 // ALU - ALU input operands are generally needed in EX1. An operand produced in
 216 //       in say EX2 can be forwarded for consumption to ALU in EX1, thereby
 217 //       allowing back-to-back ALU operations such as add. If an operand requires
 218 //       a shift, it will, however, be required in ISS stage.
 219 def : ReadAdvance<ReadI, 2, [WriteImm,WriteI,
 220                              WriteISReg, WriteIEReg,WriteIS,
 221                              WriteID32,WriteID64,
 222                              WriteIM32,WriteIM64]>;
 223 // Shifted operand
 224 def CortexA55ReadShifted : SchedReadAdvance<1, [WriteImm,WriteI,
 225                                           WriteISReg, WriteIEReg,WriteIS,
 226                                           WriteID32,WriteID64,
 227                                           WriteIM32,WriteIM64]>;
 228 def CortexA55ReadNotShifted : SchedReadAdvance<2, [WriteImm,WriteI,
 229                                              WriteISReg, WriteIEReg,WriteIS,
 230                                              WriteID32,WriteID64,
 231                                              WriteIM32,WriteIM64]>;
 232 def CortexA55ReadISReg : SchedReadVariant<[
 233         SchedVar<RegShiftedPred, [CortexA55ReadShifted]>,
 234         SchedVar<NoSchedPred, [CortexA55ReadNotShifted]>]>;
 235 def : SchedAlias<ReadISReg, CortexA55ReadISReg>;
 236
 237 def CortexA55ReadIEReg : SchedReadVariant<[
 238         SchedVar<RegExtendedPred, [CortexA55ReadShifted]>,
 239         SchedVar<NoSchedPred, [CortexA55ReadNotShifted]>]>;
 240 def : SchedAlias<ReadIEReg, CortexA55ReadIEReg>;
 241
 242 // MUL
 243 def : ReadAdvance<ReadIM, 1, [WriteImm,WriteI,
 244                               WriteISReg, WriteIEReg,WriteIS,
 245                               WriteID32,WriteID64,
 246                               WriteIM32,WriteIM64]>;
 247 def : ReadAdvance<ReadIMA, 2, [WriteImm,WriteI,
 248                                WriteISReg, WriteIEReg,WriteIS,
 249                                WriteID32,WriteID64,
 250                                WriteIM32,WriteIM64]>;
 251
 252 // Div
 253 def : ReadAdvance<ReadID, 1, [WriteImm,WriteI,
 254                               WriteISReg, WriteIEReg,WriteIS,
 255                               WriteID32,WriteID64,
 256                               WriteIM32,WriteIM64]>;
 257
 258 //===----------------------------------------------------------------------===//
 259 // Subtarget-specific InstRWs.
 260
 261 //---
 262 // Miscellaneous
 263 //---
 264 def : InstRW<[CortexA55WriteVLD1SI,CortexA55WriteLDP1], (instregex "LDPS?Wi")>;
 265 def : InstRW<[CortexA55WriteVLD1,CortexA55WriteLDP1], (instregex "LDPSi")>;
 266 def : InstRW<[CortexA55WriteVLD1,CortexA55WriteLDP2], (instregex "LDP(X|D)i")>;
 267 def : InstRW<[CortexA55WriteVLD1,CortexA55WriteLDP4], (instregex "LDPQi")>;
 268 def : InstRW<[WriteAdr, CortexA55WriteVLD1SI,CortexA55WriteLDP1], (instregex "LDPS?W(pre|post)")>;
 269 def : InstRW<[WriteAdr, CortexA55WriteVLD1,CortexA55WriteLDP1], (instregex "LDPS(pre|post)")>;
 270 def : InstRW<[WriteAdr, CortexA55WriteVLD1,CortexA55WriteLDP2], (instregex "LDP(X|D)(pre|post)")>;
 271 def : InstRW<[WriteAdr, CortexA55WriteVLD1,CortexA55WriteLDP4], (instregex "LDPQ(pre|post)")>;
 272 def : InstRW<[WriteI], (instrs COPY)>;
 273 //---
 274 // Vector Loads - 64-bit per cycle
 275 //---
 276 //   1-element structures
 277 def : InstRW<[CortexA55WriteVLD1], (instregex "LD1i(8|16|32|64)$")>;                // single element
 278 def : InstRW<[CortexA55WriteVLD1], (instregex "LD1Rv(8b|4h|2s|1d|16b|8h|4s|2d)$")>; // replicate
 279 def : InstRW<[CortexA55WriteVLD1], (instregex "LD1Onev(8b|4h|2s|1d)$")>;
 280 def : InstRW<[CortexA55WriteVLD2], (instregex "LD1Onev(16b|8h|4s|2d)$")>;
 281 def : InstRW<[CortexA55WriteVLD2], (instregex "LD1Twov(8b|4h|2s|1d)$")>; // multiple structures
 282 def : InstRW<[CortexA55WriteVLD4], (instregex "LD1Twov(16b|8h|4s|2d)$")>;
 283 def : InstRW<[CortexA55WriteVLD3], (instregex "LD1Threev(8b|4h|2s|1d)$")>;
 284 def : InstRW<[CortexA55WriteVLD6], (instregex "LD1Threev(16b|8h|4s|2d)$")>;
 285 def : InstRW<[CortexA55WriteVLD4], (instregex "LD1Fourv(8b|4h|2s|1d)$")>;
 286 def : InstRW<[CortexA55WriteVLD8], (instregex "LD1Fourv(16b|8h|4s|2d)$")>;
 287
 288 def : InstRW<[WriteAdr, CortexA55WriteVLD1], (instregex "LD1i(8|16|32|64)_POST$")>;
 289 def : InstRW<[WriteAdr, CortexA55WriteVLD1], (instregex "LD1Rv(8b|4h|2s|1d|16b|8h|4s|2d)_POST$")>;
 290 def : InstRW<[WriteAdr, CortexA55WriteVLD1], (instregex "LD1Onev(8b|4h|2s|1d)_POST$")>;
 291 def : InstRW<[WriteAdr, CortexA55WriteVLD2], (instregex "LD1Onev(16b|8h|4s|2d)_POST$")>;
 292 def : InstRW<[WriteAdr, CortexA55WriteVLD2], (instregex "LD1Twov(8b|4h|2s|1d)_POST$")>;
 293 def : InstRW<[WriteAdr, CortexA55WriteVLD4], (instregex "LD1Twov(16b|8h|4s|2d)_POST$")>;
 294 def : InstRW<[WriteAdr, CortexA55WriteVLD3], (instregex "LD1Threev(8b|4h|2s|1d)_POST$")>;
 295 def : InstRW<[WriteAdr, CortexA55WriteVLD6], (instregex "LD1Threev(16b|8h|4s|2d)_POST$")>;
 296 def : InstRW<[WriteAdr, CortexA55WriteVLD4], (instregex "LD1Fourv(8b|4h|2s|1d)_POST$")>;
 297 def : InstRW<[WriteAdr, CortexA55WriteVLD8], (instregex "LD1Fourv(16b|8h|4s|2d)_POST$")>;
 298
 299 //    2-element structures
 300 def : InstRW<[CortexA55WriteVLD2], (instregex "LD2i(8|16|32|64)$")>;
 301 def : InstRW<[CortexA55WriteVLD2], (instregex "LD2Rv(8b|4h|2s|1d|16b|8h|4s|2d)$")>;
 302 def : InstRW<[CortexA55WriteVLD2], (instregex "LD2Twov(8b|4h|2s)$")>;
 303 def : InstRW<[CortexA55WriteVLD4], (instregex "LD2Twov(16b|8h|4s|2d)$")>;
 304
 305 def : InstRW<[WriteAdr, CortexA55WriteVLD2], (instregex "LD2i(8|16|32|64)(_POST)?$")>;
 306 def : InstRW<[WriteAdr, CortexA55WriteVLD2], (instregex "LD2Rv(8b|4h|2s|1d|16b|8h|4s|2d)(_POST)?$")>;
 307 def : InstRW<[WriteAdr, CortexA55WriteVLD2], (instregex "LD2Twov(8b|4h|2s)(_POST)?$")>;
 308 def : InstRW<[WriteAdr, CortexA55WriteVLD4], (instregex "LD2Twov(16b|8h|4s|2d)(_POST)?$")>;
 309
 310 //    3-element structures
 311 def : InstRW<[CortexA55WriteVLD2], (instregex "LD3i(8|16|32|64)$")>;
 312 def : InstRW<[CortexA55WriteVLD2], (instregex "LD3Rv(8b|4h|2s|1d|16b|8h|4s|2d)$")>;
 313 def : InstRW<[CortexA55WriteVLD3], (instregex "LD3Threev(8b|4h|2s|1d)$")>;
 314 def : InstRW<[CortexA55WriteVLD6], (instregex "LD3Threev(16b|8h|4s|2d)$")>;
 315
 316 def : InstRW<[WriteAdr, CortexA55WriteVLD2], (instregex "LD3i(8|16|32|64)_POST$")>;
 317 def : InstRW<[WriteAdr, CortexA55WriteVLD2], (instregex "LD3Rv(8b|4h|2s|1d|16b|8h|4s|2d)_POST$")>;
 318 def : InstRW<[WriteAdr, CortexA55WriteVLD3], (instregex "LD3Threev(8b|4h|2s|1d)_POST$")>;
 319 def : InstRW<[WriteAdr, CortexA55WriteVLD6], (instregex "LD3Threev(16b|8h|4s|2d)_POST$")>;
 320
 321 //    4-element structures
 322 def : InstRW<[CortexA55WriteVLD2], (instregex "LD4i(8|16|32|64)$")>;                // load single 4-el structure to one lane of 4 regs.
 323 def : InstRW<[CortexA55WriteVLD2], (instregex "LD4Rv(8b|4h|2s|1d|16b|8h|4s|2d)$")>; // load single 4-el structure, replicate to all lanes of 4 regs.
 324 def : InstRW<[CortexA55WriteVLD4], (instregex "LD4Fourv(8b|4h|2s|1d)$")>;           // load multiple 4-el structures to 4 regs.
 325 def : InstRW<[CortexA55WriteVLD8], (instregex "LD4Fourv(16b|8h|4s|2d)$")>;
 326
 327 def : InstRW<[WriteAdr, CortexA55WriteVLD2], (instregex "LD4i(8|16|32|64)_POST$")>;
 328 def : InstRW<[WriteAdr, CortexA55WriteVLD2], (instregex "LD4Rv(8b|4h|2s|1d|16b|8h|4s|2d)_POST$")>;
 329 def : InstRW<[WriteAdr, CortexA55WriteVLD4], (instregex "LD4Fourv(8b|4h|2s|1d)_POST$")>;
 330 def : InstRW<[WriteAdr, CortexA55WriteVLD8], (instregex "LD4Fourv(16b|8h|4s|2d)_POST$")>;
 331
 332 //---
 333 // Vector Stores
 334 //---
 335 def : InstRW<[CortexA55WriteVST1], (instregex "ST1i(8|16|32|64)$")>;
 336 def : InstRW<[CortexA55WriteVST1], (instregex "ST1Onev(8b|4h|2s|1d|16b|8h|4s|2d)$")>;
 337 def : InstRW<[CortexA55WriteVST1], (instregex "ST1Twov(8b|4h|2s|1d|16b|8h|4s|2d)$")>;
 338 def : InstRW<[CortexA55WriteVST2], (instregex "ST1Threev(8b|4h|2s|1d|16b|8h|4s|2d)$")>;
 339 def : InstRW<[CortexA55WriteVST4], (instregex "ST1Fourv(8b|4h|2s|1d|16b|8h|4s|2d)$")>;
 340 def : InstRW<[WriteAdr, CortexA55WriteVST1], (instregex "ST1i(8|16|32|64)_POST$")>;
 341 def : InstRW<[WriteAdr, CortexA55WriteVST1], (instregex "ST1Onev(8b|4h|2s|1d|16b|8h|4s|2d)_POST$")>;
 342 def : InstRW<[WriteAdr, CortexA55WriteVST1], (instregex "ST1Twov(8b|4h|2s|1d|16b|8h|4s|2d)_POST$")>;
 343 def : InstRW<[WriteAdr, CortexA55WriteVST2], (instregex "ST1Threev(8b|4h|2s|1d|16b|8h|4s|2d)_POST$")>;
 344 def : InstRW<[WriteAdr, CortexA55WriteVST4], (instregex "ST1Fourv(8b|4h|2s|1d|16b|8h|4s|2d)_POST$")>;
 345
 346 def : InstRW<[CortexA55WriteVST2], (instregex "ST2i(8|16|32|64)$")>;
 347 def : InstRW<[CortexA55WriteVST2], (instregex "ST2Twov(8b|4h|2s)$")>;
 348 def : InstRW<[CortexA55WriteVST4], (instregex "ST2Twov(16b|8h|4s|2d)$")>;
 349 def : InstRW<[WriteAdr, CortexA55WriteVST2], (instregex "ST2i(8|16|32|64)_POST$")>;
 350 def : InstRW<[WriteAdr, CortexA55WriteVST2], (instregex "ST2Twov(8b|4h|2s)_POST$")>;
 351 def : InstRW<[WriteAdr, CortexA55WriteVST4], (instregex "ST2Twov(16b|8h|4s|2d)_POST$")>;
 352
 353 def : InstRW<[CortexA55WriteVST2], (instregex "ST3i(8|16|32|64)$")>;
 354 def : InstRW<[CortexA55WriteVST4], (instregex "ST3Threev(8b|4h|2s|1d|16b|8h|4s|2d)$")>;
 355 def : InstRW<[WriteAdr, CortexA55WriteVST2], (instregex "ST3i(8|16|32|64)_POST$")>;
 356 def : InstRW<[WriteAdr, CortexA55WriteVST4], (instregex "ST3Threev(8b|4h|2s|1d|2d|16b|8h|4s|4d)_POST$")>;
 357
 358 def : InstRW<[CortexA55WriteVST2], (instregex "ST4i(8|16|32|64)$")>;
 359 def : InstRW<[CortexA55WriteVST4], (instregex "ST4Fourv(8b|4h|2s|1d|16b|8h|4s|2d)$")>;
 360 def : InstRW<[WriteAdr, CortexA55WriteVST2], (instregex "ST4i(8|16|32|64)_POST$")>;
 361 def : InstRW<[WriteAdr, CortexA55WriteVST4], (instregex "ST4Fourv(8b|4h|2s|1d|16b|8h|4s|2d)_POST$")>;
 362
 363 //---
 364 // Floating Point Conversions, MAC, DIV, SQRT
 365 //---
 366 def : InstRW<[CortexA55WriteFPALU_F2], (instregex "^DUP(v2i64|v4i32|v8i16|v16i8)")>;
 367 def : InstRW<[CortexA55WriteFPALU_F2], (instregex "^XTN")>;
 368 def : InstRW<[CortexA55WriteFPALU_F3], (instregex "^FCVT[ALMNPZ][SU](S|U)?(W|X)")>;
 369 def : InstRW<[CortexA55WriteFPALU_F4], (instregex "^FCVT(X)?[ALMNPXZ](S|U|N)?v")>;
 370
 371 def : InstRW<[CortexA55WriteFPALU_F4], (instregex "^(S|U)CVTF(S|U)(W|X)(H|S|D)")>;
 372 def : InstRW<[CortexA55WriteFPALU_F4], (instregex "^(S|U)CVTF(h|s|d)")>;
 373 def : InstRW<[CortexA55WriteFPALU_F4], (instregex "^(S|U)CVTFv")>;
 374
 375 def : InstRW<[CortexA55WriteFMAC], (instregex "^FN?M(ADD|SUB).*")>;
 376 def : InstRW<[CortexA55WriteFMAC], (instregex "^FML(A|S).*")>;
 377 def : InstRW<[CortexA55WriteFDivHP], (instrs FDIVHrr)>;
 378 def : InstRW<[CortexA55WriteFDivSP], (instrs FDIVSrr)>;
 379 def : InstRW<[CortexA55WriteFDivDP], (instrs FDIVDrr)>;
 380 def : InstRW<[CortexA55WriteFDivHP], (instregex "^FDIVv.*16$")>;
 381 def : InstRW<[CortexA55WriteFDivSP], (instregex "^FDIVv.*32$")>;
 382 def : InstRW<[CortexA55WriteFDivDP], (instregex "^FDIVv.*64$")>;
 383 def : InstRW<[CortexA55WriteFSqrtHP], (instregex "^.*SQRT.*16$")>;
 384 def : InstRW<[CortexA55WriteFSqrtSP], (instregex "^.*SQRT.*32$")>;
 385 def : InstRW<[CortexA55WriteFSqrtDP], (instregex "^.*SQRT.*64$")>;
 386
 387 // 4.15. Advanced SIMD integer instructions
 388 // ASIMD absolute diff
 389 def : InstRW<[CortexA55WriteAluVd_3], (instregex "[SU]ABDv(2i32|4i16|8i8)")>;
 390 def : InstRW<[CortexA55WriteAluVq_3], (instregex "[SU]ABDv(16i8|4i32|8i16)")>;
 391 // ASIMD absolute diff accum
 392 def : InstRW<[CortexA55WriteAluVq_4], (instregex "[SU]ABAL?v")>;
 393 // ASIMD absolute diff long
 394 def : InstRW<[CortexA55WriteAluVq_3], (instregex "[SU]ABDLv")>;
 395 // ASIMD arith #1
 396 def : InstRW<[CortexA55WriteAluVd_2], (instregex "(ADD|SUB|NEG)v(1i64|2i32|4i16|8i8)",
 397   "[SU]R?HADDv(2i32|4i16|8i8)", "[SU]HSUBv(2i32|4i16|8i8)")>;
 398 def : InstRW<[CortexA55WriteAluVq_2], (instregex "(ADD|SUB|NEG)v(2i64|4i32|8i16|16i8)",
 399   "[SU]R?HADDv(8i16|4i32|16i8)", "[SU]HSUBv(8i16|4i32|16i8)")>;
 400 // ASIMD arith #2
 401 def : InstRW<[CortexA55WriteAluVd_3], (instregex "ABSv(1i64|2i32|4i16|8i8)$",
 402   "[SU]ADDLPv(2i32_v1i64|4i16_v2i32|8i8_v4i16)$",
 403   "([SU]QADD|[SU]QSUB|SQNEG|SUQADD|USQADD)v(1i16|1i32|1i64|1i8|2i32|4i16|8i8)$",
 404   "ADDPv(2i32|4i16|8i8)$")>;
 405 def : InstRW<[CortexA55WriteAluVq_3], (instregex "ABSv(2i64|4i32|8i16|16i8)$",
 406   "[SU]ADDLPv(16i8_v8i16|4i32_v2i64|8i16_v4i32)$",
 407   "([SU]QADD|[SU]QSUB|SQNEG|SUQADD|USQADD)v(16i8|2i64|4i32|8i16)$",
 408   "ADDPv(16i8|2i64|4i32|8i16)$")>;
 409 // ASIMD arith #3
 410 def : InstRW<[CortexA55WriteAluVq_3], (instregex  "SADDLv", "UADDLv", "SADDWv",
 411   "UADDWv", "SSUBLv", "USUBLv", "SSUBWv", "USUBWv", "ADDHNv", "SUBHNv")>;
 412 // ASIMD arith #5
 413 def : InstRW<[CortexA55WriteAluVq_4], (instregex "RADDHNv", "RSUBHNv")>;
 414 // ASIMD arith, reduce
 415 def : InstRW<[CortexA55WriteAluVq_3], (instregex  "ADDVv", "SADDLVv", "UADDLVv")>;
 416 // ASIMD compare #1
 417 def : InstRW<[CortexA55WriteAluVd_2], (instregex "CM(EQ|GE|GT|HI|HS|LE|LT)v(1i64|2i32|4i16|8i8)")>;
 418 def : InstRW<[CortexA55WriteAluVq_2], (instregex "CM(EQ|GE|GT|HI|HS|LE|LT)v(2i64|4i32|8i16|16i8)")>;
 419 // ASIMD compare #2
 420 def : InstRW<[CortexA55WriteAluVd_3], (instregex "CMTSTv(1i64|2i32|4i16|8i8)")>;
 421 def : InstRW<[CortexA55WriteAluVq_3], (instregex "CMTSTv(2i64|4i32|8i16|16i8)")>;
 422 // ASIMD logical $1
 423 def : InstRW<[CortexA55WriteAluVd_1], (instregex "(AND|EOR|NOT|ORN)v8i8",
 424   "(ORR|BIC)v(2i32|4i16|8i8)$", "MVNIv(2i|2s|4i16)")>;
 425 def : InstRW<[CortexA55WriteAluVq_1], (instregex "(AND|EOR|NOT|ORN)v16i8",
 426   "(ORR|BIC)v(16i8|4i32|8i16)$", "MVNIv(4i32|4s|8i16)")>;
 427 // ASIMD max/min, basic
 428 def : InstRW<[CortexA55WriteAluVd_2], (instregex "[SU](MIN|MAX)P?v(2i32|4i16|8i8)")>;
 429 def : InstRW<[CortexA55WriteAluVq_2], (instregex "[SU](MIN|MAX)P?v(16i8|4i132|8i16)")>;
 430 // SIMD max/min, reduce
 431 def : InstRW<[CortexA55WriteAluVq_4], (instregex "[SU](MAX|MIN)Vv")>;
 432 // ASIMD multiply, by element
 433 def : InstRW<[CortexA55WriteAluVq_4], (instregex "MULv(2i32|4i16|4i32|8i16)_indexed$",
 434   "SQR?DMULHv(1i16|1i32|2i32|4i16|4i32|8i16)_indexed$")>;
 435 // ASIMD multiply
 436 def : InstRW<[CortexA55WriteAluVd_3], (instrs PMULv8i8)>;
 437 def : InstRW<[CortexA55WriteAluVq_3], (instrs PMULv16i8)>;
 438 // ASIMD multiply accumulate
 439 def : InstRW<[CortexA55WriteMlaVd_4], (instregex "ML[AS]v(2i32|4i16|8i8)$")>;
 440 def : InstRW<[CortexA55WriteMlaVq_4], (instregex "ML[AS]v(16i8|4i32|8i16)$")>;
 441 def : InstRW<[CortexA55WriteMlaIxVq_4], (instregex "ML[AS]v(2i32|4i16|4i32|8i16)_indexed$")>;
 442 // ASIMD multiply accumulate half
 443 def : InstRW<[CortexA55WriteAluVq_4], (instregex "SQRDML[AS]H[vi]")>;
 444 // ASIMD multiply accumulate long
 445 def : InstRW<[CortexA55WriteMlaLVq_4], (instregex "[SU]ML[AS]Lv")>;
 446 // ASIMD multiply accumulate long #2
 447 def : InstRW<[CortexA55WriteAluVq_4], (instregex "SQDML[AS]L[iv]")>;
 448 // ASIMD dot product
 449 def : InstRW<[CortexA55WriteDotVd_4], (instregex "[SU]DOTv8i8")>;
 450 def : InstRW<[CortexA55WriteDotVq_4], (instregex "[SU]DOTv16i8")>;
 451 // ASIMD dot product, by scalar
 452 def : InstRW<[CortexA55WriteDotScVq_4], (instregex "[SU]DOTlanev")>;
 453 // ASIMD multiply long
 454 def : InstRW<[CortexA55WriteAluVq_4], (instregex "[SU]MULLv", "SQDMULL[iv]")>;
 455 // ASIMD polynomial (8x8) multiply long
 456 def : InstRW<[CortexA55WriteAluVq_3], (instrs PMULLv8i8, PMULLv16i8)>;
 457 // ASIMD pairwise add and accumulate
 458 def : InstRW<[CortexA55WriteAluVq_4], (instregex "[SU]ADALPv")>;
 459 // ASIMD shift accumulate
 460 def : InstRW<[CortexA55WriteAluVd_3], (instregex "[SU]SRA(d|v2i32|v4i16|v8i8)")>;
 461 def : InstRW<[CortexA55WriteAluVq_3], (instregex "[SU]SRAv(16i8|2i64|4i32|8i16)")>;
 462 // ASIMD shift accumulate #2
 463 def : InstRW<[CortexA55WriteAluVq_4], (instregex "[SU]RSRA[vd]")>;
 464 // ASIMD shift by immed
 465 def : InstRW<[CortexA55WriteAluVd_2], (instregex "SHLd$", "SHLv",
 466   "SLId$", "SRId$", "[SU]SHR[vd]", "SHRNv")>;
 467 // ASIMD shift by immed
 468 // SXTL and UXTL are aliases for SHLL
 469 def : InstRW<[CortexA55WriteAluVq_2], (instregex "[US]?SHLLv")>;
 470 // ASIMD shift by immed #2
 471 def : InstRW<[CortexA55WriteAluVd_3], (instregex "[SU]RSHR(d|v2i32|v4i16|v8i8)",
 472   "RSHRNv(2i32|4i16|8i8)")>;
 473 def : InstRW<[CortexA55WriteAluVq_3], (instregex "[SU]RSHRv(16i8|2i64|4i32|8i16)",
 474   "RSHRNv(16i8|4i32|8i16)")>;
 475 // ASIMD shift by register
 476 def : InstRW<[CortexA55WriteAluVd_2], (instregex "[SU]SHLv(1i64|2i32|4i16|8i8)")>;
 477 def : InstRW<[CortexA55WriteAluVq_2], (instregex "[SU]SHLv(2i64|4i32|8i16|16i8)")>;
 478 // ASIMD shift by register #2
 479 def : InstRW<[CortexA55WriteAluVd_3], (instregex "[SU]RSHLv(1i64|2i32|4i16|8i8)")>;
 480 def : InstRW<[CortexA55WriteAluVq_3], (instregex "[SU]RSHLv(2i64|4i32|8i16|16i8)")>;
 481
 482 }