llvm/lib/Target/AArch64/AArch64SchedCyclone.td

   1 //=- AArch64SchedCyclone.td - Cyclone Scheduling Definitions -*- tablegen -*-=//
   2 //
   3 // Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
   4 // See https://llvm.org/LICENSE.txt for license information.
   5 // SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
   6 //
   7 //===----------------------------------------------------------------------===//
   8 //
   9 // This file defines the machine model for AArch64 Cyclone to support
  10 // instruction scheduling and other instruction cost heuristics.
  11 //
  12 //===----------------------------------------------------------------------===//
  13
  14 def CycloneModel : SchedMachineModel {
  15   let IssueWidth = 6; // 6 micro-ops are dispatched per cycle.
  16   let MicroOpBufferSize = 192; // Based on the reorder buffer.
  17   let LoadLatency = 4; // Optimistic load latency.
  18   let MispredictPenalty = 16; // 14-19 cycles are typical.
  19   let CompleteModel = 1;
  20
  21   list<Predicate> UnsupportedFeatures = !listconcat(SVEUnsupported.F,
  22                                                     PAUnsupported.F,
  23                                                     SMEUnsupported.F);
  24 }
  25
  26 //===----------------------------------------------------------------------===//
  27 // Define each kind of processor resource and number available on Cyclone.
  28
  29 // 4 integer pipes
  30 def CyUnitI : ProcResource<4> {
  31   let BufferSize = 48;
  32 }
  33
  34 // 2 branch units: I[0..1]
  35 def CyUnitB : ProcResource<2> {
  36   let Super  = CyUnitI;
  37   let BufferSize = 24;
  38 }
  39
  40 // 1 indirect-branch unit: I[0]
  41 def CyUnitBR : ProcResource<1> {
  42   let Super  = CyUnitB;
  43 }
  44
  45 // 2 shifter pipes: I[2..3]
  46 // When an instruction consumes a CyUnitIS, it also consumes a CyUnitI
  47 def CyUnitIS : ProcResource<2> {
  48   let Super = CyUnitI;
  49   let BufferSize = 24;
  50 }
  51
  52 // 1 mul pipe: I[0]
  53 def CyUnitIM : ProcResource<1> {
  54   let Super = CyUnitBR;
  55   let BufferSize = 32;
  56 }
  57
  58 // 1 div pipe: I[1]
  59 def CyUnitID : ProcResource<1> {
  60   let Super = CyUnitB;
  61   let BufferSize = 16;
  62 }
  63
  64 // 1 integer division unit. This is driven by the ID pipe, but only
  65 // consumes the pipe for one cycle at issue and another cycle at writeback.
  66 def CyUnitIntDiv : ProcResource<1>;
  67
  68 // 2 ld/st pipes.
  69 def CyUnitLS : ProcResource<2> {
  70   let BufferSize = 28;
  71 }
  72
  73 // 3 fp/vector pipes.
  74 def CyUnitV : ProcResource<3> {
  75   let BufferSize = 48;
  76 }
  77 // 2 fp/vector arithmetic and multiply pipes: V[0-1]
  78 def CyUnitVM : ProcResource<2> {
  79   let Super = CyUnitV;
  80   let BufferSize = 32;
  81 }
  82 // 1 fp/vector division/sqrt pipe: V[2]
  83 def CyUnitVD : ProcResource<1> {
  84   let Super = CyUnitV;
  85   let BufferSize = 16;
  86 }
  87 // 1 fp compare pipe: V[0]
  88 def CyUnitVC : ProcResource<1> {
  89   let Super = CyUnitVM;
  90   let BufferSize = 16;
  91 }
  92
  93 // 2 fp division/square-root units.  These are driven by the VD pipe,
  94 // but only consume the pipe for one cycle at issue and a cycle at writeback.
  95 def CyUnitFloatDiv : ProcResource<2>;
  96
  97 //===----------------------------------------------------------------------===//
  98 // Define scheduler read/write resources and latency on Cyclone.
  99 // This mirrors sections 7.7-7.9 of the Tuning Guide v1.0.1.
 100
 101 let SchedModel = CycloneModel in {
 102
 103 //---
 104 // 7.8.1. Moves
 105 //---
 106
 107 // A single nop micro-op (uX).
 108 def WriteX : SchedWriteRes<[]> { let Latency = 0; }
 109
 110 // Move zero is a register rename (to machine register zero).
 111 // The move is replaced by a single nop micro-op.
 112 // MOVZ Rd, #0
 113 // AND Rd, Rzr, #imm
 114 def WriteZPred : SchedPredicate<[{TII->isGPRZero(*MI)}]>;
 115 def WriteImmZ  : SchedWriteVariant<[
 116                    SchedVar<WriteZPred, [WriteX]>,
 117                    SchedVar<NoSchedPred, [WriteImm]>]>;
 118 def : InstRW<[WriteImmZ], (instrs MOVZWi,MOVZXi,ANDWri,ANDXri)>;
 119
 120 // Move GPR is a register rename and single nop micro-op.
 121 // ORR Xd, XZR, Xm
 122 // ADD Xd, Xn, #0
 123 def WriteIMovPred : SchedPredicate<[{TII->isGPRCopy(*MI)}]>;
 124 def WriteVMovPred : SchedPredicate<[{TII->isFPRCopy(*MI)}]>;
 125 def WriteMov      : SchedWriteVariant<[
 126                       SchedVar<WriteIMovPred, [WriteX]>,
 127                       SchedVar<WriteVMovPred, [WriteX]>,
 128                       SchedVar<NoSchedPred,   [WriteI]>]>;
 129 def : InstRW<[WriteMov], (instrs COPY,ORRXrr,ADDXrr)>;
 130
 131 // Move non-zero immediate is an integer ALU op.
 132 // MOVN,MOVZ,MOVK
 133 def : WriteRes<WriteImm, [CyUnitI]>;
 134
 135 //---
 136 // 7.8.2-7.8.5. Arithmetic and Logical, Comparison, Conditional,
 137 //              Shifts and Bitfield Operations
 138 //---
 139
 140 // ADR,ADRP
 141 // ADD(S)ri,SUB(S)ri,AND(S)ri,EORri,ORRri
 142 // ADD(S)rr,SUB(S)rr,AND(S)rr,BIC(S)rr,EONrr,EORrr,ORNrr,ORRrr
 143 // ADC(S),SBC(S)
 144 // Aliases: CMN, CMP, TST
 145 //
 146 // Conditional operations.
 147 // CCMNi,CCMPi,CCMNr,CCMPr,
 148 // CSEL,CSINC,CSINV,CSNEG
 149 //
 150 // Bit counting and reversal operations.
 151 // CLS,CLZ,RBIT,REV,REV16,REV32
 152 def : WriteRes<WriteI, [CyUnitI]>;
 153
 154 // ADD with shifted register operand is a single micro-op that
 155 // consumes a shift pipeline for two cycles.
 156 // ADD(S)rs,SUB(S)rs,AND(S)rs,BIC(S)rs,EONrs,EORrs,ORNrs,ORRrs
 157 // EXAMPLE: ADDrs Xn, Xm LSL #imm
 158 def : WriteRes<WriteISReg, [CyUnitIS]> {
 159   let Latency = 2;
 160   let ResourceCycles = [2];
 161 }
 162
 163 // ADD with extended register operand is the same as shifted reg operand.
 164 // ADD(S)re,SUB(S)re
 165 // EXAMPLE: ADDXre Xn, Xm, UXTB #1
 166 def : WriteRes<WriteIEReg, [CyUnitIS]> {
 167   let Latency = 2;
 168   let ResourceCycles = [2];
 169 }
 170
 171 // Variable shift and bitfield operations.
 172 // ASRV,LSLV,LSRV,RORV,BFM,SBFM,UBFM
 173 def : WriteRes<WriteIS, [CyUnitIS]>;
 174
 175 // EXTR Shifts a pair of registers and requires two micro-ops.
 176 // The second micro-op is delayed, as modeled by ReadExtrHi.
 177 // EXTR Xn, Xm, #imm
 178 def : WriteRes<WriteExtr, [CyUnitIS, CyUnitIS]> {
 179   let Latency = 2;
 180   let NumMicroOps = 2;
 181 }
 182
 183 // EXTR's first register read is delayed by one cycle, effectively
 184 // shortening its writer's latency.
 185 // EXTR Xn, Xm, #imm
 186 def : ReadAdvance<ReadExtrHi, 1>;
 187
 188 //---
 189 // 7.8.6. Multiplies
 190 //---
 191
 192 // MUL/MNEG are aliases for MADD/MSUB.
 193 // MADDW,MSUBW,SMADDL,SMSUBL,UMADDL,UMSUBL
 194 def : WriteRes<WriteIM32, [CyUnitIM]> {
 195   let Latency = 4;
 196 }
 197 // MADDX,MSUBX,SMULH,UMULH
 198 def : WriteRes<WriteIM64, [CyUnitIM]> {
 199   let Latency = 5;
 200 }
 201
 202 //---
 203 // 7.8.7. Divide
 204 //---
 205
 206 // 32-bit divide takes 7-13 cycles. 10 cycles covers a 20-bit quotient.
 207 // The ID pipe is consumed for 2 cycles: issue and writeback.
 208 // SDIVW,UDIVW
 209 def : WriteRes<WriteID32, [CyUnitID, CyUnitIntDiv]> {
 210   let Latency = 10;
 211   let ResourceCycles = [2, 10];
 212 }
 213 // 64-bit divide takes 7-21 cycles. 13 cycles covers a 32-bit quotient.
 214 // The ID pipe is consumed for 2 cycles: issue and writeback.
 215 // SDIVX,UDIVX
 216 def : WriteRes<WriteID64, [CyUnitID, CyUnitIntDiv]> {
 217   let Latency = 13;
 218   let ResourceCycles = [2, 13];
 219 }
 220
 221 //---
 222 // 7.8.8,7.8.10. Load/Store, single element
 223 //---
 224
 225 // Integer loads take 4 cycles and use one LS unit for one cycle.
 226 def : WriteRes<WriteLD, [CyUnitLS]> {
 227   let Latency = 4;
 228 }
 229
 230 // Store-load forwarding is 4 cycles.
 231 //
 232 // Note: The store-exclusive sequence incorporates this
 233 // latency. However, general heuristics should not model the
 234 // dependence between a store and subsequent may-alias load because
 235 // hardware speculation works.
 236 def : WriteRes<WriteST, [CyUnitLS]> {
 237   let Latency = 4;
 238 }
 239
 240 // Load from base address plus an optionally scaled register offset.
 241 // Rt latency is latency WriteIS + WriteLD.
 242 // EXAMPLE: LDR Xn, Xm [, lsl 3]
 243 def CyWriteLDIdx : SchedWriteVariant<[
 244   SchedVar<ScaledIdxPred, [WriteIS, WriteLD]>, // Load from scaled register.
 245   SchedVar<NoSchedPred,   [WriteLD]>]>;        // Load from register offset.
 246 def : SchedAlias<WriteLDIdx, CyWriteLDIdx>;    // Map AArch64->Cyclone type.
 247
 248 // EXAMPLE: STR Xn, Xm [, lsl 3]
 249 def CyWriteSTIdx : SchedWriteVariant<[
 250   SchedVar<ScaledIdxPred, [WriteIS, WriteST]>, // Store to scaled register.
 251   SchedVar<NoSchedPred,   [WriteST]>]>;        // Store to register offset.
 252 def : SchedAlias<WriteSTIdx, CyWriteSTIdx>;    // Map AArch64->Cyclone type.
 253
 254 // Read the (unshifted) base register Xn in the second micro-op one cycle later.
 255 // EXAMPLE: LDR Xn, Xm [, lsl 3]
 256 def ReadBaseRS : SchedReadAdvance<1>;
 257 def CyReadAdrBase : SchedReadVariant<[
 258   SchedVar<ScaledIdxPred, [ReadBaseRS]>, // Read base reg after shifting offset.
 259   SchedVar<NoSchedPred,   [ReadDefault]>]>;   // Read base reg with no shift.
 260 def : SchedAlias<ReadAdrBase, CyReadAdrBase>; // Map AArch64->Cyclone type.
 261
 262 //---
 263 // 7.8.9,7.8.11. Load/Store, paired
 264 //---
 265
 266 // Address pre/post increment is a simple ALU op with one cycle latency.
 267 def : WriteRes<WriteAdr, [CyUnitI]>;
 268
 269 // LDP high register write is fused with the load, but a nop micro-op remains.
 270 def : WriteRes<WriteLDHi, []> {
 271   let Latency = 4;
 272 }
 273
 274 // STP is a vector op and store, except for QQ, which is just two stores.
 275 def : SchedAlias<WriteSTP, WriteVSTShuffle>;
 276 def : InstRW<[WriteST, WriteST], (instrs STPQi)>;
 277
 278 //---
 279 // 7.8.13. Branches
 280 //---
 281
 282 // Branches take a single micro-op.
 283 // The misprediction penalty is defined as a SchedMachineModel property.
 284 def : WriteRes<WriteBr,    [CyUnitB]>  {let Latency = 0;}
 285 def : WriteRes<WriteBrReg, [CyUnitBR]> {let Latency = 0;}
 286
 287 //---
 288 // 7.8.14. Never-issued Instructions, Barrier and Hint Operations
 289 //---
 290
 291 // NOP,SEV,SEVL,WFE,WFI,YIELD
 292 def : WriteRes<WriteHint, []> {let Latency = 0;}
 293 // ISB
 294 def : InstRW<[WriteI], (instrs ISB)>;
 295 // SLREX,DMB,DSB
 296 def : WriteRes<WriteBarrier, [CyUnitLS]>;
 297
 298 // System instructions get an invalid latency because the latency of
 299 // other operations across them is meaningless.
 300 def : WriteRes<WriteSys, []> {let Latency = -1;}
 301
 302 //===----------------------------------------------------------------------===//
 303 // 7.9 Vector Unit Instructions
 304
 305 // Simple vector operations take 2 cycles.
 306 def : WriteRes<WriteV, [CyUnitV]> {let Latency = 2;}
 307
 308 // Define some longer latency vector op types for Cyclone.
 309 def CyWriteV3 : SchedWriteRes<[CyUnitV]> {let Latency = 3;}
 310 def CyWriteV4 : SchedWriteRes<[CyUnitV]> {let Latency = 4;}
 311 def CyWriteV5 : SchedWriteRes<[CyUnitV]> {let Latency = 5;}
 312 def CyWriteV6 : SchedWriteRes<[CyUnitV]> {let Latency = 6;}
 313
 314 // Simple floating-point operations take 2 cycles.
 315 def : WriteRes<WriteF, [CyUnitV]> {let Latency = 2;}
 316
 317 //---
 318 // 7.9.1 Vector Moves
 319 //---
 320
 321 // TODO: Add Cyclone-specific zero-cycle zeros. LLVM currently
 322 // generates expensive int-float conversion instead:
 323 // FMOVDi Dd, #0.0
 324 // FMOVv2f64ns Vd.2d, #0.0
 325
 326 // FMOVSi,FMOVDi
 327 def : WriteRes<WriteFImm, [CyUnitV]> {let Latency = 2;}
 328
 329 // MOVI,MVNI are WriteV
 330 // FMOVv2f32ns,FMOVv2f64ns,FMOVv4f32ns are WriteV
 331
 332 // Move FPR is a register rename and single nop micro-op.
 333 // ORR.16b Vd,Vn,Vn
 334 // COPY is handled above in the WriteMov Variant.
 335 def WriteVMov    : SchedWriteVariant<[
 336                      SchedVar<WriteVMovPred, [WriteX]>,
 337                      SchedVar<NoSchedPred,   [WriteV]>]>;
 338 def : InstRW<[WriteVMov], (instrs ORRv16i8)>;
 339
 340 // FMOVSr,FMOVDr are WriteF.
 341
 342 // MOV V,V is a WriteV.
 343
 344 // CPY D,V[x] is a WriteV
 345
 346 // INS V[x],V[y] is a WriteV.
 347
 348 // FMOVWSr,FMOVXDr,FMOVXDHighr
 349 def : WriteRes<WriteFCopy, [CyUnitLS]> {
 350   let Latency = 5;
 351 }
 352
 353 // FMOVSWr,FMOVDXr
 354 def : InstRW<[WriteLD], (instrs FMOVSWr,FMOVDXr,FMOVDXHighr)>;
 355
 356 // INS V[x],R
 357 def CyWriteCopyToFPR : WriteSequence<[WriteVLD, WriteV]>;
 358 def : InstRW<[CyWriteCopyToFPR], (instregex "INSv")>;
 359
 360 // SMOV,UMOV R,V[x]
 361 def CyWriteCopyToGPR : WriteSequence<[WriteLD, WriteI]>;
 362 def : InstRW<[CyWriteCopyToGPR], (instregex "SMOVv","UMOVv")>;
 363
 364 // DUP V,R
 365 def : InstRW<[CyWriteCopyToFPR], (instregex "DUPv")>;
 366
 367 // DUP V,V[x] is a WriteV.
 368
 369 //---
 370 // 7.9.2 Integer Arithmetic, Logical, and Comparisons
 371 //---
 372
 373 // BIC,ORR V,#imm are WriteV
 374
 375 def : InstRW<[CyWriteV3], (instregex "ABSv")>;
 376
 377 // MVN,NEG,NOT are WriteV
 378
 379 def : InstRW<[CyWriteV3], (instregex "SQABSv","SQNEGv")>;
 380
 381 // ADDP is a WriteV.
 382 def CyWriteVADDLP : SchedWriteRes<[CyUnitV]> {let Latency = 2;}
 383 def : InstRW<[CyWriteVADDLP], (instregex "SADDLPv","UADDLPv")>;
 384
 385 def : InstRW<[CyWriteV3],
 386              (instregex "ADDVv","SMAXVv","UMAXVv","SMINVv","UMINVv")>;
 387
 388 def : InstRW<[CyWriteV3], (instregex "SADDLV","UADDLV")>;
 389
 390 // ADD,SUB are WriteV
 391
 392 // Forward declare.
 393 def CyWriteVABD : SchedWriteRes<[CyUnitV]> {let Latency = 3;}
 394
 395 // Add/Diff and accumulate uses the vector multiply unit.
 396 def CyWriteVAccum : SchedWriteRes<[CyUnitVM]> {let Latency = 3;}
 397 def CyReadVAccum  : SchedReadAdvance<1,
 398                     [CyWriteVAccum, CyWriteVADDLP, CyWriteVABD]>;
 399
 400 def : InstRW<[CyWriteVAccum, CyReadVAccum],
 401              (instregex "SADALP","UADALP")>;
 402
 403 def : InstRW<[CyWriteVAccum, CyReadVAccum],
 404              (instregex "SABAv","UABAv","SABALv","UABALv")>;
 405
 406 def : InstRW<[CyWriteV3], (instregex "SQADDv","SQSUBv","UQADDv","UQSUBv")>;
 407
 408 def : InstRW<[CyWriteV3], (instregex "SUQADDv","USQADDv")>;
 409
 410 def : InstRW<[CyWriteV4], (instregex "ADDHNv","RADDHNv", "RSUBHNv", "SUBHNv")>;
 411
 412 // WriteV includes:
 413 // AND,BIC,CMTST,EOR,ORN,ORR
 414 // ADDP
 415 // SHADD,SHSUB,SRHADD,UHADD,UHSUB,URHADD
 416 // SADDL,SSUBL,UADDL,USUBL
 417 // SADDW,SSUBW,UADDW,USUBW
 418
 419 def : InstRW<[CyWriteV3], (instregex "CMEQv","CMGEv","CMGTv",
 420                                      "CMLEv","CMLTv",
 421                                      "CMHIv","CMHSv")>;
 422
 423 def : InstRW<[CyWriteV3], (instregex "SMAXv","SMINv","UMAXv","UMINv",
 424                                      "SMAXPv","SMINPv","UMAXPv","UMINPv")>;
 425
 426 def : InstRW<[CyWriteVABD], (instregex "SABDv","UABDv",
 427                                        "SABDLv","UABDLv")>;
 428
 429 //---
 430 // 7.9.3 Floating Point Arithmetic and Comparisons
 431 //---
 432
 433 // FABS,FNEG are WriteF
 434
 435 def : InstRW<[CyWriteV4], (instrs FADDPv2i32p)>;
 436 def : InstRW<[CyWriteV5], (instrs FADDPv2i64p)>;
 437
 438 def : InstRW<[CyWriteV3], (instregex "FMAXPv2i","FMAXNMPv2i",
 439                                      "FMINPv2i","FMINNMPv2i")>;
 440
 441 def : InstRW<[CyWriteV4], (instregex "FMAXVv","FMAXNMVv","FMINVv","FMINNMVv")>;
 442
 443 def : InstRW<[CyWriteV4], (instrs FADDSrr,FADDv2f32,FADDv4f32,
 444                                   FSUBSrr,FSUBv2f32,FSUBv4f32,
 445                                   FADDPv2f32,FADDPv4f32,
 446                                   FABD32,FABDv2f32,FABDv4f32)>;
 447 def : InstRW<[CyWriteV5], (instrs FADDDrr,FADDv2f64,
 448                                   FSUBDrr,FSUBv2f64,
 449                                   FADDPv2f64,
 450                                   FABD64,FABDv2f64)>;
 451
 452 def : InstRW<[CyWriteV3], (instregex "FCMEQ","FCMGT","FCMLE","FCMLT")>;
 453
 454 def : InstRW<[CyWriteV3], (instregex "FACGE","FACGT",
 455                                      "FMAXS","FMAXD","FMAXv",
 456                                      "FMINS","FMIND","FMINv",
 457                                      "FMAXNMS","FMAXNMD","FMAXNMv",
 458                                      "FMINNMS","FMINNMD","FMINNMv",
 459                                      "FMAXPv2f","FMAXPv4f",
 460                                      "FMINPv2f","FMINPv4f",
 461                                      "FMAXNMPv2f","FMAXNMPv4f",
 462                                      "FMINNMPv2f","FMINNMPv4f")>;
 463
 464 // FCMP,FCMPE,FCCMP,FCCMPE
 465 def : WriteRes<WriteFCmp, [CyUnitVC]> {let Latency = 4;}
 466
 467 // FCSEL is a WriteF.
 468
 469 //---
 470 // 7.9.4 Shifts and Bitfield Operations
 471 //---
 472
 473 // SHL is a WriteV
 474
 475 def CyWriteVSHR : SchedWriteRes<[CyUnitV]> {let Latency = 2;}
 476 def : InstRW<[CyWriteVSHR], (instregex "SSHRv","USHRv")>;
 477
 478 def CyWriteVSRSHR : SchedWriteRes<[CyUnitV]> {let Latency = 3;}
 479 def : InstRW<[CyWriteVSRSHR], (instregex "SRSHRv","URSHRv")>;
 480
 481 // Shift and accumulate uses the vector multiply unit.
 482 def CyWriteVShiftAcc : SchedWriteRes<[CyUnitVM]> {let Latency = 3;}
 483 def CyReadVShiftAcc  : SchedReadAdvance<1,
 484                         [CyWriteVShiftAcc, CyWriteVSHR, CyWriteVSRSHR]>;
 485 def : InstRW<[CyWriteVShiftAcc, CyReadVShiftAcc],
 486              (instregex "SRSRAv","SSRAv","URSRAv","USRAv")>;
 487
 488 // SSHL,USHL are WriteV.
 489
 490 def : InstRW<[CyWriteV3], (instregex "SRSHLv","URSHLv")>;
 491
 492 // SQSHL,SQSHLU,UQSHL are WriteV.
 493
 494 def : InstRW<[CyWriteV3], (instregex "SQRSHLv","UQRSHLv")>;
 495
 496 // WriteV includes:
 497 // SHLL,SSHLL,USHLL
 498 // SLI,SRI
 499 // BIF,BIT,BSL,BSP
 500 // EXT
 501 // CLS,CLZ,CNT,RBIT,REV16,REV32,REV64,XTN
 502 // XTN2
 503
 504 def : InstRW<[CyWriteV4],
 505              (instregex "RSHRNv","SHRNv",
 506                         "SQRSHRNv","SQRSHRUNv","SQSHRNv","SQSHRUNv",
 507                         "UQRSHRNv","UQSHRNv","SQXTNv","SQXTUNv","UQXTNv")>;
 508
 509 //---
 510 // 7.9.5 Multiplication
 511 //---
 512
 513 def CyWriteVMul : SchedWriteRes<[CyUnitVM]> { let Latency = 4;}
 514 def : InstRW<[CyWriteVMul], (instregex "MULv","SMULLv","UMULLv",
 515                              "SQDMULLv","SQDMULHv","SQRDMULHv")>;
 516
 517 // FMUL,FMULX,FNMUL default to WriteFMul.
 518 def : WriteRes<WriteFMul, [CyUnitVM]> { let Latency = 4;}
 519
 520 def CyWriteV64Mul : SchedWriteRes<[CyUnitVM]> { let Latency = 5;}
 521 def : InstRW<[CyWriteV64Mul], (instrs FMULDrr,FMULv2f64,FMULv2i64_indexed,
 522                                FNMULDrr,FMULX64,FMULXv2f64,FMULXv2i64_indexed)>;
 523
 524 def CyReadVMulAcc : SchedReadAdvance<1, [CyWriteVMul, CyWriteV64Mul]>;
 525 def : InstRW<[CyWriteVMul, CyReadVMulAcc],
 526              (instregex "MLA","MLS","SMLAL","SMLSL","UMLAL","UMLSL",
 527               "SQDMLAL","SQDMLSL")>;
 528
 529 def CyWriteSMul : SchedWriteRes<[CyUnitVM]> { let Latency = 8;}
 530 def CyWriteDMul : SchedWriteRes<[CyUnitVM]> { let Latency = 10;}
 531 def CyReadSMul : SchedReadAdvance<4, [CyWriteSMul]>;
 532 def CyReadDMul : SchedReadAdvance<5, [CyWriteDMul]>;
 533
 534 def : InstRW<[CyWriteSMul, CyReadSMul],
 535              (instrs FMADDSrrr,FMSUBSrrr,FNMADDSrrr,FNMSUBSrrr,
 536               FMLAv2f32,FMLAv4f32,
 537               FMLAv1i32_indexed,FMLAv1i64_indexed,FMLAv2i32_indexed)>;
 538 def : InstRW<[CyWriteDMul, CyReadDMul],
 539              (instrs FMADDDrrr,FMSUBDrrr,FNMADDDrrr,FNMSUBDrrr,
 540               FMLAv2f64,FMLAv2i64_indexed,
 541               FMLSv2f64,FMLSv2i64_indexed)>;
 542
 543 def CyWritePMUL : SchedWriteRes<[CyUnitVD]> { let Latency = 3; }
 544 def : InstRW<[CyWritePMUL], (instregex "PMULv", "PMULLv")>;
 545
 546 //---
 547 // 7.9.6 Divide and Square Root
 548 //---
 549
 550 // FDIV,FSQRT
 551 // TODO: Add 64-bit variant with 19 cycle latency.
 552 // TODO: Specialize FSQRT for longer latency.
 553 def : WriteRes<WriteFDiv, [CyUnitVD, CyUnitFloatDiv]> {
 554   let Latency = 17;
 555   let ResourceCycles = [2, 17];
 556 }
 557
 558 def : InstRW<[CyWriteV4], (instregex "FRECPEv","FRECPXv","URECPEv","URSQRTEv")>;
 559
 560 def WriteFRSQRTE : SchedWriteRes<[CyUnitVM]> { let Latency = 4; }
 561 def : InstRW<[WriteFRSQRTE], (instregex "FRSQRTEv")>;
 562
 563 def WriteFRECPS : SchedWriteRes<[CyUnitVM]> { let Latency = 8; }
 564 def WriteFRSQRTS : SchedWriteRes<[CyUnitVM]> { let Latency = 10; }
 565 def : InstRW<[WriteFRECPS],  (instregex "FRECPSv")>;
 566 def : InstRW<[WriteFRSQRTS], (instregex "FRSQRTSv")>;
 567
 568 //---
 569 // 7.9.7 Integer-FP Conversions
 570 //---
 571
 572 // FCVT lengthen f16/s32
 573 def : InstRW<[WriteV], (instrs FCVTSHr,FCVTDHr,FCVTDSr)>;
 574
 575 // FCVT,FCVTN,FCVTXN
 576 // SCVTF,UCVTF V,V
 577 // FRINT(AIMNPXZ) V,V
 578 def : WriteRes<WriteFCvt, [CyUnitV]> {let Latency = 4;}
 579
 580 // SCVT/UCVT S/D, Rd = VLD5+V4: 9 cycles.
 581 def CyWriteCvtToFPR : WriteSequence<[WriteVLD, CyWriteV4]>;
 582 def : InstRW<[CyWriteCopyToFPR], (instregex "FCVT[AMNPZ][SU][SU][WX][SD]r")>;
 583
 584 // FCVT Rd, S/D = V6+LD4: 10 cycles
 585 def CyWriteCvtToGPR : WriteSequence<[CyWriteV6, WriteLD]>;
 586 def : InstRW<[CyWriteCvtToGPR], (instregex "[SU]CVTF[SU][WX][SD]r")>;
 587
 588 // FCVTL is a WriteV
 589
 590 //---
 591 // 7.9.8-7.9.10 Cryptography, Data Transposition, Table Lookup
 592 //---
 593
 594 def CyWriteCrypto2 : SchedWriteRes<[CyUnitVD]> {let Latency = 2;}
 595 def : InstRW<[CyWriteCrypto2], (instrs AESIMCrr, AESMCrr, SHA1Hrr,
 596                                        AESDrr, AESErr, SHA1SU1rr, SHA256SU0rr,
 597                                        SHA1SU0rrr)>;
 598
 599 def CyWriteCrypto3 : SchedWriteRes<[CyUnitVD]> {let Latency = 3;}
 600 def : InstRW<[CyWriteCrypto3], (instrs SHA256SU1rrr)>;
 601
 602 def CyWriteCrypto6 : SchedWriteRes<[CyUnitVD]> {let Latency = 6;}
 603 def : InstRW<[CyWriteCrypto6], (instrs SHA1Crrr, SHA1Mrrr, SHA1Prrr,
 604                                        SHA256Hrrr,SHA256H2rrr)>;
 605
 606 // TRN,UZP,ZUP are WriteV.
 607
 608 // TBL,TBX are WriteV.
 609
 610 //---
 611 // 7.9.11-7.9.14 Load/Store, single element and paired
 612 //---
 613
 614 // Loading into the vector unit takes 5 cycles vs 4 for integer loads.
 615 def : WriteRes<WriteVLD, [CyUnitLS]> {
 616   let Latency = 5;
 617 }
 618
 619 // Store-load forwarding is 4 cycles.
 620 def : WriteRes<WriteVST, [CyUnitLS]> {
 621   let Latency = 4;
 622 }
 623
 624 // WriteVLDPair/VSTPair sequences are expanded by the target description.
 625
 626 //---
 627 // 7.9.15 Load, element operations
 628 //---
 629
 630 // Only the first WriteVLD and WriteAdr for writeback matches def operands.
 631 // Subsequent WriteVLDs consume resources. Since all loaded values have the
 632 // same latency, this is acceptable.
 633
 634 // Vd is read 5 cycles after issuing the vector load.
 635 def : ReadAdvance<ReadVLD, 5>;
 636
 637 def : InstRW<[WriteVLD],
 638              (instregex "LD1Onev(8b|4h|2s|1d|16b|8h|4s|2d)$")>;
 639 def : InstRW<[WriteVLD, WriteAdr],
 640              (instregex "LD1Onev(8b|4h|2s|1d|16b|8h|4s|2d)_POST")>;
 641
 642 // Register writes from the load's high half are fused micro-ops.
 643 def : InstRW<[WriteVLD],
 644              (instregex "LD1Twov(8b|4h|2s|1d)$")>;
 645 def : InstRW<[WriteVLD, WriteAdr],
 646              (instregex "LD1Twov(8b|4h|2s|1d)_POST")>;
 647 def : InstRW<[WriteVLD, WriteVLD],
 648              (instregex "LD1Twov(16b|8h|4s|2d)$")>;
 649 def : InstRW<[WriteVLD, WriteAdr, WriteVLD],
 650              (instregex "LD1Twov(16b|8h|4s|2d)_POST")>;
 651
 652 def : InstRW<[WriteVLD, WriteVLD],
 653              (instregex "LD1Threev(8b|4h|2s|1d)$")>;
 654 def : InstRW<[WriteVLD, WriteAdr, WriteVLD],
 655              (instregex "LD1Threev(8b|4h|2s|1d)_POST")>;
 656 def : InstRW<[WriteVLD, WriteVLD, WriteVLD],
 657              (instregex "LD1Threev(16b|8h|4s|2d)$")>;
 658 def : InstRW<[WriteVLD, WriteAdr, WriteVLD, WriteVLD],
 659              (instregex "LD1Threev(16b|8h|4s|2d)_POST")>;
 660
 661 def : InstRW<[WriteVLD, WriteVLD],
 662              (instregex "LD1Fourv(8b|4h|2s|1d)$")>;
 663 def : InstRW<[WriteVLD, WriteAdr, WriteVLD],
 664              (instregex "LD1Fourv(8b|4h|2s|1d)_POST")>;
 665 def : InstRW<[WriteVLD, WriteVLD, WriteVLD, WriteVLD],
 666              (instregex "LD1Fourv(16b|8h|4s|2d)$")>;
 667 def : InstRW<[WriteVLD, WriteAdr, WriteVLD, WriteVLD, WriteVLD],
 668              (instregex "LD1Fourv(16b|8h|4s|2d)_POST")>;
 669
 670 def : InstRW<[WriteVLDShuffle, ReadVLD],
 671              (instregex "LD1i(8|16|32)$")>;
 672 def : InstRW<[WriteVLDShuffle, ReadVLD, WriteAdr],
 673              (instregex "LD1i(8|16|32)_POST")>;
 674
 675 def : InstRW<[WriteVLDShuffle, ReadVLD],          (instrs LD1i64)>;
 676 def : InstRW<[WriteVLDShuffle, ReadVLD, WriteAdr],(instrs LD1i64_POST)>;
 677
 678 def : InstRW<[WriteVLDShuffle],
 679              (instregex "LD1Rv(8b|4h|2s|1d|16b|8h|4s|2d)$")>;
 680 def : InstRW<[WriteVLDShuffle, WriteAdr],
 681              (instregex "LD1Rv(8b|4h|2s|1d|16b|8h|4s|2d)_POST$")>;
 682
 683 def : InstRW<[WriteVLDShuffle, WriteV],
 684              (instregex "LD2Twov(8b|4h|2s)$")>;
 685 def : InstRW<[WriteVLDShuffle, WriteAdr, WriteV],
 686              (instregex "LD2Twov(8b|4h|2s)_POST$")>;
 687 def : InstRW<[WriteVLDShuffle, WriteVLDShuffle],
 688              (instregex "LD2Twov(16b|8h|4s|2d)$")>;
 689 def : InstRW<[WriteVLDShuffle, WriteAdr, WriteVLDShuffle],
 690              (instregex "LD2Twov(16b|8h|4s|2d)_POST")>;
 691
 692 def : InstRW<[WriteVLDShuffle, ReadVLD, WriteV],
 693              (instregex "LD2i(8|16|32)$")>;
 694 def : InstRW<[WriteVLDShuffle, ReadVLD, WriteAdr, WriteV],
 695              (instregex "LD2i(8|16|32)_POST")>;
 696 def : InstRW<[WriteVLDShuffle, ReadVLD, WriteV],
 697              (instregex "LD2i64$")>;
 698 def : InstRW<[WriteVLDShuffle, ReadVLD, WriteAdr, WriteV],
 699              (instregex "LD2i64_POST")>;
 700
 701 def : InstRW<[WriteVLDShuffle, WriteV],
 702              (instregex "LD2Rv(8b|4h|2s|1d|16b|8h|4s|2d)$")>;
 703 def : InstRW<[WriteVLDShuffle, WriteAdr, WriteV],
 704              (instregex "LD2Rv(8b|4h|2s|1d|16b|8h|4s|2d)_POST")>;
 705
 706 def : InstRW<[WriteVLDShuffle, WriteVLDShuffle, WriteV],
 707              (instregex "LD3Threev(8b|4h|2s)$")>;
 708 def : InstRW<[WriteVLDShuffle, WriteAdr, WriteVLDShuffle, WriteV],
 709              (instregex "LD3Threev(8b|4h|2s)_POST")>;
 710 def : InstRW<[WriteVLDShuffle, WriteVLDShuffle, WriteVLDShuffle],
 711              (instregex "LD3Threev(16b|8h|4s|2d)$")>;
 712 def : InstRW<[WriteVLDShuffle, WriteAdr, WriteVLDShuffle, WriteVLDShuffle],
 713              (instregex "LD3Threev(16b|8h|4s|2d)_POST")>;
 714
 715 def : InstRW<[WriteVLDShuffle, ReadVLD, WriteV, WriteV],
 716              (instregex "LD3i(8|16|32)$")>;
 717 def : InstRW<[WriteVLDShuffle, ReadVLD, WriteAdr, WriteV, WriteV],
 718              (instregex "LD3i(8|16|32)_POST")>;
 719
 720 def : InstRW<[WriteVLDShuffle, ReadVLD, WriteVLDShuffle, WriteV],
 721              (instregex "LD3i64$")>;
 722 def : InstRW<[WriteVLDShuffle, ReadVLD, WriteAdr, WriteVLDShuffle, WriteV],
 723              (instregex "LD3i64_POST")>;
 724
 725 def : InstRW<[WriteVLDShuffle, WriteV, WriteV],
 726              (instregex "LD3Rv(8b|4h|2s|16b|8h|4s)$")>;
 727 def : InstRW<[WriteVLDShuffle, WriteAdr, WriteV, WriteV],
 728              (instregex "LD3Rv(8b|4h|2s|16b|8h|4s)_POST")>;
 729
 730 def : InstRW<[WriteVLDShuffle, WriteVLDShuffle, WriteV],
 731              (instrs LD3Rv1d,LD3Rv2d)>;
 732 def : InstRW<[WriteVLDShuffle, WriteAdr, WriteVLDShuffle, WriteV],
 733              (instrs LD3Rv1d_POST,LD3Rv2d_POST)>;
 734
 735 def : InstRW<[WriteVLDShuffle, WriteVLDShuffle, WriteV, WriteV],
 736              (instregex "LD4Fourv(8b|4h|2s)$")>;
 737 def : InstRW<[WriteVLDShuffle, WriteAdr, WriteVLDShuffle, WriteV, WriteV],
 738              (instregex "LD4Fourv(8b|4h|2s)_POST")>;
 739 def : InstRW<[WriteVLDPairShuffle, WriteVLDPairShuffle,
 740               WriteVLDPairShuffle, WriteVLDPairShuffle],
 741              (instregex "LD4Fourv(16b|8h|4s|2d)$")>;
 742 def : InstRW<[WriteVLDPairShuffle, WriteAdr, WriteVLDPairShuffle,
 743               WriteVLDPairShuffle, WriteVLDPairShuffle],
 744              (instregex "LD4Fourv(16b|8h|4s|2d)_POST")>;
 745
 746 def : InstRW<[WriteVLDShuffle, ReadVLD, WriteV, WriteV, WriteV],
 747              (instregex "LD4i(8|16|32)$")>;
 748 def : InstRW<[WriteVLDShuffle, ReadVLD, WriteAdr, WriteV, WriteV, WriteV],
 749              (instregex "LD4i(8|16|32)_POST")>;
 750
 751
 752 def : InstRW<[WriteVLDShuffle, ReadVLD, WriteVLDShuffle, WriteV, WriteV],
 753              (instrs LD4i64)>;
 754 def : InstRW<[WriteVLDShuffle, ReadVLD, WriteAdr, WriteVLDShuffle, WriteV],
 755              (instrs LD4i64_POST)>;
 756
 757 def : InstRW<[WriteVLDShuffle, WriteV, WriteV, WriteV],
 758              (instregex "LD4Rv(8b|4h|2s|16b|8h|4s)$")>;
 759 def : InstRW<[WriteVLDShuffle, WriteAdr, WriteV, WriteV, WriteV],
 760              (instregex "LD4Rv(8b|4h|2s|16b|8h|4s)_POST")>;
 761
 762 def : InstRW<[WriteVLDShuffle, WriteVLDShuffle, WriteV, WriteV],
 763              (instrs LD4Rv1d,LD4Rv2d)>;
 764 def : InstRW<[WriteVLDShuffle, WriteAdr, WriteVLDShuffle, WriteV, WriteV],
 765              (instrs LD4Rv1d_POST,LD4Rv2d_POST)>;
 766
 767 //---
 768 // 7.9.16 Store, element operations
 769 //---
 770
 771 // Only the WriteAdr for writeback matches a def operands.
 772 // Subsequent WriteVLDs only consume resources.
 773
 774 def : InstRW<[WriteVST],
 775              (instregex "ST1Onev(8b|4h|2s|1d|16b|8h|4s|2d)$")>;
 776 def : InstRW<[WriteAdr, WriteVST],
 777              (instregex "ST1Onev(8b|4h|2s|1d|16b|8h|4s|2d)_POST")>;
 778
 779 def : InstRW<[WriteVSTShuffle],
 780              (instregex "ST1Twov(8b|4h|2s|1d)$")>;
 781 def : InstRW<[WriteAdr, WriteVSTShuffle],
 782              (instregex "ST1Twov(8b|4h|2s|1d)_POST")>;
 783 def : InstRW<[WriteVST, WriteVST],
 784              (instregex "ST1Twov(16b|8h|4s|2d)$")>;
 785 def : InstRW<[WriteAdr, WriteVST, WriteVST],
 786              (instregex "ST1Twov(16b|8h|4s|2d)_POST")>;
 787
 788 def : InstRW<[WriteVSTShuffle, WriteVST],
 789              (instregex "ST1Threev(8b|4h|2s|1d)$")>;
 790 def : InstRW<[WriteAdr, WriteVSTShuffle, WriteVST],
 791              (instregex "ST1Threev(8b|4h|2s|1d)_POST")>;
 792 def : InstRW<[WriteVST, WriteVST, WriteVST],
 793              (instregex "ST1Threev(16b|8h|4s|2d)$")>;
 794 def : InstRW<[WriteAdr, WriteVST, WriteVST, WriteVST],
 795              (instregex "ST1Threev(16b|8h|4s|2d)_POST")>;
 796
 797 def : InstRW<[WriteVSTShuffle, WriteVSTShuffle],
 798              (instregex "ST1Fourv(8b|4h|2s|1d)$")>;
 799 def : InstRW<[WriteAdr, WriteVSTShuffle, WriteVSTShuffle],
 800              (instregex "ST1Fourv(8b|4h|2s|1d)_POST")>;
 801 def : InstRW<[WriteVST, WriteVST, WriteVST, WriteVST],
 802              (instregex "ST1Fourv(16b|8h|4s|2d)$")>;
 803 def : InstRW<[WriteAdr, WriteVST, WriteVST, WriteVST, WriteVST],
 804              (instregex "ST1Fourv(16b|8h|4s|2d)_POST")>;
 805
 806 def : InstRW<[WriteVSTShuffle],           (instregex "ST1i(8|16|32)$")>;
 807 def : InstRW<[WriteAdr, WriteVSTShuffle], (instregex "ST1i(8|16|32)_POST")>;
 808
 809 def : InstRW<[WriteVSTShuffle],           (instrs ST1i64)>;
 810 def : InstRW<[WriteAdr, WriteVSTShuffle], (instrs ST1i64_POST)>;
 811
 812 def : InstRW<[WriteVSTShuffle],
 813              (instregex "ST2Twov(8b|4h|2s)$")>;
 814 def : InstRW<[WriteAdr, WriteVSTShuffle],
 815              (instregex "ST2Twov(8b|4h|2s)_POST")>;
 816 def : InstRW<[WriteVSTShuffle, WriteVSTShuffle],
 817              (instregex "ST2Twov(16b|8h|4s|2d)$")>;
 818 def : InstRW<[WriteAdr, WriteVSTShuffle, WriteVSTShuffle],
 819              (instregex "ST2Twov(16b|8h|4s|2d)_POST")>;
 820
 821 def : InstRW<[WriteVSTShuffle],           (instregex "ST2i(8|16|32)$")>;
 822 def : InstRW<[WriteAdr, WriteVSTShuffle], (instregex "ST2i(8|16|32)_POST")>;
 823 def : InstRW<[WriteVSTShuffle],           (instrs ST2i64)>;
 824 def : InstRW<[WriteAdr, WriteVSTShuffle], (instrs ST2i64_POST)>;
 825
 826 def : InstRW<[WriteVSTShuffle, WriteVSTShuffle],
 827              (instregex "ST3Threev(8b|4h|2s)$")>;
 828 def : InstRW<[WriteAdr, WriteVSTShuffle, WriteVSTShuffle],
 829              (instregex "ST3Threev(8b|4h|2s)_POST")>;
 830 def : InstRW<[WriteVSTShuffle, WriteVSTShuffle, WriteVSTShuffle],
 831              (instregex "ST3Threev(16b|8h|4s|2d)$")>;
 832 def : InstRW<[WriteAdr, WriteVSTShuffle, WriteVSTShuffle, WriteVSTShuffle],
 833              (instregex "ST3Threev(16b|8h|4s|2d)_POST")>;
 834
 835 def : InstRW<[WriteVSTShuffle],           (instregex "ST3i(8|16|32)$")>;
 836 def : InstRW<[WriteAdr, WriteVSTShuffle], (instregex "ST3i(8|16|32)_POST")>;
 837
 838 def :InstRW<[WriteVSTShuffle, WriteVSTShuffle],           (instrs ST3i64)>;
 839 def :InstRW<[WriteAdr, WriteVSTShuffle, WriteVSTShuffle], (instrs ST3i64_POST)>;
 840
 841 def : InstRW<[WriteVSTPairShuffle, WriteVSTPairShuffle],
 842             (instregex "ST4Fourv(8b|4h|2s|1d)$")>;
 843 def : InstRW<[WriteAdr, WriteVSTPairShuffle, WriteVSTPairShuffle],
 844             (instregex "ST4Fourv(8b|4h|2s|1d)_POST")>;
 845 def : InstRW<[WriteVSTPairShuffle, WriteVSTPairShuffle,
 846               WriteVSTPairShuffle, WriteVSTPairShuffle],
 847              (instregex "ST4Fourv(16b|8h|4s|2d)$")>;
 848 def : InstRW<[WriteAdr, WriteVSTPairShuffle, WriteVSTPairShuffle,
 849               WriteVSTPairShuffle, WriteVSTPairShuffle],
 850              (instregex "ST4Fourv(16b|8h|4s|2d)_POST")>;
 851
 852 def : InstRW<[WriteVSTPairShuffle],           (instregex "ST4i(8|16|32)$")>;
 853 def : InstRW<[WriteAdr, WriteVSTPairShuffle], (instregex "ST4i(8|16|32)_POST")>;
 854
 855 def : InstRW<[WriteVSTShuffle, WriteVSTShuffle],          (instrs ST4i64)>;
 856 def : InstRW<[WriteAdr, WriteVSTShuffle, WriteVSTShuffle],(instrs ST4i64_POST)>;
 857
 858 // Atomic operations are not supported.
 859 def : WriteRes<WriteAtomic, []> { let Unsupported = 1; }
 860
 861 //---
 862 // Unused SchedRead types
 863 //---
 864
 865 def : ReadAdvance<ReadI, 0>;
 866 def : ReadAdvance<ReadISReg, 0>;
 867 def : ReadAdvance<ReadIEReg, 0>;
 868 def : ReadAdvance<ReadIM, 0>;
 869 def : ReadAdvance<ReadIMA, 0>;
 870 def : ReadAdvance<ReadID, 0>;
 871
 872 } // SchedModel = CycloneModel