lib/Target/AArch64/AArch64SchedCyclone.td

   1 //=- AArch64SchedCyclone.td - Cyclone Scheduling Definitions -*- tablegen -*-=//
   2 //
   3 // Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
   4 // See https://llvm.org/LICENSE.txt for license information.
   5 // SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
   6 //
   7 //===----------------------------------------------------------------------===//
   8 //
   9 // This file defines the machine model for AArch64 Cyclone to support
  10 // instruction scheduling and other instruction cost heuristics.
  11 //
  12 //===----------------------------------------------------------------------===//
  13
  14 def CycloneModel : SchedMachineModel {
  15   let IssueWidth = 6; // 6 micro-ops are dispatched per cycle.
  16   let MicroOpBufferSize = 192; // Based on the reorder buffer.
  17   let LoadLatency = 4; // Optimistic load latency.
  18   let MispredictPenalty = 16; // 14-19 cycles are typical.
  19   let CompleteModel = 1;
  20
  21   list<Predicate> UnsupportedFeatures = SVEUnsupported.F;
  22 }
  23
  24 //===----------------------------------------------------------------------===//
  25 // Define each kind of processor resource and number available on Cyclone.
  26
  27 // 4 integer pipes
  28 def CyUnitI : ProcResource<4> {
  29   let BufferSize = 48;
  30 }
  31
  32 // 2 branch units: I[0..1]
  33 def CyUnitB : ProcResource<2> {
  34   let Super  = CyUnitI;
  35   let BufferSize = 24;
  36 }
  37
  38 // 1 indirect-branch unit: I[0]
  39 def CyUnitBR : ProcResource<1> {
  40   let Super  = CyUnitB;
  41 }
  42
  43 // 2 shifter pipes: I[2..3]
  44 // When an instruction consumes a CyUnitIS, it also consumes a CyUnitI
  45 def CyUnitIS : ProcResource<2> {
  46   let Super = CyUnitI;
  47   let BufferSize = 24;
  48 }
  49
  50 // 1 mul pipe: I[0]
  51 def CyUnitIM : ProcResource<1> {
  52   let Super = CyUnitBR;
  53   let BufferSize = 32;
  54 }
  55
  56 // 1 div pipe: I[1]
  57 def CyUnitID : ProcResource<1> {
  58   let Super = CyUnitB;
  59   let BufferSize = 16;
  60 }
  61
  62 // 1 integer division unit. This is driven by the ID pipe, but only
  63 // consumes the pipe for one cycle at issue and another cycle at writeback.
  64 def CyUnitIntDiv : ProcResource<1>;
  65
  66 // 2 ld/st pipes.
  67 def CyUnitLS : ProcResource<2> {
  68   let BufferSize = 28;
  69 }
  70
  71 // 3 fp/vector pipes.
  72 def CyUnitV : ProcResource<3> {
  73   let BufferSize = 48;
  74 }
  75 // 2 fp/vector arithmetic and multiply pipes: V[0-1]
  76 def CyUnitVM : ProcResource<2> {
  77   let Super = CyUnitV;
  78   let BufferSize = 32;
  79 }
  80 // 1 fp/vector division/sqrt pipe: V[2]
  81 def CyUnitVD : ProcResource<1> {
  82   let Super = CyUnitV;
  83   let BufferSize = 16;
  84 }
  85 // 1 fp compare pipe: V[0]
  86 def CyUnitVC : ProcResource<1> {
  87   let Super = CyUnitVM;
  88   let BufferSize = 16;
  89 }
  90
  91 // 2 fp division/square-root units.  These are driven by the VD pipe,
  92 // but only consume the pipe for one cycle at issue and a cycle at writeback.
  93 def CyUnitFloatDiv : ProcResource<2>;
  94
  95 //===----------------------------------------------------------------------===//
  96 // Define scheduler read/write resources and latency on Cyclone.
  97 // This mirrors sections 7.7-7.9 of the Tuning Guide v1.0.1.
  98
  99 let SchedModel = CycloneModel in {
 100
 101 //---
 102 // 7.8.1. Moves
 103 //---
 104
 105 // A single nop micro-op (uX).
 106 def WriteX : SchedWriteRes<[]> { let Latency = 0; }
 107
 108 // Move zero is a register rename (to machine register zero).
 109 // The move is replaced by a single nop micro-op.
 110 // MOVZ Rd, #0
 111 // AND Rd, Rzr, #imm
 112 def WriteZPred : SchedPredicate<[{TII->isGPRZero(*MI)}]>;
 113 def WriteImmZ  : SchedWriteVariant<[
 114                    SchedVar<WriteZPred, [WriteX]>,
 115                    SchedVar<NoSchedPred, [WriteImm]>]>;
 116 def : InstRW<[WriteImmZ], (instrs MOVZWi,MOVZXi,ANDWri,ANDXri)>;
 117
 118 // Move GPR is a register rename and single nop micro-op.
 119 // ORR Xd, XZR, Xm
 120 // ADD Xd, Xn, #0
 121 def WriteIMovPred : SchedPredicate<[{TII->isGPRCopy(*MI)}]>;
 122 def WriteVMovPred : SchedPredicate<[{TII->isFPRCopy(*MI)}]>;
 123 def WriteMov      : SchedWriteVariant<[
 124                       SchedVar<WriteIMovPred, [WriteX]>,
 125                       SchedVar<WriteVMovPred, [WriteX]>,
 126                       SchedVar<NoSchedPred,   [WriteI]>]>;
 127 def : InstRW<[WriteMov], (instrs COPY,ORRXrr,ADDXrr)>;
 128
 129 // Move non-zero immediate is an integer ALU op.
 130 // MOVN,MOVZ,MOVK
 131 def : WriteRes<WriteImm, [CyUnitI]>;
 132
 133 //---
 134 // 7.8.2-7.8.5. Arithmetic and Logical, Comparison, Conditional,
 135 //              Shifts and Bitfield Operations
 136 //---
 137
 138 // ADR,ADRP
 139 // ADD(S)ri,SUB(S)ri,AND(S)ri,EORri,ORRri
 140 // ADD(S)rr,SUB(S)rr,AND(S)rr,BIC(S)rr,EONrr,EORrr,ORNrr,ORRrr
 141 // ADC(S),SBC(S)
 142 // Aliases: CMN, CMP, TST
 143 //
 144 // Conditional operations.
 145 // CCMNi,CCMPi,CCMNr,CCMPr,
 146 // CSEL,CSINC,CSINV,CSNEG
 147 //
 148 // Bit counting and reversal operations.
 149 // CLS,CLZ,RBIT,REV,REV16,REV32
 150 def : WriteRes<WriteI, [CyUnitI]>;
 151
 152 // ADD with shifted register operand is a single micro-op that
 153 // consumes a shift pipeline for two cycles.
 154 // ADD(S)rs,SUB(S)rs,AND(S)rs,BIC(S)rs,EONrs,EORrs,ORNrs,ORRrs
 155 // EXAMPLE: ADDrs Xn, Xm LSL #imm
 156 def : WriteRes<WriteISReg, [CyUnitIS]> {
 157   let Latency = 2;
 158   let ResourceCycles = [2];
 159 }
 160
 161 // ADD with extended register operand is the same as shifted reg operand.
 162 // ADD(S)re,SUB(S)re
 163 // EXAMPLE: ADDXre Xn, Xm, UXTB #1
 164 def : WriteRes<WriteIEReg, [CyUnitIS]> {
 165   let Latency = 2;
 166   let ResourceCycles = [2];
 167 }
 168
 169 // Variable shift and bitfield operations.
 170 // ASRV,LSLV,LSRV,RORV,BFM,SBFM,UBFM
 171 def : WriteRes<WriteIS, [CyUnitIS]>;
 172
 173 // EXTR Shifts a pair of registers and requires two micro-ops.
 174 // The second micro-op is delayed, as modeled by ReadExtrHi.
 175 // EXTR Xn, Xm, #imm
 176 def : WriteRes<WriteExtr, [CyUnitIS, CyUnitIS]> {
 177   let Latency = 2;
 178   let NumMicroOps = 2;
 179 }
 180
 181 // EXTR's first register read is delayed by one cycle, effectively
 182 // shortening its writer's latency.
 183 // EXTR Xn, Xm, #imm
 184 def : ReadAdvance<ReadExtrHi, 1>;
 185
 186 //---
 187 // 7.8.6. Multiplies
 188 //---
 189
 190 // MUL/MNEG are aliases for MADD/MSUB.
 191 // MADDW,MSUBW,SMADDL,SMSUBL,UMADDL,UMSUBL
 192 def : WriteRes<WriteIM32, [CyUnitIM]> {
 193   let Latency = 4;
 194 }
 195 // MADDX,MSUBX,SMULH,UMULH
 196 def : WriteRes<WriteIM64, [CyUnitIM]> {
 197   let Latency = 5;
 198 }
 199
 200 //---
 201 // 7.8.7. Divide
 202 //---
 203
 204 // 32-bit divide takes 7-13 cycles. 10 cycles covers a 20-bit quotient.
 205 // The ID pipe is consumed for 2 cycles: issue and writeback.
 206 // SDIVW,UDIVW
 207 def : WriteRes<WriteID32, [CyUnitID, CyUnitIntDiv]> {
 208   let Latency = 10;
 209   let ResourceCycles = [2, 10];
 210 }
 211 // 64-bit divide takes 7-21 cycles. 13 cycles covers a 32-bit quotient.
 212 // The ID pipe is consumed for 2 cycles: issue and writeback.
 213 // SDIVX,UDIVX
 214 def : WriteRes<WriteID64, [CyUnitID, CyUnitIntDiv]> {
 215   let Latency = 13;
 216   let ResourceCycles = [2, 13];
 217 }
 218
 219 //---
 220 // 7.8.8,7.8.10. Load/Store, single element
 221 //---
 222
 223 // Integer loads take 4 cycles and use one LS unit for one cycle.
 224 def : WriteRes<WriteLD, [CyUnitLS]> {
 225   let Latency = 4;
 226 }
 227
 228 // Store-load forwarding is 4 cycles.
 229 //
 230 // Note: The store-exclusive sequence incorporates this
 231 // latency. However, general heuristics should not model the
 232 // dependence between a store and subsequent may-alias load because
 233 // hardware speculation works.
 234 def : WriteRes<WriteST, [CyUnitLS]> {
 235   let Latency = 4;
 236 }
 237
 238 // Load from base address plus an optionally scaled register offset.
 239 // Rt latency is latency WriteIS + WriteLD.
 240 // EXAMPLE: LDR Xn, Xm [, lsl 3]
 241 def CyWriteLDIdx : SchedWriteVariant<[
 242   SchedVar<ScaledIdxPred, [WriteIS, WriteLD]>, // Load from scaled register.
 243   SchedVar<NoSchedPred,   [WriteLD]>]>;        // Load from register offset.
 244 def : SchedAlias<WriteLDIdx, CyWriteLDIdx>;    // Map AArch64->Cyclone type.
 245
 246 // EXAMPLE: STR Xn, Xm [, lsl 3]
 247 def CyWriteSTIdx : SchedWriteVariant<[
 248   SchedVar<ScaledIdxPred, [WriteIS, WriteST]>, // Store to scaled register.
 249   SchedVar<NoSchedPred,   [WriteST]>]>;        // Store to register offset.
 250 def : SchedAlias<WriteSTIdx, CyWriteSTIdx>;    // Map AArch64->Cyclone type.
 251
 252 // Read the (unshifted) base register Xn in the second micro-op one cycle later.
 253 // EXAMPLE: LDR Xn, Xm [, lsl 3]
 254 def ReadBaseRS : SchedReadAdvance<1>;
 255 def CyReadAdrBase : SchedReadVariant<[
 256   SchedVar<ScaledIdxPred, [ReadBaseRS]>, // Read base reg after shifting offset.
 257   SchedVar<NoSchedPred,   [ReadDefault]>]>;   // Read base reg with no shift.
 258 def : SchedAlias<ReadAdrBase, CyReadAdrBase>; // Map AArch64->Cyclone type.
 259
 260 //---
 261 // 7.8.9,7.8.11. Load/Store, paired
 262 //---
 263
 264 // Address pre/post increment is a simple ALU op with one cycle latency.
 265 def : WriteRes<WriteAdr, [CyUnitI]>;
 266
 267 // LDP high register write is fused with the load, but a nop micro-op remains.
 268 def : WriteRes<WriteLDHi, []> {
 269   let Latency = 4;
 270 }
 271
 272 // STP is a vector op and store, except for QQ, which is just two stores.
 273 def : SchedAlias<WriteSTP, WriteVSTShuffle>;
 274 def : InstRW<[WriteST, WriteST], (instrs STPQi)>;
 275
 276 //---
 277 // 7.8.13. Branches
 278 //---
 279
 280 // Branches take a single micro-op.
 281 // The misprediction penalty is defined as a SchedMachineModel property.
 282 def : WriteRes<WriteBr,    [CyUnitB]>  {let Latency = 0;}
 283 def : WriteRes<WriteBrReg, [CyUnitBR]> {let Latency = 0;}
 284
 285 //---
 286 // 7.8.14. Never-issued Instructions, Barrier and Hint Operations
 287 //---
 288
 289 // NOP,SEV,SEVL,WFE,WFI,YIELD
 290 def : WriteRes<WriteHint, []> {let Latency = 0;}
 291 // ISB
 292 def : InstRW<[WriteI], (instrs ISB)>;
 293 // SLREX,DMB,DSB
 294 def : WriteRes<WriteBarrier, [CyUnitLS]>;
 295
 296 // System instructions get an invalid latency because the latency of
 297 // other operations across them is meaningless.
 298 def : WriteRes<WriteSys, []> {let Latency = -1;}
 299
 300 //===----------------------------------------------------------------------===//
 301 // 7.9 Vector Unit Instructions
 302
 303 // Simple vector operations take 2 cycles.
 304 def : WriteRes<WriteV, [CyUnitV]> {let Latency = 2;}
 305
 306 // Define some longer latency vector op types for Cyclone.
 307 def CyWriteV3 : SchedWriteRes<[CyUnitV]> {let Latency = 3;}
 308 def CyWriteV4 : SchedWriteRes<[CyUnitV]> {let Latency = 4;}
 309 def CyWriteV5 : SchedWriteRes<[CyUnitV]> {let Latency = 5;}
 310 def CyWriteV6 : SchedWriteRes<[CyUnitV]> {let Latency = 6;}
 311
 312 // Simple floating-point operations take 2 cycles.
 313 def : WriteRes<WriteF, [CyUnitV]> {let Latency = 2;}
 314
 315 //---
 316 // 7.9.1 Vector Moves
 317 //---
 318
 319 // TODO: Add Cyclone-specific zero-cycle zeros. LLVM currently
 320 // generates expensive int-float conversion instead:
 321 // FMOVDi Dd, #0.0
 322 // FMOVv2f64ns Vd.2d, #0.0
 323
 324 // FMOVSi,FMOVDi
 325 def : WriteRes<WriteFImm, [CyUnitV]> {let Latency = 2;}
 326
 327 // MOVI,MVNI are WriteV
 328 // FMOVv2f32ns,FMOVv2f64ns,FMOVv4f32ns are WriteV
 329
 330 // Move FPR is a register rename and single nop micro-op.
 331 // ORR.16b Vd,Vn,Vn
 332 // COPY is handled above in the WriteMov Variant.
 333 def WriteVMov    : SchedWriteVariant<[
 334                      SchedVar<WriteVMovPred, [WriteX]>,
 335                      SchedVar<NoSchedPred,   [WriteV]>]>;
 336 def : InstRW<[WriteVMov], (instrs ORRv16i8)>;
 337
 338 // FMOVSr,FMOVDr are WriteF.
 339
 340 // MOV V,V is a WriteV.
 341
 342 // CPY D,V[x] is a WriteV
 343
 344 // INS V[x],V[y] is a WriteV.
 345
 346 // FMOVWSr,FMOVXDr,FMOVXDHighr
 347 def : WriteRes<WriteFCopy, [CyUnitLS]> {
 348   let Latency = 5;
 349 }
 350
 351 // FMOVSWr,FMOVDXr
 352 def : InstRW<[WriteLD], (instrs FMOVSWr,FMOVDXr,FMOVDXHighr)>;
 353
 354 // INS V[x],R
 355 def CyWriteCopyToFPR : WriteSequence<[WriteVLD, WriteV]>;
 356 def : InstRW<[CyWriteCopyToFPR], (instregex "INSv")>;
 357
 358 // SMOV,UMOV R,V[x]
 359 def CyWriteCopyToGPR : WriteSequence<[WriteLD, WriteI]>;
 360 def : InstRW<[CyWriteCopyToGPR], (instregex "SMOVv","UMOVv")>;
 361
 362 // DUP V,R
 363 def : InstRW<[CyWriteCopyToFPR], (instregex "DUPv")>;
 364
 365 // DUP V,V[x] is a WriteV.
 366
 367 //---
 368 // 7.9.2 Integer Arithmetic, Logical, and Comparisons
 369 //---
 370
 371 // BIC,ORR V,#imm are WriteV
 372
 373 def : InstRW<[CyWriteV3], (instregex "ABSv")>;
 374
 375 // MVN,NEG,NOT are WriteV
 376
 377 def : InstRW<[CyWriteV3], (instregex "SQABSv","SQNEGv")>;
 378
 379 // ADDP is a WriteV.
 380 def CyWriteVADDLP : SchedWriteRes<[CyUnitV]> {let Latency = 2;}
 381 def : InstRW<[CyWriteVADDLP], (instregex "SADDLPv","UADDLPv")>;
 382
 383 def : InstRW<[CyWriteV3],
 384              (instregex "ADDVv","SMAXVv","UMAXVv","SMINVv","UMINVv")>;
 385
 386 def : InstRW<[CyWriteV3], (instregex "SADDLV","UADDLV")>;
 387
 388 // ADD,SUB are WriteV
 389
 390 // Forward declare.
 391 def CyWriteVABD : SchedWriteRes<[CyUnitV]> {let Latency = 3;}
 392
 393 // Add/Diff and accumulate uses the vector multiply unit.
 394 def CyWriteVAccum : SchedWriteRes<[CyUnitVM]> {let Latency = 3;}
 395 def CyReadVAccum  : SchedReadAdvance<1,
 396                     [CyWriteVAccum, CyWriteVADDLP, CyWriteVABD]>;
 397
 398 def : InstRW<[CyWriteVAccum, CyReadVAccum],
 399              (instregex "SADALP","UADALP")>;
 400
 401 def : InstRW<[CyWriteVAccum, CyReadVAccum],
 402              (instregex "SABAv","UABAv","SABALv","UABALv")>;
 403
 404 def : InstRW<[CyWriteV3], (instregex "SQADDv","SQSUBv","UQADDv","UQSUBv")>;
 405
 406 def : InstRW<[CyWriteV3], (instregex "SUQADDv","USQADDv")>;
 407
 408 def : InstRW<[CyWriteV4], (instregex "ADDHNv","RADDHNv", "RSUBHNv", "SUBHNv")>;
 409
 410 // WriteV includes:
 411 // AND,BIC,CMTST,EOR,ORN,ORR
 412 // ADDP
 413 // SHADD,SHSUB,SRHADD,UHADD,UHSUB,URHADD
 414 // SADDL,SSUBL,UADDL,USUBL
 415 // SADDW,SSUBW,UADDW,USUBW
 416
 417 def : InstRW<[CyWriteV3], (instregex "CMEQv","CMGEv","CMGTv",
 418                                      "CMLEv","CMLTv",
 419                                      "CMHIv","CMHSv")>;
 420
 421 def : InstRW<[CyWriteV3], (instregex "SMAXv","SMINv","UMAXv","UMINv",
 422                                      "SMAXPv","SMINPv","UMAXPv","UMINPv")>;
 423
 424 def : InstRW<[CyWriteVABD], (instregex "SABDv","UABDv",
 425                                        "SABDLv","UABDLv")>;
 426
 427 //---
 428 // 7.9.3 Floating Point Arithmetic and Comparisons
 429 //---
 430
 431 // FABS,FNEG are WriteF
 432
 433 def : InstRW<[CyWriteV4], (instrs FADDPv2i32p)>;
 434 def : InstRW<[CyWriteV5], (instrs FADDPv2i64p)>;
 435
 436 def : InstRW<[CyWriteV3], (instregex "FMAXPv2i","FMAXNMPv2i",
 437                                      "FMINPv2i","FMINNMPv2i")>;
 438
 439 def : InstRW<[CyWriteV4], (instregex "FMAXVv","FMAXNMVv","FMINVv","FMINNMVv")>;
 440
 441 def : InstRW<[CyWriteV4], (instrs FADDSrr,FADDv2f32,FADDv4f32,
 442                                   FSUBSrr,FSUBv2f32,FSUBv4f32,
 443                                   FADDPv2f32,FADDPv4f32,
 444                                   FABD32,FABDv2f32,FABDv4f32)>;
 445 def : InstRW<[CyWriteV5], (instrs FADDDrr,FADDv2f64,
 446                                   FSUBDrr,FSUBv2f64,
 447                                   FADDPv2f64,
 448                                   FABD64,FABDv2f64)>;
 449
 450 def : InstRW<[CyWriteV3], (instregex "FCMEQ","FCMGT","FCMLE","FCMLT")>;
 451
 452 def : InstRW<[CyWriteV3], (instregex "FACGE","FACGT",
 453                                      "FMAXS","FMAXD","FMAXv",
 454                                      "FMINS","FMIND","FMINv",
 455                                      "FMAXNMS","FMAXNMD","FMAXNMv",
 456                                      "FMINNMS","FMINNMD","FMINNMv",
 457                                      "FMAXPv2f","FMAXPv4f",
 458                                      "FMINPv2f","FMINPv4f",
 459                                      "FMAXNMPv2f","FMAXNMPv4f",
 460                                      "FMINNMPv2f","FMINNMPv4f")>;
 461
 462 // FCMP,FCMPE,FCCMP,FCCMPE
 463 def : WriteRes<WriteFCmp, [CyUnitVC]> {let Latency = 4;}
 464
 465 // FCSEL is a WriteF.
 466
 467 //---
 468 // 7.9.4 Shifts and Bitfield Operations
 469 //---
 470
 471 // SHL is a WriteV
 472
 473 def CyWriteVSHR : SchedWriteRes<[CyUnitV]> {let Latency = 2;}
 474 def : InstRW<[CyWriteVSHR], (instregex "SSHRv","USHRv")>;
 475
 476 def CyWriteVSRSHR : SchedWriteRes<[CyUnitV]> {let Latency = 3;}
 477 def : InstRW<[CyWriteVSRSHR], (instregex "SRSHRv","URSHRv")>;
 478
 479 // Shift and accumulate uses the vector multiply unit.
 480 def CyWriteVShiftAcc : SchedWriteRes<[CyUnitVM]> {let Latency = 3;}
 481 def CyReadVShiftAcc  : SchedReadAdvance<1,
 482                         [CyWriteVShiftAcc, CyWriteVSHR, CyWriteVSRSHR]>;
 483 def : InstRW<[CyWriteVShiftAcc, CyReadVShiftAcc],
 484              (instregex "SRSRAv","SSRAv","URSRAv","USRAv")>;
 485
 486 // SSHL,USHL are WriteV.
 487
 488 def : InstRW<[CyWriteV3], (instregex "SRSHLv","URSHLv")>;
 489
 490 // SQSHL,SQSHLU,UQSHL are WriteV.
 491
 492 def : InstRW<[CyWriteV3], (instregex "SQRSHLv","UQRSHLv")>;
 493
 494 // WriteV includes:
 495 // SHLL,SSHLL,USHLL
 496 // SLI,SRI
 497 // BIF,BIT,BSL
 498 // EXT
 499 // CLS,CLZ,CNT,RBIT,REV16,REV32,REV64,XTN
 500 // XTN2
 501
 502 def : InstRW<[CyWriteV4],
 503              (instregex "RSHRNv","SHRNv",
 504                         "SQRSHRNv","SQRSHRUNv","SQSHRNv","SQSHRUNv",
 505                         "UQRSHRNv","UQSHRNv","SQXTNv","SQXTUNv","UQXTNv")>;
 506
 507 //---
 508 // 7.9.5 Multiplication
 509 //---
 510
 511 def CyWriteVMul : SchedWriteRes<[CyUnitVM]> { let Latency = 4;}
 512 def : InstRW<[CyWriteVMul], (instregex "MULv","SMULLv","UMULLv",
 513                              "SQDMULLv","SQDMULHv","SQRDMULHv")>;
 514
 515 // FMUL,FMULX,FNMUL default to WriteFMul.
 516 def : WriteRes<WriteFMul, [CyUnitVM]> { let Latency = 4;}
 517
 518 def CyWriteV64Mul : SchedWriteRes<[CyUnitVM]> { let Latency = 5;}
 519 def : InstRW<[CyWriteV64Mul], (instrs FMULDrr,FMULv2f64,FMULv2i64_indexed,
 520                                FNMULDrr,FMULX64,FMULXv2f64,FMULXv2i64_indexed)>;
 521
 522 def CyReadVMulAcc : SchedReadAdvance<1, [CyWriteVMul, CyWriteV64Mul]>;
 523 def : InstRW<[CyWriteVMul, CyReadVMulAcc],
 524              (instregex "MLA","MLS","SMLAL","SMLSL","UMLAL","UMLSL",
 525               "SQDMLAL","SQDMLSL")>;
 526
 527 def CyWriteSMul : SchedWriteRes<[CyUnitVM]> { let Latency = 8;}
 528 def CyWriteDMul : SchedWriteRes<[CyUnitVM]> { let Latency = 10;}
 529 def CyReadSMul : SchedReadAdvance<4, [CyWriteSMul]>;
 530 def CyReadDMul : SchedReadAdvance<5, [CyWriteDMul]>;
 531
 532 def : InstRW<[CyWriteSMul, CyReadSMul],
 533              (instrs FMADDSrrr,FMSUBSrrr,FNMADDSrrr,FNMSUBSrrr,
 534               FMLAv2f32,FMLAv4f32,
 535               FMLAv1i32_indexed,FMLAv1i64_indexed,FMLAv2i32_indexed)>;
 536 def : InstRW<[CyWriteDMul, CyReadDMul],
 537              (instrs FMADDDrrr,FMSUBDrrr,FNMADDDrrr,FNMSUBDrrr,
 538               FMLAv2f64,FMLAv2i64_indexed,
 539               FMLSv2f64,FMLSv2i64_indexed)>;
 540
 541 def CyWritePMUL : SchedWriteRes<[CyUnitVD]> { let Latency = 3; }
 542 def : InstRW<[CyWritePMUL], (instregex "PMULv", "PMULLv")>;
 543
 544 //---
 545 // 7.9.6 Divide and Square Root
 546 //---
 547
 548 // FDIV,FSQRT
 549 // TODO: Add 64-bit variant with 19 cycle latency.
 550 // TODO: Specialize FSQRT for longer latency.
 551 def : WriteRes<WriteFDiv, [CyUnitVD, CyUnitFloatDiv]> {
 552   let Latency = 17;
 553   let ResourceCycles = [2, 17];
 554 }
 555
 556 def : InstRW<[CyWriteV4], (instregex "FRECPEv","FRECPXv","URECPEv","URSQRTEv")>;
 557
 558 def WriteFRSQRTE : SchedWriteRes<[CyUnitVM]> { let Latency = 4; }
 559 def : InstRW<[WriteFRSQRTE], (instregex "FRSQRTEv")>;
 560
 561 def WriteFRECPS : SchedWriteRes<[CyUnitVM]> { let Latency = 8; }
 562 def WriteFRSQRTS : SchedWriteRes<[CyUnitVM]> { let Latency = 10; }
 563 def : InstRW<[WriteFRECPS],  (instregex "FRECPSv")>;
 564 def : InstRW<[WriteFRSQRTS], (instregex "FRSQRTSv")>;
 565
 566 //---
 567 // 7.9.7 Integer-FP Conversions
 568 //---
 569
 570 // FCVT lengthen f16/s32
 571 def : InstRW<[WriteV], (instrs FCVTSHr,FCVTDHr,FCVTDSr)>;
 572
 573 // FCVT,FCVTN,FCVTXN
 574 // SCVTF,UCVTF V,V
 575 // FRINT(AIMNPXZ) V,V
 576 def : WriteRes<WriteFCvt, [CyUnitV]> {let Latency = 4;}
 577
 578 // SCVT/UCVT S/D, Rd = VLD5+V4: 9 cycles.
 579 def CyWriteCvtToFPR : WriteSequence<[WriteVLD, CyWriteV4]>;
 580 def : InstRW<[CyWriteCopyToFPR], (instregex "FCVT[AMNPZ][SU][SU][WX][SD]r")>;
 581
 582 // FCVT Rd, S/D = V6+LD4: 10 cycles
 583 def CyWriteCvtToGPR : WriteSequence<[CyWriteV6, WriteLD]>;
 584 def : InstRW<[CyWriteCvtToGPR], (instregex "[SU]CVTF[SU][WX][SD]r")>;
 585
 586 // FCVTL is a WriteV
 587
 588 //---
 589 // 7.9.8-7.9.10 Cryptography, Data Transposition, Table Lookup
 590 //---
 591
 592 def CyWriteCrypto2 : SchedWriteRes<[CyUnitVD]> {let Latency = 2;}
 593 def : InstRW<[CyWriteCrypto2], (instrs AESIMCrr, AESMCrr, SHA1Hrr,
 594                                        AESDrr, AESErr, SHA1SU1rr, SHA256SU0rr,
 595                                        SHA1SU0rrr)>;
 596
 597 def CyWriteCrypto3 : SchedWriteRes<[CyUnitVD]> {let Latency = 3;}
 598 def : InstRW<[CyWriteCrypto3], (instrs SHA256SU1rrr)>;
 599
 600 def CyWriteCrypto6 : SchedWriteRes<[CyUnitVD]> {let Latency = 6;}
 601 def : InstRW<[CyWriteCrypto6], (instrs SHA1Crrr, SHA1Mrrr, SHA1Prrr,
 602                                        SHA256Hrrr,SHA256H2rrr)>;
 603
 604 // TRN,UZP,ZUP are WriteV.
 605
 606 // TBL,TBX are WriteV.
 607
 608 //---
 609 // 7.9.11-7.9.14 Load/Store, single element and paired
 610 //---
 611
 612 // Loading into the vector unit takes 5 cycles vs 4 for integer loads.
 613 def : WriteRes<WriteVLD, [CyUnitLS]> {
 614   let Latency = 5;
 615 }
 616
 617 // Store-load forwarding is 4 cycles.
 618 def : WriteRes<WriteVST, [CyUnitLS]> {
 619   let Latency = 4;
 620 }
 621
 622 // WriteVLDPair/VSTPair sequences are expanded by the target description.
 623
 624 //---
 625 // 7.9.15 Load, element operations
 626 //---
 627
 628 // Only the first WriteVLD and WriteAdr for writeback matches def operands.
 629 // Subsequent WriteVLDs consume resources. Since all loaded values have the
 630 // same latency, this is acceptable.
 631
 632 // Vd is read 5 cycles after issuing the vector load.
 633 def : ReadAdvance<ReadVLD, 5>;
 634
 635 def : InstRW<[WriteVLD],
 636              (instregex "LD1Onev(8b|4h|2s|1d|16b|8h|4s|2d)$")>;
 637 def : InstRW<[WriteVLD, WriteAdr],
 638              (instregex "LD1Onev(8b|4h|2s|1d|16b|8h|4s|2d)_POST")>;
 639
 640 // Register writes from the load's high half are fused micro-ops.
 641 def : InstRW<[WriteVLD],
 642              (instregex "LD1Twov(8b|4h|2s|1d)$")>;
 643 def : InstRW<[WriteVLD, WriteAdr],
 644              (instregex "LD1Twov(8b|4h|2s|1d)_POST")>;
 645 def : InstRW<[WriteVLD, WriteVLD],
 646              (instregex "LD1Twov(16b|8h|4s|2d)$")>;
 647 def : InstRW<[WriteVLD, WriteAdr, WriteVLD],
 648              (instregex "LD1Twov(16b|8h|4s|2d)_POST")>;
 649
 650 def : InstRW<[WriteVLD, WriteVLD],
 651              (instregex "LD1Threev(8b|4h|2s|1d)$")>;
 652 def : InstRW<[WriteVLD, WriteAdr, WriteVLD],
 653              (instregex "LD1Threev(8b|4h|2s|1d)_POST")>;
 654 def : InstRW<[WriteVLD, WriteVLD, WriteVLD],
 655              (instregex "LD1Threev(16b|8h|4s|2d)$")>;
 656 def : InstRW<[WriteVLD, WriteAdr, WriteVLD, WriteVLD],
 657              (instregex "LD1Threev(16b|8h|4s|2d)_POST")>;
 658
 659 def : InstRW<[WriteVLD, WriteVLD],
 660              (instregex "LD1Fourv(8b|4h|2s|1d)$")>;
 661 def : InstRW<[WriteVLD, WriteAdr, WriteVLD],
 662              (instregex "LD1Fourv(8b|4h|2s|1d)_POST")>;
 663 def : InstRW<[WriteVLD, WriteVLD, WriteVLD, WriteVLD],
 664              (instregex "LD1Fourv(16b|8h|4s|2d)$")>;
 665 def : InstRW<[WriteVLD, WriteAdr, WriteVLD, WriteVLD, WriteVLD],
 666              (instregex "LD1Fourv(16b|8h|4s|2d)_POST")>;
 667
 668 def : InstRW<[WriteVLDShuffle, ReadVLD],
 669              (instregex "LD1i(8|16|32)$")>;
 670 def : InstRW<[WriteVLDShuffle, ReadVLD, WriteAdr],
 671              (instregex "LD1i(8|16|32)_POST")>;
 672
 673 def : InstRW<[WriteVLDShuffle, ReadVLD],          (instrs LD1i64)>;
 674 def : InstRW<[WriteVLDShuffle, ReadVLD, WriteAdr],(instrs LD1i64_POST)>;
 675
 676 def : InstRW<[WriteVLDShuffle],
 677              (instregex "LD1Rv(8b|4h|2s|1d|16b|8h|4s|2d)$")>;
 678 def : InstRW<[WriteVLDShuffle, WriteAdr],
 679              (instregex "LD1Rv(8b|4h|2s|1d|16b|8h|4s|2d)_POST$")>;
 680
 681 def : InstRW<[WriteVLDShuffle, WriteV],
 682              (instregex "LD2Twov(8b|4h|2s)$")>;
 683 def : InstRW<[WriteVLDShuffle, WriteAdr, WriteV],
 684              (instregex "LD2Twov(8b|4h|2s)_POST$")>;
 685 def : InstRW<[WriteVLDShuffle, WriteVLDShuffle],
 686              (instregex "LD2Twov(16b|8h|4s|2d)$")>;
 687 def : InstRW<[WriteVLDShuffle, WriteAdr, WriteVLDShuffle],
 688              (instregex "LD2Twov(16b|8h|4s|2d)_POST")>;
 689
 690 def : InstRW<[WriteVLDShuffle, ReadVLD, WriteV],
 691              (instregex "LD2i(8|16|32)$")>;
 692 def : InstRW<[WriteVLDShuffle, ReadVLD, WriteAdr, WriteV],
 693              (instregex "LD2i(8|16|32)_POST")>;
 694 def : InstRW<[WriteVLDShuffle, ReadVLD, WriteV],
 695              (instregex "LD2i64$")>;
 696 def : InstRW<[WriteVLDShuffle, ReadVLD, WriteAdr, WriteV],
 697              (instregex "LD2i64_POST")>;
 698
 699 def : InstRW<[WriteVLDShuffle, WriteV],
 700              (instregex "LD2Rv(8b|4h|2s|1d|16b|8h|4s|2d)$")>;
 701 def : InstRW<[WriteVLDShuffle, WriteAdr, WriteV],
 702              (instregex "LD2Rv(8b|4h|2s|1d|16b|8h|4s|2d)_POST")>;
 703
 704 def : InstRW<[WriteVLDShuffle, WriteVLDShuffle, WriteV],
 705              (instregex "LD3Threev(8b|4h|2s)$")>;
 706 def : InstRW<[WriteVLDShuffle, WriteAdr, WriteVLDShuffle, WriteV],
 707              (instregex "LD3Threev(8b|4h|2s)_POST")>;
 708 def : InstRW<[WriteVLDShuffle, WriteVLDShuffle, WriteVLDShuffle],
 709              (instregex "LD3Threev(16b|8h|4s|2d)$")>;
 710 def : InstRW<[WriteVLDShuffle, WriteAdr, WriteVLDShuffle, WriteVLDShuffle],
 711              (instregex "LD3Threev(16b|8h|4s|2d)_POST")>;
 712
 713 def : InstRW<[WriteVLDShuffle, ReadVLD, WriteV, WriteV],
 714              (instregex "LD3i(8|16|32)$")>;
 715 def : InstRW<[WriteVLDShuffle, ReadVLD, WriteAdr, WriteV, WriteV],
 716              (instregex "LD3i(8|16|32)_POST")>;
 717
 718 def : InstRW<[WriteVLDShuffle, ReadVLD, WriteVLDShuffle, WriteV],
 719              (instregex "LD3i64$")>;
 720 def : InstRW<[WriteVLDShuffle, ReadVLD, WriteAdr, WriteVLDShuffle, WriteV],
 721              (instregex "LD3i64_POST")>;
 722
 723 def : InstRW<[WriteVLDShuffle, WriteV, WriteV],
 724              (instregex "LD3Rv(8b|4h|2s|16b|8h|4s)$")>;
 725 def : InstRW<[WriteVLDShuffle, WriteAdr, WriteV, WriteV],
 726              (instregex "LD3Rv(8b|4h|2s|16b|8h|4s)_POST")>;
 727
 728 def : InstRW<[WriteVLDShuffle, WriteVLDShuffle, WriteV],
 729              (instrs LD3Rv1d,LD3Rv2d)>;
 730 def : InstRW<[WriteVLDShuffle, WriteAdr, WriteVLDShuffle, WriteV],
 731              (instrs LD3Rv1d_POST,LD3Rv2d_POST)>;
 732
 733 def : InstRW<[WriteVLDShuffle, WriteVLDShuffle, WriteV, WriteV],
 734              (instregex "LD4Fourv(8b|4h|2s)$")>;
 735 def : InstRW<[WriteVLDShuffle, WriteAdr, WriteVLDShuffle, WriteV, WriteV],
 736              (instregex "LD4Fourv(8b|4h|2s)_POST")>;
 737 def : InstRW<[WriteVLDPairShuffle, WriteVLDPairShuffle,
 738               WriteVLDPairShuffle, WriteVLDPairShuffle],
 739              (instregex "LD4Fourv(16b|8h|4s|2d)$")>;
 740 def : InstRW<[WriteVLDPairShuffle, WriteAdr, WriteVLDPairShuffle,
 741               WriteVLDPairShuffle, WriteVLDPairShuffle],
 742              (instregex "LD4Fourv(16b|8h|4s|2d)_POST")>;
 743
 744 def : InstRW<[WriteVLDShuffle, ReadVLD, WriteV, WriteV, WriteV],
 745              (instregex "LD4i(8|16|32)$")>;
 746 def : InstRW<[WriteVLDShuffle, ReadVLD, WriteAdr, WriteV, WriteV, WriteV],
 747              (instregex "LD4i(8|16|32)_POST")>;
 748
 749
 750 def : InstRW<[WriteVLDShuffle, ReadVLD, WriteVLDShuffle, WriteV, WriteV],
 751              (instrs LD4i64)>;
 752 def : InstRW<[WriteVLDShuffle, ReadVLD, WriteAdr, WriteVLDShuffle, WriteV],
 753              (instrs LD4i64_POST)>;
 754
 755 def : InstRW<[WriteVLDShuffle, WriteV, WriteV, WriteV],
 756              (instregex "LD4Rv(8b|4h|2s|16b|8h|4s)$")>;
 757 def : InstRW<[WriteVLDShuffle, WriteAdr, WriteV, WriteV, WriteV],
 758              (instregex "LD4Rv(8b|4h|2s|16b|8h|4s)_POST")>;
 759
 760 def : InstRW<[WriteVLDShuffle, WriteVLDShuffle, WriteV, WriteV],
 761              (instrs LD4Rv1d,LD4Rv2d)>;
 762 def : InstRW<[WriteVLDShuffle, WriteAdr, WriteVLDShuffle, WriteV, WriteV],
 763              (instrs LD4Rv1d_POST,LD4Rv2d_POST)>;
 764
 765 //---
 766 // 7.9.16 Store, element operations
 767 //---
 768
 769 // Only the WriteAdr for writeback matches a def operands.
 770 // Subsequent WriteVLDs only consume resources.
 771
 772 def : InstRW<[WriteVST],
 773              (instregex "ST1Onev(8b|4h|2s|1d|16b|8h|4s|2d)$")>;
 774 def : InstRW<[WriteAdr, WriteVST],
 775              (instregex "ST1Onev(8b|4h|2s|1d|16b|8h|4s|2d)_POST")>;
 776
 777 def : InstRW<[WriteVSTShuffle],
 778              (instregex "ST1Twov(8b|4h|2s|1d)$")>;
 779 def : InstRW<[WriteAdr, WriteVSTShuffle],
 780              (instregex "ST1Twov(8b|4h|2s|1d)_POST")>;
 781 def : InstRW<[WriteVST, WriteVST],
 782              (instregex "ST1Twov(16b|8h|4s|2d)$")>;
 783 def : InstRW<[WriteAdr, WriteVST, WriteVST],
 784              (instregex "ST1Twov(16b|8h|4s|2d)_POST")>;
 785
 786 def : InstRW<[WriteVSTShuffle, WriteVST],
 787              (instregex "ST1Threev(8b|4h|2s|1d)$")>;
 788 def : InstRW<[WriteAdr, WriteVSTShuffle, WriteVST],
 789              (instregex "ST1Threev(8b|4h|2s|1d)_POST")>;
 790 def : InstRW<[WriteVST, WriteVST, WriteVST],
 791              (instregex "ST1Threev(16b|8h|4s|2d)$")>;
 792 def : InstRW<[WriteAdr, WriteVST, WriteVST, WriteVST],
 793              (instregex "ST1Threev(16b|8h|4s|2d)_POST")>;
 794
 795 def : InstRW<[WriteVSTShuffle, WriteVSTShuffle],
 796              (instregex "ST1Fourv(8b|4h|2s|1d)$")>;
 797 def : InstRW<[WriteAdr, WriteVSTShuffle, WriteVSTShuffle],
 798              (instregex "ST1Fourv(8b|4h|2s|1d)_POST")>;
 799 def : InstRW<[WriteVST, WriteVST, WriteVST, WriteVST],
 800              (instregex "ST1Fourv(16b|8h|4s|2d)$")>;
 801 def : InstRW<[WriteAdr, WriteVST, WriteVST, WriteVST, WriteVST],
 802              (instregex "ST1Fourv(16b|8h|4s|2d)_POST")>;
 803
 804 def : InstRW<[WriteVSTShuffle],           (instregex "ST1i(8|16|32)$")>;
 805 def : InstRW<[WriteAdr, WriteVSTShuffle], (instregex "ST1i(8|16|32)_POST")>;
 806
 807 def : InstRW<[WriteVSTShuffle],           (instrs ST1i64)>;
 808 def : InstRW<[WriteAdr, WriteVSTShuffle], (instrs ST1i64_POST)>;
 809
 810 def : InstRW<[WriteVSTShuffle],
 811              (instregex "ST2Twov(8b|4h|2s)$")>;
 812 def : InstRW<[WriteAdr, WriteVSTShuffle],
 813              (instregex "ST2Twov(8b|4h|2s)_POST")>;
 814 def : InstRW<[WriteVSTShuffle, WriteVSTShuffle],
 815              (instregex "ST2Twov(16b|8h|4s|2d)$")>;
 816 def : InstRW<[WriteAdr, WriteVSTShuffle, WriteVSTShuffle],
 817              (instregex "ST2Twov(16b|8h|4s|2d)_POST")>;
 818
 819 def : InstRW<[WriteVSTShuffle],           (instregex "ST2i(8|16|32)$")>;
 820 def : InstRW<[WriteAdr, WriteVSTShuffle], (instregex "ST2i(8|16|32)_POST")>;
 821 def : InstRW<[WriteVSTShuffle],           (instrs ST2i64)>;
 822 def : InstRW<[WriteAdr, WriteVSTShuffle], (instrs ST2i64_POST)>;
 823
 824 def : InstRW<[WriteVSTShuffle, WriteVSTShuffle],
 825              (instregex "ST3Threev(8b|4h|2s)$")>;
 826 def : InstRW<[WriteAdr, WriteVSTShuffle, WriteVSTShuffle],
 827              (instregex "ST3Threev(8b|4h|2s)_POST")>;
 828 def : InstRW<[WriteVSTShuffle, WriteVSTShuffle, WriteVSTShuffle],
 829              (instregex "ST3Threev(16b|8h|4s|2d)$")>;
 830 def : InstRW<[WriteAdr, WriteVSTShuffle, WriteVSTShuffle, WriteVSTShuffle],
 831              (instregex "ST3Threev(16b|8h|4s|2d)_POST")>;
 832
 833 def : InstRW<[WriteVSTShuffle],           (instregex "ST3i(8|16|32)$")>;
 834 def : InstRW<[WriteAdr, WriteVSTShuffle], (instregex "ST3i(8|16|32)_POST")>;
 835
 836 def :InstRW<[WriteVSTShuffle, WriteVSTShuffle],           (instrs ST3i64)>;
 837 def :InstRW<[WriteAdr, WriteVSTShuffle, WriteVSTShuffle], (instrs ST3i64_POST)>;
 838
 839 def : InstRW<[WriteVSTPairShuffle, WriteVSTPairShuffle],
 840             (instregex "ST4Fourv(8b|4h|2s|1d)$")>;
 841 def : InstRW<[WriteAdr, WriteVSTPairShuffle, WriteVSTPairShuffle],
 842             (instregex "ST4Fourv(8b|4h|2s|1d)_POST")>;
 843 def : InstRW<[WriteVSTPairShuffle, WriteVSTPairShuffle,
 844               WriteVSTPairShuffle, WriteVSTPairShuffle],
 845              (instregex "ST4Fourv(16b|8h|4s|2d)$")>;
 846 def : InstRW<[WriteAdr, WriteVSTPairShuffle, WriteVSTPairShuffle,
 847               WriteVSTPairShuffle, WriteVSTPairShuffle],
 848              (instregex "ST4Fourv(16b|8h|4s|2d)_POST")>;
 849
 850 def : InstRW<[WriteVSTPairShuffle],           (instregex "ST4i(8|16|32)$")>;
 851 def : InstRW<[WriteAdr, WriteVSTPairShuffle], (instregex "ST4i(8|16|32)_POST")>;
 852
 853 def : InstRW<[WriteVSTShuffle, WriteVSTShuffle],          (instrs ST4i64)>;
 854 def : InstRW<[WriteAdr, WriteVSTShuffle, WriteVSTShuffle],(instrs ST4i64_POST)>;
 855
 856 // Atomic operations are not supported.
 857 def : WriteRes<WriteAtomic, []> { let Unsupported = 1; }
 858
 859 //---
 860 // Unused SchedRead types
 861 //---
 862
 863 def : ReadAdvance<ReadI, 0>;
 864 def : ReadAdvance<ReadISReg, 0>;
 865 def : ReadAdvance<ReadIEReg, 0>;
 866 def : ReadAdvance<ReadIM, 0>;
 867 def : ReadAdvance<ReadIMA, 0>;
 868 def : ReadAdvance<ReadID, 0>;
 869
 870 } // SchedModel = CycloneModel