lib/Target/X86/X86ScheduleBtVer2.td

   1 //=- X86ScheduleBtVer2.td - X86 BtVer2 (Jaguar) Scheduling ---*- tablegen -*-=//
   2 //
   3 // Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
   4 // See https://llvm.org/LICENSE.txt for license information.
   5 // SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
   6 //
   7 //===----------------------------------------------------------------------===//
   8 //
   9 // This file defines the machine model for AMD btver2 (Jaguar) to support
  10 // instruction scheduling and other instruction cost heuristics. Based off AMD Software
  11 // Optimization Guide for AMD Family 16h Processors & Instruction Latency appendix.
  12 //
  13 //===----------------------------------------------------------------------===//
  14
  15 def BtVer2Model : SchedMachineModel {
  16   // All x86 instructions are modeled as a single micro-op, and btver2 can
  17   // decode 2 instructions per cycle.
  18   let IssueWidth = 2;
  19   let MicroOpBufferSize = 64; // Retire Control Unit
  20   let LoadLatency = 5; // FPU latency (worse case cf Integer 3 cycle latency)
  21   let HighLatency = 25;
  22   let MispredictPenalty = 14; // Minimum branch misdirection penalty
  23   let PostRAScheduler = 1;
  24
  25   // FIXME: SSE4/AVX is unimplemented. This flag is set to allow
  26   // the scheduler to assign a default model to unrecognized opcodes.
  27   let CompleteModel = 0;
  28 }
  29
  30 let SchedModel = BtVer2Model in {
  31
  32 // Jaguar can issue up to 6 micro-ops in one cycle
  33 def JALU0 : ProcResource<1>; // Integer Pipe0: integer ALU0 (also handle FP->INT jam)
  34 def JALU1 : ProcResource<1>; // Integer Pipe1: integer ALU1/MUL/DIV
  35 def JLAGU : ProcResource<1>; // Integer Pipe2: LAGU
  36 def JSAGU : ProcResource<1>; // Integer Pipe3: SAGU (also handles 3-operand LEA)
  37 def JFPU0 : ProcResource<1>; // Vector/FPU Pipe0: VALU0/VIMUL/FPA
  38 def JFPU1 : ProcResource<1>; // Vector/FPU Pipe1: VALU1/STC/FPM
  39
  40 // The Integer PRF for Jaguar is 64 entries, and it holds the architectural and
  41 // speculative version of the 64-bit integer registers.
  42 // Reference: www.realworldtech.com/jaguar/4/
  43 //
  44 // The processor always keeps the different parts of an integer register
  45 // together. An instruction that writes to a part of a register will therefore
  46 // have a false dependence on any previous write to the same register or any
  47 // part of it.
  48 // Reference: Section 21.10 "AMD Bobcat and Jaguar pipeline: Partial register
  49 // access" - Agner Fog's "microarchitecture.pdf".
  50 def JIntegerPRF : RegisterFile<64, [GR64, CCR], [1, 1], [1, 0],
  51                                0,  // Max moves that can be eliminated per cycle.
  52                                1>; // Restrict move elimination to zero regs.
  53
  54 // The Jaguar FP Retire Queue renames SIMD and FP uOps onto a pool of 72 SSE
  55 // registers. Operations on 256-bit data types are cracked into two COPs.
  56 // Reference: www.realworldtech.com/jaguar/4/
  57
  58 // The PRF in the floating point unit can eliminate a move from a MMX or SSE
  59 // register that is know to be zero (i.e. it has been zeroed using a zero-idiom
  60 // dependency breaking instruction, or via VZEROALL).
  61 // Reference: Section 21.8 "AMD Bobcat and Jaguar pipeline: Dependency-breaking
  62 // instructions" - Agner Fog's "microarchitecture.pdf"
  63 def JFpuPRF: RegisterFile<72, [VR64, VR128, VR256], [1, 1, 2], [1, 1, 0],
  64                           0,  // Max moves that can be eliminated per cycle.
  65                           1>; // Restrict move elimination to zero regs.
  66
  67 // The retire control unit (RCU) can track up to 64 macro-ops in-flight. It can
  68 // retire up to two macro-ops per cycle.
  69 // Reference: "Software Optimization Guide for AMD Family 16h Processors"
  70 def JRCU : RetireControlUnit<64, 2>;
  71
  72 // Integer Pipe Scheduler
  73 def JALU01 : ProcResGroup<[JALU0, JALU1]> {
  74   let BufferSize=20;
  75 }
  76
  77 // AGU Pipe Scheduler
  78 def JLSAGU : ProcResGroup<[JLAGU, JSAGU]> {
  79   let BufferSize=12;
  80 }
  81
  82 // Fpu Pipe Scheduler
  83 def JFPU01 : ProcResGroup<[JFPU0, JFPU1]> {
  84   let BufferSize=18;
  85 }
  86
  87 // Functional units
  88 def JDiv    : ProcResource<1>; // integer division
  89 def JMul    : ProcResource<1>; // integer multiplication
  90 def JVALU0  : ProcResource<1>; // vector integer
  91 def JVALU1  : ProcResource<1>; // vector integer
  92 def JVIMUL  : ProcResource<1>; // vector integer multiplication
  93 def JSTC    : ProcResource<1>; // vector store/convert
  94 def JFPM    : ProcResource<1>; // FP multiplication
  95 def JFPA    : ProcResource<1>; // FP addition
  96
  97 // Functional unit groups
  98 def JFPX  : ProcResGroup<[JFPA, JFPM]>;
  99 def JVALU : ProcResGroup<[JVALU0, JVALU1]>;
 100
 101 // Integer loads are 3 cycles, so ReadAfterLd registers needn't be available until 3
 102 // cycles after the memory operand.
 103 def : ReadAdvance<ReadAfterLd, 3>;
 104
 105 // Vector loads are 5 cycles, so ReadAfterVec*Ld registers needn't be available until 5
 106 // cycles after the memory operand.
 107 def : ReadAdvance<ReadAfterVecLd, 5>;
 108 def : ReadAdvance<ReadAfterVecXLd, 5>;
 109 def : ReadAdvance<ReadAfterVecYLd, 5>;
 110
 111 /// "Additional 6 cycle transfer operation which moves a floating point
 112 /// operation input value from the integer unit to the floating point unit.
 113 /// Reference: AMDfam16h SOG (Appendix A "Instruction Latencies", Section A.2).
 114 def : ReadAdvance<ReadInt2Fpu, -6>;
 115
 116 // Many SchedWrites are defined in pairs with and without a folded load.
 117 // Instructions with folded loads are usually micro-fused, so they only appear
 118 // as two micro-ops when dispatched by the schedulers.
 119 // This multiclass defines the resource usage for variants with and without
 120 // folded loads.
 121 multiclass JWriteResIntPair<X86FoldableSchedWrite SchedRW,
 122                             list<ProcResourceKind> ExePorts,
 123                             int Lat, list<int> Res = [], int UOps = 1,
 124                             int LoadUOps = 0> {
 125   // Register variant is using a single cycle on ExePort.
 126   def : WriteRes<SchedRW, ExePorts> {
 127     let Latency = Lat;
 128     let ResourceCycles = Res;
 129     let NumMicroOps = UOps;
 130   }
 131
 132   // Memory variant also uses a cycle on JLAGU and adds 3 cycles to the
 133   // latency.
 134   def : WriteRes<SchedRW.Folded, !listconcat([JLAGU], ExePorts)> {
 135     let Latency = !add(Lat, 3);
 136     let ResourceCycles = !if(!empty(Res), [], !listconcat([1], Res));
 137     let NumMicroOps = !add(UOps, LoadUOps);
 138   }
 139 }
 140
 141 multiclass JWriteResFpuPair<X86FoldableSchedWrite SchedRW,
 142                             list<ProcResourceKind> ExePorts,
 143                             int Lat, list<int> Res = [], int UOps = 1,
 144                             int LoadUOps = 0> {
 145   // Register variant is using a single cycle on ExePort.
 146   def : WriteRes<SchedRW, ExePorts> {
 147     let Latency = Lat;
 148     let ResourceCycles = Res;
 149     let NumMicroOps = UOps;
 150   }
 151
 152   // Memory variant also uses a cycle on JLAGU and adds 5 cycles to the
 153   // latency.
 154   def : WriteRes<SchedRW.Folded, !listconcat([JLAGU], ExePorts)> {
 155     let Latency = !add(Lat, 5);
 156     let ResourceCycles = !if(!empty(Res), [], !listconcat([1], Res));
 157     let NumMicroOps = !add(UOps, LoadUOps);
 158   }
 159 }
 160
 161 multiclass JWriteResYMMPair<X86FoldableSchedWrite SchedRW,
 162                             list<ProcResourceKind> ExePorts,
 163                             int Lat, list<int> Res = [2], int UOps = 2,
 164                             int LoadUOps = 0> {
 165   // Register variant is using a single cycle on ExePort.
 166   def : WriteRes<SchedRW, ExePorts> {
 167     let Latency = Lat;
 168     let ResourceCycles = Res;
 169     let NumMicroOps = UOps;
 170   }
 171
 172   // Memory variant also uses 2 cycles on JLAGU and adds 5 cycles to the
 173   // latency.
 174   def : WriteRes<SchedRW.Folded, !listconcat([JLAGU], ExePorts)> {
 175     let Latency = !add(Lat, 5);
 176     let ResourceCycles = !listconcat([2], Res);
 177     let NumMicroOps = !add(UOps, LoadUOps);
 178   }
 179 }
 180
 181 // Instructions that have local forwarding disabled have an extra +1cy latency.
 182
 183 // A folded store needs a cycle on the SAGU for the store data, most RMW
 184 // instructions don't need an extra uop.  ALU RMW operations don't seem to
 185 // benefit from STLF, and their observed latency is 6cy. That is the reason why
 186 // this write adds two extra cycles (instead of just 1cy for the store).
 187 defm : X86WriteRes<WriteRMW, [JSAGU], 2, [1], 0>;
 188
 189 ////////////////////////////////////////////////////////////////////////////////
 190 // Arithmetic.
 191 ////////////////////////////////////////////////////////////////////////////////
 192
 193 defm : JWriteResIntPair<WriteALU,    [JALU01], 1>;
 194 defm : JWriteResIntPair<WriteADC,    [JALU01], 1, [2]>;
 195
 196 defm : X86WriteRes<WriteBSWAP32,     [JALU01], 1, [1], 1>;
 197 defm : X86WriteRes<WriteBSWAP64,     [JALU01], 1, [1], 1>;
 198 defm : X86WriteRes<WriteCMPXCHG,     [JALU01], 3, [3], 5>;
 199 defm : X86WriteRes<WriteCMPXCHGRMW,  [JALU01, JSAGU, JLAGU], 11, [3, 1, 1], 6>;
 200 defm : X86WriteRes<WriteXCHG,        [JALU01], 1, [2], 2>;
 201
 202 defm : JWriteResIntPair<WriteIMul8,     [JALU1, JMul], 3, [1, 1], 1>;
 203 defm : JWriteResIntPair<WriteIMul16,    [JALU1, JMul], 3, [1, 3], 3>;
 204 defm : JWriteResIntPair<WriteIMul16Imm, [JALU1, JMul], 4, [1, 2], 2>;
 205 defm : JWriteResIntPair<WriteIMul16Reg, [JALU1, JMul], 3, [1, 1], 1>;
 206 defm : JWriteResIntPair<WriteIMul32,    [JALU1, JMul], 3, [1, 2], 2>;
 207 defm : JWriteResIntPair<WriteIMul32Imm, [JALU1, JMul], 3, [1, 1], 1>;
 208 defm : JWriteResIntPair<WriteIMul32Reg, [JALU1, JMul], 3, [1, 1], 1>;
 209 defm : JWriteResIntPair<WriteIMul64,    [JALU1, JMul], 6, [1, 4], 2>;
 210 defm : JWriteResIntPair<WriteIMul64Imm, [JALU1, JMul], 6, [1, 4], 1>;
 211 defm : JWriteResIntPair<WriteIMul64Reg, [JALU1, JMul], 6, [1, 4], 1>;
 212 defm : X86WriteRes<WriteIMulH,          [JALU1], 6, [4], 1>;
 213
 214 defm : JWriteResIntPair<WriteDiv8,   [JALU1, JDiv], 12, [1, 12], 1>;
 215 defm : JWriteResIntPair<WriteDiv16,  [JALU1, JDiv], 17, [1, 17], 2>;
 216 defm : JWriteResIntPair<WriteDiv32,  [JALU1, JDiv], 25, [1, 25], 2>;
 217 defm : JWriteResIntPair<WriteDiv64,  [JALU1, JDiv], 41, [1, 41], 2>;
 218 defm : JWriteResIntPair<WriteIDiv8,  [JALU1, JDiv], 12, [1, 12], 1>;
 219 defm : JWriteResIntPair<WriteIDiv16, [JALU1, JDiv], 17, [1, 17], 2>;
 220 defm : JWriteResIntPair<WriteIDiv32, [JALU1, JDiv], 25, [1, 25], 2>;
 221 defm : JWriteResIntPair<WriteIDiv64, [JALU1, JDiv], 41, [1, 41], 2>;
 222
 223 defm : JWriteResIntPair<WriteCRC32,  [JALU01], 3, [4], 3>;
 224
 225 defm : JWriteResIntPair<WriteCMOV,  [JALU01], 1>; // Conditional move.
 226 defm : X86WriteRes<WriteFCMOV, [JFPU0, JFPA], 3, [1,1], 1>; // x87 conditional move.
 227 def  : WriteRes<WriteSETCC, [JALU01]>; // Setcc.
 228 def  : WriteRes<WriteSETCCStore, [JALU01,JSAGU]>;
 229 def  : WriteRes<WriteLAHFSAHF, [JALU01]>;
 230
 231 defm : X86WriteRes<WriteBitTest,         [JALU01], 1, [1], 1>;
 232 defm : X86WriteRes<WriteBitTestImmLd,    [JALU01,JLAGU], 4, [1,1], 1>;
 233 defm : X86WriteRes<WriteBitTestRegLd,    [JALU01,JLAGU], 4, [1,1], 5>;
 234 defm : X86WriteRes<WriteBitTestSet,      [JALU01], 1, [1], 2>;
 235 defm : X86WriteRes<WriteBitTestSetImmLd, [JALU01,JLAGU], 4, [1,1], 4>;
 236 defm : X86WriteRes<WriteBitTestSetRegLd, [JALU01,JLAGU], 4, [1,1], 8>;
 237
 238 // This is for simple LEAs with one or two input operands.
 239 def : WriteRes<WriteLEA, [JALU01]>;
 240
 241 // Bit counts.
 242 defm : JWriteResIntPair<WriteBSF, [JALU01], 4, [8], 7>;
 243 defm : JWriteResIntPair<WriteBSR, [JALU01], 5, [8], 8>;
 244 defm : JWriteResIntPair<WritePOPCNT,         [JALU01], 1>;
 245 defm : JWriteResIntPair<WriteLZCNT,          [JALU01], 1>;
 246 defm : JWriteResIntPair<WriteTZCNT,          [JALU01], 2, [2], 2>;
 247
 248 // BMI1 BEXTR/BLS, BMI2 BZHI
 249 defm : JWriteResIntPair<WriteBEXTR, [JALU01], 1>;
 250 defm : JWriteResIntPair<WriteBLS,   [JALU01], 2, [2], 2>;
 251 defm : X86WriteResPairUnsupported<WriteBZHI>;
 252
 253 ////////////////////////////////////////////////////////////////////////////////
 254 // Integer shifts and rotates.
 255 ////////////////////////////////////////////////////////////////////////////////
 256
 257 defm : JWriteResIntPair<WriteShift,    [JALU01], 1>;
 258 defm : JWriteResIntPair<WriteShiftCL,  [JALU01], 1>;
 259 defm : JWriteResIntPair<WriteRotate,   [JALU01], 1>;
 260 defm : JWriteResIntPair<WriteRotateCL, [JALU01], 1>;
 261
 262 // SHLD/SHRD.
 263 defm : X86WriteRes<WriteSHDrri, [JALU01], 3, [6], 6>;
 264 defm : X86WriteRes<WriteSHDrrcl,[JALU01], 4, [8], 7>;
 265 defm : X86WriteRes<WriteSHDmri, [JLAGU, JALU01], 9, [1, 22], 8>;
 266 defm : X86WriteRes<WriteSHDmrcl,[JLAGU, JALU01], 9, [1, 22], 8>;
 267
 268 ////////////////////////////////////////////////////////////////////////////////
 269 // Loads, stores, and moves, not folded with other operations.
 270 ////////////////////////////////////////////////////////////////////////////////
 271
 272 def : WriteRes<WriteLoad,    [JLAGU]> { let Latency = 3; }
 273 def : WriteRes<WriteStore,   [JSAGU]>;
 274 def : WriteRes<WriteStoreNT, [JSAGU]>;
 275 def : WriteRes<WriteMove,    [JALU01]>;
 276
 277 // Load/store MXCSR.
 278 def : WriteRes<WriteLDMXCSR, [JLAGU]> { let Latency = 3; }
 279 def : WriteRes<WriteSTMXCSR, [JSAGU]>;
 280
 281 // Treat misc copies as a move.
 282 def : InstRW<[WriteMove], (instrs COPY)>;
 283
 284 ////////////////////////////////////////////////////////////////////////////////
 285 // Idioms that clear a register, like xorps %xmm0, %xmm0.
 286 // These can often bypass execution ports completely.
 287 ////////////////////////////////////////////////////////////////////////////////
 288
 289 def : WriteRes<WriteZero,  []>;
 290
 291 ////////////////////////////////////////////////////////////////////////////////
 292 // Branches don't produce values, so they have no latency, but they still
 293 // consume resources. Indirect branches can fold loads.
 294 ////////////////////////////////////////////////////////////////////////////////
 295
 296 defm : JWriteResIntPair<WriteJump,  [JALU01], 1>;
 297
 298 ////////////////////////////////////////////////////////////////////////////////
 299 // Special case scheduling classes.
 300 ////////////////////////////////////////////////////////////////////////////////
 301
 302 def : WriteRes<WriteSystem,     [JALU01]> { let Latency = 100; }
 303 def : WriteRes<WriteMicrocoded, [JALU01]> { let Latency = 100; }
 304 def : WriteRes<WriteFence,  [JSAGU]>;
 305
 306 // Nops don't have dependencies, so there's no actual latency, but we set this
 307 // to '1' to tell the scheduler that the nop uses an ALU slot for a cycle.
 308 def : WriteRes<WriteNop, [JALU01]> { let Latency = 1; }
 309
 310 def JWriteCMPXCHG8rr : SchedWriteRes<[JALU01]> {
 311   let Latency = 3;
 312   let ResourceCycles = [3];
 313   let NumMicroOps = 3;
 314 }
 315
 316 def JWriteLOCK_CMPXCHG8rm : SchedWriteRes<[JALU01, JLAGU, JSAGU]> {
 317   let Latency = 16;
 318   let ResourceCycles = [3,16,16];
 319   let NumMicroOps = 5;
 320 }
 321
 322 def JWriteLOCK_CMPXCHGrm : SchedWriteRes<[JALU01, JLAGU, JSAGU]> {
 323   let Latency = 17;
 324   let ResourceCycles = [3,17,17];
 325   let NumMicroOps = 6;
 326 }
 327
 328 def JWriteCMPXCHG8rm : SchedWriteRes<[JALU01, JLAGU, JSAGU]> {
 329   let Latency = 11;
 330   let ResourceCycles = [3,1,1];
 331   let NumMicroOps = 5;
 332 }
 333
 334 def JWriteCMPXCHG8B : SchedWriteRes<[JALU01, JLAGU, JSAGU]> {
 335   let Latency = 11;
 336   let ResourceCycles = [3,1,1];
 337   let NumMicroOps = 18;
 338 }
 339
 340 def JWriteCMPXCHG16B : SchedWriteRes<[JALU01, JLAGU, JSAGU]> {
 341   let Latency = 32;
 342   let ResourceCycles = [6,1,1];
 343   let NumMicroOps = 28;
 344 }
 345
 346 def JWriteLOCK_CMPXCHG8B : SchedWriteRes<[JALU01, JLAGU, JSAGU]> {
 347   let Latency = 19;
 348   let ResourceCycles = [3,19,19];
 349   let NumMicroOps = 18;
 350 }
 351
 352 def JWriteLOCK_CMPXCHG16B : SchedWriteRes<[JALU01, JLAGU, JSAGU]> {
 353   let Latency = 38;
 354   let ResourceCycles = [6,38,38];
 355   let NumMicroOps = 28;
 356 }
 357
 358 def JWriteCMPXCHGVariant :  SchedWriteVariant<[
 359   SchedVar<MCSchedPredicate<IsAtomicCompareAndSwap8B>,  [JWriteLOCK_CMPXCHG8B]>,
 360   SchedVar<MCSchedPredicate<IsAtomicCompareAndSwap16B>, [JWriteLOCK_CMPXCHG16B]>,
 361   SchedVar<MCSchedPredicate<IsAtomicCompareAndSwap_8>,  [JWriteLOCK_CMPXCHG8rm]>,
 362   SchedVar<MCSchedPredicate<IsAtomicCompareAndSwap>,    [JWriteLOCK_CMPXCHGrm]>,
 363   SchedVar<MCSchedPredicate<IsCompareAndSwap8B>,        [JWriteCMPXCHG8B]>,
 364   SchedVar<MCSchedPredicate<IsCompareAndSwap16B>,       [JWriteCMPXCHG16B]>,
 365   SchedVar<MCSchedPredicate<IsRegMemCompareAndSwap_8>,  [JWriteCMPXCHG8rm]>,
 366   SchedVar<MCSchedPredicate<IsRegMemCompareAndSwap>,    [WriteCMPXCHGRMW]>,
 367   SchedVar<MCSchedPredicate<IsRegRegCompareAndSwap_8>,  [JWriteCMPXCHG8rr]>,
 368   SchedVar<NoSchedPred,                                 [WriteCMPXCHG]>
 369 ]>;
 370
 371 // The first five reads are contributed by the memory load operand.
 372 // We ignore those reads and set a read-advance for the other input operands
 373 // including the implicit read of RAX.
 374 def : InstRW<[JWriteCMPXCHGVariant,
 375               ReadDefault, ReadDefault, ReadDefault, ReadDefault, ReadDefault,
 376               ReadAfterLd, ReadAfterLd], (instrs LCMPXCHG8, LCMPXCHG16,
 377                                                  LCMPXCHG32, LCMPXCHG64,
 378                                                  CMPXCHG8rm, CMPXCHG16rm,
 379                                                  CMPXCHG32rm, CMPXCHG64rm)>;
 380
 381 def : InstRW<[JWriteCMPXCHGVariant], (instrs CMPXCHG8rr, CMPXCHG16rr,
 382                                              CMPXCHG32rr, CMPXCHG64rr)>;
 383
 384 def : InstRW<[JWriteCMPXCHGVariant,
 385               // Ignore reads contributed by the memory operand.
 386               ReadDefault, ReadDefault, ReadDefault, ReadDefault, ReadDefault,
 387               // Add a read-advance to every implicit register read.
 388               ReadAfterLd, ReadAfterLd, ReadAfterLd, ReadAfterLd], (instrs LCMPXCHG8B, LCMPXCHG16B,
 389                                                                            CMPXCHG8B, CMPXCHG16B)>;
 390
 391 def JWriteLOCK_ALURMW : SchedWriteRes<[JALU01, JLAGU, JSAGU]> {
 392   let Latency = 19;
 393   let ResourceCycles = [1,19,19];
 394   let NumMicroOps = 1;
 395 }
 396
 397 def JWriteLOCK_ALURMWVariant :  SchedWriteVariant<[
 398   SchedVar<MCSchedPredicate<CheckLockPrefix>, [JWriteLOCK_ALURMW]>,
 399   SchedVar<NoSchedPred,                       [WriteALURMW]>
 400 ]>;
 401 def : InstRW<[JWriteLOCK_ALURMWVariant], (instrs INC8m, INC16m, INC32m, INC64m,
 402                                                  DEC8m, DEC16m, DEC32m, DEC64m,
 403                                                  NOT8m, NOT16m, NOT32m, NOT64m,
 404                                                  NEG8m, NEG16m, NEG32m, NEG64m)>;
 405
 406 def JWriteXCHG8rr_XADDrr : SchedWriteRes<[JALU01]> {
 407   let Latency = 2;
 408   let ResourceCycles = [3];
 409   let NumMicroOps = 3;
 410 }
 411 def : InstRW<[JWriteXCHG8rr_XADDrr], (instrs XCHG8rr, XADD8rr, XADD16rr,
 412                                                       XADD32rr, XADD64rr)>;
 413
 414 // This write defines the latency of the in/out register operand of a non-atomic
 415 // XADDrm. This is the first of a pair of writes that model non-atomic
 416 // XADDrm instructions (the second write definition is JWriteXADDrm_LdSt_Part).
 417 //
 418 // We need two writes because the instruction latency differs from the output
 419 // register operand latency. In particular, the first write describes the first
 420 // (and only) output register operand of the instruction.  However, the
 421 // instruction latency is set to the MAX of all the write latencies. That's why
 422 // a second write is needed in this case (see example below).
 423 //
 424 // Example:
 425 //     XADD %ecx, (%rsp)      ## Instruction latency: 11cy
 426 //                            ## ECX write Latency: 3cy
 427 //
 428 // Register ECX becomes available in 3 cycles. That is because the value of ECX
 429 // is exchanged with the value read from the stack pointer, and the load-to-use
 430 // latency is assumed to be 3cy.
 431 def JWriteXADDrm_XCHG_Part : SchedWriteRes<[JALU01]> {
 432   let Latency = 3;  // load-to-use latency
 433   let ResourceCycles = [3];
 434   let NumMicroOps = 3;
 435 }
 436
 437 // This write defines the latency of the in/out register operand of an atomic
 438 // XADDrm. This is the first of a sequence of two writes used to model atomic
 439 // XADD instructions. The second write of the sequence is JWriteXCHGrm_LdSt_Part.
 440 //
 441 //
 442 // Example:
 443 //    LOCK XADD %ecx, (%rsp)     ## Instruction Latency: 16cy
 444 //                               ## ECX write Latency: 11cy
 445 //
 446 // The value of ECX becomes available only after 11cy from the start of
 447 // execution. This write is used to specifically set that operand latency.
 448 def JWriteLOCK_XADDrm_XCHG_Part : SchedWriteRes<[JALU01]> {
 449   let Latency = 11;
 450   let ResourceCycles = [3];
 451   let NumMicroOps = 3;
 452 }
 453
 454 // This write defines the latency of the in/out register operand of an atomic
 455 // XCHGrm. This write is the first of a sequence of two writes that describe
 456 // atomic XCHG operations. We need two writes because the instruction latency
 457 // differs from the output register write latency.  We want to make sure that
 458 // the output register operand becomes visible after 11cy. However, we want to
 459 // set the instruction latency to 16cy.
 460 def JWriteXCHGrm_XCHG_Part : SchedWriteRes<[JALU01]> {
 461   let Latency = 11;
 462   let ResourceCycles = [2];
 463   let NumMicroOps = 2;
 464 }
 465
 466 def JWriteXADDrm_LdSt_Part : SchedWriteRes<[JLAGU, JSAGU]> {
 467   let Latency = 11;
 468   let ResourceCycles = [1, 1];
 469   let NumMicroOps = 1;
 470 }
 471
 472 def JWriteXCHGrm_LdSt_Part : SchedWriteRes<[JLAGU, JSAGU]> {
 473   let Latency = 16;
 474   let ResourceCycles = [16, 16];
 475   let NumMicroOps = 1;
 476 }
 477
 478 def JWriteXADDrm_Part1 : SchedWriteVariant<[
 479   SchedVar<MCSchedPredicate<CheckLockPrefix>, [JWriteLOCK_XADDrm_XCHG_Part]>,
 480   SchedVar<NoSchedPred,                       [JWriteXADDrm_XCHG_Part]>
 481 ]>;
 482
 483 def JWriteXADDrm_Part2 : SchedWriteVariant<[
 484   SchedVar<MCSchedPredicate<CheckLockPrefix>, [JWriteXCHGrm_LdSt_Part]>,
 485   SchedVar<NoSchedPred,                       [JWriteXADDrm_LdSt_Part]>
 486 ]>;
 487
 488 def : InstRW<[JWriteXADDrm_Part1, JWriteXADDrm_Part2, ReadAfterLd],
 489                  (instrs XADD8rm, XADD16rm, XADD32rm, XADD64rm,
 490                          LXADD8, LXADD16, LXADD32, LXADD64)>;
 491
 492 def : InstRW<[JWriteXCHGrm_XCHG_Part, JWriteXCHGrm_LdSt_Part, ReadAfterLd],
 493                  (instrs XCHG8rm, XCHG16rm, XCHG32rm, XCHG64rm)>;
 494
 495
 496 ////////////////////////////////////////////////////////////////////////////////
 497 // Floating point. This covers both scalar and vector operations.
 498 ////////////////////////////////////////////////////////////////////////////////
 499
 500 defm : X86WriteRes<WriteFLD0,          [JFPU1, JSTC], 3, [1,1], 1>;
 501 defm : X86WriteRes<WriteFLD1,          [JFPU1, JSTC], 3, [1,1], 1>;
 502 defm : X86WriteRes<WriteFLDC,          [JFPU1, JSTC], 3, [1,1], 1>;
 503 defm : X86WriteRes<WriteFLoad,         [JLAGU, JFPU01, JFPX], 5, [1, 1, 1], 1>;
 504 defm : X86WriteRes<WriteFLoadX,        [JLAGU], 5, [1], 1>;
 505 defm : X86WriteRes<WriteFLoadY,        [JLAGU], 5, [2], 2>;
 506 defm : X86WriteRes<WriteFMaskedLoad,   [JLAGU, JFPU01, JFPX], 6, [1, 2, 2], 1>;
 507 defm : X86WriteRes<WriteFMaskedLoadY,  [JLAGU, JFPU01, JFPX], 6, [2, 4, 4], 2>;
 508
 509 defm : X86WriteRes<WriteFStore,        [JSAGU, JFPU1,  JSTC], 2, [1, 1, 1], 1>;
 510 defm : X86WriteRes<WriteFStoreX,       [JSAGU, JFPU1,  JSTC], 1, [1, 1, 1], 1>;
 511 defm : X86WriteRes<WriteFStoreY,       [JSAGU, JFPU1,  JSTC], 1, [2, 2, 2], 2>;
 512 defm : X86WriteRes<WriteFStoreNT,      [JSAGU, JFPU1,  JSTC], 3, [1, 1, 1], 1>;
 513 defm : X86WriteRes<WriteFStoreNTX,     [JSAGU, JFPU1,  JSTC], 3, [1, 1, 1], 1>;
 514 defm : X86WriteRes<WriteFStoreNTY,     [JSAGU, JFPU1,  JSTC], 3, [2, 2, 2], 1>;
 515
 516 defm : X86WriteRes<WriteFMaskedStore32,  [JFPU0, JFPA, JFPU1, JSTC, JLAGU, JSAGU, JALU01], 16, [1,1, 5, 5,4,4,4], 19>;
 517 defm : X86WriteRes<WriteFMaskedStore64,  [JFPU0, JFPA, JFPU1, JSTC, JLAGU, JSAGU, JALU01], 13, [1,1, 2, 2,2,2,2], 10>;
 518 defm : X86WriteRes<WriteFMaskedStore32Y, [JFPU0, JFPA, JFPU1, JSTC, JLAGU, JSAGU, JALU01], 22, [1,1,10,10,8,8,8], 36>;
 519 defm : X86WriteRes<WriteFMaskedStore64Y, [JFPU0, JFPA, JFPU1, JSTC, JLAGU, JSAGU, JALU01], 16, [1,1, 4, 4,4,4,4], 18>;
 520
 521 defm : X86WriteRes<WriteFMove,         [JFPU01, JFPX], 1, [1, 1], 1>;
 522 defm : X86WriteRes<WriteFMoveX,        [JFPU01, JFPX], 1, [1, 1], 1>;
 523 defm : X86WriteRes<WriteFMoveY,        [JFPU01, JFPX], 1, [2, 2], 2>;
 524
 525 defm : X86WriteRes<WriteEMMS,          [JFPU01, JFPX], 2, [1, 1], 1>;
 526
 527 defm : JWriteResFpuPair<WriteFAdd,         [JFPU0, JFPA],  3>;
 528 defm : JWriteResFpuPair<WriteFAddX,        [JFPU0, JFPA],  3>;
 529 defm : JWriteResYMMPair<WriteFAddY,        [JFPU0, JFPA],  3, [2,2], 2>;
 530 defm : X86WriteResPairUnsupported<WriteFAddZ>;
 531 defm : JWriteResFpuPair<WriteFAdd64,       [JFPU0, JFPA],  3>;
 532 defm : JWriteResFpuPair<WriteFAdd64X,      [JFPU0, JFPA],  3>;
 533 defm : JWriteResYMMPair<WriteFAdd64Y,      [JFPU0, JFPA],  3, [2,2], 2>;
 534 defm : X86WriteResPairUnsupported<WriteFAdd64Z>;
 535 defm : JWriteResFpuPair<WriteFCmp,         [JFPU0, JFPA],  2>;
 536 defm : JWriteResFpuPair<WriteFCmpX,        [JFPU0, JFPA],  2>;
 537 defm : JWriteResYMMPair<WriteFCmpY,        [JFPU0, JFPA],  2, [2,2], 2>;
 538 defm : X86WriteResPairUnsupported<WriteFCmpZ>;
 539 defm : JWriteResFpuPair<WriteFCmp64,       [JFPU0, JFPA],  2>;
 540 defm : JWriteResFpuPair<WriteFCmp64X,      [JFPU0, JFPA],  2>;
 541 defm : JWriteResYMMPair<WriteFCmp64Y,      [JFPU0, JFPA],  2, [2,2], 2>;
 542 defm : X86WriteResPairUnsupported<WriteFCmp64Z>;
 543 defm : JWriteResFpuPair<WriteFCom,  [JFPU0, JFPA, JALU0],  3>;
 544 defm : JWriteResFpuPair<WriteFMul,         [JFPU1, JFPM],  2>;
 545 defm : JWriteResFpuPair<WriteFMulX,        [JFPU1, JFPM],  2>;
 546 defm : JWriteResYMMPair<WriteFMulY,        [JFPU1, JFPM],  2, [2,2], 2>;
 547 defm : X86WriteResPairUnsupported<WriteFMulZ>;
 548 defm : JWriteResFpuPair<WriteFMul64,       [JFPU1, JFPM],  4, [1,2]>;
 549 defm : JWriteResFpuPair<WriteFMul64X,      [JFPU1, JFPM],  4, [1,2]>;
 550 defm : JWriteResYMMPair<WriteFMul64Y,      [JFPU1, JFPM],  4, [2,4], 2>;
 551 defm : X86WriteResPairUnsupported<WriteFMul64Z>;
 552 defm : X86WriteResPairUnsupported<WriteFMA>;
 553 defm : X86WriteResPairUnsupported<WriteFMAX>;
 554 defm : X86WriteResPairUnsupported<WriteFMAY>;
 555 defm : X86WriteResPairUnsupported<WriteFMAZ>;
 556 defm : JWriteResFpuPair<WriteDPPD,   [JFPU1, JFPM, JFPA],  9, [1, 3, 3],  3>;
 557 defm : JWriteResFpuPair<WriteDPPS,   [JFPU1, JFPM, JFPA], 11, [1, 3, 3],  5>;
 558 defm : JWriteResYMMPair<WriteDPPSY,  [JFPU1, JFPM, JFPA], 12, [2, 6, 6], 10>;
 559 defm : X86WriteResPairUnsupported<WriteDPPSZ>;
 560 defm : JWriteResFpuPair<WriteFRcp,         [JFPU1, JFPM],  2>;
 561 defm : JWriteResFpuPair<WriteFRcpX,        [JFPU1, JFPM],  2>;
 562 defm : JWriteResYMMPair<WriteFRcpY,        [JFPU1, JFPM],  2, [2,2], 2>;
 563 defm : X86WriteResPairUnsupported<WriteFRcpZ>;
 564 defm : JWriteResFpuPair<WriteFRsqrt,       [JFPU1, JFPM],  2>;
 565 defm : JWriteResFpuPair<WriteFRsqrtX,      [JFPU1, JFPM],  2>;
 566 defm : JWriteResYMMPair<WriteFRsqrtY,      [JFPU1, JFPM],  2, [2,2], 2>;
 567 defm : X86WriteResPairUnsupported<WriteFRsqrtZ>;
 568 defm : JWriteResFpuPair<WriteFDiv,         [JFPU1, JFPM], 19, [1, 19]>;
 569 defm : JWriteResFpuPair<WriteFDivX,        [JFPU1, JFPM], 19, [1, 19]>;
 570 defm : JWriteResYMMPair<WriteFDivY,        [JFPU1, JFPM], 38, [2, 38], 2>;
 571 defm : X86WriteResPairUnsupported<WriteFDivZ>;
 572 defm : JWriteResFpuPair<WriteFDiv64,       [JFPU1, JFPM], 19, [1, 19]>;
 573 defm : JWriteResFpuPair<WriteFDiv64X,      [JFPU1, JFPM], 19, [1, 19]>;
 574 defm : JWriteResYMMPair<WriteFDiv64Y,      [JFPU1, JFPM], 38, [2, 38], 2>;
 575 defm : X86WriteResPairUnsupported<WriteFDiv64Z>;
 576 defm : JWriteResFpuPair<WriteFSqrt,        [JFPU1, JFPM], 21, [1, 21]>;
 577 defm : JWriteResFpuPair<WriteFSqrtX,       [JFPU1, JFPM], 21, [1, 21]>;
 578 defm : JWriteResYMMPair<WriteFSqrtY,       [JFPU1, JFPM], 42, [2, 42], 2>;
 579 defm : X86WriteResPairUnsupported<WriteFSqrtZ>;
 580 defm : JWriteResFpuPair<WriteFSqrt64,      [JFPU1, JFPM], 27, [1, 27]>;
 581 defm : JWriteResFpuPair<WriteFSqrt64X,     [JFPU1, JFPM], 27, [1, 27]>;
 582 defm : JWriteResYMMPair<WriteFSqrt64Y,     [JFPU1, JFPM], 54, [2, 54], 2>;
 583 defm : X86WriteResPairUnsupported<WriteFSqrt64Z>;
 584 defm : JWriteResFpuPair<WriteFSqrt80,      [JFPU1, JFPM], 35, [1, 35]>;
 585 defm : JWriteResFpuPair<WriteFSign,        [JFPU1, JFPM],  2>;
 586 defm : JWriteResFpuPair<WriteFRnd,         [JFPU1, JSTC],  3>;
 587 defm : JWriteResYMMPair<WriteFRndY,        [JFPU1, JSTC],  3, [2,2], 2>;
 588 defm : X86WriteResPairUnsupported<WriteFRndZ>;
 589 defm : JWriteResFpuPair<WriteFLogic,      [JFPU01, JFPX],  1>;
 590 defm : JWriteResYMMPair<WriteFLogicY,     [JFPU01, JFPX],  1, [2, 2], 2>;
 591 defm : X86WriteResPairUnsupported<WriteFLogicZ>;
 592 defm : JWriteResFpuPair<WriteFTest,       [JFPU0, JFPA, JALU0], 3>;
 593 defm : JWriteResYMMPair<WriteFTestY ,     [JFPU01, JFPX, JFPA, JALU0], 4, [2, 2, 2, 1], 3>;
 594 defm : X86WriteResPairUnsupported<WriteFTestZ>;
 595 defm : JWriteResFpuPair<WriteFShuffle,    [JFPU01, JFPX],  1>;
 596 defm : JWriteResYMMPair<WriteFShuffleY,   [JFPU01, JFPX],  1, [2, 2], 2>;
 597 defm : X86WriteResPairUnsupported<WriteFShuffleZ>;
 598 defm : JWriteResFpuPair<WriteFVarShuffle, [JFPU01, JFPX],  3, [1, 4], 3>; // +1cy latency.
 599 defm : JWriteResYMMPair<WriteFVarShuffleY,[JFPU01, JFPX],  4, [2, 6], 6>; // +1cy latency.
 600 defm : X86WriteResPairUnsupported<WriteFVarShuffleZ>;
 601 defm : JWriteResFpuPair<WriteFBlend,      [JFPU01, JFPX],  1>;
 602 defm : JWriteResYMMPair<WriteFBlendY,     [JFPU01, JFPX],  1, [2, 2], 2>;
 603 defm : X86WriteResPairUnsupported<WriteFBlendZ>;
 604 defm : JWriteResFpuPair<WriteFVarBlend,   [JFPU01, JFPX],  2, [4, 4], 3>;
 605 defm : JWriteResYMMPair<WriteFVarBlendY,  [JFPU01, JFPX],  3, [6, 6], 6>;
 606 defm : X86WriteResPairUnsupported<WriteFVarBlendZ>;
 607 defm : JWriteResFpuPair<WriteFShuffle256, [JFPU01, JFPX],  1, [2, 2], 2>;
 608 defm : X86WriteResPairUnsupported<WriteFVarShuffle256>;
 609
 610 ////////////////////////////////////////////////////////////////////////////////
 611 // Conversions.
 612 ////////////////////////////////////////////////////////////////////////////////
 613
 614 defm : JWriteResFpuPair<WriteCvtSS2I,      [JFPU1, JSTC, JFPU0, JFPA, JALU0], 7, [1,1,1,1,1], 2>;
 615 defm : JWriteResFpuPair<WriteCvtPS2I,      [JFPU1, JSTC], 3, [1,1], 1>;
 616 defm : JWriteResYMMPair<WriteCvtPS2IY,     [JFPU1, JSTC], 3, [2,2], 2>;
 617 defm : X86WriteResPairUnsupported<WriteCvtPS2IZ>;
 618 defm : JWriteResFpuPair<WriteCvtSD2I,      [JFPU1, JSTC, JFPU0, JFPA, JALU0], 7, [1,1,1,1,1], 2>;
 619 defm : JWriteResFpuPair<WriteCvtPD2I,      [JFPU1, JSTC], 3, [1,1], 1>;
 620 defm : JWriteResYMMPair<WriteCvtPD2IY,     [JFPU1, JSTC, JFPX], 6, [2,2,4], 3>;
 621 defm : X86WriteResPairUnsupported<WriteCvtPD2IZ>;
 622
 623 defm : X86WriteRes<WriteCvtI2SS,           [JFPU1, JSTC], 4, [1,1], 2>;
 624 defm : X86WriteRes<WriteCvtI2SSLd,         [JLAGU, JFPU1, JSTC], 9, [1,1,1], 1>;
 625 defm : JWriteResFpuPair<WriteCvtI2PS,      [JFPU1, JSTC], 3, [1,1], 1>;
 626 defm : JWriteResYMMPair<WriteCvtI2PSY,     [JFPU1, JSTC], 3, [2,2], 2>;
 627 defm : X86WriteResPairUnsupported<WriteCvtI2PSZ>;
 628 defm : X86WriteRes<WriteCvtI2SD,           [JFPU1, JSTC], 4, [1,1], 2>;
 629 defm : X86WriteRes<WriteCvtI2SDLd,         [JLAGU, JFPU1, JSTC], 9, [1,1,1], 1>;
 630 defm : JWriteResFpuPair<WriteCvtI2PD,      [JFPU1, JSTC], 3, [1,1], 1>;
 631 defm : JWriteResYMMPair<WriteCvtI2PDY,     [JFPU1, JSTC], 3, [2,2], 2>;
 632 defm : X86WriteResPairUnsupported<WriteCvtI2PDZ>;
 633
 634 defm : JWriteResFpuPair<WriteCvtSS2SD,      [JFPU1, JSTC], 7, [1,2], 2>;
 635 defm : JWriteResFpuPair<WriteCvtPS2PD,      [JFPU1, JSTC], 2, [1,1], 1>;
 636 defm : JWriteResYMMPair<WriteCvtPS2PDY,     [JFPU1, JSTC], 2, [2,2], 2>;
 637 defm : X86WriteResPairUnsupported<WriteCvtPS2PDZ>;
 638
 639 defm : JWriteResFpuPair<WriteCvtSD2SS,    [JFPU1, JSTC], 7, [1,2], 2>;
 640 defm : JWriteResFpuPair<WriteCvtPD2PS,    [JFPU1, JSTC], 3, [1,1], 1>;
 641 defm : JWriteResYMMPair<WriteCvtPD2PSY,   [JFPU1, JSTC, JFPX], 6, [2,2,4], 3>;
 642 defm : X86WriteResPairUnsupported<WriteCvtPD2PSZ>;
 643
 644 defm : JWriteResFpuPair<WriteCvtPH2PS,     [JFPU1, JSTC], 3, [1,1], 1>;
 645 defm : JWriteResYMMPair<WriteCvtPH2PSY,    [JFPU1, JSTC], 3, [2,2], 2>;
 646 defm : X86WriteResPairUnsupported<WriteCvtPH2PSZ>;
 647
 648 defm : X86WriteRes<WriteCvtPS2PH,                 [JFPU1, JSTC], 3, [1,1], 1>;
 649 defm : X86WriteRes<WriteCvtPS2PHY,          [JFPU1, JSTC, JFPX], 6, [2,2,2], 3>;
 650 defm : X86WriteResUnsupported<WriteCvtPS2PHZ>;
 651 defm : X86WriteRes<WriteCvtPS2PHSt,        [JFPU1, JSTC, JSAGU], 4, [1,1,1], 1>;
 652 defm : X86WriteRes<WriteCvtPS2PHYSt, [JFPU1, JSTC, JFPX, JSAGU], 7, [2,2,2,1], 3>;
 653 defm : X86WriteResUnsupported<WriteCvtPS2PHZSt>;
 654
 655 ////////////////////////////////////////////////////////////////////////////////
 656 // Vector integer operations.
 657 ////////////////////////////////////////////////////////////////////////////////
 658
 659 defm : X86WriteRes<WriteVecLoad,          [JLAGU, JFPU01, JVALU], 5, [1, 1, 1], 1>;
 660 defm : X86WriteRes<WriteVecLoadX,         [JLAGU], 5, [1], 1>;
 661 defm : X86WriteRes<WriteVecLoadY,         [JLAGU], 5, [2], 2>;
 662 defm : X86WriteRes<WriteVecLoadNT,        [JLAGU, JFPU01, JVALU], 5, [1, 1, 1], 1>;
 663 defm : X86WriteRes<WriteVecLoadNTY,       [JLAGU, JFPU01, JVALU], 5, [1, 1, 1], 1>;
 664 defm : X86WriteRes<WriteVecMaskedLoad,    [JLAGU, JFPU01, JVALU], 6, [1, 2, 2], 1>;
 665 defm : X86WriteRes<WriteVecMaskedLoadY,   [JLAGU, JFPU01, JVALU], 6, [2, 4, 4], 2>;
 666
 667 defm : X86WriteRes<WriteVecStore,         [JSAGU, JFPU1,   JSTC], 2, [1, 1, 1], 1>;
 668 defm : X86WriteRes<WriteVecStoreX,        [JSAGU, JFPU1,   JSTC], 1, [1, 1, 1], 1>;
 669 defm : X86WriteRes<WriteVecStoreY,        [JSAGU, JFPU1,   JSTC], 1, [2, 2, 2], 2>;
 670 defm : X86WriteRes<WriteVecStoreNT,       [JSAGU, JFPU1,   JSTC], 2, [1, 1, 1], 1>;
 671 defm : X86WriteRes<WriteVecStoreNTY,      [JSAGU, JFPU1,   JSTC], 2, [2, 2, 2], 1>;
 672 defm : X86WriteRes<WriteVecMaskedStore,   [JSAGU, JFPU01, JVALU], 6, [1, 1, 4], 1>;
 673 defm : X86WriteRes<WriteVecMaskedStoreY,  [JSAGU, JFPU01, JVALU], 6, [2, 2, 4], 2>;
 674
 675 defm : X86WriteRes<WriteVecMove,          [JFPU01, JVALU], 1, [1, 1], 1>;
 676 defm : X86WriteRes<WriteVecMoveX,         [JFPU01, JVALU], 1, [1, 1], 1>;
 677 defm : X86WriteRes<WriteVecMoveY,         [JFPU01, JVALU], 1, [2, 2], 2>;
 678 defm : X86WriteRes<WriteVecMoveToGpr,     [JFPU0, JFPA, JALU0], 4, [1, 1, 1], 1>;
 679 defm : X86WriteRes<WriteVecMoveFromGpr,   [JFPU01, JFPX], 8, [1, 1], 2>;
 680
 681 defm : JWriteResFpuPair<WriteVecALU,      [JFPU01, JVALU], 1>;
 682 defm : JWriteResFpuPair<WriteVecALUX,     [JFPU01, JVALU], 1>;
 683 defm : X86WriteResPairUnsupported<WriteVecALUY>;
 684 defm : X86WriteResPairUnsupported<WriteVecALUZ>;
 685 defm : JWriteResFpuPair<WriteVecShift,    [JFPU01, JVALU], 1>;
 686 defm : JWriteResFpuPair<WriteVecShiftX,   [JFPU01, JVALU], 2>; // +1cy latency.
 687 defm : X86WriteResPairUnsupported<WriteVecShiftY>;
 688 defm : X86WriteResPairUnsupported<WriteVecShiftZ>;
 689 defm : JWriteResFpuPair<WriteVecShiftImm, [JFPU01, JVALU], 1>;
 690 defm : JWriteResFpuPair<WriteVecShiftImmX,[JFPU01, JVALU], 2>; // +1cy latency.
 691 defm : X86WriteResPairUnsupported<WriteVecShiftImmY>;
 692 defm : X86WriteResPairUnsupported<WriteVecShiftImmZ>;
 693 defm : X86WriteResPairUnsupported<WriteVarVecShift>;
 694 defm : X86WriteResPairUnsupported<WriteVarVecShiftY>;
 695 defm : X86WriteResPairUnsupported<WriteVarVecShiftZ>;
 696 defm : JWriteResFpuPair<WriteVecIMul,     [JFPU0, JVIMUL], 2>;
 697 defm : JWriteResFpuPair<WriteVecIMulX,    [JFPU0, JVIMUL], 2>;
 698 defm : X86WriteResPairUnsupported<WriteVecIMulY>;
 699 defm : X86WriteResPairUnsupported<WriteVecIMulZ>;
 700 defm : JWriteResFpuPair<WritePMULLD,      [JFPU0, JFPU01, JVIMUL, JVALU], 4, [2, 1, 2, 1], 3>;
 701 defm : X86WriteResPairUnsupported<WritePMULLDY>;
 702 defm : X86WriteResPairUnsupported<WritePMULLDZ>;
 703 defm : JWriteResFpuPair<WriteMPSAD,       [JFPU0, JVIMUL], 3, [1, 2], 3>;
 704 defm : X86WriteResPairUnsupported<WriteMPSADY>;
 705 defm : X86WriteResPairUnsupported<WriteMPSADZ>;
 706 defm : JWriteResFpuPair<WritePSADBW,      [JFPU01, JVALU], 2>;
 707 defm : JWriteResFpuPair<WritePSADBWX,     [JFPU01, JVALU], 2>;
 708 defm : X86WriteResPairUnsupported<WritePSADBWY>;
 709 defm : X86WriteResPairUnsupported<WritePSADBWZ>;
 710 defm : JWriteResFpuPair<WritePHMINPOS,    [JFPU01, JVALU], 2>;
 711 defm : JWriteResFpuPair<WriteShuffle,     [JFPU01, JVALU], 1>;
 712 defm : JWriteResFpuPair<WriteShuffleX,    [JFPU01, JVALU], 1>;
 713 defm : X86WriteResPairUnsupported<WriteShuffleY>;
 714 defm : X86WriteResPairUnsupported<WriteShuffleZ>;
 715 defm : JWriteResFpuPair<WriteVarShuffle,  [JFPU01, JVALU], 2, [1, 1], 1>;
 716 defm : JWriteResFpuPair<WriteVarShuffleX, [JFPU01, JVALU], 2, [1, 4], 3>;
 717 defm : X86WriteResPairUnsupported<WriteVarShuffleY>;
 718 defm : X86WriteResPairUnsupported<WriteVarShuffleZ>;
 719 defm : JWriteResFpuPair<WriteBlend,       [JFPU01, JVALU], 1>;
 720 defm : X86WriteResPairUnsupported<WriteBlendY>;
 721 defm : X86WriteResPairUnsupported<WriteBlendZ>;
 722 defm : JWriteResFpuPair<WriteVarBlend,    [JFPU01, JVALU], 2, [4, 4], 3>;
 723 defm : X86WriteResPairUnsupported<WriteVarBlendY>;
 724 defm : X86WriteResPairUnsupported<WriteVarBlendZ>;
 725 defm : JWriteResFpuPair<WriteVecLogic,    [JFPU01, JVALU], 1>;
 726 defm : JWriteResFpuPair<WriteVecLogicX,   [JFPU01, JVALU], 1>;
 727 defm : X86WriteResPairUnsupported<WriteVecLogicY>;
 728 defm : X86WriteResPairUnsupported<WriteVecLogicZ>;
 729 defm : JWriteResFpuPair<WriteVecTest,     [JFPU0, JFPA, JALU0], 3>;
 730 defm : JWriteResYMMPair<WriteVecTestY,    [JFPU01, JFPX, JFPA, JALU0], 4, [2, 2, 2, 1], 3>;
 731 defm : X86WriteResPairUnsupported<WriteVecTestZ>;
 732 defm : X86WriteResPairUnsupported<WriteShuffle256>;
 733 defm : X86WriteResPairUnsupported<WriteVarShuffle256>;
 734
 735 ////////////////////////////////////////////////////////////////////////////////
 736 // Vector insert/extract operations.
 737 ////////////////////////////////////////////////////////////////////////////////
 738
 739 defm : X86WriteRes<WriteVecInsert,      [JFPU01, JVALU], 1, [1,1], 2>;
 740 defm : X86WriteRes<WriteVecInsertLd,    [JFPU01, JVALU, JLAGU], 4, [1,1,1], 1>;
 741 defm : X86WriteRes<WriteVecExtract,     [JFPU0, JFPA, JALU0], 3, [1,1,1], 1>;
 742 defm : X86WriteRes<WriteVecExtractSt,   [JFPU1, JSTC, JSAGU], 3, [1,1,1], 1>;
 743
 744 ////////////////////////////////////////////////////////////////////////////////
 745 // SSE42 String instructions.
 746 ////////////////////////////////////////////////////////////////////////////////
 747
 748 defm : JWriteResFpuPair<WritePCmpIStrI, [JFPU1, JVALU1, JFPU0, JFPA, JALU0], 7, [2, 2, 1, 1, 1], 3>;
 749 defm : JWriteResFpuPair<WritePCmpIStrM, [JFPU1, JVALU1, JFPU0, JFPA, JALU0], 8, [2, 2, 1, 1, 1], 3>;
 750 defm : JWriteResFpuPair<WritePCmpEStrI, [JFPU1, JSAGU, JLAGU, JVALU, JVALU1, JFPA, JALU0], 14, [1, 2, 2, 6, 4, 1, 1], 9>;
 751 defm : JWriteResFpuPair<WritePCmpEStrM, [JFPU1, JSAGU, JLAGU, JVALU, JVALU1, JFPA, JALU0], 14, [1, 2, 2, 6, 4, 1, 1], 9>;
 752
 753 ////////////////////////////////////////////////////////////////////////////////
 754 // MOVMSK Instructions.
 755 ////////////////////////////////////////////////////////////////////////////////
 756
 757 def  : WriteRes<WriteFMOVMSK,    [JFPU0, JFPA, JALU0]> { let Latency = 3; }
 758 def  : WriteRes<WriteVecMOVMSK,  [JFPU0, JFPA, JALU0]> { let Latency = 3; }
 759 defm : X86WriteResUnsupported<WriteVecMOVMSKY>;
 760 def  : WriteRes<WriteMMXMOVMSK,  [JFPU0, JFPA, JALU0]> { let Latency = 3; }
 761
 762 ////////////////////////////////////////////////////////////////////////////////
 763 // AES Instructions.
 764 ////////////////////////////////////////////////////////////////////////////////
 765
 766 defm : JWriteResFpuPair<WriteAESIMC,      [JFPU0, JVIMUL], 2>;
 767 defm : JWriteResFpuPair<WriteAESKeyGen,   [JFPU0, JVIMUL], 2>;
 768 defm : JWriteResFpuPair<WriteAESDecEnc,   [JFPU01, JVALU, JFPU0, JVIMUL], 3, [1,1,1,1], 2>;
 769
 770 ////////////////////////////////////////////////////////////////////////////////
 771 // Horizontal add/sub  instructions.
 772 ////////////////////////////////////////////////////////////////////////////////
 773
 774 defm : JWriteResFpuPair<WriteFHAdd,         [JFPU0, JFPA], 4>;            // +1cy latency.
 775 defm : JWriteResYMMPair<WriteFHAddY,        [JFPU0, JFPA], 4, [2,2], 2>;  // +1cy latency.
 776 defm : JWriteResFpuPair<WritePHAdd,         [JFPU01, JVALU], 1>;
 777 defm : JWriteResFpuPair<WritePHAddX,        [JFPU01, JVALU], 2>;          // +1cy latency.
 778 defm : X86WriteResPairUnsupported<WritePHAddY>;
 779
 780 ////////////////////////////////////////////////////////////////////////////////
 781 // Carry-less multiplication instructions.
 782 ////////////////////////////////////////////////////////////////////////////////
 783
 784 defm : JWriteResFpuPair<WriteCLMul,       [JFPU0, JVIMUL], 2>;
 785
 786 ////////////////////////////////////////////////////////////////////////////////
 787 // SSE4A instructions.
 788 ////////////////////////////////////////////////////////////////////////////////
 789
 790 def JWriteINSERTQ: SchedWriteRes<[JFPU01, JVALU]> {
 791   let Latency = 2;
 792   let ResourceCycles = [1, 4];
 793 }
 794 def : InstRW<[JWriteINSERTQ], (instrs INSERTQ, INSERTQI)>;
 795
 796 ////////////////////////////////////////////////////////////////////////////////
 797 // AVX instructions.
 798 ////////////////////////////////////////////////////////////////////////////////
 799
 800 def JWriteVecExtractF128: SchedWriteRes<[JFPU01, JFPX]>;
 801 def : InstRW<[JWriteVecExtractF128], (instrs VEXTRACTF128rr)>;
 802
 803 def JWriteVBROADCASTYLd: SchedWriteRes<[JLAGU, JFPU01, JFPX]> {
 804   let Latency = 6;
 805   let ResourceCycles = [1, 2, 4];
 806   let NumMicroOps = 2;
 807 }
 808 def : InstRW<[JWriteVBROADCASTYLd], (instrs VBROADCASTSDYrm,
 809                                             VBROADCASTSSYrm,
 810                                             VBROADCASTF128)>;
 811
 812 def JWriteJVZEROALL: SchedWriteRes<[]> {
 813   let Latency = 90;
 814   let NumMicroOps = 73;
 815 }
 816 def : InstRW<[JWriteJVZEROALL], (instrs VZEROALL)>;
 817
 818 def JWriteJVZEROUPPER: SchedWriteRes<[]> {
 819   let Latency = 46;
 820   let NumMicroOps = 37;
 821 }
 822 def : InstRW<[JWriteJVZEROUPPER], (instrs VZEROUPPER)>;
 823
 824 ///////////////////////////////////////////////////////////////////////////////
 825 //  SSE2/AVX Store Selected Bytes of Double Quadword - (V)MASKMOVDQ
 826 ///////////////////////////////////////////////////////////////////////////////
 827
 828 def JWriteMASKMOVDQU: SchedWriteRes<[JFPU0, JFPA, JFPU1, JSTC, JLAGU, JSAGU, JALU01]> {
 829   let Latency = 34;
 830   let ResourceCycles = [1, 1, 2, 2, 2, 16, 42];
 831   let NumMicroOps = 63;
 832 }
 833 def : InstRW<[JWriteMASKMOVDQU], (instrs MASKMOVDQU, MASKMOVDQU64,
 834                                          VMASKMOVDQU, VMASKMOVDQU64)>;
 835
 836 ///////////////////////////////////////////////////////////////////////////////
 837 //  SchedWriteVariant definitions.
 838 ///////////////////////////////////////////////////////////////////////////////
 839
 840 def JWriteZeroLatency : SchedWriteRes<[]> {
 841   let Latency = 0;
 842 }
 843
 844 def JWriteZeroIdiomYmm : SchedWriteRes<[JFPU01, JFPX]> {
 845   let NumMicroOps = 2;
 846 }
 847
 848 // Certain instructions that use the same register for both source
 849 // operands do not have a real dependency on the previous contents of the
 850 // register, and thus, do not have to wait before completing. They can be
 851 // optimized out at register renaming stage.
 852 // Reference: Section 10.8 of the "Software Optimization Guide for AMD Family
 853 // 15h Processors".
 854 // Reference: Agner's Fog "The microarchitecture of Intel, AMD and VIA CPUs",
 855 // Section 21.8 [Dependency-breaking instructions].
 856
 857 def JWriteZeroIdiom : SchedWriteVariant<[
 858     SchedVar<MCSchedPredicate<ZeroIdiomPredicate>, [JWriteZeroLatency]>,
 859     SchedVar<NoSchedPred,                          [WriteALU]>
 860 ]>;
 861 def : InstRW<[JWriteZeroIdiom], (instrs SUB32rr, SUB64rr,
 862                                         XOR32rr, XOR64rr)>;
 863
 864 def JWriteFZeroIdiom : SchedWriteVariant<[
 865     SchedVar<MCSchedPredicate<ZeroIdiomPredicate>, [JWriteZeroLatency]>,
 866     SchedVar<NoSchedPred,                          [WriteFLogic]>
 867 ]>;
 868 def : InstRW<[JWriteFZeroIdiom], (instrs XORPSrr, VXORPSrr, XORPDrr, VXORPDrr,
 869                                          ANDNPSrr, VANDNPSrr,
 870                                          ANDNPDrr, VANDNPDrr)>;
 871
 872 def JWriteFZeroIdiomY : SchedWriteVariant<[
 873     SchedVar<MCSchedPredicate<ZeroIdiomPredicate>, [JWriteZeroIdiomYmm]>,
 874     SchedVar<NoSchedPred,                          [WriteFLogicY]>
 875 ]>;
 876 def : InstRW<[JWriteFZeroIdiomY], (instrs VXORPSYrr, VXORPDYrr,
 877                                           VANDNPSYrr, VANDNPDYrr)>;
 878
 879 def JWriteVZeroIdiomLogic : SchedWriteVariant<[
 880     SchedVar<MCSchedPredicate<ZeroIdiomPredicate>, [JWriteZeroLatency]>,
 881     SchedVar<NoSchedPred,                          [WriteVecLogic]>
 882 ]>;
 883 def : InstRW<[JWriteVZeroIdiomLogic], (instrs MMX_PXORirr, MMX_PANDNirr)>;
 884
 885 def JWriteVZeroIdiomLogicX : SchedWriteVariant<[
 886     SchedVar<MCSchedPredicate<ZeroIdiomPredicate>, [JWriteZeroLatency]>,
 887     SchedVar<NoSchedPred,                          [WriteVecLogicX]>
 888 ]>;
 889 def : InstRW<[JWriteVZeroIdiomLogicX], (instrs PXORrr, VPXORrr,
 890                                                PANDNrr, VPANDNrr)>;
 891
 892 def JWriteVZeroIdiomALU : SchedWriteVariant<[
 893     SchedVar<MCSchedPredicate<ZeroIdiomPredicate>, [JWriteZeroLatency]>,
 894     SchedVar<NoSchedPred,                          [WriteVecALU]>
 895 ]>;
 896 def : InstRW<[JWriteVZeroIdiomALU], (instrs MMX_PSUBBirr, MMX_PSUBDirr,
 897                                             MMX_PSUBQirr, MMX_PSUBWirr,
 898                                             MMX_PSUBSBirr, MMX_PSUBSWirr,
 899                                             MMX_PSUBUSBirr, MMX_PSUBUSWirr,
 900                                             MMX_PCMPGTBirr, MMX_PCMPGTDirr,
 901                                             MMX_PCMPGTWirr)>;
 902
 903 def JWriteVZeroIdiomALUX : SchedWriteVariant<[
 904     SchedVar<MCSchedPredicate<ZeroIdiomPredicate>, [JWriteZeroLatency]>,
 905     SchedVar<NoSchedPred,                          [WriteVecALUX]>
 906 ]>;
 907 def : InstRW<[JWriteVZeroIdiomALUX], (instrs PSUBBrr, VPSUBBrr,
 908                                              PSUBDrr, VPSUBDrr,
 909                                              PSUBQrr, VPSUBQrr,
 910                                              PSUBWrr, VPSUBWrr,
 911                                              PSUBSBrr, VPSUBSBrr,
 912                                              PSUBSWrr, VPSUBSWrr,
 913                                              PSUBUSBrr, VPSUBUSBrr,
 914                                              PSUBUSWrr, VPSUBUSWrr,
 915                                              PCMPGTBrr, VPCMPGTBrr,
 916                                              PCMPGTDrr, VPCMPGTDrr,
 917                                              PCMPGTQrr, VPCMPGTQrr,
 918                                              PCMPGTWrr, VPCMPGTWrr)>;
 919
 920 def JWriteVPERM2F128 : SchedWriteVariant<[
 921   SchedVar<MCSchedPredicate<ZeroIdiomVPERMPredicate>, [JWriteZeroIdiomYmm]>,
 922   SchedVar<NoSchedPred,                               [WriteFShuffle256]>
 923 ]>;
 924 def : InstRW<[JWriteVPERM2F128], (instrs VPERM2F128rr)>;
 925
 926 // This write is used for slow LEA instructions.
 927 def JWrite3OpsLEA : SchedWriteRes<[JALU1, JSAGU]> {
 928   let Latency = 2;
 929 }
 930
 931 // On Jaguar, a slow LEA is either a 3Ops LEA (base, index, offset), or an LEA
 932 // with a `Scale` value different than 1.
 933 def JSlowLEAPredicate : MCSchedPredicate<
 934   CheckAny<[
 935     // A 3-operand LEA (base, index, offset).
 936     IsThreeOperandsLEAFn,
 937     // An LEA with a "Scale" different than 1.
 938     CheckAll<[
 939       CheckIsImmOperand<2>,
 940       CheckNot<CheckImmOperand<2, 1>>
 941     ]>
 942   ]>
 943 >;
 944
 945 def JWriteLEA : SchedWriteVariant<[
 946     SchedVar<JSlowLEAPredicate, [JWrite3OpsLEA]>,
 947     SchedVar<NoSchedPred,       [WriteLEA]>
 948 ]>;
 949
 950 def : InstRW<[JWriteLEA], (instrs LEA32r, LEA64r, LEA64_32r)>;
 951
 952 def JSlowLEA16r : SchedWriteRes<[JALU01]> {
 953   let Latency = 3;
 954   let ResourceCycles = [4];
 955 }
 956
 957 def : InstRW<[JSlowLEA16r], (instrs LEA16r)>;
 958
 959 ///////////////////////////////////////////////////////////////////////////////
 960 // Dependency breaking instructions.
 961 ///////////////////////////////////////////////////////////////////////////////
 962
 963 def : IsZeroIdiomFunction<[
 964   // GPR Zero-idioms.
 965   DepBreakingClass<[ SUB32rr, SUB64rr, XOR32rr, XOR64rr ], ZeroIdiomPredicate>,
 966
 967   // MMX Zero-idioms.
 968   DepBreakingClass<[
 969     MMX_PXORirr, MMX_PANDNirr, MMX_PSUBBirr,
 970     MMX_PSUBDirr, MMX_PSUBQirr, MMX_PSUBWirr,
 971     MMX_PSUBSBirr, MMX_PSUBSWirr, MMX_PSUBUSBirr, MMX_PSUBUSWirr,
 972     MMX_PCMPGTBirr, MMX_PCMPGTDirr, MMX_PCMPGTWirr
 973   ], ZeroIdiomPredicate>,
 974
 975   // SSE Zero-idioms.
 976   DepBreakingClass<[
 977     // fp variants.
 978     XORPSrr, XORPDrr, ANDNPSrr, ANDNPDrr,
 979
 980     // int variants.
 981     PXORrr, PANDNrr,
 982     PSUBBrr, PSUBWrr, PSUBDrr, PSUBQrr,
 983     PSUBSBrr, PSUBSWrr, PSUBUSBrr, PSUBUSWrr,
 984     PCMPGTBrr, PCMPGTDrr, PCMPGTQrr, PCMPGTWrr
 985   ], ZeroIdiomPredicate>,
 986
 987   // AVX Zero-idioms.
 988   DepBreakingClass<[
 989     // xmm fp variants.
 990     VXORPSrr, VXORPDrr, VANDNPSrr, VANDNPDrr,
 991
 992     // xmm int variants.
 993     VPXORrr, VPANDNrr,
 994     VPSUBBrr, VPSUBWrr, VPSUBDrr, VPSUBQrr,
 995     VPSUBSBrr, VPSUBSWrr, VPSUBUSBrr, VPSUBUSWrr,
 996     VPCMPGTBrr, VPCMPGTWrr, VPCMPGTDrr, VPCMPGTQrr,
 997
 998     // ymm variants.
 999     VXORPSYrr, VXORPDYrr, VANDNPSYrr, VANDNPDYrr
1000   ], ZeroIdiomPredicate>,
1001
1002   DepBreakingClass<[ VPERM2F128rr ], ZeroIdiomVPERMPredicate>
1003 ]>;
1004
1005 def : IsDepBreakingFunction<[
1006   // GPR
1007   DepBreakingClass<[ SBB32rr, SBB64rr ], ZeroIdiomPredicate>,
1008   DepBreakingClass<[ CMP32rr, CMP64rr ], CheckSameRegOperand<0, 1> >,
1009
1010   // MMX
1011   DepBreakingClass<[
1012     MMX_PCMPEQBirr, MMX_PCMPEQDirr, MMX_PCMPEQWirr
1013   ], ZeroIdiomPredicate>,
1014
1015   // SSE
1016   DepBreakingClass<[
1017     PCMPEQBrr, PCMPEQWrr, PCMPEQDrr, PCMPEQQrr
1018   ], ZeroIdiomPredicate>,
1019
1020   // AVX
1021   DepBreakingClass<[
1022     VPCMPEQBrr, VPCMPEQWrr, VPCMPEQDrr, VPCMPEQQrr
1023   ], ZeroIdiomPredicate>
1024 ]>;
1025
1026 def : IsOptimizableRegisterMove<[
1027   InstructionEquivalenceClass<[
1028     // GPR variants.
1029     MOV32rr, MOV64rr,
1030
1031     // MMX variants.
1032     MMX_MOVQ64rr,
1033
1034     // SSE variants.
1035     MOVAPSrr, MOVUPSrr,
1036     MOVAPDrr, MOVUPDrr,
1037     MOVDQArr, MOVDQUrr,
1038
1039     // AVX variants.
1040     VMOVAPSrr, VMOVUPSrr,
1041     VMOVAPDrr, VMOVUPDrr,
1042     VMOVDQArr, VMOVDQUrr
1043   ], TruePred >
1044 ]>;
1045
1046 } // SchedModel