llvm/lib/Target/X86/X86ScheduleBtVer2.td

   1 //=- X86ScheduleBtVer2.td - X86 BtVer2 (Jaguar) Scheduling ---*- tablegen -*-=//
   2 //
   3 // Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
   4 // See https://llvm.org/LICENSE.txt for license information.
   5 // SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
   6 //
   7 //===----------------------------------------------------------------------===//
   8 //
   9 // This file defines the machine model for AMD btver2 (Jaguar) to support
  10 // instruction scheduling and other instruction cost heuristics. Based off AMD Software
  11 // Optimization Guide for AMD Family 16h Processors & Instruction Latency appendix.
  12 //
  13 //===----------------------------------------------------------------------===//
  14
  15 def BtVer2Model : SchedMachineModel {
  16   // All x86 instructions are modeled as a single micro-op, and btver2 can
  17   // decode 2 instructions per cycle.
  18   let IssueWidth = 2;
  19   let MicroOpBufferSize = 64; // Retire Control Unit
  20   let LoadLatency = 5; // FPU latency (worse case cf Integer 3 cycle latency)
  21   let HighLatency = 25;
  22   let MispredictPenalty = 14; // Minimum branch misdirection penalty
  23   let PostRAScheduler = 1;
  24
  25   // FIXME: SSE4/AVX is unimplemented. This flag is set to allow
  26   // the scheduler to assign a default model to unrecognized opcodes.
  27   let CompleteModel = 0;
  28 }
  29
  30 let SchedModel = BtVer2Model in {
  31
  32 // Jaguar can issue up to 6 micro-ops in one cycle
  33 def JALU0 : ProcResource<1>; // Integer Pipe0: integer ALU0 (also handle FP->INT jam)
  34 def JALU1 : ProcResource<1>; // Integer Pipe1: integer ALU1/MUL/DIV
  35 def JLAGU : ProcResource<1>; // Integer Pipe2: LAGU
  36 def JSAGU : ProcResource<1>; // Integer Pipe3: SAGU (also handles 3-operand LEA)
  37 def JFPU0 : ProcResource<1>; // Vector/FPU Pipe0: VALU0/VIMUL/FPA
  38 def JFPU1 : ProcResource<1>; // Vector/FPU Pipe1: VALU1/STC/FPM
  39
  40 // The Integer PRF for Jaguar is 64 entries, and it holds the architectural and
  41 // speculative version of the 64-bit integer registers.
  42 // Reference: www.realworldtech.com/jaguar/4/
  43 //
  44 // The processor always keeps the different parts of an integer register
  45 // together. An instruction that writes to a part of a register will therefore
  46 // have a false dependence on any previous write to the same register or any
  47 // part of it.
  48 // Reference: Section 21.10 "AMD Bobcat and Jaguar pipeline: Partial register
  49 // access" - Agner Fog's "microarchitecture.pdf".
  50 def JIntegerPRF : RegisterFile<64, [GR64, CCR], [1, 1], [1, 0],
  51                                0,  // Max moves that can be eliminated per cycle.
  52                                1>; // Restrict move elimination to zero regs.
  53
  54 // The Jaguar FP Retire Queue renames SIMD and FP uOps onto a pool of 72 SSE
  55 // registers. Operations on 256-bit data types are cracked into two COPs.
  56 // Reference: www.realworldtech.com/jaguar/4/
  57
  58 // The PRF in the floating point unit can eliminate a move from a MMX or SSE
  59 // register that is know to be zero (i.e. it has been zeroed using a zero-idiom
  60 // dependency breaking instruction, or via VZEROALL).
  61 // Reference: Section 21.8 "AMD Bobcat and Jaguar pipeline: Dependency-breaking
  62 // instructions" - Agner Fog's "microarchitecture.pdf"
  63 def JFpuPRF: RegisterFile<72, [VR64, VR128, VR256], [1, 1, 2], [1, 1, 0],
  64                           0,  // Max moves that can be eliminated per cycle.
  65                           1>; // Restrict move elimination to zero regs.
  66
  67 // The retire control unit (RCU) can track up to 64 macro-ops in-flight. It can
  68 // retire up to two macro-ops per cycle.
  69 // Reference: "Software Optimization Guide for AMD Family 16h Processors"
  70 def JRCU : RetireControlUnit<64, 2>;
  71
  72 // Integer Pipe Scheduler
  73 def JALU01 : ProcResGroup<[JALU0, JALU1]> {
  74   let BufferSize=20;
  75 }
  76
  77 // AGU Pipe Scheduler
  78 def JLSAGU : ProcResGroup<[JLAGU, JSAGU]> {
  79   let BufferSize=12;
  80 }
  81
  82 // Fpu Pipe Scheduler
  83 def JFPU01 : ProcResGroup<[JFPU0, JFPU1]> {
  84   let BufferSize=18;
  85 }
  86
  87 // Functional units
  88 def JDiv    : ProcResource<1>; // integer division
  89 def JMul    : ProcResource<1>; // integer multiplication
  90 def JVALU0  : ProcResource<1>; // vector integer
  91 def JVALU1  : ProcResource<1>; // vector integer
  92 def JVIMUL  : ProcResource<1>; // vector integer multiplication
  93 def JSTC    : ProcResource<1>; // vector store/convert
  94 def JFPM    : ProcResource<1>; // FP multiplication
  95 def JFPA    : ProcResource<1>; // FP addition
  96
  97 // Functional unit groups
  98 def JFPX  : ProcResGroup<[JFPA, JFPM]>;
  99 def JVALU : ProcResGroup<[JVALU0, JVALU1]>;
 100
 101 // Integer loads are 3 cycles, so ReadAfterLd registers needn't be available until 3
 102 // cycles after the memory operand.
 103 def : ReadAdvance<ReadAfterLd, 3>;
 104
 105 // Vector loads are 5 cycles, so ReadAfterVec*Ld registers needn't be available until 5
 106 // cycles after the memory operand.
 107 def : ReadAdvance<ReadAfterVecLd, 5>;
 108 def : ReadAdvance<ReadAfterVecXLd, 5>;
 109 def : ReadAdvance<ReadAfterVecYLd, 5>;
 110
 111 /// "Additional 6 cycle transfer operation which moves a floating point
 112 /// operation input value from the integer unit to the floating point unit.
 113 /// Reference: AMDfam16h SOG (Appendix A "Instruction Latencies", Section A.2).
 114 def : ReadAdvance<ReadInt2Fpu, -6>;
 115
 116 // Many SchedWrites are defined in pairs with and without a folded load.
 117 // Instructions with folded loads are usually micro-fused, so they only appear
 118 // as two micro-ops when dispatched by the schedulers.
 119 // This multiclass defines the resource usage for variants with and without
 120 // folded loads.
 121 multiclass JWriteResIntPair<X86FoldableSchedWrite SchedRW,
 122                             list<ProcResourceKind> ExePorts,
 123                             int Lat, list<int> Res = [], int UOps = 1,
 124                             int LoadUOps = 0> {
 125   // Register variant is using a single cycle on ExePort.
 126   def : WriteRes<SchedRW, ExePorts> {
 127     let Latency = Lat;
 128     let ReleaseAtCycles = Res;
 129     let NumMicroOps = UOps;
 130   }
 131
 132   // Memory variant also uses a cycle on JLAGU and adds 3 cycles to the
 133   // latency.
 134   def : WriteRes<SchedRW.Folded, !listconcat([JLAGU], ExePorts)> {
 135     let Latency = !add(Lat, 3);
 136     let ReleaseAtCycles = !if(!empty(Res), [], !listconcat([1], Res));
 137     let NumMicroOps = !add(UOps, LoadUOps);
 138   }
 139 }
 140
 141 multiclass JWriteResFpuPair<X86FoldableSchedWrite SchedRW,
 142                             list<ProcResourceKind> ExePorts,
 143                             int Lat, list<int> Res = [], int UOps = 1,
 144                             int LoadUOps = 0> {
 145   // Register variant is using a single cycle on ExePort.
 146   def : WriteRes<SchedRW, ExePorts> {
 147     let Latency = Lat;
 148     let ReleaseAtCycles = Res;
 149     let NumMicroOps = UOps;
 150   }
 151
 152   // Memory variant also uses a cycle on JLAGU and adds 5 cycles to the
 153   // latency.
 154   def : WriteRes<SchedRW.Folded, !listconcat([JLAGU], ExePorts)> {
 155     let Latency = !add(Lat, 5);
 156     let ReleaseAtCycles = !if(!empty(Res), [], !listconcat([1], Res));
 157     let NumMicroOps = !add(UOps, LoadUOps);
 158   }
 159 }
 160
 161 multiclass JWriteResYMMPair<X86FoldableSchedWrite SchedRW,
 162                             list<ProcResourceKind> ExePorts,
 163                             int Lat, list<int> Res = [2], int UOps = 2,
 164                             int LoadUOps = 0> {
 165   // Register variant is using a single cycle on ExePort.
 166   def : WriteRes<SchedRW, ExePorts> {
 167     let Latency = Lat;
 168     let ReleaseAtCycles = Res;
 169     let NumMicroOps = UOps;
 170   }
 171
 172   // Memory variant also uses 2 cycles on JLAGU and adds 5 cycles to the
 173   // latency.
 174   def : WriteRes<SchedRW.Folded, !listconcat([JLAGU], ExePorts)> {
 175     let Latency = !add(Lat, 5);
 176     let ReleaseAtCycles = !listconcat([2], Res);
 177     let NumMicroOps = !add(UOps, LoadUOps);
 178   }
 179 }
 180
 181 // Instructions that have local forwarding disabled have an extra +1cy latency.
 182
 183 // A folded store needs a cycle on the SAGU for the store data, most RMW
 184 // instructions don't need an extra uop.  ALU RMW operations don't seem to
 185 // benefit from STLF, and their observed latency is 6cy. That is the reason why
 186 // this write adds two extra cycles (instead of just 1cy for the store).
 187 defm : X86WriteRes<WriteRMW, [JSAGU], 2, [1], 0>;
 188
 189 ////////////////////////////////////////////////////////////////////////////////
 190 // Arithmetic.
 191 ////////////////////////////////////////////////////////////////////////////////
 192
 193 defm : JWriteResIntPair<WriteALU,    [JALU01], 1>;
 194 defm : JWriteResIntPair<WriteADC,    [JALU01], 1, [2]>;
 195
 196 defm : X86WriteRes<WriteBSWAP32,     [JALU01], 1, [1], 1>;
 197 defm : X86WriteRes<WriteBSWAP64,     [JALU01], 1, [1], 1>;
 198 defm : X86WriteRes<WriteCMPXCHG,     [JALU01], 3, [3], 5>;
 199 defm : X86WriteRes<WriteCMPXCHGRMW,  [JALU01, JSAGU, JLAGU], 11, [3, 1, 1], 6>;
 200 defm : X86WriteRes<WriteXCHG,        [JALU01], 1, [2], 2>;
 201
 202 defm : JWriteResIntPair<WriteIMul8,     [JALU1, JMul], 3, [1, 1], 1>;
 203 defm : JWriteResIntPair<WriteIMul16,    [JALU1, JMul], 3, [1, 3], 3>;
 204 defm : JWriteResIntPair<WriteIMul16Imm, [JALU1, JMul], 4, [1, 2], 2>;
 205 defm : JWriteResIntPair<WriteIMul16Reg, [JALU1, JMul], 3, [1, 1], 1>;
 206 defm : JWriteResIntPair<WriteIMul32,    [JALU1, JMul], 3, [1, 2], 2>;
 207 defm : JWriteResIntPair<WriteIMul32Imm, [JALU1, JMul], 3, [1, 1], 1>;
 208 defm : JWriteResIntPair<WriteIMul32Reg, [JALU1, JMul], 3, [1, 1], 1>;
 209 defm : JWriteResIntPair<WriteIMul64,    [JALU1, JMul], 6, [1, 4], 2>;
 210 defm : JWriteResIntPair<WriteIMul64Imm, [JALU1, JMul], 6, [1, 4], 1>;
 211 defm : JWriteResIntPair<WriteIMul64Reg, [JALU1, JMul], 6, [1, 4], 1>;
 212 defm : X86WriteResUnsupported<WriteIMulH>;
 213 defm : X86WriteResUnsupported<WriteIMulHLd>;
 214 defm : X86WriteResPairUnsupported<WriteMULX32>;
 215 defm : X86WriteResPairUnsupported<WriteMULX64>;
 216
 217 defm : JWriteResIntPair<WriteDiv8,   [JALU1, JDiv], 12, [1, 12], 1>;
 218 defm : JWriteResIntPair<WriteDiv16,  [JALU1, JDiv], 17, [1, 17], 2>;
 219 defm : JWriteResIntPair<WriteDiv32,  [JALU1, JDiv], 25, [1, 25], 2>;
 220 defm : JWriteResIntPair<WriteDiv64,  [JALU1, JDiv], 41, [1, 41], 2>;
 221 defm : JWriteResIntPair<WriteIDiv8,  [JALU1, JDiv], 12, [1, 12], 1>;
 222 defm : JWriteResIntPair<WriteIDiv16, [JALU1, JDiv], 17, [1, 17], 2>;
 223 defm : JWriteResIntPair<WriteIDiv32, [JALU1, JDiv], 25, [1, 25], 2>;
 224 defm : JWriteResIntPair<WriteIDiv64, [JALU1, JDiv], 41, [1, 41], 2>;
 225
 226 defm : JWriteResIntPair<WriteCRC32,  [JALU01], 3, [4], 3>;
 227
 228 defm : JWriteResIntPair<WriteCMOV,  [JALU01], 1>; // Conditional move.
 229 defm : X86WriteRes<WriteFCMOV, [JFPU0, JFPA], 3, [1,1], 1>; // x87 conditional move.
 230 def  : WriteRes<WriteSETCC, [JALU01]>; // Setcc.
 231 def  : WriteRes<WriteSETCCStore, [JALU01,JSAGU]>;
 232 def  : WriteRes<WriteLAHFSAHF, [JALU01]>;
 233
 234 defm : X86WriteRes<WriteBitTest,         [JALU01], 1, [1], 1>;
 235 defm : X86WriteRes<WriteBitTestImmLd,    [JALU01,JLAGU], 4, [1,1], 1>;
 236 defm : X86WriteRes<WriteBitTestRegLd,    [JALU01,JLAGU], 4, [1,1], 5>;
 237 defm : X86WriteRes<WriteBitTestSet,      [JALU01], 1, [1], 2>;
 238 defm : X86WriteRes<WriteBitTestSetImmLd, [JALU01,JLAGU], 4, [1,1], 4>;
 239 defm : X86WriteRes<WriteBitTestSetRegLd, [JALU01,JLAGU], 4, [1,1], 8>;
 240
 241 // This is for simple LEAs with one or two input operands.
 242 def : WriteRes<WriteLEA, [JALU01]>;
 243
 244 // Bit counts.
 245 defm : JWriteResIntPair<WriteBSF, [JALU01], 4, [8], 7>;
 246 defm : JWriteResIntPair<WriteBSR, [JALU01], 5, [8], 8>;
 247 defm : JWriteResIntPair<WritePOPCNT,         [JALU01], 1>;
 248 defm : JWriteResIntPair<WriteLZCNT,          [JALU01], 1>;
 249 defm : JWriteResIntPair<WriteTZCNT,          [JALU01], 2, [2], 2>;
 250
 251 // BMI1 BEXTR/BLS, BMI2 BZHI
 252 defm : JWriteResIntPair<WriteBEXTR, [JALU01], 1>;
 253 defm : JWriteResIntPair<WriteBLS,   [JALU01], 2, [2], 2>;
 254 defm : X86WriteResPairUnsupported<WriteBZHI>;
 255
 256 ////////////////////////////////////////////////////////////////////////////////
 257 // Integer shifts and rotates.
 258 ////////////////////////////////////////////////////////////////////////////////
 259
 260 defm : JWriteResIntPair<WriteShift,    [JALU01], 1>;
 261 defm : JWriteResIntPair<WriteShiftCL,  [JALU01], 1>;
 262 defm : JWriteResIntPair<WriteRotate,   [JALU01], 1>;
 263 defm : JWriteResIntPair<WriteRotateCL, [JALU01], 1>;
 264
 265 // SHLD/SHRD.
 266 defm : X86WriteRes<WriteSHDrri, [JALU01], 3, [6], 6>;
 267 defm : X86WriteRes<WriteSHDrrcl,[JALU01], 4, [8], 7>;
 268 defm : X86WriteRes<WriteSHDmri, [JLAGU, JALU01], 9, [1, 22], 8>;
 269 defm : X86WriteRes<WriteSHDmrcl,[JLAGU, JALU01], 9, [1, 22], 8>;
 270
 271 ////////////////////////////////////////////////////////////////////////////////
 272 // Loads, stores, and moves, not folded with other operations.
 273 ////////////////////////////////////////////////////////////////////////////////
 274
 275 def : WriteRes<WriteLoad,    [JLAGU]> { let Latency = 3; }
 276 def : WriteRes<WriteStore,   [JSAGU]>;
 277 def : WriteRes<WriteStoreNT, [JSAGU]>;
 278 def : WriteRes<WriteMove,    [JALU01]>;
 279 defm : X86WriteResUnsupported<WriteVecMaskedGatherWriteback>;
 280
 281 // Load/store MXCSR.
 282 def : WriteRes<WriteLDMXCSR, [JLAGU]> { let Latency = 3; }
 283 def : WriteRes<WriteSTMXCSR, [JSAGU]>;
 284
 285 // Treat misc copies as a move.
 286 def : InstRW<[WriteMove], (instrs COPY)>;
 287
 288 ////////////////////////////////////////////////////////////////////////////////
 289 // Idioms that clear a register, like xorps %xmm0, %xmm0.
 290 // These can often bypass execution ports completely.
 291 ////////////////////////////////////////////////////////////////////////////////
 292
 293 def : WriteRes<WriteZero,  []>;
 294
 295 ////////////////////////////////////////////////////////////////////////////////
 296 // Branches don't produce values, so they have no latency, but they still
 297 // consume resources. Indirect branches can fold loads.
 298 ////////////////////////////////////////////////////////////////////////////////
 299
 300 defm : JWriteResIntPair<WriteJump,  [JALU01], 1>;
 301
 302 ////////////////////////////////////////////////////////////////////////////////
 303 // Special case scheduling classes.
 304 ////////////////////////////////////////////////////////////////////////////////
 305
 306 def : WriteRes<WriteSystem,     [JALU01]> { let Latency = 100; }
 307 def : WriteRes<WriteMicrocoded, [JALU01]> { let Latency = 100; }
 308 def : WriteRes<WriteFence,  [JSAGU]>;
 309
 310 // Nops don't have dependencies, so there's no actual latency, but we set this
 311 // to '1' to tell the scheduler that the nop uses an ALU slot for a cycle.
 312 def : WriteRes<WriteNop, [JALU01]> { let Latency = 1; }
 313
 314 def JWriteCMPXCHG8rr : SchedWriteRes<[JALU01]> {
 315   let Latency = 3;
 316   let ReleaseAtCycles = [3];
 317   let NumMicroOps = 3;
 318 }
 319
 320 def JWriteLOCK_CMPXCHG8rm : SchedWriteRes<[JALU01, JLAGU, JSAGU]> {
 321   let Latency = 16;
 322   let ReleaseAtCycles = [3,16,16];
 323   let NumMicroOps = 5;
 324 }
 325
 326 def JWriteLOCK_CMPXCHGrm : SchedWriteRes<[JALU01, JLAGU, JSAGU]> {
 327   let Latency = 17;
 328   let ReleaseAtCycles = [3,17,17];
 329   let NumMicroOps = 6;
 330 }
 331
 332 def JWriteCMPXCHG8rm : SchedWriteRes<[JALU01, JLAGU, JSAGU]> {
 333   let Latency = 11;
 334   let ReleaseAtCycles = [3,1,1];
 335   let NumMicroOps = 5;
 336 }
 337
 338 def JWriteCMPXCHG8B : SchedWriteRes<[JALU01, JLAGU, JSAGU]> {
 339   let Latency = 11;
 340   let ReleaseAtCycles = [3,1,1];
 341   let NumMicroOps = 18;
 342 }
 343
 344 def JWriteCMPXCHG16B : SchedWriteRes<[JALU01, JLAGU, JSAGU]> {
 345   let Latency = 32;
 346   let ReleaseAtCycles = [6,1,1];
 347   let NumMicroOps = 28;
 348 }
 349
 350 def JWriteLOCK_CMPXCHG8B : SchedWriteRes<[JALU01, JLAGU, JSAGU]> {
 351   let Latency = 19;
 352   let ReleaseAtCycles = [3,19,19];
 353   let NumMicroOps = 18;
 354 }
 355
 356 def JWriteLOCK_CMPXCHG16B : SchedWriteRes<[JALU01, JLAGU, JSAGU]> {
 357   let Latency = 38;
 358   let ReleaseAtCycles = [6,38,38];
 359   let NumMicroOps = 28;
 360 }
 361
 362 def JWriteCMPXCHGVariant :  SchedWriteVariant<[
 363   SchedVar<MCSchedPredicate<IsAtomicCompareAndSwap8B>,  [JWriteLOCK_CMPXCHG8B]>,
 364   SchedVar<MCSchedPredicate<IsAtomicCompareAndSwap16B>, [JWriteLOCK_CMPXCHG16B]>,
 365   SchedVar<MCSchedPredicate<IsAtomicCompareAndSwap_8>,  [JWriteLOCK_CMPXCHG8rm]>,
 366   SchedVar<MCSchedPredicate<IsAtomicCompareAndSwap>,    [JWriteLOCK_CMPXCHGrm]>,
 367   SchedVar<MCSchedPredicate<IsCompareAndSwap8B>,        [JWriteCMPXCHG8B]>,
 368   SchedVar<MCSchedPredicate<IsCompareAndSwap16B>,       [JWriteCMPXCHG16B]>,
 369   SchedVar<MCSchedPredicate<IsRegMemCompareAndSwap_8>,  [JWriteCMPXCHG8rm]>,
 370   SchedVar<MCSchedPredicate<IsRegMemCompareAndSwap>,    [WriteCMPXCHGRMW]>,
 371   SchedVar<MCSchedPredicate<IsRegRegCompareAndSwap_8>,  [JWriteCMPXCHG8rr]>,
 372   SchedVar<NoSchedPred,                                 [WriteCMPXCHG]>
 373 ]>;
 374
 375 // The first five reads are contributed by the memory load operand.
 376 // We ignore those reads and set a read-advance for the other input operands
 377 // including the implicit read of RAX.
 378 def : InstRW<[JWriteCMPXCHGVariant,
 379               ReadDefault, ReadDefault, ReadDefault, ReadDefault, ReadDefault,
 380               ReadAfterLd, ReadAfterLd], (instrs LCMPXCHG8, LCMPXCHG16,
 381                                                  LCMPXCHG32, LCMPXCHG64,
 382                                                  CMPXCHG8rm, CMPXCHG16rm,
 383                                                  CMPXCHG32rm, CMPXCHG64rm)>;
 384
 385 def : InstRW<[JWriteCMPXCHGVariant], (instrs CMPXCHG8rr, CMPXCHG16rr,
 386                                              CMPXCHG32rr, CMPXCHG64rr)>;
 387
 388 def : InstRW<[JWriteCMPXCHGVariant,
 389               // Ignore reads contributed by the memory operand.
 390               ReadDefault, ReadDefault, ReadDefault, ReadDefault, ReadDefault,
 391               // Add a read-advance to every implicit register read.
 392               ReadAfterLd, ReadAfterLd, ReadAfterLd, ReadAfterLd], (instrs LCMPXCHG8B, LCMPXCHG16B,
 393                                                                            CMPXCHG8B, CMPXCHG16B)>;
 394
 395 def JWriteLOCK_ALURMW : SchedWriteRes<[JALU01, JLAGU, JSAGU]> {
 396   let Latency = 19;
 397   let ReleaseAtCycles = [1,19,19];
 398   let NumMicroOps = 1;
 399 }
 400
 401 def JWriteLOCK_ALURMWVariant :  SchedWriteVariant<[
 402   SchedVar<MCSchedPredicate<CheckLockPrefix>, [JWriteLOCK_ALURMW]>,
 403   SchedVar<NoSchedPred,                       [WriteALURMW]>
 404 ]>;
 405 def : InstRW<[JWriteLOCK_ALURMWVariant], (instrs INC8m, INC16m, INC32m, INC64m,
 406                                                  DEC8m, DEC16m, DEC32m, DEC64m,
 407                                                  NOT8m, NOT16m, NOT32m, NOT64m,
 408                                                  NEG8m, NEG16m, NEG32m, NEG64m)>;
 409
 410 def JWriteXCHG8rr_XADDrr : SchedWriteRes<[JALU01]> {
 411   let Latency = 2;
 412   let ReleaseAtCycles = [3];
 413   let NumMicroOps = 3;
 414 }
 415 def : InstRW<[JWriteXCHG8rr_XADDrr], (instrs XCHG8rr, XADD8rr, XADD16rr,
 416                                                       XADD32rr, XADD64rr)>;
 417
 418 // This write defines the latency of the in/out register operand of a non-atomic
 419 // XADDrm. This is the first of a pair of writes that model non-atomic
 420 // XADDrm instructions (the second write definition is JWriteXADDrm_LdSt_Part).
 421 //
 422 // We need two writes because the instruction latency differs from the output
 423 // register operand latency. In particular, the first write describes the first
 424 // (and only) output register operand of the instruction.  However, the
 425 // instruction latency is set to the MAX of all the write latencies. That's why
 426 // a second write is needed in this case (see example below).
 427 //
 428 // Example:
 429 //     XADD %ecx, (%rsp)      ## Instruction latency: 11cy
 430 //                            ## ECX write Latency: 3cy
 431 //
 432 // Register ECX becomes available in 3 cycles. That is because the value of ECX
 433 // is exchanged with the value read from the stack pointer, and the load-to-use
 434 // latency is assumed to be 3cy.
 435 def JWriteXADDrm_XCHG_Part : SchedWriteRes<[JALU01]> {
 436   let Latency = 3;  // load-to-use latency
 437   let ReleaseAtCycles = [3];
 438   let NumMicroOps = 3;
 439 }
 440
 441 // This write defines the latency of the in/out register operand of an atomic
 442 // XADDrm. This is the first of a sequence of two writes used to model atomic
 443 // XADD instructions. The second write of the sequence is JWriteXCHGrm_LdSt_Part.
 444 //
 445 //
 446 // Example:
 447 //    LOCK XADD %ecx, (%rsp)     ## Instruction Latency: 16cy
 448 //                               ## ECX write Latency: 11cy
 449 //
 450 // The value of ECX becomes available only after 11cy from the start of
 451 // execution. This write is used to specifically set that operand latency.
 452 def JWriteLOCK_XADDrm_XCHG_Part : SchedWriteRes<[JALU01]> {
 453   let Latency = 11;
 454   let ReleaseAtCycles = [3];
 455   let NumMicroOps = 3;
 456 }
 457
 458 // This write defines the latency of the in/out register operand of an atomic
 459 // XCHGrm. This write is the first of a sequence of two writes that describe
 460 // atomic XCHG operations. We need two writes because the instruction latency
 461 // differs from the output register write latency.  We want to make sure that
 462 // the output register operand becomes visible after 11cy. However, we want to
 463 // set the instruction latency to 16cy.
 464 def JWriteXCHGrm_XCHG_Part : SchedWriteRes<[JALU01]> {
 465   let Latency = 11;
 466   let ReleaseAtCycles = [2];
 467   let NumMicroOps = 2;
 468 }
 469
 470 def JWriteXADDrm_LdSt_Part : SchedWriteRes<[JLAGU, JSAGU]> {
 471   let Latency = 11;
 472   let ReleaseAtCycles = [1, 1];
 473   let NumMicroOps = 1;
 474 }
 475
 476 def JWriteXCHGrm_LdSt_Part : SchedWriteRes<[JLAGU, JSAGU]> {
 477   let Latency = 16;
 478   let ReleaseAtCycles = [16, 16];
 479   let NumMicroOps = 1;
 480 }
 481
 482 def JWriteXADDrm_Part1 : SchedWriteVariant<[
 483   SchedVar<MCSchedPredicate<CheckLockPrefix>, [JWriteLOCK_XADDrm_XCHG_Part]>,
 484   SchedVar<NoSchedPred,                       [JWriteXADDrm_XCHG_Part]>
 485 ]>;
 486
 487 def JWriteXADDrm_Part2 : SchedWriteVariant<[
 488   SchedVar<MCSchedPredicate<CheckLockPrefix>, [JWriteXCHGrm_LdSt_Part]>,
 489   SchedVar<NoSchedPred,                       [JWriteXADDrm_LdSt_Part]>
 490 ]>;
 491
 492 def : InstRW<[JWriteXADDrm_Part1, JWriteXADDrm_Part2, ReadAfterLd],
 493                  (instrs XADD8rm, XADD16rm, XADD32rm, XADD64rm,
 494                          LXADD8, LXADD16, LXADD32, LXADD64)>;
 495
 496 def : InstRW<[JWriteXCHGrm_XCHG_Part, JWriteXCHGrm_LdSt_Part, ReadAfterLd],
 497                  (instrs XCHG8rm, XCHG16rm, XCHG32rm, XCHG64rm)>;
 498
 499
 500 ////////////////////////////////////////////////////////////////////////////////
 501 // Floating point. This covers both scalar and vector operations.
 502 ////////////////////////////////////////////////////////////////////////////////
 503
 504 defm : X86WriteRes<WriteFLD0,          [JFPU1, JSTC], 3, [1,1], 1>;
 505 defm : X86WriteRes<WriteFLD1,          [JFPU1, JSTC], 3, [1,1], 1>;
 506 defm : X86WriteRes<WriteFLDC,          [JFPU1, JSTC], 3, [1,1], 1>;
 507 defm : X86WriteRes<WriteFLoad,         [JLAGU, JFPU01, JFPX], 5, [1, 1, 1], 1>;
 508 defm : X86WriteRes<WriteFLoadX,        [JLAGU], 5, [1], 1>;
 509 defm : X86WriteRes<WriteFLoadY,        [JLAGU], 5, [2], 2>;
 510 defm : X86WriteRes<WriteFMaskedLoad,   [JLAGU, JFPU01, JFPX], 6, [1, 2, 2], 1>;
 511 defm : X86WriteRes<WriteFMaskedLoadY,  [JLAGU, JFPU01, JFPX], 6, [2, 4, 4], 2>;
 512
 513 defm : X86WriteRes<WriteFStore,        [JSAGU, JFPU1,  JSTC], 2, [1, 1, 1], 1>;
 514 defm : X86WriteRes<WriteFStoreX,       [JSAGU, JFPU1,  JSTC], 1, [1, 1, 1], 1>;
 515 defm : X86WriteRes<WriteFStoreY,       [JSAGU, JFPU1,  JSTC], 1, [2, 2, 2], 2>;
 516 defm : X86WriteRes<WriteFStoreNT,      [JSAGU, JFPU1,  JSTC], 3, [1, 1, 1], 1>;
 517 defm : X86WriteRes<WriteFStoreNTX,     [JSAGU, JFPU1,  JSTC], 3, [1, 1, 1], 1>;
 518 defm : X86WriteRes<WriteFStoreNTY,     [JSAGU, JFPU1,  JSTC], 3, [2, 2, 2], 1>;
 519
 520 defm : X86WriteRes<WriteFMaskedStore32,  [JFPU0, JFPA, JFPU1, JSTC, JLAGU, JSAGU, JALU01], 16, [1,1, 5, 5,4,4,4], 19>;
 521 defm : X86WriteRes<WriteFMaskedStore64,  [JFPU0, JFPA, JFPU1, JSTC, JLAGU, JSAGU, JALU01], 13, [1,1, 2, 2,2,2,2], 10>;
 522 defm : X86WriteRes<WriteFMaskedStore32Y, [JFPU0, JFPA, JFPU1, JSTC, JLAGU, JSAGU, JALU01], 22, [1,1,10,10,8,8,8], 36>;
 523 defm : X86WriteRes<WriteFMaskedStore64Y, [JFPU0, JFPA, JFPU1, JSTC, JLAGU, JSAGU, JALU01], 16, [1,1, 4, 4,4,4,4], 18>;
 524
 525 defm : X86WriteRes<WriteFMove,         [JFPU01, JFPX], 1, [1, 1], 1>;
 526 defm : X86WriteRes<WriteFMoveX,        [JFPU01, JFPX], 1, [1, 1], 1>;
 527 defm : X86WriteRes<WriteFMoveY,        [JFPU01, JFPX], 1, [2, 2], 2>;
 528 defm : X86WriteResUnsupported<WriteFMoveZ>;
 529
 530 defm : X86WriteRes<WriteEMMS,          [JFPU01, JFPX], 2, [1, 1], 1>;
 531
 532 defm : JWriteResFpuPair<WriteFAdd,         [JFPU0, JFPA],  3>;
 533 defm : JWriteResFpuPair<WriteFAddX,        [JFPU0, JFPA],  3>;
 534 defm : JWriteResYMMPair<WriteFAddY,        [JFPU0, JFPA],  3, [2,2], 2>;
 535 defm : X86WriteResPairUnsupported<WriteFAddZ>;
 536 defm : JWriteResFpuPair<WriteFAdd64,       [JFPU0, JFPA],  3>;
 537 defm : JWriteResFpuPair<WriteFAdd64X,      [JFPU0, JFPA],  3>;
 538 defm : JWriteResYMMPair<WriteFAdd64Y,      [JFPU0, JFPA],  3, [2,2], 2>;
 539 defm : X86WriteResPairUnsupported<WriteFAdd64Z>;
 540 defm : JWriteResFpuPair<WriteFCmp,         [JFPU0, JFPA],  2>;
 541 defm : JWriteResFpuPair<WriteFCmpX,        [JFPU0, JFPA],  2>;
 542 defm : JWriteResYMMPair<WriteFCmpY,        [JFPU0, JFPA],  2, [2,2], 2>;
 543 defm : X86WriteResPairUnsupported<WriteFCmpZ>;
 544 defm : JWriteResFpuPair<WriteFCmp64,       [JFPU0, JFPA],  2>;
 545 defm : JWriteResFpuPair<WriteFCmp64X,      [JFPU0, JFPA],  2>;
 546 defm : JWriteResYMMPair<WriteFCmp64Y,      [JFPU0, JFPA],  2, [2,2], 2>;
 547 defm : X86WriteResPairUnsupported<WriteFCmp64Z>;
 548 defm : JWriteResFpuPair<WriteFCom,  [JFPU0, JFPA, JALU0],  3>;
 549 defm : JWriteResFpuPair<WriteFComX, [JFPU0, JFPA, JALU0],  3>;
 550 defm : JWriteResFpuPair<WriteFMul,         [JFPU1, JFPM],  2>;
 551 defm : JWriteResFpuPair<WriteFMulX,        [JFPU1, JFPM],  2>;
 552 defm : JWriteResYMMPair<WriteFMulY,        [JFPU1, JFPM],  2, [2,2], 2>;
 553 defm : X86WriteResPairUnsupported<WriteFMulZ>;
 554 defm : JWriteResFpuPair<WriteFMul64,       [JFPU1, JFPM],  4, [1,2]>;
 555 defm : JWriteResFpuPair<WriteFMul64X,      [JFPU1, JFPM],  4, [1,2]>;
 556 defm : JWriteResYMMPair<WriteFMul64Y,      [JFPU1, JFPM],  4, [2,4], 2>;
 557 defm : X86WriteResPairUnsupported<WriteFMul64Z>;
 558 defm : X86WriteResPairUnsupported<WriteFMA>;
 559 defm : X86WriteResPairUnsupported<WriteFMAX>;
 560 defm : X86WriteResPairUnsupported<WriteFMAY>;
 561 defm : X86WriteResPairUnsupported<WriteFMAZ>;
 562 defm : JWriteResFpuPair<WriteDPPD,   [JFPU1, JFPM, JFPA],  9, [1, 3, 3],  3>;
 563 defm : JWriteResFpuPair<WriteDPPS,   [JFPU1, JFPM, JFPA], 11, [1, 3, 3],  5>;
 564 defm : JWriteResYMMPair<WriteDPPSY,  [JFPU1, JFPM, JFPA], 12, [2, 6, 6], 10>;
 565 defm : JWriteResFpuPair<WriteFRcp,         [JFPU1, JFPM],  2>;
 566 defm : JWriteResFpuPair<WriteFRcpX,        [JFPU1, JFPM],  2>;
 567 defm : JWriteResYMMPair<WriteFRcpY,        [JFPU1, JFPM],  2, [2,2], 2>;
 568 defm : X86WriteResPairUnsupported<WriteFRcpZ>;
 569 defm : JWriteResFpuPair<WriteFRsqrt,       [JFPU1, JFPM],  2>;
 570 defm : JWriteResFpuPair<WriteFRsqrtX,      [JFPU1, JFPM],  2>;
 571 defm : JWriteResYMMPair<WriteFRsqrtY,      [JFPU1, JFPM],  2, [2,2], 2>;
 572 defm : X86WriteResPairUnsupported<WriteFRsqrtZ>;
 573 defm : JWriteResFpuPair<WriteFDiv,         [JFPU1, JFPM], 19, [1, 19]>;
 574 defm : JWriteResFpuPair<WriteFDivX,        [JFPU1, JFPM], 19, [1, 19]>;
 575 defm : JWriteResYMMPair<WriteFDivY,        [JFPU1, JFPM], 38, [2, 38], 2>;
 576 defm : X86WriteResPairUnsupported<WriteFDivZ>;
 577 defm : JWriteResFpuPair<WriteFDiv64,       [JFPU1, JFPM], 19, [1, 19]>;
 578 defm : JWriteResFpuPair<WriteFDiv64X,      [JFPU1, JFPM], 19, [1, 19]>;
 579 defm : JWriteResYMMPair<WriteFDiv64Y,      [JFPU1, JFPM], 38, [2, 38], 2>;
 580 defm : X86WriteResPairUnsupported<WriteFDiv64Z>;
 581 defm : JWriteResFpuPair<WriteFSqrt,        [JFPU1, JFPM], 21, [1, 21]>;
 582 defm : JWriteResFpuPair<WriteFSqrtX,       [JFPU1, JFPM], 21, [1, 21]>;
 583 defm : JWriteResYMMPair<WriteFSqrtY,       [JFPU1, JFPM], 42, [2, 42], 2>;
 584 defm : X86WriteResPairUnsupported<WriteFSqrtZ>;
 585 defm : JWriteResFpuPair<WriteFSqrt64,      [JFPU1, JFPM], 27, [1, 27]>;
 586 defm : JWriteResFpuPair<WriteFSqrt64X,     [JFPU1, JFPM], 27, [1, 27]>;
 587 defm : JWriteResYMMPair<WriteFSqrt64Y,     [JFPU1, JFPM], 54, [2, 54], 2>;
 588 defm : X86WriteResPairUnsupported<WriteFSqrt64Z>;
 589 defm : JWriteResFpuPair<WriteFSqrt80,      [JFPU1, JFPM], 35, [1, 35]>;
 590 defm : JWriteResFpuPair<WriteFSign,        [JFPU1, JFPM],  2>;
 591 defm : JWriteResFpuPair<WriteFRnd,         [JFPU1, JSTC],  3>;
 592 defm : JWriteResYMMPair<WriteFRndY,        [JFPU1, JSTC],  3, [2,2], 2>;
 593 defm : X86WriteResPairUnsupported<WriteFRndZ>;
 594 defm : JWriteResFpuPair<WriteFLogic,      [JFPU01, JFPX],  1>;
 595 defm : JWriteResYMMPair<WriteFLogicY,     [JFPU01, JFPX],  1, [2, 2], 2>;
 596 defm : X86WriteResPairUnsupported<WriteFLogicZ>;
 597 defm : JWriteResFpuPair<WriteFTest,       [JFPU0, JFPA, JALU0], 3>;
 598 defm : JWriteResYMMPair<WriteFTestY ,     [JFPU01, JFPX, JFPA, JALU0], 4, [2, 2, 2, 1], 3>;
 599 defm : X86WriteResPairUnsupported<WriteFTestZ>;
 600 defm : JWriteResFpuPair<WriteFShuffle,    [JFPU01, JFPX],  1>;
 601 defm : JWriteResYMMPair<WriteFShuffleY,   [JFPU01, JFPX],  1, [2, 2], 2>;
 602 defm : X86WriteResPairUnsupported<WriteFShuffleZ>;
 603 defm : JWriteResFpuPair<WriteFVarShuffle, [JFPU01, JFPX],  3, [1, 4], 3>; // +1cy latency.
 604 defm : JWriteResYMMPair<WriteFVarShuffleY,[JFPU01, JFPX],  4, [2, 6], 6>; // +1cy latency.
 605 defm : X86WriteResPairUnsupported<WriteFVarShuffleZ>;
 606 defm : JWriteResFpuPair<WriteFBlend,      [JFPU01, JFPX],  1>;
 607 defm : JWriteResYMMPair<WriteFBlendY,     [JFPU01, JFPX],  1, [2, 2], 2>;
 608 defm : X86WriteResPairUnsupported<WriteFBlendZ>;
 609 defm : JWriteResFpuPair<WriteFVarBlend,   [JFPU01, JFPX],  2, [4, 4], 3>;
 610 defm : JWriteResYMMPair<WriteFVarBlendY,  [JFPU01, JFPX],  3, [6, 6], 6>;
 611 defm : X86WriteResPairUnsupported<WriteFVarBlendZ>;
 612 defm : JWriteResFpuPair<WriteFShuffle256, [JFPU01, JFPX],  1, [2, 2], 2>;
 613 defm : X86WriteResPairUnsupported<WriteFVarShuffle256>;
 614
 615 ////////////////////////////////////////////////////////////////////////////////
 616 // Conversions.
 617 ////////////////////////////////////////////////////////////////////////////////
 618
 619 defm : JWriteResFpuPair<WriteCvtSS2I,      [JFPU1, JSTC, JFPU0, JFPA, JALU0], 7, [1,1,1,1,1], 2>;
 620 defm : JWriteResFpuPair<WriteCvtPS2I,      [JFPU1, JSTC], 3, [1,1], 1>;
 621 defm : JWriteResYMMPair<WriteCvtPS2IY,     [JFPU1, JSTC], 3, [2,2], 2>;
 622 defm : X86WriteResPairUnsupported<WriteCvtPS2IZ>;
 623 defm : JWriteResFpuPair<WriteCvtSD2I,      [JFPU1, JSTC, JFPU0, JFPA, JALU0], 7, [1,1,1,1,1], 2>;
 624 defm : JWriteResFpuPair<WriteCvtPD2I,      [JFPU1, JSTC], 3, [1,1], 1>;
 625 defm : JWriteResYMMPair<WriteCvtPD2IY,     [JFPU1, JSTC, JFPX], 6, [2,2,4], 3>;
 626 defm : X86WriteResPairUnsupported<WriteCvtPD2IZ>;
 627
 628 defm : X86WriteRes<WriteCvtI2SS,           [JFPU1, JSTC], 4, [1,1], 2>;
 629 defm : X86WriteRes<WriteCvtI2SSLd,         [JLAGU, JFPU1, JSTC], 9, [1,1,1], 1>;
 630 defm : JWriteResFpuPair<WriteCvtI2PS,      [JFPU1, JSTC], 3, [1,1], 1>;
 631 defm : JWriteResYMMPair<WriteCvtI2PSY,     [JFPU1, JSTC], 3, [2,2], 2>;
 632 defm : X86WriteResPairUnsupported<WriteCvtI2PSZ>;
 633 defm : X86WriteRes<WriteCvtI2SD,           [JFPU1, JSTC], 4, [1,1], 2>;
 634 defm : X86WriteRes<WriteCvtI2SDLd,         [JLAGU, JFPU1, JSTC], 9, [1,1,1], 1>;
 635 defm : JWriteResFpuPair<WriteCvtI2PD,      [JFPU1, JSTC], 3, [1,1], 1>;
 636 defm : JWriteResYMMPair<WriteCvtI2PDY,     [JFPU1, JSTC], 3, [2,2], 2>;
 637 defm : X86WriteResPairUnsupported<WriteCvtI2PDZ>;
 638
 639 defm : JWriteResFpuPair<WriteCvtSS2SD,      [JFPU1, JSTC], 7, [1,2], 2>;
 640 defm : JWriteResFpuPair<WriteCvtPS2PD,      [JFPU1, JSTC], 2, [1,1], 1>;
 641 defm : JWriteResYMMPair<WriteCvtPS2PDY,     [JFPU1, JSTC], 2, [2,2], 2>;
 642 defm : X86WriteResPairUnsupported<WriteCvtPS2PDZ>;
 643
 644 defm : JWriteResFpuPair<WriteCvtSD2SS,    [JFPU1, JSTC], 7, [1,2], 2>;
 645 defm : JWriteResFpuPair<WriteCvtPD2PS,    [JFPU1, JSTC], 3, [1,1], 1>;
 646 defm : JWriteResYMMPair<WriteCvtPD2PSY,   [JFPU1, JSTC, JFPX], 6, [2,2,4], 3>;
 647 defm : X86WriteResPairUnsupported<WriteCvtPD2PSZ>;
 648
 649 defm : JWriteResFpuPair<WriteCvtPH2PS,     [JFPU1, JSTC], 3, [1,1], 1>;
 650 defm : JWriteResYMMPair<WriteCvtPH2PSY,    [JFPU1, JSTC], 3, [2,2], 2>;
 651 defm : X86WriteResPairUnsupported<WriteCvtPH2PSZ>;
 652
 653 defm : X86WriteRes<WriteCvtPS2PH,                 [JFPU1, JSTC], 3, [1,1], 1>;
 654 defm : X86WriteRes<WriteCvtPS2PHY,          [JFPU1, JSTC, JFPX], 6, [2,2,2], 3>;
 655 defm : X86WriteResUnsupported<WriteCvtPS2PHZ>;
 656 defm : X86WriteRes<WriteCvtPS2PHSt,        [JFPU1, JSTC, JSAGU], 4, [1,1,1], 1>;
 657 defm : X86WriteRes<WriteCvtPS2PHYSt, [JFPU1, JSTC, JFPX, JSAGU], 7, [2,2,2,1], 3>;
 658 defm : X86WriteResUnsupported<WriteCvtPS2PHZSt>;
 659
 660 ////////////////////////////////////////////////////////////////////////////////
 661 // Vector integer operations.
 662 ////////////////////////////////////////////////////////////////////////////////
 663
 664 defm : X86WriteRes<WriteVecLoad,          [JLAGU, JFPU01, JVALU], 5, [1, 1, 1], 1>;
 665 defm : X86WriteRes<WriteVecLoadX,         [JLAGU], 5, [1], 1>;
 666 defm : X86WriteRes<WriteVecLoadY,         [JLAGU], 5, [2], 2>;
 667 defm : X86WriteRes<WriteVecLoadNT,        [JLAGU, JFPU01, JVALU], 5, [1, 1, 1], 1>;
 668 defm : X86WriteRes<WriteVecLoadNTY,       [JLAGU, JFPU01, JVALU], 5, [1, 1, 1], 1>;
 669 defm : X86WriteRes<WriteVecMaskedLoad,    [JLAGU, JFPU01, JVALU], 6, [1, 2, 2], 1>;
 670 defm : X86WriteRes<WriteVecMaskedLoadY,   [JLAGU, JFPU01, JVALU], 6, [2, 4, 4], 2>;
 671
 672 defm : X86WriteRes<WriteVecStore,         [JSAGU, JFPU1,   JSTC], 2, [1, 1, 1], 1>;
 673 defm : X86WriteRes<WriteVecStoreX,        [JSAGU, JFPU1,   JSTC], 1, [1, 1, 1], 1>;
 674 defm : X86WriteRes<WriteVecStoreY,        [JSAGU, JFPU1,   JSTC], 1, [2, 2, 2], 2>;
 675 defm : X86WriteRes<WriteVecStoreNT,       [JSAGU, JFPU1,   JSTC], 2, [1, 1, 1], 1>;
 676 defm : X86WriteRes<WriteVecStoreNTY,      [JSAGU, JFPU1,   JSTC], 2, [2, 2, 2], 1>;
 677 defm : X86WriteResUnsupported<WriteVecMaskedStore32>;
 678 defm : X86WriteResUnsupported<WriteVecMaskedStore64>;
 679 defm : X86WriteResUnsupported<WriteVecMaskedStore32Y>;
 680 defm : X86WriteResUnsupported<WriteVecMaskedStore64Y>;
 681
 682 defm : X86WriteRes<WriteVecMove,          [JFPU01, JVALU], 1, [1, 1], 1>;
 683 defm : X86WriteRes<WriteVecMoveX,         [JFPU01, JVALU], 1, [1, 1], 1>;
 684 defm : X86WriteRes<WriteVecMoveY,         [JFPU01, JVALU], 1, [2, 2], 2>;
 685 defm : X86WriteResUnsupported<WriteVecMoveZ>;
 686 defm : X86WriteRes<WriteVecMoveToGpr,     [JFPU0, JFPA, JALU0], 4, [1, 1, 1], 1>;
 687 defm : X86WriteRes<WriteVecMoveFromGpr,   [JFPU01, JFPX], 8, [1, 1], 2>;
 688
 689 defm : JWriteResFpuPair<WriteVecALU,      [JFPU01, JVALU], 1>;
 690 defm : JWriteResFpuPair<WriteVecALUX,     [JFPU01, JVALU], 1>;
 691 defm : X86WriteResPairUnsupported<WriteVecALUY>;
 692 defm : X86WriteResPairUnsupported<WriteVecALUZ>;
 693 defm : JWriteResFpuPair<WriteVecShift,    [JFPU01, JVALU], 1>;
 694 defm : JWriteResFpuPair<WriteVecShiftX,   [JFPU01, JVALU], 2>; // +1cy latency.
 695 defm : X86WriteResPairUnsupported<WriteVecShiftY>;
 696 defm : X86WriteResPairUnsupported<WriteVecShiftZ>;
 697 defm : JWriteResFpuPair<WriteVecShiftImm, [JFPU01, JVALU], 1>;
 698 defm : JWriteResFpuPair<WriteVecShiftImmX,[JFPU01, JVALU], 2>; // +1cy latency.
 699 defm : X86WriteResPairUnsupported<WriteVecShiftImmY>;
 700 defm : X86WriteResPairUnsupported<WriteVecShiftImmZ>;
 701 defm : X86WriteResPairUnsupported<WriteVarVecShift>;
 702 defm : X86WriteResPairUnsupported<WriteVarVecShiftY>;
 703 defm : X86WriteResPairUnsupported<WriteVarVecShiftZ>;
 704 defm : JWriteResFpuPair<WriteVecIMul,     [JFPU0, JVIMUL], 2>;
 705 defm : JWriteResFpuPair<WriteVecIMulX,    [JFPU0, JVIMUL], 2>;
 706 defm : X86WriteResPairUnsupported<WriteVecIMulY>;
 707 defm : X86WriteResPairUnsupported<WriteVecIMulZ>;
 708 defm : JWriteResFpuPair<WritePMULLD,      [JFPU0, JFPU01, JVIMUL, JVALU], 4, [2, 1, 2, 1], 3>;
 709 defm : X86WriteResPairUnsupported<WritePMULLDY>;
 710 defm : X86WriteResPairUnsupported<WritePMULLDZ>;
 711 defm : JWriteResFpuPair<WriteMPSAD,       [JFPU0, JVIMUL], 3, [1, 2], 3>;
 712 defm : X86WriteResPairUnsupported<WriteMPSADY>;
 713 defm : X86WriteResPairUnsupported<WriteMPSADZ>;
 714 defm : JWriteResFpuPair<WritePSADBW,      [JFPU01, JVALU], 2>;
 715 defm : JWriteResFpuPair<WritePSADBWX,     [JFPU01, JVALU], 2>;
 716 defm : X86WriteResPairUnsupported<WritePSADBWY>;
 717 defm : X86WriteResPairUnsupported<WritePSADBWZ>;
 718 defm : JWriteResFpuPair<WritePHMINPOS,    [JFPU01, JVALU], 2>;
 719 defm : JWriteResFpuPair<WriteShuffle,     [JFPU01, JVALU], 1>;
 720 defm : JWriteResFpuPair<WriteShuffleX,    [JFPU01, JVALU], 1>;
 721 defm : X86WriteResPairUnsupported<WriteShuffleY>;
 722 defm : X86WriteResPairUnsupported<WriteShuffleZ>;
 723 defm : JWriteResFpuPair<WriteVarShuffle,  [JFPU01, JVALU], 2, [1, 1], 1>;
 724 defm : JWriteResFpuPair<WriteVarShuffleX, [JFPU01, JVALU], 2, [1, 4], 3>;
 725 defm : X86WriteResPairUnsupported<WriteVarShuffleY>;
 726 defm : X86WriteResPairUnsupported<WriteVarShuffleZ>;
 727 defm : JWriteResFpuPair<WriteBlend,       [JFPU01, JVALU], 1>;
 728 defm : X86WriteResPairUnsupported<WriteBlendY>;
 729 defm : X86WriteResPairUnsupported<WriteBlendZ>;
 730 defm : JWriteResFpuPair<WriteVarBlend,    [JFPU01, JVALU], 2, [4, 4], 3>;
 731 defm : X86WriteResPairUnsupported<WriteVarBlendY>;
 732 defm : X86WriteResPairUnsupported<WriteVarBlendZ>;
 733 defm : JWriteResFpuPair<WriteVecLogic,    [JFPU01, JVALU], 1>;
 734 defm : JWriteResFpuPair<WriteVecLogicX,   [JFPU01, JVALU], 1>;
 735 defm : X86WriteResPairUnsupported<WriteVecLogicY>;
 736 defm : X86WriteResPairUnsupported<WriteVecLogicZ>;
 737 defm : JWriteResFpuPair<WriteVecTest,     [JFPU0, JFPA, JALU0], 3>;
 738 defm : JWriteResYMMPair<WriteVecTestY,    [JFPU01, JFPX, JFPA, JALU0], 4, [2, 2, 2, 1], 3>;
 739 defm : X86WriteResPairUnsupported<WriteVecTestZ>;
 740 defm : X86WriteResPairUnsupported<WriteShuffle256>;
 741 defm : X86WriteResPairUnsupported<WriteVPMOV256>;
 742 defm : X86WriteResPairUnsupported<WriteVarShuffle256>;
 743
 744 ////////////////////////////////////////////////////////////////////////////////
 745 // Vector insert/extract operations.
 746 ////////////////////////////////////////////////////////////////////////////////
 747
 748 defm : X86WriteRes<WriteVecInsert,      [JFPU01, JVALU], 1, [1,1], 2>;
 749 defm : X86WriteRes<WriteVecInsertLd,    [JFPU01, JVALU, JLAGU], 4, [1,1,1], 1>;
 750 defm : X86WriteRes<WriteVecExtract,     [JFPU0, JFPA, JALU0], 3, [1,1,1], 1>;
 751 defm : X86WriteRes<WriteVecExtractSt,   [JFPU1, JSTC, JSAGU], 3, [1,1,1], 1>;
 752
 753 ////////////////////////////////////////////////////////////////////////////////
 754 // SSE42 String instructions.
 755 ////////////////////////////////////////////////////////////////////////////////
 756
 757 defm : JWriteResFpuPair<WritePCmpIStrI, [JFPU1, JVALU1, JFPU0, JFPA, JALU0], 7, [2, 2, 1, 1, 1], 3>;
 758 defm : JWriteResFpuPair<WritePCmpIStrM, [JFPU1, JVALU1, JFPU0, JFPA, JALU0], 8, [2, 2, 1, 1, 1], 3>;
 759 defm : JWriteResFpuPair<WritePCmpEStrI, [JFPU1, JSAGU, JLAGU, JVALU, JVALU1, JFPA, JALU0], 14, [1, 2, 2, 6, 4, 1, 1], 9>;
 760 defm : JWriteResFpuPair<WritePCmpEStrM, [JFPU1, JSAGU, JLAGU, JVALU, JVALU1, JFPA, JALU0], 14, [1, 2, 2, 6, 4, 1, 1], 9>;
 761
 762 ////////////////////////////////////////////////////////////////////////////////
 763 // MOVMSK Instructions.
 764 ////////////////////////////////////////////////////////////////////////////////
 765
 766 def  : WriteRes<WriteFMOVMSK,    [JFPU0, JFPA, JALU0]> { let Latency = 3; }
 767 def  : WriteRes<WriteVecMOVMSK,  [JFPU0, JFPA, JALU0]> { let Latency = 3; }
 768 defm : X86WriteResUnsupported<WriteVecMOVMSKY>;
 769 def  : WriteRes<WriteMMXMOVMSK,  [JFPU0, JFPA, JALU0]> { let Latency = 3; }
 770
 771 ////////////////////////////////////////////////////////////////////////////////
 772 // AES Instructions.
 773 ////////////////////////////////////////////////////////////////////////////////
 774
 775 defm : JWriteResFpuPair<WriteAESIMC,      [JFPU0, JVIMUL], 2>;
 776 defm : JWriteResFpuPair<WriteAESKeyGen,   [JFPU0, JVIMUL], 2>;
 777 defm : JWriteResFpuPair<WriteAESDecEnc,   [JFPU01, JVALU, JFPU0, JVIMUL], 3, [1,1,1,1], 2>;
 778
 779 ////////////////////////////////////////////////////////////////////////////////
 780 // Horizontal add/sub  instructions.
 781 ////////////////////////////////////////////////////////////////////////////////
 782
 783 defm : JWriteResFpuPair<WriteFHAdd,         [JFPU0, JFPA], 4>;            // +1cy latency.
 784 defm : JWriteResYMMPair<WriteFHAddY,        [JFPU0, JFPA], 4, [2,2], 2>;  // +1cy latency.
 785 defm : JWriteResFpuPair<WritePHAdd,         [JFPU01, JVALU], 1>;
 786 defm : JWriteResFpuPair<WritePHAddX,        [JFPU01, JVALU], 2>;          // +1cy latency.
 787 defm : X86WriteResPairUnsupported<WritePHAddY>;
 788
 789 ////////////////////////////////////////////////////////////////////////////////
 790 // Carry-less multiplication instructions.
 791 ////////////////////////////////////////////////////////////////////////////////
 792
 793 defm : JWriteResFpuPair<WriteCLMul,       [JFPU0, JVIMUL], 2>;
 794
 795 ////////////////////////////////////////////////////////////////////////////////
 796 // SSE4A instructions.
 797 ////////////////////////////////////////////////////////////////////////////////
 798
 799 def JWriteINSERTQ: SchedWriteRes<[JFPU01, JVALU]> {
 800   let Latency = 2;
 801   let ReleaseAtCycles = [1, 4];
 802 }
 803 def : InstRW<[JWriteINSERTQ], (instrs INSERTQ, INSERTQI)>;
 804
 805 ////////////////////////////////////////////////////////////////////////////////
 806 // AVX instructions.
 807 ////////////////////////////////////////////////////////////////////////////////
 808
 809 def JWriteVecExtractF128: SchedWriteRes<[JFPU01, JFPX]>;
 810 def : InstRW<[JWriteVecExtractF128], (instrs VEXTRACTF128rri)>;
 811
 812 def JWriteVBROADCASTYLd: SchedWriteRes<[JLAGU, JFPU01, JFPX]> {
 813   let Latency = 6;
 814   let ReleaseAtCycles = [1, 2, 4];
 815   let NumMicroOps = 2;
 816 }
 817 def : InstRW<[JWriteVBROADCASTYLd], (instrs VBROADCASTSDYrm,
 818                                             VBROADCASTSSYrm,
 819                                             VBROADCASTF128rm)>;
 820
 821 def JWriteJVZEROALL: SchedWriteRes<[]> {
 822   let Latency = 90;
 823   let NumMicroOps = 73;
 824 }
 825 def : InstRW<[JWriteJVZEROALL], (instrs VZEROALL)>;
 826
 827 def JWriteJVZEROUPPER: SchedWriteRes<[]> {
 828   let Latency = 46;
 829   let NumMicroOps = 37;
 830 }
 831 def : InstRW<[JWriteJVZEROUPPER], (instrs VZEROUPPER)>;
 832
 833 ///////////////////////////////////////////////////////////////////////////////
 834 //  SSE2/AVX Store Selected Bytes of Double Quadword - (V)MASKMOVDQ
 835 ///////////////////////////////////////////////////////////////////////////////
 836
 837 def JWriteMASKMOVDQU: SchedWriteRes<[JFPU0, JFPA, JFPU1, JSTC, JLAGU, JSAGU, JALU01]> {
 838   let Latency = 34;
 839   let ReleaseAtCycles = [1, 1, 2, 2, 2, 16, 42];
 840   let NumMicroOps = 63;
 841 }
 842 def : InstRW<[JWriteMASKMOVDQU], (instrs MASKMOVDQU, MASKMOVDQU64,
 843                                          VMASKMOVDQU, VMASKMOVDQU64)>;
 844
 845 ///////////////////////////////////////////////////////////////////////////////
 846 //  SchedWriteVariant definitions.
 847 ///////////////////////////////////////////////////////////////////////////////
 848
 849 def JWriteZeroLatency : SchedWriteRes<[]> {
 850   let Latency = 0;
 851 }
 852
 853 def JWriteZeroIdiomYmm : SchedWriteRes<[JFPU01, JFPX]> {
 854   let NumMicroOps = 2;
 855 }
 856
 857 // Certain instructions that use the same register for both source
 858 // operands do not have a real dependency on the previous contents of the
 859 // register, and thus, do not have to wait before completing. They can be
 860 // optimized out at register renaming stage.
 861 // Reference: Section 10.8 of the "Software Optimization Guide for AMD Family
 862 // 15h Processors".
 863 // Reference: Agner's Fog "The microarchitecture of Intel, AMD and VIA CPUs",
 864 // Section 21.8 [Dependency-breaking instructions].
 865
 866 def JWriteZeroIdiom : SchedWriteVariant<[
 867     SchedVar<MCSchedPredicate<ZeroIdiomPredicate>, [JWriteZeroLatency]>,
 868     SchedVar<NoSchedPred,                          [WriteALU]>
 869 ]>;
 870 def : InstRW<[JWriteZeroIdiom], (instrs SUB32rr, SUB64rr,
 871                                         XOR32rr, XOR64rr)>;
 872
 873 def JWriteFZeroIdiom : SchedWriteVariant<[
 874     SchedVar<MCSchedPredicate<ZeroIdiomPredicate>, [JWriteZeroLatency]>,
 875     SchedVar<NoSchedPred,                          [WriteFLogic]>
 876 ]>;
 877 def : InstRW<[JWriteFZeroIdiom], (instrs XORPSrr, VXORPSrr, XORPDrr, VXORPDrr,
 878                                          ANDNPSrr, VANDNPSrr,
 879                                          ANDNPDrr, VANDNPDrr)>;
 880
 881 def JWriteFZeroIdiomY : SchedWriteVariant<[
 882     SchedVar<MCSchedPredicate<ZeroIdiomPredicate>, [JWriteZeroIdiomYmm]>,
 883     SchedVar<NoSchedPred,                          [WriteFLogicY]>
 884 ]>;
 885 def : InstRW<[JWriteFZeroIdiomY], (instrs VXORPSYrr, VXORPDYrr,
 886                                           VANDNPSYrr, VANDNPDYrr)>;
 887
 888 def JWriteVZeroIdiomLogic : SchedWriteVariant<[
 889     SchedVar<MCSchedPredicate<ZeroIdiomPredicate>, [JWriteZeroLatency]>,
 890     SchedVar<NoSchedPred,                          [WriteVecLogic]>
 891 ]>;
 892 def : InstRW<[JWriteVZeroIdiomLogic], (instrs MMX_PXORrr, MMX_PANDNrr)>;
 893
 894 def JWriteVZeroIdiomLogicX : SchedWriteVariant<[
 895     SchedVar<MCSchedPredicate<ZeroIdiomPredicate>, [JWriteZeroLatency]>,
 896     SchedVar<NoSchedPred,                          [WriteVecLogicX]>
 897 ]>;
 898 def : InstRW<[JWriteVZeroIdiomLogicX], (instrs PXORrr, VPXORrr,
 899                                                PANDNrr, VPANDNrr)>;
 900
 901 def JWriteVZeroIdiomALU : SchedWriteVariant<[
 902     SchedVar<MCSchedPredicate<ZeroIdiomPredicate>, [JWriteZeroLatency]>,
 903     SchedVar<NoSchedPred,                          [WriteVecALU]>
 904 ]>;
 905 def : InstRW<[JWriteVZeroIdiomALU], (instrs MMX_PSUBBrr, MMX_PSUBDrr,
 906                                             MMX_PSUBQrr, MMX_PSUBWrr,
 907                                             MMX_PSUBSBrr, MMX_PSUBSWrr,
 908                                             MMX_PSUBUSBrr, MMX_PSUBUSWrr,
 909                                             MMX_PCMPGTBrr, MMX_PCMPGTDrr,
 910                                             MMX_PCMPGTWrr)>;
 911
 912 def JWriteVZeroIdiomALUX : SchedWriteVariant<[
 913     SchedVar<MCSchedPredicate<ZeroIdiomPredicate>, [JWriteZeroLatency]>,
 914     SchedVar<NoSchedPred,                          [WriteVecALUX]>
 915 ]>;
 916 def : InstRW<[JWriteVZeroIdiomALUX], (instrs PSUBBrr, VPSUBBrr,
 917                                              PSUBDrr, VPSUBDrr,
 918                                              PSUBQrr, VPSUBQrr,
 919                                              PSUBWrr, VPSUBWrr,
 920                                              PSUBSBrr, VPSUBSBrr,
 921                                              PSUBSWrr, VPSUBSWrr,
 922                                              PSUBUSBrr, VPSUBUSBrr,
 923                                              PSUBUSWrr, VPSUBUSWrr,
 924                                              PCMPGTBrr, VPCMPGTBrr,
 925                                              PCMPGTDrr, VPCMPGTDrr,
 926                                              PCMPGTQrr, VPCMPGTQrr,
 927                                              PCMPGTWrr, VPCMPGTWrr)>;
 928
 929 def JWriteVPERM2F128 : SchedWriteVariant<[
 930   SchedVar<MCSchedPredicate<ZeroIdiomVPERMPredicate>, [JWriteZeroIdiomYmm]>,
 931   SchedVar<NoSchedPred,                               [WriteFShuffle256]>
 932 ]>;
 933 def : InstRW<[JWriteVPERM2F128], (instrs VPERM2F128rri)>;
 934
 935 // This write is used for slow LEA instructions.
 936 def JWrite3OpsLEA : SchedWriteRes<[JALU1, JSAGU]> {
 937   let Latency = 2;
 938 }
 939
 940 // On Jaguar, a slow LEA is either a 3Ops LEA (base, index, offset), or an LEA
 941 // with a `Scale` value different than 1.
 942 def JSlowLEAPredicate : MCSchedPredicate<
 943   CheckAny<[
 944     // A 3-operand LEA (base, index, offset).
 945     IsThreeOperandsLEAFn,
 946     // An LEA with a "Scale" different than 1.
 947     CheckAll<[
 948       CheckIsImmOperand<2>,
 949       CheckNot<CheckImmOperand<2, 1>>
 950     ]>
 951   ]>
 952 >;
 953
 954 def JWriteLEA : SchedWriteVariant<[
 955     SchedVar<JSlowLEAPredicate, [JWrite3OpsLEA]>,
 956     SchedVar<NoSchedPred,       [WriteLEA]>
 957 ]>;
 958
 959 def : InstRW<[JWriteLEA], (instrs LEA32r, LEA64r, LEA64_32r)>;
 960
 961 def JSlowLEA16r : SchedWriteRes<[JALU01]> {
 962   let Latency = 3;
 963   let ReleaseAtCycles = [4];
 964 }
 965
 966 def : InstRW<[JSlowLEA16r], (instrs LEA16r)>;
 967
 968 ///////////////////////////////////////////////////////////////////////////////
 969 // Dependency breaking instructions.
 970 ///////////////////////////////////////////////////////////////////////////////
 971
 972 def : IsZeroIdiomFunction<[
 973   // GPR Zero-idioms.
 974   DepBreakingClass<[ SUB32rr, SUB64rr, XOR32rr, XOR64rr ], ZeroIdiomPredicate>,
 975
 976   // MMX Zero-idioms.
 977   DepBreakingClass<[
 978     MMX_PXORrr, MMX_PANDNrr, MMX_PSUBBrr,
 979     MMX_PSUBDrr, MMX_PSUBQrr, MMX_PSUBWrr,
 980     MMX_PSUBSBrr, MMX_PSUBSWrr, MMX_PSUBUSBrr, MMX_PSUBUSWrr,
 981     MMX_PCMPGTBrr, MMX_PCMPGTDrr, MMX_PCMPGTWrr
 982   ], ZeroIdiomPredicate>,
 983
 984   // SSE Zero-idioms.
 985   DepBreakingClass<[
 986     // fp variants.
 987     XORPSrr, XORPDrr, ANDNPSrr, ANDNPDrr,
 988
 989     // int variants.
 990     PXORrr, PANDNrr,
 991     PSUBBrr, PSUBWrr, PSUBDrr, PSUBQrr,
 992     PSUBSBrr, PSUBSWrr, PSUBUSBrr, PSUBUSWrr,
 993     PCMPGTBrr, PCMPGTDrr, PCMPGTQrr, PCMPGTWrr
 994   ], ZeroIdiomPredicate>,
 995
 996   // AVX Zero-idioms.
 997   DepBreakingClass<[
 998     // xmm fp variants.
 999     VXORPSrr, VXORPDrr, VANDNPSrr, VANDNPDrr,
1000
1001     // xmm int variants.
1002     VPXORrr, VPANDNrr,
1003     VPSUBBrr, VPSUBWrr, VPSUBDrr, VPSUBQrr,
1004     VPSUBSBrr, VPSUBSWrr, VPSUBUSBrr, VPSUBUSWrr,
1005     VPCMPGTBrr, VPCMPGTWrr, VPCMPGTDrr, VPCMPGTQrr,
1006
1007     // ymm variants.
1008     VXORPSYrr, VXORPDYrr, VANDNPSYrr, VANDNPDYrr
1009   ], ZeroIdiomPredicate>,
1010
1011   DepBreakingClass<[ VPERM2F128rri ], ZeroIdiomVPERMPredicate>
1012 ]>;
1013
1014 def : IsDepBreakingFunction<[
1015   // GPR
1016   DepBreakingClass<[ SBB32rr, SBB64rr ], ZeroIdiomPredicate>,
1017   DepBreakingClass<[ CMP32rr, CMP64rr ], CheckSameRegOperand<0, 1> >,
1018
1019   // MMX
1020   DepBreakingClass<[
1021     MMX_PCMPEQBrr, MMX_PCMPEQDrr, MMX_PCMPEQWrr
1022   ], ZeroIdiomPredicate>,
1023
1024   // SSE
1025   DepBreakingClass<[
1026     PCMPEQBrr, PCMPEQWrr, PCMPEQDrr, PCMPEQQrr
1027   ], ZeroIdiomPredicate>,
1028
1029   // AVX
1030   DepBreakingClass<[
1031     VPCMPEQBrr, VPCMPEQWrr, VPCMPEQDrr, VPCMPEQQrr
1032   ], ZeroIdiomPredicate>
1033 ]>;
1034
1035 def : IsOptimizableRegisterMove<[
1036   InstructionEquivalenceClass<[
1037     // GPR variants.
1038     MOV32rr, MOV64rr,
1039
1040     // MMX variants.
1041     MMX_MOVQ64rr,
1042
1043     // SSE variants.
1044     MOVAPSrr, MOVUPSrr,
1045     MOVAPDrr, MOVUPDrr,
1046     MOVDQArr, MOVDQUrr,
1047
1048     // AVX variants.
1049     VMOVAPSrr, VMOVUPSrr,
1050     VMOVAPDrr, VMOVUPDrr,
1051     VMOVDQArr, VMOVDQUrr
1052   ], TruePred >
1053 ]>;
1054
1055 } // SchedModel