1 //=- X86ScheduleZnver3.td - X86 Znver3 Scheduling ------------*- tablegen -*-=//
3 // Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
4 // See https://llvm.org/LICENSE.txt for license information.
5 // SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
7 //===----------------------------------------------------------------------===//
9 // This file defines the machine model for Znver3 to support instruction
10 // scheduling and other instruction cost heuristics.
12 // * AMD Software Optimization Guide for AMD Family 19h Processors.
13 // https://www.amd.com/system/files/TechDocs/56665.zip
14 // * The microarchitecture of Intel, AMD and VIA CPUs, By Agner Fog
15 // http://www.agner.org/optimize/microarchitecture.pdf
16 // * AMD Zen 3 Ryzen Deep Dive Review
17 // https://www.anandtech.com/show/16214/
18 //===----------------------------------------------------------------------===//
20 def Znver3Model : SchedMachineModel {
21 // AMD SOG 19h, 2.9.6 Dispatch
22 // The processor may dispatch up to 6 macro ops per cycle
23 // into the execution engine.
25 // AMD SOG 19h, 2.10.3
26 // The retire control unit (RCU) tracks the completion status of all
27 // outstanding operations (integer, load/store, and floating-point) and is
28 // the final arbiter for exception processing and recovery.
29 // The unit can receive up to 6 macro ops dispatched per cycle and track up
30 // to 256 macro ops in-flight in non-SMT mode or 128 per thread in SMT mode.
31 let MicroOpBufferSize = 256;
32 // AMD SOG 19h, 2.9.1 Op Cache
33 // The op cache is organized as an associative cache with 64 sets and 8 ways.
34 // At each set-way intersection is an entry containing up to 8 macro ops.
35 // The maximum capacity of the op cache is 4K ops.
36 // Agner, 22.5 µop cache
37 // The size of the µop cache is big enough for holding most critical loops.
38 // FIXME: PR50584: MachineScheduler/PostRAScheduler have quadradic complexity,
39 // with large values here the compilation of certain loops
40 // ends up taking way too long.
41 // let LoopMicroOpBufferSize = 4096;
42 let LoopMicroOpBufferSize = 512;
43 // AMD SOG 19h, 2.6.2 L1 Data Cache
44 // The L1 data cache has a 4- or 5- cycle integer load-to-use latency.
45 // AMD SOG 19h, 2.12 L1 Data Cache
46 // The AGU and LS pipelines are optimized for simple address generation modes.
47 // <...> and can achieve 4-cycle load-to-use integer load latency.
49 // AMD SOG 19h, 2.12 L1 Data Cache
50 // The AGU and LS pipelines are optimized for simple address generation modes.
51 // <...> and can achieve <...> 7-cycle load-to-use FP load latency.
52 int VecLoadLatency = 7;
53 // Latency of a simple store operation.
56 let HighLatency = 25; // FIXME: any better choice?
57 // AMD SOG 19h, 2.8 Optimizing Branching
58 // The branch misprediction penalty is in the range from 11 to 18 cycles,
59 // <...>. The common case penalty is 13 cycles.
60 let MispredictPenalty = 13;
62 let PostRAScheduler = 1; // Enable Post RegAlloc Scheduler pass.
64 let CompleteModel = 1;
67 let SchedModel = Znver3Model in {
70 //===----------------------------------------------------------------------===//
72 //===----------------------------------------------------------------------===//
74 // AMD SOG 19h, 2.10.3 Retire Control Unit
75 // The unit can receive up to 6 macro ops dispatched per cycle and track up to
76 // 256 macro ops in-flight in non-SMT mode or 128 per thread in SMT mode. <...>
77 // The retire unit handles in-order commit of up to eight macro ops per cycle.
78 def Zn3RCU : RetireControlUnit<Znver3Model.MicroOpBufferSize, 8>;
80 //===----------------------------------------------------------------------===//
82 //===----------------------------------------------------------------------===//
84 // There are total of three Units, each one with it's own schedulers.
86 //===----------------------------------------------------------------------===//
87 // Integer Execution Unit
90 // AMD SOG 19h, 2.4 Superscalar Organization
91 // The processor uses four decoupled independent integer scheduler queues,
92 // each one servicing one ALU pipeline and one or two other pipelines
96 //===----------------------------------------------------------------------===//
98 // AMD SOG 19h, 2.10.2 Execution Units
99 // The processor contains 4 general purpose integer execution pipes.
100 // Each pipe has an ALU capable of general purpose integer operations.
101 def Zn3ALU0 : ProcResource<1>;
102 def Zn3ALU1 : ProcResource<1>;
103 def Zn3ALU2 : ProcResource<1>;
104 def Zn3ALU3 : ProcResource<1>;
106 // AMD SOG 19h, 2.10.2 Execution Units
107 // There is also a separate branch execution unit.
108 def Zn3BRU1 : ProcResource<1>;
110 // AMD SOG 19h, 2.10.2 Execution Units
111 // There are three Address Generation Units (AGUs) for all load and store
112 // address generation. There are also 3 store data movement units
113 // associated with the same schedulers as the AGUs.
114 def Zn3AGU0 : ProcResource<1>;
115 def Zn3AGU1 : ProcResource<1>;
116 def Zn3AGU2 : ProcResource<1>;
120 //===----------------------------------------------------------------------===//
122 // AMD SOG 19h, 2.10.2 Execution Units
123 // ALU0 additionally has divide <...> execution capability.
124 defvar Zn3Divider = Zn3ALU0;
126 // AMD SOG 19h, 2.10.2 Execution Units
127 // ALU0 additionally has <...> branch execution capability.
128 defvar Zn3BRU0 = Zn3ALU0;
130 // Integer Multiplication issued on ALU1.
131 defvar Zn3Multiplier = Zn3ALU1;
133 // Execution pipeline grouping
134 //===----------------------------------------------------------------------===//
136 // General ALU operations
137 def Zn3ALU0123 : ProcResGroup<[Zn3ALU0, Zn3ALU1, Zn3ALU2, Zn3ALU3]>;
139 // General AGU operations
140 def Zn3AGU012 : ProcResGroup<[Zn3AGU0, Zn3AGU1, Zn3AGU2]>;
142 // Control flow: jumps, calls
143 def Zn3BRU01 : ProcResGroup<[Zn3BRU0, Zn3BRU1]>;
145 // Everything that isn't control flow, but still needs to access CC register,
146 // namely: conditional moves, SETcc.
147 def Zn3ALU03 : ProcResGroup<[Zn3ALU0, Zn3ALU3]>;
149 // Zn3ALU1 handles complex bit twiddling: CRC/PDEP/PEXT
151 // Simple bit twiddling: bit test, shift/rotate, bit extraction
152 def Zn3ALU12 : ProcResGroup<[Zn3ALU1, Zn3ALU2]>;
157 //===----------------------------------------------------------------------===//
159 // AMD SOG 19h, 2.10.3 Retire Control Unit
160 // The integer physical register file (PRF) consists of 192 registers.
161 def Zn3IntegerPRF : RegisterFile<192, [GR64, CCR], [1, 1], [1, 0],
162 6, // Max moves that can be eliminated per cycle.
163 0>; // Restrict move elimination to zero regs.
165 // anandtech, The integer scheduler has a 4*24 entry macro op capacity.
166 // AMD SOG 19h, 2.10.1 Schedulers
167 // The schedulers can receive up to six macro ops per cycle, with a limit of
168 // two per scheduler. Each scheduler can issue one micro op per cycle into
169 // each of its associated pipelines
170 // FIXME: these are 4 separate schedulers, not a single big one.
171 def Zn3Int : ProcResGroup<[Zn3ALU0, Zn3AGU0, Zn3BRU0, // scheduler 0
172 Zn3ALU1, Zn3AGU1, // scheduler 1
173 Zn3ALU2, Zn3AGU2, // scheduler 2
174 Zn3ALU3, Zn3BRU1 // scheduler 3
176 let BufferSize = !mul(4, 24);
180 //===----------------------------------------------------------------------===//
181 // Floating-Point Unit
184 // AMD SOG 19h, 2.4 Superscalar Organization
185 // The processor uses <...> two decoupled independent floating point schedulers
186 // each servicing two FP pipelines and one store or FP-to-integer pipeline.
190 //===----------------------------------------------------------------------===//
192 // AMD SOG 19h, 2.10.1 Schedulers
193 // <...>, and six FPU pipes.
194 // Agner, 22.10 Floating point execution pipes
195 // There are six floating point/vector execution pipes,
196 def Zn3FPP0 : ProcResource<1>;
197 def Zn3FPP1 : ProcResource<1>;
198 def Zn3FPP2 : ProcResource<1>;
199 def Zn3FPP3 : ProcResource<1>;
200 def Zn3FPP45 : ProcResource<2>;
204 //===----------------------------------------------------------------------===//
205 // AMD SOG 19h, 2.11.1 Floating Point Execution Resources
207 // (v)FMUL*, (v)FMA*, Floating Point Compares, Blendv(DQ)
208 defvar Zn3FPFMul0 = Zn3FPP0;
209 defvar Zn3FPFMul1 = Zn3FPP1;
212 defvar Zn3FPFAdd0 = Zn3FPP2;
213 defvar Zn3FPFAdd1 = Zn3FPP3;
215 // All convert operations except pack/unpack
216 defvar Zn3FPFCvt0 = Zn3FPP2;
217 defvar Zn3FPFCvt1 = Zn3FPP3;
219 // All Divide and Square Root except Reciprocal Approximation
220 // AMD SOG 19h, 2.11.1 Floating Point Execution Resources
221 // FDIV unit can support 2 simultaneous operations in flight
222 // even though it occupies a single pipe.
223 // FIXME: BufferSize=2 ?
224 defvar Zn3FPFDiv = Zn3FPP1;
226 // Moves and Logical operations on Floating Point Data Types
227 defvar Zn3FPFMisc0 = Zn3FPP0;
228 defvar Zn3FPFMisc1 = Zn3FPP1;
229 defvar Zn3FPFMisc2 = Zn3FPP2;
230 defvar Zn3FPFMisc3 = Zn3FPP3;
232 // Integer Adds, Subtracts, and Compares
233 // Some complex VADD operations are not available in all pipes.
234 defvar Zn3FPVAdd0 = Zn3FPP0;
235 defvar Zn3FPVAdd1 = Zn3FPP1;
236 defvar Zn3FPVAdd2 = Zn3FPP2;
237 defvar Zn3FPVAdd3 = Zn3FPP3;
239 // Integer Multiplies, SAD, Blendvb
240 defvar Zn3FPVMul0 = Zn3FPP0;
241 defvar Zn3FPVMul1 = Zn3FPP3;
243 // Data Shuffles, Packs, Unpacks, Permute
244 // Some complex shuffle operations are only available in pipe1.
245 defvar Zn3FPVShuf = Zn3FPP1;
246 defvar Zn3FPVShufAux = Zn3FPP2;
248 // Bit Shift Left/Right operations
249 defvar Zn3FPVShift0 = Zn3FPP1;
250 defvar Zn3FPVShift1 = Zn3FPP2;
252 // Moves and Logical operations on Packed Integer Data Types
253 defvar Zn3FPVMisc0 = Zn3FPP0;
254 defvar Zn3FPVMisc1 = Zn3FPP1;
255 defvar Zn3FPVMisc2 = Zn3FPP2;
256 defvar Zn3FPVMisc3 = Zn3FPP3;
259 defvar Zn3FPAES0 = Zn3FPP0;
260 defvar Zn3FPAES1 = Zn3FPP1;
263 defvar Zn3FPCLM0 = Zn3FPP0;
264 defvar Zn3FPCLM1 = Zn3FPP1;
266 // Execution pipeline grouping
267 //===----------------------------------------------------------------------===//
269 // AMD SOG 19h, 2.11 Floating-Point Unit
270 // Stores and floating point to general purpose register transfer
271 // have 2 dedicated pipelines (pipe 5 and 6).
272 def Zn3FPU0123 : ProcResGroup<[Zn3FPP0, Zn3FPP1, Zn3FPP2, Zn3FPP3]>;
274 // (v)FMUL*, (v)FMA*, Floating Point Compares, Blendv(DQ)
275 def Zn3FPFMul01 : ProcResGroup<[Zn3FPFMul0, Zn3FPFMul1]>;
278 // Some complex VADD operations are not available in all pipes.
279 def Zn3FPFAdd01 : ProcResGroup<[Zn3FPFAdd0, Zn3FPFAdd1]>;
281 // All convert operations except pack/unpack
282 def Zn3FPFCvt01 : ProcResGroup<[Zn3FPFCvt0, Zn3FPFCvt1]>;
284 // All Divide and Square Root except Reciprocal Approximation
285 // def Zn3FPFDiv : ProcResGroup<[Zn3FPFDiv]>;
287 // Moves and Logical operations on Floating Point Data Types
288 def Zn3FPFMisc0123 : ProcResGroup<[Zn3FPFMisc0, Zn3FPFMisc1, Zn3FPFMisc2, Zn3FPFMisc3]>;
290 def Zn3FPFMisc12 : ProcResGroup<[Zn3FPFMisc1, Zn3FPFMisc2]>;
292 // Loads, Stores and Move to General Register (EX) Operations
293 // AMD SOG 19h, 2.11 Floating-Point Unit
294 // Stores and floating point to general purpose register transfer
295 // have 2 dedicated pipelines (pipe 5 and 6).
296 defvar Zn3FPLd01 = Zn3FPP45;
298 // AMD SOG 19h, 2.11 Floating-Point Unit
299 // Note that FP stores are supported on two pipelines,
300 // but throughput is limited to one per cycle.
301 let Super = Zn3FPP45 in
302 def Zn3FPSt : ProcResource<1>;
304 // Integer Adds, Subtracts, and Compares
305 // Some complex VADD operations are not available in all pipes.
306 def Zn3FPVAdd0123 : ProcResGroup<[Zn3FPVAdd0, Zn3FPVAdd1, Zn3FPVAdd2, Zn3FPVAdd3]>;
308 def Zn3FPVAdd01: ProcResGroup<[Zn3FPVAdd0, Zn3FPVAdd1]>;
309 def Zn3FPVAdd12: ProcResGroup<[Zn3FPVAdd1, Zn3FPVAdd2]>;
311 // Integer Multiplies, SAD, Blendvb
312 def Zn3FPVMul01 : ProcResGroup<[Zn3FPVMul0, Zn3FPVMul1]>;
314 // Data Shuffles, Packs, Unpacks, Permute
315 // Some complex shuffle operations are only available in pipe1.
316 def Zn3FPVShuf01 : ProcResGroup<[Zn3FPVShuf, Zn3FPVShufAux]>;
318 // Bit Shift Left/Right operations
319 def Zn3FPVShift01 : ProcResGroup<[Zn3FPVShift0, Zn3FPVShift1]>;
321 // Moves and Logical operations on Packed Integer Data Types
322 def Zn3FPVMisc0123 : ProcResGroup<[Zn3FPVMisc0, Zn3FPVMisc1, Zn3FPVMisc2, Zn3FPVMisc3]>;
325 def Zn3FPAES01 : ProcResGroup<[Zn3FPAES0, Zn3FPAES1]>;
328 def Zn3FPCLM01 : ProcResGroup<[Zn3FPCLM0, Zn3FPCLM1]>;
333 //===----------------------------------------------------------------------===//
335 // Agner, 21.8 Register renaming and out-of-order schedulers
336 // The floating point register file has 160 vector registers
337 // of 128 bits each in Zen 1 and 256 bits each in Zen 2.
338 // anandtech also confirms this.
339 def Zn3FpPRF : RegisterFile<160, [VR64, VR128, VR256], [1, 1, 1], [0, 1, 1],
340 6, // Max moves that can be eliminated per cycle.
341 0>; // Restrict move elimination to zero regs.
343 // AMD SOG 19h, 2.11 Floating-Point Unit
344 // The floating-point scheduler has a 2*32 entry macro op capacity.
345 // AMD SOG 19h, 2.11 Floating-Point Unit
346 // <...> the scheduler can issue 1 micro op per cycle for each pipe.
347 // FIXME: those are two separate schedulers, not a single big one.
348 def Zn3FP : ProcResGroup<[Zn3FPP0, Zn3FPP2, /*Zn3FPP4,*/ // scheduler 0
349 Zn3FPP1, Zn3FPP3, Zn3FPP45 /*Zn3FPP5*/ // scheduler 1
351 let BufferSize = !mul(2, 32);
354 // AMD SOG 19h, 2.11 Floating-Point Unit
355 // Macro ops can be dispatched to the 64 entry Non Scheduling Queue (NSQ)
356 // even if floating-point scheduler is full.
357 // FIXME: how to model this properly?
360 //===----------------------------------------------------------------------===//
364 // AMD SOG 19h, 2.12 Load-Store Unit
365 // The LS unit contains three largely independent pipe-lines
366 // enabling the execution of three 256-bit memory operations per cycle.
367 def Zn3LSU : ProcResource<3>;
369 // AMD SOG 19h, 2.12 Load-Store Unit
370 // All three memory operations can be loads.
371 let Super = Zn3LSU in
372 def Zn3Load : ProcResource<3> {
373 // AMD SOG 19h, 2.12 Load-Store Unit
374 // The LS unit can process up to 72 out-of-order loads.
378 def Zn3LoadQueue : LoadQueue<Zn3Load>;
380 // AMD SOG 19h, 2.12 Load-Store Unit
381 // A maximum of two of the memory operations can be stores.
382 let Super = Zn3LSU in
383 def Zn3Store : ProcResource<2> {
384 // AMD SOG 19h, 2.12 Load-Store Unit
385 // The LS unit utilizes a 64-entry store queue (STQ).
389 def Zn3StoreQueue : StoreQueue<Zn3Store>;
391 //===----------------------------------------------------------------------===//
392 // Basic helper classes.
393 //===----------------------------------------------------------------------===//
395 // Many SchedWrites are defined in pairs with and without a folded load.
396 // Instructions with folded loads are usually micro-fused, so they only appear
397 // as two micro-ops when dispatched by the schedulers.
398 // This multiclass defines the resource usage for variants with and without
401 multiclass __zn3WriteRes<SchedWrite SchedRW, list<ProcResourceKind> ExePorts,
402 int Lat = 1, list<int> Res = [], int UOps = 1> {
403 def : WriteRes<SchedRW, ExePorts> {
405 let ResourceCycles = Res;
406 let NumMicroOps = UOps;
410 multiclass __zn3WriteResPair<X86FoldableSchedWrite SchedRW,
411 list<ProcResourceKind> ExePorts, int Lat,
412 list<int> Res, int UOps, int LoadLat, int LoadUOps,
413 ProcResourceKind AGU, int LoadRes> {
414 defm : __zn3WriteRes<SchedRW, ExePorts, Lat, Res, UOps>;
416 defm : __zn3WriteRes<SchedRW.Folded,
417 !listconcat([AGU, Zn3Load], ExePorts),
419 !if(!and(!empty(Res), !eq(LoadRes, 1)),
421 !listconcat([1, LoadRes],
423 !listsplat(1, !size(ExePorts)),
425 !add(UOps, LoadUOps)>;
428 // For classes without folded loads.
429 multiclass Zn3WriteResInt<SchedWrite SchedRW,
430 list<ProcResourceKind> ExePorts, int Lat = 1,
431 list<int> Res = [], int UOps = 1> {
432 defm : __zn3WriteRes<SchedRW, ExePorts, Lat, Res, UOps>;
435 multiclass Zn3WriteResXMM<SchedWrite SchedRW,
436 list<ProcResourceKind> ExePorts, int Lat = 1,
437 list<int> Res = [], int UOps = 1> {
438 defm : __zn3WriteRes<SchedRW, ExePorts, Lat, Res, UOps>;
441 multiclass Zn3WriteResYMM<SchedWrite SchedRW,
442 list<ProcResourceKind> ExePorts, int Lat = 1,
443 list<int> Res = [], int UOps = 1> {
444 defm : __zn3WriteRes<SchedRW, ExePorts, Lat, Res, UOps>;
447 // For classes with folded loads.
448 multiclass Zn3WriteResIntPair<X86FoldableSchedWrite SchedRW,
449 list<ProcResourceKind> ExePorts, int Lat = 1,
450 list<int> Res = [], int UOps = 1,
451 int LoadUOps = 0, int LoadRes = 1> {
452 defm : __zn3WriteResPair<SchedRW, ExePorts, Lat, Res, UOps,
453 Znver3Model.LoadLatency,
454 LoadUOps, Zn3AGU012, LoadRes>;
457 multiclass Zn3WriteResXMMPair<X86FoldableSchedWrite SchedRW,
458 list<ProcResourceKind> ExePorts, int Lat = 1,
459 list<int> Res = [], int UOps = 1,
460 int LoadUOps = 0, int LoadRes = 1> {
461 defm : __zn3WriteResPair<SchedRW, ExePorts, Lat, Res, UOps,
462 Znver3Model.VecLoadLatency,
463 LoadUOps, Zn3FPLd01, LoadRes>;
466 multiclass Zn3WriteResYMMPair<X86FoldableSchedWrite SchedRW,
467 list<ProcResourceKind> ExePorts, int Lat = 1,
468 list<int> Res = [], int UOps = 1,
469 int LoadUOps = 0, int LoadRes = 1> {
470 defm : __zn3WriteResPair<SchedRW, ExePorts, Lat, Res, UOps,
471 Znver3Model.VecLoadLatency,
472 LoadUOps, Zn3FPLd01, LoadRes>;
476 //===----------------------------------------------------------------------===//
478 //===----------------------------------------------------------------------===//
480 def : ReadAdvance<ReadAfterLd, Znver3Model.LoadLatency>;
482 def : ReadAdvance<ReadAfterVecLd, Znver3Model.VecLoadLatency>;
483 def : ReadAdvance<ReadAfterVecXLd, Znver3Model.VecLoadLatency>;
484 def : ReadAdvance<ReadAfterVecYLd, Znver3Model.VecLoadLatency>;
486 // AMD SOG 19h, 2.11 Floating-Point Unit
487 // There is 1 cycle of added latency for a result to cross
488 // from F to I or I to F domain.
489 def : ReadAdvance<ReadInt2Fpu, -1>;
491 // Instructions with both a load and a store folded are modeled as a folded
493 defm : Zn3WriteResInt<WriteRMW, [Zn3AGU012, Zn3Store], Znver3Model.StoreLatency, [1, 1], 0>;
495 // Loads, stores, and moves, not folded with other operations.
496 defm : Zn3WriteResInt<WriteLoad, [Zn3AGU012, Zn3Load], !add(Znver3Model.LoadLatency, 1), [1, 1], 1>;
498 // Model the effect of clobbering the read-write mask operand of the GATHER operation.
499 // Does not cost anything by itself, only has latency, matching that of the WriteLoad,
500 defm : Zn3WriteResInt<WriteVecMaskedGatherWriteback, [], !add(Znver3Model.LoadLatency, 1), [], 0>;
502 def Zn3WriteMOVSlow : SchedWriteRes<[Zn3AGU012, Zn3Load]> {
503 let Latency = !add(Znver3Model.LoadLatency, 1);
504 let ResourceCycles = [3, 1];
507 def : InstRW<[Zn3WriteMOVSlow], (instrs MOV8rm, MOV8rm_NOREX, MOV16rm, MOVSX16rm16, MOVSX16rm32, MOVZX16rm16, MOVSX16rm8, MOVZX16rm8)>;
509 defm : Zn3WriteResInt<WriteStore, [Zn3AGU012, Zn3Store], Znver3Model.StoreLatency, [1, 2], 1>;
510 defm : Zn3WriteResInt<WriteStoreNT, [Zn3AGU012, Zn3Store], Znver3Model.StoreLatency, [1, 2], 1>;
511 defm : Zn3WriteResInt<WriteMove, [Zn3ALU0123], 1, [4], 1>;
513 // Treat misc copies as a move.
514 def : InstRW<[WriteMove], (instrs COPY)>;
516 def Zn3WriteMOVBE16rm : SchedWriteRes<[Zn3AGU012, Zn3Load, Zn3ALU0123]> {
517 let Latency = Znver3Model.LoadLatency;
518 let ResourceCycles = [1, 1, 4];
521 def : InstRW<[Zn3WriteMOVBE16rm], (instrs MOVBE16rm)>;
523 def Zn3WriteMOVBEmr : SchedWriteRes<[Zn3ALU0123, Zn3AGU012, Zn3Store]> {
524 let Latency = Znver3Model.StoreLatency;
525 let ResourceCycles = [4, 1, 1];
528 def : InstRW<[Zn3WriteMOVBEmr], (instrs MOVBE16mr, MOVBE32mr, MOVBE64mr)>;
531 defm : Zn3WriteResIntPair<WriteALU, [Zn3ALU0123], 1, [1], 1>; // Simple integer ALU op.
533 def Zn3WriteALUSlow : SchedWriteRes<[Zn3ALU0123]> {
535 let ResourceCycles = [4];
538 def : InstRW<[Zn3WriteALUSlow], (instrs ADD8i8, ADD16i16, ADD32i32, ADD64i32,
539 AND8i8, AND16i16, AND32i32, AND64i32,
540 OR8i8, OR16i16, OR32i32, OR64i32,
541 SUB8i8, SUB16i16, SUB32i32, SUB64i32,
542 XOR8i8, XOR16i16, XOR32i32, XOR64i32)>;
544 def Zn3WriteMoveExtend : SchedWriteRes<[Zn3ALU0123]> {
546 let ResourceCycles = [4];
549 def : InstRW<[Zn3WriteMoveExtend], (instrs MOVSX16rr16, MOVSX16rr32, MOVZX16rr16, MOVSX16rr8, MOVZX16rr8)>;
551 def Zn3WriteMaterialize32bitImm: SchedWriteRes<[Zn3ALU0123]> {
553 let ResourceCycles = [2];
556 def : InstRW<[Zn3WriteMaterialize32bitImm], (instrs MOV32ri, MOV32ri_alt, MOV64ri32)>;
558 def Zn3WritePDEP_PEXT : SchedWriteRes<[Zn3ALU1]> {
560 let ResourceCycles = [1];
563 def : InstRW<[Zn3WritePDEP_PEXT], (instrs PDEP32rr, PDEP64rr,
564 PEXT32rr, PEXT64rr)>;
566 defm : Zn3WriteResIntPair<WriteADC, [Zn3ALU0123], 1, [4], 1>; // Integer ALU + flags op.
568 def Zn3WriteADC8mr_SBB8mr : SchedWriteRes<[Zn3AGU012, Zn3Load, Zn3ALU0123, Zn3Store]> {
570 let ResourceCycles = [1, 1, 7, 1];
573 def : InstRW<[Zn3WriteADC8mr_SBB8mr], (instrs ADC8mr, SBB8mr)>;
575 // This is for simple LEAs with one or two input operands.
576 defm : Zn3WriteResInt<WriteLEA, [Zn3AGU012], 1, [1], 1>; // LEA instructions can't fold loads.
578 // This write is used for slow LEA instructions.
579 def Zn3Write3OpsLEA : SchedWriteRes<[Zn3ALU0123]> {
581 let ResourceCycles = [1];
585 // On Znver3, a slow LEA is either a 3Ops LEA (base, index, offset),
586 // or an LEA with a `Scale` value different than 1.
587 def Zn3SlowLEAPredicate : MCSchedPredicate<
589 // A 3-operand LEA (base, index, offset).
590 IsThreeOperandsLEAFn,
591 // An LEA with a "Scale" different than 1.
593 CheckIsImmOperand<2>,
594 CheckNot<CheckImmOperand<2, 1>>
599 def Zn3WriteLEA : SchedWriteVariant<[
600 SchedVar<Zn3SlowLEAPredicate, [Zn3Write3OpsLEA]>,
601 SchedVar<NoSchedPred, [WriteLEA]>
604 def : InstRW<[Zn3WriteLEA], (instrs LEA32r, LEA64r, LEA64_32r)>;
606 def Zn3SlowLEA16r : SchedWriteRes<[Zn3ALU0123]> {
607 let Latency = 2; // FIXME: not from llvm-exegesis
608 let ResourceCycles = [4];
612 def : InstRW<[Zn3SlowLEA16r], (instrs LEA16r)>;
614 // Integer multiplication
615 defm : Zn3WriteResIntPair<WriteIMul8, [Zn3Multiplier], 3, [3], 1>; // Integer 8-bit multiplication.
616 defm : Zn3WriteResIntPair<WriteIMul16, [Zn3Multiplier], 3, [3], 3, /*LoadUOps=*/1>; // Integer 16-bit multiplication.
617 defm : Zn3WriteResIntPair<WriteIMul16Imm, [Zn3Multiplier], 4, [4], 2>; // Integer 16-bit multiplication by immediate.
618 defm : Zn3WriteResIntPair<WriteIMul16Reg, [Zn3Multiplier], 3, [1], 1>; // Integer 16-bit multiplication by register.
619 defm : Zn3WriteResIntPair<WriteIMul32, [Zn3Multiplier], 3, [3], 2>; // Integer 32-bit multiplication.
620 defm : Zn3WriteResIntPair<WriteMULX32, [Zn3Multiplier], 4, [1], 2>; // Integer 32-bit Unsigned Multiply Without Affecting Flags.
622 def Zn3MULX32rr : SchedWriteRes<[Zn3Multiplier]> {
624 let ResourceCycles = [1];
627 def : InstRW<[Zn3MULX32rr, WriteIMulH], (instrs MULX32rr)>;
629 def Zn3MULX32rm : SchedWriteRes<[Zn3AGU012, Zn3Load, Zn3Multiplier]> {
630 let Latency = !add(Znver3Model.LoadLatency, Zn3MULX32rr.Latency);
631 let ResourceCycles = [1, 1, 2];
632 let NumMicroOps = Zn3MULX32rr.NumMicroOps;
634 def : InstRW<[Zn3MULX32rm, WriteIMulH,
635 ReadDefault, ReadDefault, ReadDefault, ReadDefault, ReadDefault,
636 ReadAfterLd], (instrs MULX32rm)>;
638 defm : Zn3WriteResIntPair<WriteIMul32Imm, [Zn3Multiplier], 3, [1], 1>; // Integer 32-bit multiplication by immediate.
639 defm : Zn3WriteResIntPair<WriteIMul32Reg, [Zn3Multiplier], 3, [1], 1>; // Integer 32-bit multiplication by register.
640 defm : Zn3WriteResIntPair<WriteIMul64, [Zn3Multiplier], 3, [3], 2>; // Integer 64-bit multiplication.
641 defm : Zn3WriteResIntPair<WriteMULX64, [Zn3Multiplier], 4, [1], 2>; // Integer 32-bit Unsigned Multiply Without Affecting Flags.
643 def Zn3MULX64rr : SchedWriteRes<[Zn3Multiplier]> {
645 let ResourceCycles = [1];
648 def : InstRW<[Zn3MULX64rr, WriteIMulH], (instrs MULX64rr)>;
650 def Zn3MULX64rm : SchedWriteRes<[Zn3AGU012, Zn3Load, Zn3Multiplier]> {
651 let Latency = !add(Znver3Model.LoadLatency, Zn3MULX64rr.Latency);
652 let ResourceCycles = [1, 1, 2];
653 let NumMicroOps = Zn3MULX64rr.NumMicroOps;
655 def : InstRW<[Zn3MULX64rm, WriteIMulH,
656 ReadDefault, ReadDefault, ReadDefault, ReadDefault, ReadDefault,
657 ReadAfterLd], (instrs MULX64rm)>;
659 defm : Zn3WriteResIntPair<WriteIMul64Imm, [Zn3Multiplier], 3, [1], 1>; // Integer 64-bit multiplication by immediate.
660 defm : Zn3WriteResIntPair<WriteIMul64Reg, [Zn3Multiplier], 3, [1], 1>; // Integer 64-bit multiplication by register.
661 defm : Zn3WriteResInt<WriteIMulH, [], 4, [], 0>; // Integer multiplication, high part.
663 defm : Zn3WriteResInt<WriteBSWAP32, [Zn3ALU0123], 1, [1], 1>; // Byte Order (Endianness) 32-bit Swap.
664 defm : Zn3WriteResInt<WriteBSWAP64, [Zn3ALU0123], 1, [1], 1>; // Byte Order (Endianness) 64-bit Swap.
666 defm : Zn3WriteResIntPair<WriteCMPXCHG, [Zn3ALU0123], 3, [12], 5>; // Compare and set, compare and swap.
668 def Zn3WriteCMPXCHG8rr : SchedWriteRes<[Zn3ALU0123]> {
670 let ResourceCycles = [12];
673 def : InstRW<[Zn3WriteCMPXCHG8rr], (instrs CMPXCHG8rr)>;
675 defm : Zn3WriteResInt<WriteCMPXCHGRMW, [Zn3ALU0123], 3, [12], 6>; // Compare and set, compare and swap.
677 def Zn3WriteCMPXCHG8rm_LCMPXCHG8 : SchedWriteRes<[Zn3AGU012, Zn3Load, Zn3ALU0123]> {
678 let Latency = !add(Znver3Model.LoadLatency, Zn3WriteCMPXCHG8rr.Latency);
679 let ResourceCycles = [1, 1, 12];
680 let NumMicroOps = !add(Zn3WriteCMPXCHG8rr.NumMicroOps, 2);
682 def : InstRW<[Zn3WriteCMPXCHG8rm_LCMPXCHG8], (instrs CMPXCHG8rm, LCMPXCHG8)>;
684 def Zn3WriteCMPXCHG8B : SchedWriteRes<[Zn3ALU0123]> {
685 let Latency = 3; // FIXME: not from llvm-exegesis
686 let ResourceCycles = [24];
687 let NumMicroOps = 19;
689 def : InstRW<[Zn3WriteCMPXCHG8B], (instrs CMPXCHG8B)>;
691 def Zn3WriteCMPXCHG16B_LCMPXCHG16B : SchedWriteRes<[Zn3ALU0123]> {
692 let Latency = 4; // FIXME: not from llvm-exegesis
693 let ResourceCycles = [59];
694 let NumMicroOps = 28;
696 def : InstRW<[Zn3WriteCMPXCHG16B_LCMPXCHG16B], (instrs CMPXCHG16B, LCMPXCHG16B)>;
698 def Zn3WriteWriteXCHGUnrenameable : SchedWriteRes<[Zn3ALU0123]> {
700 let ResourceCycles = [2];
703 def : InstRW<[Zn3WriteWriteXCHGUnrenameable], (instrs XCHG8rr, XCHG16rr, XCHG16ar)>;
705 def Zn3WriteXCHG8rm_XCHG16rm : SchedWriteRes<[Zn3AGU012, Zn3Load, Zn3ALU0123]> {
706 let Latency = !add(Znver3Model.LoadLatency, 3); // FIXME: not from llvm-exegesis
707 let ResourceCycles = [1, 1, 2];
710 def : InstRW<[Zn3WriteXCHG8rm_XCHG16rm], (instrs XCHG8rm, XCHG16rm)>;
712 def Zn3WriteXCHG32rm_XCHG64rm : SchedWriteRes<[Zn3AGU012, Zn3Load, Zn3ALU0123]> {
713 let Latency = !add(Znver3Model.LoadLatency, 2); // FIXME: not from llvm-exegesis
714 let ResourceCycles = [1, 1, 2];
717 def : InstRW<[Zn3WriteXCHG32rm_XCHG64rm], (instrs XCHG32rm, XCHG64rm)>;
720 // FIXME: uops for 8-bit division measures as 2. for others it's a guess.
721 // FIXME: latency for 8-bit division measures as 10. for others it's a guess.
722 defm : Zn3WriteResIntPair<WriteDiv8, [Zn3Divider], 10, [10], 2>;
723 defm : Zn3WriteResIntPair<WriteDiv16, [Zn3Divider], 11, [11], 2>;
724 defm : Zn3WriteResIntPair<WriteDiv32, [Zn3Divider], 13, [13], 2>;
725 defm : Zn3WriteResIntPair<WriteDiv64, [Zn3Divider], 17, [17], 2>;
726 defm : Zn3WriteResIntPair<WriteIDiv8, [Zn3Divider], 10, [10], 2>;
727 defm : Zn3WriteResIntPair<WriteIDiv16, [Zn3Divider], 11, [11], 2>;
728 defm : Zn3WriteResIntPair<WriteIDiv32, [Zn3Divider], 13, [13], 2>;
729 defm : Zn3WriteResIntPair<WriteIDiv64, [Zn3Divider], 17, [17], 2>;
731 defm : Zn3WriteResIntPair<WriteBSF, [Zn3ALU1], 3, [3], 6, /*LoadUOps=*/2>; // Bit scan forward.
732 defm : Zn3WriteResIntPair<WriteBSR, [Zn3ALU1], 4, [4], 6, /*LoadUOps=*/2>; // Bit scan reverse.
734 defm : Zn3WriteResIntPair<WritePOPCNT, [Zn3ALU0123], 1, [1], 1>; // Bit population count.
736 def Zn3WritePOPCNT16rr : SchedWriteRes<[Zn3ALU0123]> {
738 let ResourceCycles = [4];
741 def : InstRW<[Zn3WritePOPCNT16rr], (instrs POPCNT16rr)>;
743 defm : Zn3WriteResIntPair<WriteLZCNT, [Zn3ALU0123], 1, [1], 1>; // Leading zero count.
745 def Zn3WriteLZCNT16rr : SchedWriteRes<[Zn3ALU0123]> {
747 let ResourceCycles = [4];
750 def : InstRW<[Zn3WriteLZCNT16rr], (instrs LZCNT16rr)>;
752 defm : Zn3WriteResIntPair<WriteTZCNT, [Zn3ALU12], 2, [1], 2>; // Trailing zero count.
754 def Zn3WriteTZCNT16rr : SchedWriteRes<[Zn3ALU0123]> {
756 let ResourceCycles = [4];
759 def : InstRW<[Zn3WriteTZCNT16rr], (instrs TZCNT16rr)>;
761 defm : Zn3WriteResIntPair<WriteCMOV, [Zn3ALU03], 1, [1], 1>; // Conditional move.
762 defm : Zn3WriteResInt<WriteFCMOV, [Zn3ALU0123], 7, [28], 7>; // FIXME: not from llvm-exegesis // X87 conditional move.
763 defm : Zn3WriteResInt<WriteSETCC, [Zn3ALU03], 1, [2], 1>; // Set register based on condition code.
764 defm : Zn3WriteResInt<WriteSETCCStore, [Zn3ALU03, Zn3AGU012, Zn3Store], 2, [2, 1, 1], 2>; // FIXME: latency not from llvm-exegesis
765 defm : Zn3WriteResInt<WriteLAHFSAHF, [Zn3ALU3], 1, [1], 1>; // Load/Store flags in AH.
767 defm : Zn3WriteResInt<WriteBitTest, [Zn3ALU12], 1, [1], 1>; // Bit Test
768 defm : Zn3WriteResInt<WriteBitTestImmLd, [Zn3AGU012, Zn3Load, Zn3ALU12], !add(Znver3Model.LoadLatency, 1), [1, 1, 1], 2>;
769 defm : Zn3WriteResInt<WriteBitTestRegLd, [Zn3AGU012, Zn3Load, Zn3ALU12], !add(Znver3Model.LoadLatency, 1), [1, 1, 1], 7>;
771 defm : Zn3WriteResInt<WriteBitTestSet, [Zn3ALU12], 2, [2], 2>; // Bit Test + Set
772 defm : Zn3WriteResInt<WriteBitTestSetImmLd, [Zn3AGU012, Zn3Load, Zn3ALU12], !add(Znver3Model.LoadLatency, 2), [1, 1, 1], 4>;
773 defm : Zn3WriteResInt<WriteBitTestSetRegLd, [Zn3AGU012, Zn3Load, Zn3ALU12], !add(Znver3Model.LoadLatency, 2), [1, 1, 1], 9>;
775 // Integer shifts and rotates.
776 defm : Zn3WriteResIntPair<WriteShift, [Zn3ALU12], 1, [1], 1, /*LoadUOps=*/1>;
777 defm : Zn3WriteResIntPair<WriteShiftCL, [Zn3ALU12], 1, [1], 1, /*LoadUOps=*/1>;
778 defm : Zn3WriteResIntPair<WriteRotate, [Zn3ALU12], 1, [1], 1, /*LoadUOps=*/1>;
780 def Zn3WriteRotateR1 : SchedWriteRes<[Zn3ALU12]> {
782 let ResourceCycles = [2];
785 def : InstRW<[Zn3WriteRotateR1], (instrs RCL8r1, RCL16r1, RCL32r1, RCL64r1,
786 RCR8r1, RCR16r1, RCR32r1, RCR64r1)>;
788 def Zn3WriteRotateM1 : SchedWriteRes<[Zn3AGU012, Zn3Load, Zn3ALU12]> {
789 let Latency = !add(Znver3Model.LoadLatency, Zn3WriteRotateR1.Latency);
790 let ResourceCycles = [1, 1, 2];
791 let NumMicroOps = !add(Zn3WriteRotateR1.NumMicroOps, 1);
793 def : InstRW<[Zn3WriteRotateM1], (instrs RCL8m1, RCL16m1, RCL32m1, RCL64m1,
794 RCR8m1, RCR16m1, RCR32m1, RCR64m1)>;
796 def Zn3WriteRotateRightRI : SchedWriteRes<[Zn3ALU12]> {
798 let ResourceCycles = [6];
801 def : InstRW<[Zn3WriteRotateRightRI], (instrs RCR8ri, RCR16ri, RCR32ri, RCR64ri)>;
803 def Zn3WriteRotateRightMI : SchedWriteRes<[Zn3AGU012, Zn3Load, Zn3ALU12]> {
804 let Latency = !add(Znver3Model.LoadLatency, Zn3WriteRotateRightRI.Latency);
805 let ResourceCycles = [1, 1, 8];
806 let NumMicroOps = !add(Zn3WriteRotateRightRI.NumMicroOps, 3);
808 def : InstRW<[Zn3WriteRotateRightMI], (instrs RCR8mi, RCR16mi, RCR32mi, RCR64mi)>;
810 def Zn3WriteRotateLeftRI : SchedWriteRes<[Zn3ALU12]> {
812 let ResourceCycles = [8];
815 def : InstRW<[Zn3WriteRotateLeftRI], (instrs RCL8ri, RCL16ri, RCL32ri, RCL64ri)>;
817 def Zn3WriteRotateLeftMI : SchedWriteRes<[Zn3AGU012, Zn3Load, Zn3ALU12]> {
818 let Latency = !add(Znver3Model.LoadLatency, Zn3WriteRotateLeftRI.Latency);
819 let ResourceCycles = [1, 1, 8];
820 let NumMicroOps = !add(Zn3WriteRotateLeftRI.NumMicroOps, 2);
822 def : InstRW<[Zn3WriteRotateLeftMI], (instrs RCL8mi, RCL16mi, RCL32mi, RCL64mi)>;
824 defm : Zn3WriteResIntPair<WriteRotateCL, [Zn3ALU12], 1, [1], 1, /*LoadUOps=*/1>;
826 def Zn3WriteRotateRightRCL : SchedWriteRes<[Zn3ALU12]> {
828 let ResourceCycles = [6];
831 def : InstRW<[Zn3WriteRotateRightRCL], (instrs RCR8rCL, RCR16rCL, RCR32rCL, RCR64rCL)>;
833 def Zn3WriteRotateRightMCL : SchedWriteRes<[Zn3AGU012, Zn3Load, Zn3ALU12]> {
834 let Latency = !add(Znver3Model.LoadLatency, Zn3WriteRotateRightRCL.Latency);
835 let ResourceCycles = [1, 1, 8];
836 let NumMicroOps = !add(Zn3WriteRotateRightRCL.NumMicroOps, 2);
838 def : InstRW<[Zn3WriteRotateRightMCL], (instrs RCR8mCL, RCR16mCL, RCR32mCL, RCR64mCL)>;
840 def Zn3WriteRotateLeftRCL : SchedWriteRes<[Zn3ALU12]> {
842 let ResourceCycles = [8];
845 def : InstRW<[Zn3WriteRotateLeftRCL], (instrs RCL8rCL, RCL16rCL, RCL32rCL, RCL64rCL)>;
847 def Zn3WriteRotateLeftMCL : SchedWriteRes<[Zn3AGU012, Zn3Load, Zn3ALU12]> {
848 let Latency = !add(Znver3Model.LoadLatency, Zn3WriteRotateLeftRCL.Latency);
849 let ResourceCycles = [1, 1, 8];
850 let NumMicroOps = !add(Zn3WriteRotateLeftRCL.NumMicroOps, 2);
852 def : InstRW<[Zn3WriteRotateLeftMCL], (instrs RCL8mCL, RCL16mCL, RCL32mCL, RCL64mCL)>;
854 // Double shift instructions.
855 defm : Zn3WriteResInt<WriteSHDrri, [Zn3ALU12], 2, [3], 4>;
856 defm : Zn3WriteResInt<WriteSHDrrcl, [Zn3ALU12], 2, [3], 5>;
857 defm : Zn3WriteResInt<WriteSHDmri, [Zn3AGU012, Zn3Load, Zn3ALU12], !add(Znver3Model.LoadLatency, 2), [1, 1, 4], 6>;
858 defm : Zn3WriteResInt<WriteSHDmrcl, [Zn3AGU012, Zn3Load, Zn3ALU12], !add(Znver3Model.LoadLatency, 2), [1, 1, 4], 6>;
860 // BMI1 BEXTR/BLS, BMI2 BZHI
861 defm : Zn3WriteResIntPair<WriteBEXTR, [Zn3ALU12], 1, [1], 1, /*LoadUOps=*/1>;
862 defm : Zn3WriteResIntPair<WriteBLS, [Zn3ALU0123], 2, [2], 2, /*LoadUOps=*/1>;
863 defm : Zn3WriteResIntPair<WriteBZHI, [Zn3ALU12], 1, [1], 1, /*LoadUOps=*/1>;
865 // Idioms that clear a register, like xorps %xmm0, %xmm0.
866 // These can often bypass execution ports completely.
867 defm : Zn3WriteResInt<WriteZero, [Zn3ALU0123], 0, [0], 1>;
869 // Branches don't produce values, so they have no latency, but they still
870 // consume resources. Indirect branches can fold loads.
871 defm : Zn3WriteResIntPair<WriteJump, [Zn3BRU01], 1, [1], 1>; // FIXME: not from llvm-exegesis
873 // Floating point. This covers both scalar and vector operations.
874 defm : Zn3WriteResInt<WriteFLD0, [Zn3FPLd01, Zn3Load, Zn3FPP1], !add(Znver3Model.LoadLatency, 4), [1, 1, 1], 1>;
875 defm : Zn3WriteResInt<WriteFLD1, [Zn3FPLd01, Zn3Load, Zn3FPP1], !add(Znver3Model.LoadLatency, 7), [1, 1, 1], 1>;
876 defm : Zn3WriteResInt<WriteFLDC, [Zn3FPLd01, Zn3Load, Zn3FPP1], !add(Znver3Model.LoadLatency, 7), [1, 1, 1], 1>;
877 defm : Zn3WriteResXMM<WriteFLoad, [Zn3FPLd01, Zn3Load], !add(Znver3Model.VecLoadLatency, 1), [1, 1], 1>;
878 defm : Zn3WriteResXMM<WriteFLoadX, [Zn3FPLd01, Zn3Load], !add(Znver3Model.VecLoadLatency, 1), [1, 1], 1>;
879 defm : Zn3WriteResYMM<WriteFLoadY, [Zn3FPLd01, Zn3Load], !add(Znver3Model.VecLoadLatency, 1), [1, 1], 1>;
880 defm : Zn3WriteResXMM<WriteFMaskedLoad, [Zn3FPLd01, Zn3Load], !add(Znver3Model.VecLoadLatency, 1), [1, 1], 1>;
881 defm : Zn3WriteResYMM<WriteFMaskedLoadY, [Zn3FPLd01, Zn3Load], !add(Znver3Model.VecLoadLatency, 1), [1, 1], 1>;
882 defm : Zn3WriteResXMM<WriteFStore, [Zn3FPSt, Zn3Store], Znver3Model.StoreLatency, [1, 1], 1>;
884 def Zn3WriteWriteFStoreMMX : SchedWriteRes<[Zn3FPSt, Zn3Store]> {
885 let Latency = 2; // FIXME: not from llvm-exegesis
886 let ResourceCycles = [1, 1];
889 def : InstRW<[Zn3WriteWriteFStoreMMX], (instrs MOVHPDmr, MOVHPSmr,
890 VMOVHPDmr, VMOVHPSmr)>;
892 defm : Zn3WriteResXMM<WriteFStoreX, [Zn3FPSt, Zn3Store], Znver3Model.StoreLatency, [1, 1], 1>;
893 defm : Zn3WriteResYMM<WriteFStoreY, [Zn3FPSt, Zn3Store], Znver3Model.StoreLatency, [1, 1], 1>;
894 defm : Zn3WriteResXMM<WriteFStoreNT, [Zn3FPSt, Zn3Store], Znver3Model.StoreLatency, [1, 1], 1>;
895 defm : Zn3WriteResXMM<WriteFStoreNTX, [Zn3FPSt, Zn3Store], Znver3Model.StoreLatency, [1, 1], 1>;
896 defm : Zn3WriteResYMM<WriteFStoreNTY, [Zn3FPSt, Zn3Store], Znver3Model.StoreLatency, [1, 1], 1>;
898 defm : Zn3WriteResXMM<WriteFMaskedStore32, [Zn3FPSt, Zn3Store], Znver3Model.StoreLatency, [6, 1], 18>;
899 defm : Zn3WriteResXMM<WriteFMaskedStore64, [Zn3FPSt, Zn3Store], Znver3Model.StoreLatency, [4, 1], 10>;
900 defm : Zn3WriteResYMM<WriteFMaskedStore32Y, [Zn3FPSt, Zn3Store], Znver3Model.StoreLatency, [12, 1], 42>;
901 defm : Zn3WriteResYMM<WriteFMaskedStore64Y, [Zn3FPSt, Zn3Store], Znver3Model.StoreLatency, [6, 1], 18>;
903 defm : Zn3WriteResXMMPair<WriteFAdd, [Zn3FPFAdd01], 3, [1], 1>; // Floating point add/sub.
905 def Zn3WriteX87Arith : SchedWriteRes<[Zn3AGU012, Zn3Load, Zn3FPU0123]> {
906 let Latency = !add(Znver3Model.LoadLatency, 1); // FIXME: not from llvm-exegesis
907 let ResourceCycles = [1, 1, 24];
910 def : InstRW<[Zn3WriteX87Arith], (instrs ADD_FI16m, ADD_FI32m,
911 SUB_FI16m, SUB_FI32m,
912 SUBR_FI16m, SUBR_FI32m,
913 MUL_FI16m, MUL_FI32m)>;
915 def Zn3WriteX87Div : SchedWriteRes<[Zn3AGU012, Zn3Load, Zn3FPU0123]> {
916 let Latency = !add(Znver3Model.LoadLatency, 1); // FIXME: not from llvm-exegesis
917 let ResourceCycles = [1, 1, 62];
920 def : InstRW<[Zn3WriteX87Div], (instrs DIV_FI16m, DIV_FI32m,
921 DIVR_FI16m, DIVR_FI32m)>;
923 defm : Zn3WriteResXMMPair<WriteFAddX, [Zn3FPFAdd01], 3, [1], 1>; // Floating point add/sub (XMM).
924 defm : Zn3WriteResYMMPair<WriteFAddY, [Zn3FPFAdd01], 3, [1], 1>; // Floating point add/sub (YMM).
925 defm : X86WriteResPairUnsupported<WriteFAddZ>; // Floating point add/sub (ZMM).
926 defm : Zn3WriteResXMMPair<WriteFAdd64, [Zn3FPFAdd01], 3, [1], 1>; // Floating point double add/sub.
927 defm : Zn3WriteResXMMPair<WriteFAdd64X, [Zn3FPFAdd01], 3, [1], 1>; // Floating point double add/sub (XMM).
928 defm : Zn3WriteResYMMPair<WriteFAdd64Y, [Zn3FPFAdd01], 3, [1], 1>; // Floating point double add/sub (YMM).
929 defm : X86WriteResPairUnsupported<WriteFAdd64Z>; // Floating point double add/sub (ZMM).
930 defm : Zn3WriteResXMMPair<WriteFCmp, [Zn3FPFMul01], 1, [1], 1>; // Floating point compare.
931 defm : Zn3WriteResXMMPair<WriteFCmpX, [Zn3FPFMul01], 1, [1], 1>; // Floating point compare (XMM).
932 defm : Zn3WriteResYMMPair<WriteFCmpY, [Zn3FPFMul01], 1, [1], 1>; // Floating point compare (YMM).
933 defm : X86WriteResPairUnsupported<WriteFCmpZ>; // Floating point compare (ZMM).
934 defm : Zn3WriteResXMMPair<WriteFCmp64, [Zn3FPFMul01], 1, [1], 1>; // Floating point double compare.
935 defm : Zn3WriteResXMMPair<WriteFCmp64X, [Zn3FPFMul01], 1, [1], 1>; // Floating point double compare (XMM).
936 defm : Zn3WriteResYMMPair<WriteFCmp64Y, [Zn3FPFMul01], 1, [1], 1>; // Floating point double compare (YMM).
937 defm : X86WriteResPairUnsupported<WriteFCmp64Z>; // Floating point double compare (ZMM).
938 defm : Zn3WriteResXMMPair<WriteFCom, [Zn3FPFMul01], 3, [2], 1>; // FIXME: latency not from llvm-exegesis // Floating point compare to flags (X87).
939 defm : Zn3WriteResXMMPair<WriteFComX, [Zn3FPFMul01], 4, [2], 2>; // FIXME: latency not from llvm-exegesis // Floating point compare to flags (SSE).
940 defm : Zn3WriteResXMMPair<WriteFMul, [Zn3FPFMul01], 3, [1], 1>; // Floating point multiplication.
941 defm : Zn3WriteResXMMPair<WriteFMulX, [Zn3FPFMul01], 3, [1], 1>; // Floating point multiplication (XMM).
942 defm : Zn3WriteResYMMPair<WriteFMulY, [Zn3FPFMul01], 3, [1], 1>; // Floating point multiplication (YMM).
943 defm : X86WriteResPairUnsupported<WriteFMulZ>; // Floating point multiplication (YMM).
944 defm : Zn3WriteResXMMPair<WriteFMul64, [Zn3FPFMul01], 3, [1], 1>; // Floating point double multiplication.
945 defm : Zn3WriteResXMMPair<WriteFMul64X, [Zn3FPFMul01], 3, [1], 1>; // Floating point double multiplication (XMM).
946 defm : Zn3WriteResYMMPair<WriteFMul64Y, [Zn3FPFMul01], 3, [1], 1>; // Floating point double multiplication (YMM).
947 defm : X86WriteResPairUnsupported<WriteFMul64Z>; // Floating point double multiplication (ZMM).
948 defm : Zn3WriteResXMMPair<WriteFDiv, [Zn3FPFDiv], 11, [3], 1>; // Floating point division.
949 defm : Zn3WriteResXMMPair<WriteFDivX, [Zn3FPFDiv], 11, [3], 1>; // Floating point division (XMM).
950 defm : Zn3WriteResYMMPair<WriteFDivY, [Zn3FPFDiv], 11, [3], 1>; // Floating point division (YMM).
951 defm : X86WriteResPairUnsupported<WriteFDivZ>; // Floating point division (ZMM).
952 defm : Zn3WriteResXMMPair<WriteFDiv64, [Zn3FPFDiv], 13, [5], 1>; // Floating point double division.
953 defm : Zn3WriteResXMMPair<WriteFDiv64X, [Zn3FPFDiv], 13, [5], 1>; // Floating point double division (XMM).
954 defm : Zn3WriteResYMMPair<WriteFDiv64Y, [Zn3FPFDiv], 13, [5], 1>; // Floating point double division (YMM).
955 defm : X86WriteResPairUnsupported<WriteFDiv64Z>; // Floating point double division (ZMM).
956 defm : Zn3WriteResXMMPair<WriteFSqrt, [Zn3FPFDiv], 15, [5], 1>; // Floating point square root.
957 defm : Zn3WriteResXMMPair<WriteFSqrtX, [Zn3FPFDiv], 15, [5], 1>; // Floating point square root (XMM).
958 defm : Zn3WriteResYMMPair<WriteFSqrtY, [Zn3FPFDiv], 15, [5], 1>; // Floating point square root (YMM).
959 defm : X86WriteResPairUnsupported<WriteFSqrtZ>; // Floating point square root (ZMM).
960 defm : Zn3WriteResXMMPair<WriteFSqrt64, [Zn3FPFDiv], 21, [9], 1>; // Floating point double square root.
961 defm : Zn3WriteResXMMPair<WriteFSqrt64X, [Zn3FPFDiv], 21, [9], 1>; // Floating point double square root (XMM).
962 defm : Zn3WriteResYMMPair<WriteFSqrt64Y, [Zn3FPFDiv], 21, [9], 1>; // Floating point double square root (YMM).
963 defm : X86WriteResPairUnsupported<WriteFSqrt64Z>; // Floating point double square root (ZMM).
964 defm : Zn3WriteResXMMPair<WriteFSqrt80, [Zn3FPFDiv], 22, [23], 1>; // FIXME: latency not from llvm-exegesis // Floating point long double square root.
965 defm : Zn3WriteResXMMPair<WriteFRcp, [Zn3FPFMul01], 3, [1], 1>; // Floating point reciprocal estimate.
966 defm : Zn3WriteResXMMPair<WriteFRcpX, [Zn3FPFMul01], 3, [1], 1>; // Floating point reciprocal estimate (XMM).
967 defm : Zn3WriteResYMMPair<WriteFRcpY, [Zn3FPFMul01], 3, [1], 1>; // Floating point reciprocal estimate (YMM).
968 defm : X86WriteResPairUnsupported<WriteFRcpZ>; // Floating point reciprocal estimate (ZMM).
969 defm : Zn3WriteResXMMPair<WriteFRsqrt, [Zn3FPFDiv], 3, [1], 1>; // Floating point reciprocal square root estimate.
970 defm : Zn3WriteResXMMPair<WriteFRsqrtX, [Zn3FPFDiv], 3, [1], 1>; // Floating point reciprocal square root estimate (XMM).
971 defm : Zn3WriteResYMMPair<WriteFRsqrtY, [Zn3FPFDiv], 3, [1], 1>; // Floating point reciprocal square root estimate (YMM).
972 defm : X86WriteResPairUnsupported<WriteFRsqrtZ>; // Floating point reciprocal square root estimate (ZMM).
973 defm : Zn3WriteResXMMPair<WriteFMA, [Zn3FPFMul01], 4, [2], 1>; // Fused Multiply Add.
974 defm : Zn3WriteResXMMPair<WriteFMAX, [Zn3FPFMul01], 4, [2], 1>; // Fused Multiply Add (XMM).
975 defm : Zn3WriteResYMMPair<WriteFMAY, [Zn3FPFMul01], 4, [2], 1>; // Fused Multiply Add (YMM).
976 defm : X86WriteResPairUnsupported<WriteFMAZ>; // Fused Multiply Add (ZMM).
977 defm : Zn3WriteResXMMPair<WriteDPPD, [Zn3FPFMul01], 9, [6], 3, /*LoadUOps=*/2>; // Floating point double dot product.
978 defm : Zn3WriteResXMMPair<WriteDPPS, [Zn3FPFMul01], 15, [8], 8, /*LoadUOps=*/2>; // Floating point single dot product.
979 defm : Zn3WriteResYMMPair<WriteDPPSY, [Zn3FPFMul01], 15, [8], 7, /*LoadUOps=*/1>; // Floating point single dot product (YMM).
980 defm : X86WriteResPairUnsupported<WriteDPPSZ>; // Floating point single dot product (ZMM).
981 defm : Zn3WriteResXMMPair<WriteFSign, [Zn3FPFMul01], 1, [2], 1>; // FIXME: latency not from llvm-exegesis // Floating point fabs/fchs.
982 defm : Zn3WriteResXMMPair<WriteFRnd, [Zn3FPFCvt01], 3, [1], 1>; // Floating point rounding.
983 defm : Zn3WriteResYMMPair<WriteFRndY, [Zn3FPFCvt01], 3, [1], 1>; // Floating point rounding (YMM).
984 defm : X86WriteResPairUnsupported<WriteFRndZ>; // Floating point rounding (ZMM).
985 defm : Zn3WriteResXMMPair<WriteFLogic, [Zn3FPVMisc0123], 1, [1], 1>; // Floating point and/or/xor logicals.
986 defm : Zn3WriteResYMMPair<WriteFLogicY, [Zn3FPVMisc0123], 1, [1], 1>; // Floating point and/or/xor logicals (YMM).
987 defm : X86WriteResPairUnsupported<WriteFLogicZ>; // Floating point and/or/xor logicals (ZMM).
988 defm : Zn3WriteResXMMPair<WriteFTest, [Zn3FPFMisc12], 1, [2], 2>; // FIXME: latency not from llvm-exegesis // Floating point TEST instructions.
989 defm : Zn3WriteResYMMPair<WriteFTestY, [Zn3FPFMisc12], 1, [2], 2>; // FIXME: latency not from llvm-exegesis // Floating point TEST instructions (YMM).
990 defm : X86WriteResPairUnsupported<WriteFTestZ>; // Floating point TEST instructions (ZMM).
991 defm : Zn3WriteResXMMPair<WriteFShuffle, [Zn3FPVShuf01], 1, [1], 1>; // Floating point vector shuffles.
992 defm : Zn3WriteResYMMPair<WriteFShuffleY, [Zn3FPVShuf01], 1, [1], 1>; // Floating point vector shuffles (YMM).
993 defm : X86WriteResPairUnsupported<WriteFShuffleZ>; // Floating point vector shuffles (ZMM).
994 defm : Zn3WriteResXMMPair<WriteFVarShuffle, [Zn3FPVShuf01], 3, [1], 1>; // Floating point vector variable shuffles.
995 defm : Zn3WriteResYMMPair<WriteFVarShuffleY, [Zn3FPVShuf01], 3, [1], 1>; // Floating point vector variable shuffles (YMM).
996 defm : X86WriteResPairUnsupported<WriteFVarShuffleZ>; // Floating point vector variable shuffles (ZMM).
997 defm : Zn3WriteResXMMPair<WriteFBlend, [Zn3FPFMul01], 1, [1], 1>; // Floating point vector blends.
998 defm : Zn3WriteResYMMPair<WriteFBlendY, [Zn3FPFMul01], 1, [1], 1>; // Floating point vector blends (YMM).
999 defm : X86WriteResPairUnsupported<WriteFBlendZ>; // Floating point vector blends (ZMM).
1000 defm : Zn3WriteResXMMPair<WriteFVarBlend, [Zn3FPFMul01], 1, [1], 1>; // Fp vector variable blends.
1001 defm : Zn3WriteResYMMPair<WriteFVarBlendY, [Zn3FPFMul01], 1, [1], 1>; // Fp vector variable blends (YMM).
1002 defm : X86WriteResPairUnsupported<WriteFVarBlendZ>; // Fp vector variable blends (ZMM).
1004 // Horizontal Add/Sub (float and integer)
1005 defm : Zn3WriteResXMMPair<WriteFHAdd, [Zn3FPFAdd0], 6, [2], 4>;
1006 defm : Zn3WriteResYMMPair<WriteFHAddY, [Zn3FPFAdd0], 6, [2], 3, /*LoadUOps=*/1>;
1007 defm : X86WriteResPairUnsupported<WriteFHAddZ>;
1008 defm : Zn3WriteResXMMPair<WritePHAdd, [Zn3FPVAdd0], 2, [2], 3, /*LoadUOps=*/1>;
1009 defm : Zn3WriteResXMMPair<WritePHAddX, [Zn3FPVAdd0], 2, [2], 4>;
1010 defm : Zn3WriteResYMMPair<WritePHAddY, [Zn3FPVAdd0], 2, [2], 3, /*LoadUOps=*/1>;
1011 defm : X86WriteResPairUnsupported<WritePHAddZ>;
1013 // Vector integer operations.
1014 defm : Zn3WriteResXMM<WriteVecLoad, [Zn3FPLd01, Zn3Load], !add(Znver3Model.VecLoadLatency, 1), [1, 1], 1>;
1015 defm : Zn3WriteResXMM<WriteVecLoadX, [Zn3FPLd01, Zn3Load], !add(Znver3Model.VecLoadLatency, 1), [1, 1], 1>;
1016 defm : Zn3WriteResYMM<WriteVecLoadY, [Zn3FPLd01, Zn3Load], !add(Znver3Model.VecLoadLatency, 1), [1, 1], 1>;
1017 defm : Zn3WriteResXMM<WriteVecLoadNT, [Zn3FPLd01, Zn3Load], !add(Znver3Model.VecLoadLatency, 1), [1, 1], 1>;
1018 defm : Zn3WriteResYMM<WriteVecLoadNTY, [Zn3FPLd01, Zn3Load], !add(Znver3Model.VecLoadLatency, 1), [1, 1], 1>;
1019 defm : Zn3WriteResXMM<WriteVecMaskedLoad, [Zn3FPLd01, Zn3Load], !add(Znver3Model.VecLoadLatency, 1), [1, 1], 1>;
1020 defm : Zn3WriteResYMM<WriteVecMaskedLoadY, [Zn3FPLd01, Zn3Load], !add(Znver3Model.VecLoadLatency, 1), [1, 1], 1>;
1021 defm : Zn3WriteResXMM<WriteVecStore, [Zn3FPSt, Zn3Store], Znver3Model.StoreLatency, [1, 1], 1>;
1022 defm : Zn3WriteResXMM<WriteVecStoreX, [Zn3FPSt, Zn3Store], Znver3Model.StoreLatency, [1, 1], 1>;
1024 def Zn3WriteVEXTRACTF128rr_VEXTRACTI128rr : SchedWriteRes<[Zn3FPFMisc0]> {
1026 let ResourceCycles = [1];
1027 let NumMicroOps = 1;
1029 def : InstRW<[Zn3WriteVEXTRACTF128rr_VEXTRACTI128rr], (instrs VEXTRACTF128rr, VEXTRACTI128rr)>;
1031 def Zn3WriteVEXTRACTI128mr : SchedWriteRes<[Zn3FPFMisc0, Zn3FPSt, Zn3Store]> {
1032 let Latency = !add(Znver3Model.LoadLatency, Zn3WriteVEXTRACTF128rr_VEXTRACTI128rr.Latency);
1033 let ResourceCycles = [1, 1, 1];
1034 let NumMicroOps = !add(Zn3WriteVEXTRACTF128rr_VEXTRACTI128rr.NumMicroOps, 1);
1036 def : InstRW<[Zn3WriteVEXTRACTI128mr], (instrs VEXTRACTI128mr, VEXTRACTF128mr)>;
1038 def Zn3WriteVINSERTF128rmr : SchedWriteRes<[Zn3AGU012, Zn3Load, Zn3FPFMisc0]> {
1039 let Latency = !add(Znver3Model.LoadLatency, Zn3WriteVEXTRACTF128rr_VEXTRACTI128rr.Latency);
1040 let ResourceCycles = [1, 1, 1];
1041 let NumMicroOps = !add(Zn3WriteVEXTRACTF128rr_VEXTRACTI128rr.NumMicroOps, 0);
1043 def : InstRW<[Zn3WriteVINSERTF128rmr], (instrs VINSERTF128rm)>;
1045 defm : Zn3WriteResYMM<WriteVecStoreY, [Zn3FPSt, Zn3Store], Znver3Model.StoreLatency, [1, 1], 1>;
1046 defm : Zn3WriteResXMM<WriteVecStoreNT, [Zn3FPSt, Zn3Store], Znver3Model.StoreLatency, [1, 1], 1>;
1047 defm : Zn3WriteResYMM<WriteVecStoreNTY, [Zn3FPSt, Zn3Store], Znver3Model.StoreLatency, [1, 1], 1>;
1048 defm : Zn3WriteResXMM<WriteVecMaskedStore32, [Zn3FPSt, Zn3Store], Znver3Model.StoreLatency, [6, 1], 18>;
1049 defm : Zn3WriteResXMM<WriteVecMaskedStore64, [Zn3FPSt, Zn3Store], Znver3Model.StoreLatency, [4, 1], 10>;
1050 defm : Zn3WriteResYMM<WriteVecMaskedStore32Y, [Zn3FPSt, Zn3Store], Znver3Model.StoreLatency, [12, 1], 42>;
1051 defm : Zn3WriteResYMM<WriteVecMaskedStore64Y, [Zn3FPSt, Zn3Store], Znver3Model.StoreLatency, [6, 1], 18>;
1053 defm : Zn3WriteResXMM<WriteVecMoveToGpr, [Zn3FPLd01], 1, [2], 1>;
1054 defm : Zn3WriteResXMM<WriteVecMoveFromGpr, [Zn3FPLd01], 1, [2], 1>;
1056 def Zn3WriteMOVMMX : SchedWriteRes<[Zn3FPLd01, Zn3FPFMisc0123]> {
1058 let ResourceCycles = [1, 2];
1059 let NumMicroOps = 2;
1061 def : InstRW<[Zn3WriteMOVMMX], (instrs MMX_MOVQ2FR64rr, MMX_MOVQ2DQrr)>;
1063 def Zn3WriteMOVMMXSlow : SchedWriteRes<[Zn3FPLd01, Zn3FPFMisc0123]> {
1065 let ResourceCycles = [1, 4];
1066 let NumMicroOps = 2;
1068 def : InstRW<[Zn3WriteMOVMMXSlow], (instrs MMX_MOVD64rr, MMX_MOVD64to64rr)>;
1070 defm : Zn3WriteResXMMPair<WriteVecALU, [Zn3FPVAdd0123], 1, [1], 1>; // Vector integer ALU op, no logicals.
1072 def Zn3WriteEXTRQ_INSERTQ : SchedWriteRes<[Zn3FPVShuf01, Zn3FPLd01]> {
1074 let ResourceCycles = [1, 1];
1075 let NumMicroOps = 1;
1077 def : InstRW<[Zn3WriteEXTRQ_INSERTQ], (instrs EXTRQ, INSERTQ)>;
1079 def Zn3WriteEXTRQI_INSERTQI : SchedWriteRes<[Zn3FPVShuf01, Zn3FPLd01]> {
1081 let ResourceCycles = [1, 1];
1082 let NumMicroOps = 2;
1084 def : InstRW<[Zn3WriteEXTRQI_INSERTQI], (instrs EXTRQI, INSERTQI)>;
1086 defm : Zn3WriteResXMMPair<WriteVecALUX, [Zn3FPVAdd0123], 1, [1], 1>; // Vector integer ALU op, no logicals (XMM).
1088 def Zn3WriteVecALUXSlow : SchedWriteRes<[Zn3FPVAdd01]> {
1090 let ResourceCycles = [1];
1091 let NumMicroOps = 1;
1093 def : InstRW<[Zn3WriteVecALUXSlow], (instrs PABSBrr, PABSDrr, PABSWrr,
1094 PADDSBrr, PADDSWrr, PADDUSBrr, PADDUSWrr,
1096 PSIGNBrr, PSIGNDrr, PSIGNWrr,
1097 VPABSBrr, VPABSDrr, VPABSWrr,
1098 VPADDSBrr, VPADDSWrr, VPADDUSBrr, VPADDUSWrr,
1101 VPSIGNBrr, VPSIGNDrr, VPSIGNWrr,
1102 PSUBSBrr, PSUBSWrr, PSUBUSBrr, PSUBUSWrr, VPSUBSBrr, VPSUBSWrr, VPSUBUSBrr, VPSUBUSWrr)>;
1104 def Zn3WriteVecALUXMMX : SchedWriteRes<[Zn3FPVAdd01]> {
1106 let ResourceCycles = [1];
1107 let NumMicroOps = 1;
1109 def : InstRW<[Zn3WriteVecALUXMMX], (instrs MMX_PABSBrr, MMX_PABSDrr, MMX_PABSWrr,
1110 MMX_PSIGNBrr, MMX_PSIGNDrr, MMX_PSIGNWrr,
1111 MMX_PADDSBirr, MMX_PADDSWirr, MMX_PADDUSBirr, MMX_PADDUSWirr,
1112 MMX_PAVGBirr, MMX_PAVGWirr,
1113 MMX_PSUBSBirr, MMX_PSUBSWirr, MMX_PSUBUSBirr, MMX_PSUBUSWirr)>;
1115 defm : Zn3WriteResYMMPair<WriteVecALUY, [Zn3FPVAdd0123], 1, [1], 1>; // Vector integer ALU op, no logicals (YMM).
1117 def Zn3WriteVecALUYSlow : SchedWriteRes<[Zn3FPVAdd01]> {
1119 let ResourceCycles = [1];
1120 let NumMicroOps = 1;
1122 def : InstRW<[Zn3WriteVecALUYSlow], (instrs VPABSBYrr, VPABSDYrr, VPABSWYrr,
1123 VPADDSBYrr, VPADDSWYrr, VPADDUSBYrr, VPADDUSWYrr,
1124 VPSUBSBYrr, VPSUBSWYrr, VPSUBUSBYrr, VPSUBUSWYrr,
1125 VPAVGBYrr, VPAVGWYrr,
1127 VPSIGNBYrr, VPSIGNDYrr, VPSIGNWYrr)>;
1129 defm : X86WriteResPairUnsupported<WriteVecALUZ>; // Vector integer ALU op, no logicals (ZMM).
1130 defm : Zn3WriteResXMMPair<WriteVecLogic, [Zn3FPVMisc0123], 1, [1], 1>; // Vector integer and/or/xor logicals.
1131 defm : Zn3WriteResXMMPair<WriteVecLogicX, [Zn3FPVMisc0123], 1, [1], 1>; // Vector integer and/or/xor logicals (XMM).
1132 defm : Zn3WriteResYMMPair<WriteVecLogicY, [Zn3FPVMisc0123], 1, [1], 1>; // Vector integer and/or/xor logicals (YMM).
1133 defm : X86WriteResPairUnsupported<WriteVecLogicZ>; // Vector integer and/or/xor logicals (ZMM).
1134 defm : Zn3WriteResXMMPair<WriteVecTest, [Zn3FPVAdd12, Zn3FPSt], 1, [1, 1], 2>; // FIXME: latency not from llvm-exegesis // Vector integer TEST instructions.
1135 defm : Zn3WriteResYMMPair<WriteVecTestY, [Zn3FPVAdd12, Zn3FPSt], 1, [1, 1], 2>; // FIXME: latency not from llvm-exegesis // Vector integer TEST instructions (YMM).
1136 defm : X86WriteResPairUnsupported<WriteVecTestZ>; // Vector integer TEST instructions (ZMM).
1137 defm : Zn3WriteResXMMPair<WriteVecShift, [Zn3FPVShift01], 1, [1], 1>; // Vector integer shifts (default).
1138 defm : Zn3WriteResXMMPair<WriteVecShiftX, [Zn3FPVShift01], 1, [1], 1>; // Vector integer shifts (XMM).
1139 defm : Zn3WriteResYMMPair<WriteVecShiftY, [Zn3FPVShift01], 1, [1], 1>; // Vector integer shifts (YMM).
1140 defm : X86WriteResPairUnsupported<WriteVecShiftZ>; // Vector integer shifts (ZMM).
1141 defm : Zn3WriteResXMMPair<WriteVecShiftImm, [Zn3FPVShift01], 1, [1], 1>; // Vector integer immediate shifts (default).
1142 defm : Zn3WriteResXMMPair<WriteVecShiftImmX, [Zn3FPVShift01], 1, [1], 1>; // Vector integer immediate shifts (XMM).
1143 defm : Zn3WriteResYMMPair<WriteVecShiftImmY, [Zn3FPVShift01], 1, [1], 1>; // Vector integer immediate shifts (YMM).
1144 defm : X86WriteResPairUnsupported<WriteVecShiftImmZ>; // Vector integer immediate shifts (ZMM).
1145 defm : Zn3WriteResXMMPair<WriteVecIMul, [Zn3FPVMul01], 3, [1], 1>; // Vector integer multiply (default).
1146 defm : Zn3WriteResXMMPair<WriteVecIMulX, [Zn3FPVMul01], 3, [1], 1>; // Vector integer multiply (XMM).
1147 defm : Zn3WriteResYMMPair<WriteVecIMulY, [Zn3FPVMul01], 3, [1], 1>; // Vector integer multiply (YMM).
1148 defm : X86WriteResPairUnsupported<WriteVecIMulZ>; // Vector integer multiply (ZMM).
1149 defm : Zn3WriteResXMMPair<WritePMULLD, [Zn3FPVMul01], 3, [1], 1>; // Vector PMULLD.
1150 defm : Zn3WriteResYMMPair<WritePMULLDY, [Zn3FPVMul01], 3, [1], 1>; // Vector PMULLD (YMM).
1151 defm : X86WriteResPairUnsupported<WritePMULLDZ>; // Vector PMULLD (ZMM).
1152 defm : Zn3WriteResXMMPair<WriteShuffle, [Zn3FPVShuf01], 1, [1], 1>; // Vector shuffles.
1153 defm : Zn3WriteResXMMPair<WriteShuffleX, [Zn3FPVShuf01], 1, [1], 1>; // Vector shuffles (XMM).
1154 defm : Zn3WriteResYMMPair<WriteShuffleY, [Zn3FPVShuf01], 1, [1], 1>; // Vector shuffles (YMM).
1155 defm : X86WriteResPairUnsupported<WriteShuffleZ>; // Vector shuffles (ZMM).
1156 defm : Zn3WriteResXMMPair<WriteVarShuffle, [Zn3FPVShuf01], 1, [1], 1>; // Vector variable shuffles.
1157 defm : Zn3WriteResXMMPair<WriteVarShuffleX, [Zn3FPVShuf01], 1, [1], 1>; // Vector variable shuffles (XMM).
1158 defm : Zn3WriteResYMMPair<WriteVarShuffleY, [Zn3FPVShuf01], 1, [1], 1>; // Vector variable shuffles (YMM).
1159 defm : X86WriteResPairUnsupported<WriteVarShuffleZ>; // Vector variable shuffles (ZMM).
1160 defm : Zn3WriteResXMMPair<WriteBlend, [Zn3FPVMisc0123], 1, [1], 1>; // Vector blends.
1161 defm : Zn3WriteResYMMPair<WriteBlendY, [Zn3FPVMisc0123], 1, [1], 1>; // Vector blends (YMM).
1162 defm : X86WriteResPairUnsupported<WriteBlendZ>; // Vector blends (ZMM).
1163 defm : Zn3WriteResXMMPair<WriteVarBlend, [Zn3FPVMul01], 1, [1], 1>; // Vector variable blends.
1164 defm : Zn3WriteResYMMPair<WriteVarBlendY, [Zn3FPVMul01], 1, [1], 1>; // Vector variable blends (YMM).
1165 defm : X86WriteResPairUnsupported<WriteVarBlendZ>; // Vector variable blends (ZMM).
1166 defm : Zn3WriteResXMMPair<WritePSADBW, [Zn3FPVAdd0123], 3, [2], 1>; // Vector PSADBW.
1167 defm : Zn3WriteResXMMPair<WritePSADBWX, [Zn3FPVAdd0123], 3, [2], 1>; // Vector PSADBW (XMM).
1168 defm : Zn3WriteResYMMPair<WritePSADBWY, [Zn3FPVAdd0123], 3, [2], 1>; // Vector PSADBW (YMM).
1169 defm : X86WriteResPairUnsupported<WritePSADBWZ>; // Vector PSADBW (ZMM).
1170 defm : Zn3WriteResXMMPair<WriteMPSAD, [Zn3FPVAdd0123], 4, [8], 4, /*LoadUOps=*/2>; // Vector MPSAD.
1171 defm : Zn3WriteResYMMPair<WriteMPSADY, [Zn3FPVAdd0123], 4, [8], 3, /*LoadUOps=*/1>; // Vector MPSAD (YMM).
1172 defm : X86WriteResPairUnsupported<WriteMPSADZ>; // Vector MPSAD (ZMM).
1173 defm : Zn3WriteResXMMPair<WritePHMINPOS, [Zn3FPVAdd01], 3, [1], 1>; // Vector PHMINPOS.
1175 // Vector insert/extract operations.
1176 defm : Zn3WriteResXMMPair<WriteVecInsert, [Zn3FPLd01], 1, [2], 2, /*LoadUOps=*/-1>; // Insert gpr to vector element.
1177 defm : Zn3WriteResXMM<WriteVecExtract, [Zn3FPLd01], 1, [2], 2>; // Extract vector element to gpr.
1178 defm : Zn3WriteResXMM<WriteVecExtractSt, [Zn3FPSt, Zn3Store], !add(1, Znver3Model.StoreLatency), [1, 1], 2>; // Extract vector element and store.
1180 // MOVMSK operations.
1181 defm : Zn3WriteResXMM<WriteFMOVMSK, [Zn3FPVMisc2], 1, [1], 1>;
1182 defm : Zn3WriteResXMM<WriteVecMOVMSK, [Zn3FPVMisc2], 1, [1], 1>;
1183 defm : Zn3WriteResYMM<WriteVecMOVMSKY, [Zn3FPVMisc2], 1, [1], 1>;
1184 defm : Zn3WriteResXMM<WriteMMXMOVMSK, [Zn3FPVMisc2], 1, [1], 1>;
1186 // Conversion between integer and float.
1187 defm : Zn3WriteResXMMPair<WriteCvtSD2I, [Zn3FPFCvt01], 2, [2], 2>; // Double -> Integer.
1188 defm : Zn3WriteResXMMPair<WriteCvtPD2I, [Zn3FPFCvt01], 3, [1], 1>; // Double -> Integer (XMM).
1189 defm : Zn3WriteResYMMPair<WriteCvtPD2IY, [Zn3FPFCvt01], 6, [2], 2>; // Double -> Integer (YMM).
1190 defm : X86WriteResPairUnsupported<WriteCvtPD2IZ>; // Double -> Integer (ZMM).
1192 def Zn3WriteCvtPD2IMMX : SchedWriteRes<[Zn3FPFCvt01]> {
1194 let ResourceCycles = [2];
1195 let NumMicroOps = 2;
1197 def : InstRW<[Zn3WriteCvtPD2IMMX], (instrs MMX_CVTPD2PIirm, MMX_CVTTPD2PIirm, MMX_CVTPD2PIirr, MMX_CVTTPD2PIirr)>;
1199 defm : Zn3WriteResXMMPair<WriteCvtSS2I, [Zn3FPFCvt01], 2, [2], 2>; // Float -> Integer.
1201 defm : Zn3WriteResXMMPair<WriteCvtPS2I, [Zn3FPFCvt01], 3, [1], 1>; // Float -> Integer (XMM).
1202 defm : Zn3WriteResYMMPair<WriteCvtPS2IY, [Zn3FPFCvt01], 3, [1], 1>; // Float -> Integer (YMM).
1203 defm : X86WriteResPairUnsupported<WriteCvtPS2IZ>; // Float -> Integer (ZMM).
1205 defm : Zn3WriteResXMMPair<WriteCvtI2SD, [Zn3FPFCvt01], 3, [2], 2, /*LoadUOps=*/-1>; // Integer -> Double.
1206 defm : Zn3WriteResXMMPair<WriteCvtI2PD, [Zn3FPFCvt01], 3, [1], 1>; // Integer -> Double (XMM).
1207 defm : Zn3WriteResYMMPair<WriteCvtI2PDY, [Zn3FPFCvt01], 4, [2], 2, /*LoadUOps=*/-1>; // Integer -> Double (YMM).
1208 defm : X86WriteResPairUnsupported<WriteCvtI2PDZ>; // Integer -> Double (ZMM).
1210 def Zn3WriteCvtI2PDMMX : SchedWriteRes<[Zn3FPFCvt01]> {
1212 let ResourceCycles = [6];
1213 let NumMicroOps = 2;
1215 def : InstRW<[Zn3WriteCvtI2PDMMX], (instrs MMX_CVTPI2PDirm, MMX_CVTPI2PDirr)>;
1217 defm : Zn3WriteResXMMPair<WriteCvtI2SS, [Zn3FPFCvt01], 3, [2], 2, /*LoadUOps=*/-1>; // Integer -> Float.
1218 defm : Zn3WriteResXMMPair<WriteCvtI2PS, [Zn3FPFCvt01], 3, [1], 1>; // Integer -> Float (XMM).
1219 defm : Zn3WriteResYMMPair<WriteCvtI2PSY, [Zn3FPFCvt01], 3, [1], 1>; // Integer -> Float (YMM).
1220 defm : X86WriteResPairUnsupported<WriteCvtI2PSZ>; // Integer -> Float (ZMM).
1222 def Zn3WriteCvtI2PSMMX : SchedWriteRes<[Zn3FPFCvt01]> {
1224 let ResourceCycles = [1];
1225 let NumMicroOps = 2;
1227 def : InstRW<[Zn3WriteCvtI2PSMMX], (instrs MMX_CVTPI2PSirr)>;
1229 defm : Zn3WriteResXMMPair<WriteCvtSS2SD, [Zn3FPFCvt01], 3, [1], 1>; // Float -> Double size conversion.
1230 defm : Zn3WriteResXMMPair<WriteCvtPS2PD, [Zn3FPFCvt01], 3, [1], 1>; // Float -> Double size conversion (XMM).
1231 defm : Zn3WriteResYMMPair<WriteCvtPS2PDY, [Zn3FPFCvt01], 4, [2], 2, /*LoadUOps=*/-1>; // Float -> Double size conversion (YMM).
1232 defm : X86WriteResPairUnsupported<WriteCvtPS2PDZ>; // Float -> Double size conversion (ZMM).
1234 defm : Zn3WriteResXMMPair<WriteCvtSD2SS, [Zn3FPFCvt01], 3, [1], 1>; // Double -> Float size conversion.
1235 defm : Zn3WriteResXMMPair<WriteCvtPD2PS, [Zn3FPFCvt01], 3, [1], 1>; // Double -> Float size conversion (XMM).
1236 defm : Zn3WriteResYMMPair<WriteCvtPD2PSY, [Zn3FPFCvt01], 6, [2], 2>; // Double -> Float size conversion (YMM).
1237 defm : X86WriteResPairUnsupported<WriteCvtPD2PSZ>; // Double -> Float size conversion (ZMM).
1239 defm : Zn3WriteResXMMPair<WriteCvtPH2PS, [Zn3FPFCvt01], 3, [1], 1>; // Half -> Float size conversion.
1240 defm : Zn3WriteResYMMPair<WriteCvtPH2PSY, [Zn3FPFCvt01], 4, [2], 2, /*LoadUOps=*/-1>; // Half -> Float size conversion (YMM).
1241 defm : X86WriteResPairUnsupported<WriteCvtPH2PSZ>; // Half -> Float size conversion (ZMM).
1243 defm : Zn3WriteResXMM<WriteCvtPS2PH, [Zn3FPFCvt01], 3, [2], 1>; // Float -> Half size conversion.
1244 defm : Zn3WriteResYMM<WriteCvtPS2PHY, [Zn3FPFCvt01], 6, [2], 2>; // Float -> Half size conversion (YMM).
1245 defm : X86WriteResUnsupported<WriteCvtPS2PHZ>; // Float -> Half size conversion (ZMM).
1246 defm : Zn3WriteResXMM<WriteCvtPS2PHSt, [Zn3FPFCvt01, Zn3FPSt, Zn3Store], !add(3, Znver3Model.StoreLatency), [1, 1, 1], 2>; // Float -> Half + store size conversion.
1247 defm : Zn3WriteResYMM<WriteCvtPS2PHYSt, [Zn3FPFCvt01, Zn3FPSt, Zn3Store], !add(6, Znver3Model.StoreLatency), [2, 1, 1], 3>; // Float -> Half + store size conversion (YMM).
1248 defm : X86WriteResUnsupported<WriteCvtPS2PHZSt>; // Float -> Half + store size conversion (ZMM).
1250 // CRC32 instruction.
1251 defm : Zn3WriteResIntPair<WriteCRC32, [Zn3ALU1], 3, [1], 1>;
1253 def Zn3WriteSHA1MSG1rr : SchedWriteRes<[Zn3FPU0123]> {
1255 let ResourceCycles = [2];
1256 let NumMicroOps = 2;
1258 def : InstRW<[Zn3WriteSHA1MSG1rr], (instrs SHA1MSG1rr)>;
1260 def Zn3WriteSHA1MSG1rm : SchedWriteRes<[Zn3AGU012, Zn3Load, Zn3FPU0123]> {
1261 let Latency = !add(Znver3Model.LoadLatency, Zn3WriteSHA1MSG1rr.Latency);
1262 let ResourceCycles = [1, 1, 2];
1263 let NumMicroOps = !add(Zn3WriteSHA1MSG1rr.NumMicroOps, 0);
1265 def : InstRW<[Zn3WriteSHA1MSG1rm], (instrs SHA1MSG1rm)>;
1267 def Zn3WriteSHA1MSG2rr_SHA1NEXTErr : SchedWriteRes<[Zn3FPU0123]> {
1269 let ResourceCycles = [2];
1270 let NumMicroOps = 1;
1272 def : InstRW<[Zn3WriteSHA1MSG2rr_SHA1NEXTErr], (instrs SHA1MSG2rr, SHA1NEXTErr)>;
1274 def Zn3Writerm_SHA1MSG2rm_SHA1NEXTErm : SchedWriteRes<[Zn3AGU012, Zn3Load, Zn3FPU0123]> {
1275 let Latency = !add(Znver3Model.LoadLatency, Zn3WriteSHA1MSG2rr_SHA1NEXTErr.Latency);
1276 let ResourceCycles = [1, 1, 2];
1277 let NumMicroOps = !add(Zn3WriteSHA1MSG2rr_SHA1NEXTErr.NumMicroOps, 0);
1279 def : InstRW<[Zn3Writerm_SHA1MSG2rm_SHA1NEXTErm], (instrs SHA1MSG2rm, SHA1NEXTErm)>;
1281 def Zn3WriteSHA256MSG1rr : SchedWriteRes<[Zn3FPU0123]> {
1283 let ResourceCycles = [3];
1284 let NumMicroOps = 2;
1286 def : InstRW<[Zn3WriteSHA256MSG1rr], (instrs SHA256MSG1rr)>;
1288 def Zn3Writerm_SHA256MSG1rm : SchedWriteRes<[Zn3AGU012, Zn3Load, Zn3FPU0123]> {
1289 let Latency = !add(Znver3Model.LoadLatency, Zn3WriteSHA256MSG1rr.Latency);
1290 let ResourceCycles = [1, 1, 3];
1291 let NumMicroOps = !add(Zn3WriteSHA256MSG1rr.NumMicroOps, 0);
1293 def : InstRW<[Zn3Writerm_SHA256MSG1rm], (instrs SHA256MSG1rm)>;
1295 def Zn3WriteSHA256MSG2rr : SchedWriteRes<[Zn3FPU0123]> {
1297 let ResourceCycles = [8];
1298 let NumMicroOps = 4;
1300 def : InstRW<[Zn3WriteSHA256MSG2rr], (instrs SHA256MSG2rr)>;
1302 def Zn3WriteSHA256MSG2rm : SchedWriteRes<[Zn3AGU012, Zn3Load, Zn3FPU0123]> {
1303 let Latency = !add(Znver3Model.LoadLatency, Zn3WriteSHA256MSG2rr.Latency);
1304 let ResourceCycles = [1, 1, 8];
1305 let NumMicroOps = !add(Zn3WriteSHA256MSG2rr.NumMicroOps, 1);
1307 def : InstRW<[Zn3WriteSHA256MSG2rm], (instrs SHA256MSG2rm)>;
1309 def Zn3WriteSHA1RNDS4rri : SchedWriteRes<[Zn3FPU0123]> {
1311 let ResourceCycles = [8];
1312 let NumMicroOps = 1;
1314 def : InstRW<[Zn3WriteSHA1RNDS4rri], (instrs SHA1RNDS4rri)>;
1316 def Zn3WriteSHA256RNDS2rr : SchedWriteRes<[Zn3FPU0123]> {
1318 let ResourceCycles = [8];
1319 let NumMicroOps = 1;
1321 def : InstRW<[Zn3WriteSHA256RNDS2rr], (instrs SHA256RNDS2rr)>;
1323 // Strings instructions.
1324 // Packed Compare Implicit Length Strings, Return Mask
1325 defm : Zn3WriteResXMMPair<WritePCmpIStrM, [Zn3FPVAdd0123], 6, [8], 3, /*LoadUOps=*/1>;
1326 // Packed Compare Explicit Length Strings, Return Mask
1327 defm : Zn3WriteResXMMPair<WritePCmpEStrM, [Zn3FPVAdd0123], 6, [12], 7, /*LoadUOps=*/5>;
1328 // Packed Compare Implicit Length Strings, Return Index
1329 defm : Zn3WriteResXMMPair<WritePCmpIStrI, [Zn3FPVAdd0123], 2, [8], 4>;
1330 // Packed Compare Explicit Length Strings, Return Index
1331 defm : Zn3WriteResXMMPair<WritePCmpEStrI, [Zn3FPVAdd0123], 6, [12], 8, /*LoadUOps=*/4>;
1333 // AES instructions.
1334 defm : Zn3WriteResXMMPair<WriteAESDecEnc, [Zn3FPAES01], 4, [1], 1>; // Decryption, encryption.
1335 defm : Zn3WriteResXMMPair<WriteAESIMC, [Zn3FPAES01], 4, [1], 1>; // InvMixColumn.
1336 defm : Zn3WriteResXMMPair<WriteAESKeyGen, [Zn3FPAES01], 4, [1], 1>; // Key Generation.
1338 // Carry-less multiplication instructions.
1339 defm : Zn3WriteResXMMPair<WriteCLMul, [Zn3FPCLM01], 4, [4], 4>;
1342 defm : Zn3WriteResInt<WriteEMMS, [Zn3ALU0123], 2, [1], 1>; // FIXME: latency not from llvm-exegesis
1345 defm : Zn3WriteResInt<WriteLDMXCSR, [Zn3AGU012, Zn3Load, Zn3ALU0123], !add(Znver3Model.LoadLatency, 1), [1, 1, 6], 1>; // FIXME: latency not from llvm-exegesis
1346 defm : Zn3WriteResInt<WriteSTMXCSR, [Zn3ALU0123, Zn3AGU012, Zn3Store], !add(1, Znver3Model.StoreLatency), [60, 1, 1], 2>; // FIXME: latency not from llvm-exegesis
1348 // Catch-all for expensive system instructions.
1349 defm : Zn3WriteResInt<WriteSystem, [Zn3ALU0123], 100, [100], 100>;
1351 def Zn3WriteVZEROUPPER : SchedWriteRes<[Zn3FPU0123]> {
1352 let Latency = 0; // FIXME: not from llvm-exegesis
1353 let ResourceCycles = [1];
1354 let NumMicroOps = 1;
1356 def : InstRW<[Zn3WriteVZEROUPPER], (instrs VZEROUPPER)>;
1358 def Zn3WriteVZEROALL : SchedWriteRes<[Zn3FPU0123]> {
1359 let Latency = 10; // FIXME: not from llvm-exegesis
1360 let ResourceCycles = [24];
1361 let NumMicroOps = 18;
1363 def : InstRW<[Zn3WriteVZEROALL], (instrs VZEROALL)>;
1366 defm : Zn3WriteResYMMPair<WriteFShuffle256, [Zn3FPVShuf], 2, [1], 1, /*LoadUOps=*/2>; // Fp 256-bit width vector shuffles.
1367 defm : Zn3WriteResYMMPair<WriteFVarShuffle256, [Zn3FPVShuf], 7, [1], 2, /*LoadUOps=*/1>; // Fp 256-bit width variable shuffles.
1368 defm : Zn3WriteResYMMPair<WriteShuffle256, [Zn3FPVShuf], 2, [1], 1>; // 256-bit width vector shuffles.
1370 def Zn3WriteVPERM2I128rr_VPERM2F128rr : SchedWriteRes<[Zn3FPVShuf]> {
1372 let ResourceCycles = [1];
1373 let NumMicroOps = 1;
1375 def : InstRW<[Zn3WriteVPERM2I128rr_VPERM2F128rr], (instrs VPERM2I128rr, VPERM2F128rr)>;
1377 def Zn3WriteVPERM2F128rm : SchedWriteRes<[Zn3AGU012, Zn3Load, Zn3FPVShuf]> {
1378 let Latency = !add(Znver3Model.LoadLatency, Zn3WriteVPERM2I128rr_VPERM2F128rr.Latency);
1379 let ResourceCycles = [1, 1, 1];
1380 let NumMicroOps = !add(Zn3WriteVPERM2I128rr_VPERM2F128rr.NumMicroOps, 0);
1382 def : InstRW<[Zn3WriteVPERM2F128rm], (instrs VPERM2F128rm)>;
1384 def Zn3WriteVPERMPSYrr : SchedWriteRes<[Zn3FPVShuf]> {
1386 let ResourceCycles = [1];
1387 let NumMicroOps = 2;
1389 def : InstRW<[Zn3WriteVPERMPSYrr], (instrs VPERMPSYrr)>;
1391 def Zn3WriteVPERMPSYrm : SchedWriteRes<[Zn3AGU012, Zn3Load, Zn3FPVShuf]> {
1392 let Latency = !add(Znver3Model.LoadLatency, Zn3WriteVPERMPSYrr.Latency);
1393 let ResourceCycles = [1, 1, 2];
1394 let NumMicroOps = !add(Zn3WriteVPERMPSYrr.NumMicroOps, 1);
1396 def : InstRW<[Zn3WriteVPERMPSYrm], (instrs VPERMPSYrm)>;
1398 def Zn3WriteVPERMYri : SchedWriteRes<[Zn3FPVShuf]> {
1400 let ResourceCycles = [1];
1401 let NumMicroOps = 2;
1403 def : InstRW<[Zn3WriteVPERMYri], (instrs VPERMPDYri, VPERMQYri)>;
1405 def Zn3WriteVPERMPDYmi : SchedWriteRes<[Zn3AGU012, Zn3Load, Zn3FPVShuf]> {
1406 let Latency = !add(Znver3Model.LoadLatency, Zn3WriteVPERMYri.Latency);
1407 let ResourceCycles = [1, 1, 2];
1408 let NumMicroOps = !add(Zn3WriteVPERMYri.NumMicroOps, 1);
1410 def : InstRW<[Zn3WriteVPERMPDYmi], (instrs VPERMPDYmi)>;
1412 def Zn3WriteVPERMDYrr : SchedWriteRes<[Zn3FPVShuf]> {
1414 let ResourceCycles = [1];
1415 let NumMicroOps = 2;
1417 def : InstRW<[Zn3WriteVPERMDYrr], (instrs VPERMDYrr)>;
1419 def Zn3WriteVPERMYm : SchedWriteRes<[Zn3AGU012, Zn3Load, Zn3FPVShuf]> {
1420 let Latency = !add(Znver3Model.LoadLatency, Zn3WriteVPERMDYrr.Latency);
1421 let ResourceCycles = [1, 1, 2];
1422 let NumMicroOps = !add(Zn3WriteVPERMDYrr.NumMicroOps, 0);
1424 def : InstRW<[Zn3WriteVPERMYm], (instrs VPERMQYmi, VPERMDYrm)>;
1426 defm : Zn3WriteResYMMPair<WriteVPMOV256, [Zn3FPVShuf01], 4, [3], 2, /*LoadUOps=*/-1>; // 256-bit width packed vector width-changing move.
1427 defm : Zn3WriteResYMMPair<WriteVarShuffle256, [Zn3FPVShuf01], 1, [1], 2>; // 256-bit width vector variable shuffles.
1428 defm : Zn3WriteResXMMPair<WriteVarVecShift, [Zn3FPVShift01], 1, [1], 1>; // Variable vector shifts.
1429 defm : Zn3WriteResYMMPair<WriteVarVecShiftY, [Zn3FPVShift01], 1, [1], 1>; // Variable vector shifts (YMM).
1430 defm : X86WriteResPairUnsupported<WriteVarVecShiftZ>; // Variable vector shifts (ZMM).
1432 // Old microcoded instructions that nobody use.
1433 defm : Zn3WriteResInt<WriteMicrocoded, [Zn3ALU0123], 100, [100], 100>;
1435 // Fence instructions.
1436 defm : Zn3WriteResInt<WriteFence, [Zn3ALU0123], 1, [100], 1>;
1438 def Zn3WriteLFENCE : SchedWriteRes<[Zn3LSU]> {
1440 let ResourceCycles = [30];
1441 let NumMicroOps = 1;
1443 def : InstRW<[Zn3WriteLFENCE], (instrs LFENCE)>;
1445 def Zn3WriteSFENCE : SchedWriteRes<[Zn3LSU]> {
1447 let ResourceCycles = [1];
1448 let NumMicroOps = 1;
1450 def : InstRW<[Zn3WriteSFENCE], (instrs SFENCE)>;
1452 // Nop, not very useful expect it provides a model for nops!
1453 defm : Zn3WriteResInt<WriteNop, [Zn3ALU0123], 0, [1], 1>; // FIXME: latency not from llvm-exegesis
1456 ///////////////////////////////////////////////////////////////////////////////
1458 ///////////////////////////////////////////////////////////////////////////////
1460 def Zn3WriteZeroLatency : SchedWriteRes<[]> {
1462 let ResourceCycles = [];
1463 let NumMicroOps = 1;
1465 def : InstRW<[Zn3WriteZeroLatency], (instrs MOV32rr, MOV32rr_REV,
1466 MOV64rr, MOV64rr_REV,
1469 def Zn3WriteSwapRenameable : SchedWriteRes<[]> {
1471 let ResourceCycles = [];
1472 let NumMicroOps = 2;
1474 def : InstRW<[Zn3WriteSwapRenameable], (instrs XCHG32rr, XCHG32ar,
1475 XCHG64rr, XCHG64ar)>;
1477 defm : Zn3WriteResInt<WriteXCHG, [Zn3ALU0123], 0, [8], 2>; // Compare+Exchange - TODO RMW support.
1479 defm : Zn3WriteResXMM<WriteFMove, [Zn3FPVMisc0123], 1, [1], 1>; // Empty sched class
1480 defm : Zn3WriteResXMM<WriteFMoveX, [], 0, [], 1>;
1481 defm : Zn3WriteResYMM<WriteFMoveY, [], 0, [], 1>;
1483 defm : Zn3WriteResXMM<WriteVecMove, [Zn3FPFMisc0123], 1, [1], 1>; // MMX
1484 defm : Zn3WriteResXMM<WriteVecMoveX, [], 0, [], 1>;
1485 defm : Zn3WriteResYMM<WriteVecMoveY, [], 0, [], 1>;
1487 def : IsOptimizableRegisterMove<[
1488 InstructionEquivalenceClass<[
1490 MOV32rr, MOV32rr_REV,
1491 MOV64rr, MOV64rr_REV,
1497 // MMX moves are *NOT* eliminated.
1500 MOVAPSrr, MOVAPSrr_REV,
1501 MOVUPSrr, MOVUPSrr_REV,
1502 MOVAPDrr, MOVAPDrr_REV,
1503 MOVUPDrr, MOVUPDrr_REV,
1504 MOVDQArr, MOVDQArr_REV,
1505 MOVDQUrr, MOVDQUrr_REV,
1508 VMOVAPSrr, VMOVAPSrr_REV,
1509 VMOVUPSrr, VMOVUPSrr_REV,
1510 VMOVAPDrr, VMOVAPDrr_REV,
1511 VMOVUPDrr, VMOVUPDrr_REV,
1512 VMOVDQArr, VMOVDQArr_REV,
1513 VMOVDQUrr, VMOVDQUrr_REV,
1515 // AVX YMM variants.
1516 VMOVAPSYrr, VMOVAPSYrr_REV,
1517 VMOVUPSYrr, VMOVUPSYrr_REV,
1518 VMOVAPDYrr, VMOVAPDYrr_REV,
1519 VMOVUPDYrr, VMOVUPDYrr_REV,
1520 VMOVDQAYrr, VMOVDQAYrr_REV,
1521 VMOVDQUYrr, VMOVDQUYrr_REV,
1525 ///////////////////////////////////////////////////////////////////////////////
1526 // Dependency breaking instructions.
1527 ///////////////////////////////////////////////////////////////////////////////
1529 def Zn3WriteZeroIdiom : SchedWriteVariant<[
1530 SchedVar<MCSchedPredicate<ZeroIdiomPredicate>, [Zn3WriteZeroLatency]>,
1531 SchedVar<NoSchedPred, [WriteALU]>
1533 def : InstRW<[Zn3WriteZeroIdiom], (instrs XOR32rr, XOR32rr_REV,
1534 XOR64rr, XOR64rr_REV,
1535 SUB32rr, SUB32rr_REV,
1536 SUB64rr, SUB64rr_REV)>;
1538 def Zn3WriteZeroIdiomEFLAGS : SchedWriteVariant<[
1539 SchedVar<MCSchedPredicate<CheckSameRegOperand<0, 1>>, [Zn3WriteZeroLatency]>,
1540 SchedVar<NoSchedPred, [WriteALU]>
1542 def : InstRW<[Zn3WriteZeroIdiomEFLAGS], (instrs CMP8rr, CMP8rr_REV,
1543 CMP16rr, CMP16rr_REV,
1544 CMP32rr, CMP32rr_REV,
1545 CMP64rr, CMP64rr_REV)>;
1547 def Zn3WriteFZeroIdiom : SchedWriteVariant<[
1548 SchedVar<MCSchedPredicate<ZeroIdiomPredicate>, [Zn3WriteZeroLatency]>,
1549 SchedVar<NoSchedPred, [WriteFLogic]>
1551 // NOTE: XORPSrr, XORPDrr are not zero-cycle!
1552 def : InstRW<[Zn3WriteFZeroIdiom], (instrs VXORPSrr, VXORPDrr,
1553 VANDNPSrr, VANDNPDrr)>;
1555 def Zn3WriteFZeroIdiomY : SchedWriteVariant<[
1556 SchedVar<MCSchedPredicate<ZeroIdiomPredicate>, [Zn3WriteZeroLatency]>,
1557 SchedVar<NoSchedPred, [WriteFLogicY]>
1559 def : InstRW<[Zn3WriteFZeroIdiomY], (instrs VXORPSYrr, VXORPDYrr,
1560 VANDNPSYrr, VANDNPDYrr)>;
1562 def Zn3WriteVZeroIdiomLogicX : SchedWriteVariant<[
1563 SchedVar<MCSchedPredicate<ZeroIdiomPredicate>, [Zn3WriteZeroLatency]>,
1564 SchedVar<NoSchedPred, [WriteVecLogicX]>
1566 // NOTE: PXORrr,PANDNrr are not zero-cycle!
1567 def : InstRW<[Zn3WriteVZeroIdiomLogicX], (instrs VPXORrr, VPANDNrr)>;
1569 def Zn3WriteVZeroIdiomLogicY : SchedWriteVariant<[
1570 SchedVar<MCSchedPredicate<ZeroIdiomPredicate>, [Zn3WriteZeroLatency]>,
1571 SchedVar<NoSchedPred, [WriteVecLogicY]>
1573 def : InstRW<[Zn3WriteVZeroIdiomLogicY], (instrs VPXORYrr, VPANDNYrr)>;
1575 def Zn3WriteVZeroIdiomALUX : SchedWriteVariant<[
1576 SchedVar<MCSchedPredicate<ZeroIdiomPredicate>, [Zn3WriteZeroLatency]>,
1577 SchedVar<NoSchedPred, [WriteVecALUX]>
1579 // NOTE: PSUBBrr, PSUBWrr, PSUBDrr, PSUBQrr,
1580 // PCMPGTBrr, PCMPGTWrr, PCMPGTDrr, PCMPGTQrr are not zero-cycle!
1581 def : InstRW<[Zn3WriteVZeroIdiomALUX],
1582 (instrs VPSUBBrr, VPSUBWrr, VPSUBDrr, VPSUBQrr,
1583 VPCMPGTBrr, VPCMPGTWrr, VPCMPGTDrr, VPCMPGTQrr)>;
1585 def Zn3WriteVZeroIdiomALUY : SchedWriteVariant<[
1586 SchedVar<MCSchedPredicate<ZeroIdiomPredicate>, [Zn3WriteZeroLatency]>,
1587 SchedVar<NoSchedPred, [WriteVecALUY]>
1589 def : InstRW<[Zn3WriteVZeroIdiomALUY],
1590 (instrs VPSUBBYrr, VPSUBWYrr, VPSUBDYrr, VPSUBQYrr,
1591 VPCMPGTBYrr, VPCMPGTWYrr, VPCMPGTDYrr, VPCMPGTQYrr)>;
1593 def : IsZeroIdiomFunction<[
1595 DepBreakingClass<[ XOR32rr, XOR32rr_REV,
1596 XOR64rr, XOR64rr_REV,
1597 SUB32rr, SUB32rr_REV,
1598 SUB64rr, SUB64rr_REV ], ZeroIdiomPredicate>,
1600 // SSE XMM Zero-idioms.
1609 PSUBBrr, PSUBWrr, PSUBDrr, PSUBQrr,
1611 PSUBUSBrr, PSUBUSWrr,
1612 PCMPGTBrr, PCMPGTWrr, PCMPGTDrr, PCMPGTQrr
1613 ], ZeroIdiomPredicate>,
1615 // AVX XMM Zero-idioms.
1619 VANDNPSrr, VANDNPDrr,
1624 VPSUBBrr, VPSUBWrr, VPSUBDrr, VPSUBQrr,
1625 VPSUBSBrr, VPSUBSWrr,
1626 VPSUBUSBrr, VPSUBUSWrr,
1627 VPCMPGTBrr, VPCMPGTWrr, VPCMPGTDrr, VPCMPGTQrr,
1628 ], ZeroIdiomPredicate>,
1630 // AVX YMM Zero-idioms.
1633 VXORPSYrr, VXORPDYrr,
1634 VANDNPSYrr, VANDNPDYrr,
1639 VPSUBBYrr, VPSUBWYrr, VPSUBDYrr, VPSUBQYrr,
1640 VPSUBSBYrr, VPSUBSWYrr,
1641 VPSUBUSBYrr, VPSUBUSWYrr,
1642 VPCMPGTBYrr, VPCMPGTWYrr, VPCMPGTDYrr, VPCMPGTQYrr
1643 ], ZeroIdiomPredicate>,
1646 def : IsDepBreakingFunction<[
1648 DepBreakingClass<[ SBB32rr, SBB32rr_REV,
1649 SBB64rr, SBB64rr_REV ], ZeroIdiomPredicate>,
1650 DepBreakingClass<[ CMP8rr, CMP8rr_REV,
1651 CMP16rr, CMP16rr_REV,
1652 CMP32rr, CMP32rr_REV,
1653 CMP64rr, CMP64rr_REV ], CheckSameRegOperand<0, 1> >,
1657 MMX_PCMPEQBirr, MMX_PCMPEQWirr, MMX_PCMPEQDirr
1658 ], ZeroIdiomPredicate>,
1662 PCMPEQBrr, PCMPEQWrr, PCMPEQDrr, PCMPEQQrr
1663 ], ZeroIdiomPredicate>,
1667 VPCMPEQBrr, VPCMPEQWrr, VPCMPEQDrr, VPCMPEQQrr
1668 ], ZeroIdiomPredicate>,
1672 VPCMPEQBYrr, VPCMPEQWYrr, VPCMPEQDYrr, VPCMPEQQYrr
1673 ], ZeroIdiomPredicate>,