1 //=- X86ScheduleZnver4.td - X86 Znver4 Scheduling ------------*- tablegen -*-=//
3 // Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
4 // See https://llvm.org/LICENSE.txt for license information.
5 // SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
7 //===----------------------------------------------------------------------===//
9 // This file defines the machine model for Znver4 to support instruction
10 // scheduling and other instruction cost heuristics.
12 // * AMD Software Optimization Guide for AMD Family 19h Processors.
13 // https://www.amd.com/system/files/TechDocs/56665.zip
14 //===----------------------------------------------------------------------===//
16 def Znver4Model : SchedMachineModel {
17 // AMD SOG 19h, 2.9.6 Dispatch
18 // The processor may dispatch up to 6 macro ops per cycle
19 // into the execution engine.
21 // AMD SOG 19h, 2.10.3
22 // The retire control unit (RCU) tracks the completion status of all
23 // outstanding operations (integer, load/store, and floating-point) and is
24 // the final arbiter for exception processing and recovery.
25 // The unit can receive up to 6 macro ops dispatched per cycle and track up
26 // to 320 macro ops in-flight in non-SMT mode or 160 per thread in SMT mode.
27 let MicroOpBufferSize = 320;
28 // AMD SOG 19h, 2.9.1 Op Cache
29 // The op cache is organized as an associative cache with 64 sets and 8 ways.
30 // At each set-way intersection is an entry containing up to 8 macro ops.
31 // The maximum capacity of the op cache is 4K ops.
32 // Agner, 22.5 µop cache
33 // The size of the µop cache is big enough for holding most critical loops.
34 // FIXME: PR50584: MachineScheduler/PostRAScheduler have quadradic complexity,
35 // with large values here the compilation of certain loops
36 // ends up taking way too long.
37 // Ideally for znver4, we should have 6.75K. However we don't add that
38 // considerting the impact compile time and prefer using default values
40 // Retaining minimal value to influence unrolling as we did for znver3.
41 let LoopMicroOpBufferSize = 512;
42 // AMD SOG 19h, 2.6.2 L1 Data Cache
43 // The L1 data cache has a 4- or 5- cycle integer load-to-use latency.
44 // AMD SOG 19h, 2.12 L1 Data Cache
45 // The AGU and LS pipelines are optimized for simple address generation modes.
46 // <...> and can achieve 4-cycle load-to-use integer load latency.
48 // AMD SOG 19h, 2.12 L1 Data Cache
49 // The AGU and LS pipelines are optimized for simple address generation modes.
50 // <...> and can achieve <...> 7-cycle load-to-use FP load latency.
51 int VecLoadLatency = 7;
52 // Latency of a simple store operation.
55 let HighLatency = 25; // FIXME: any better choice?
56 // AMD SOG 19h, 2.8 Optimizing Branching
57 // The branch misprediction penalty is in the range from 11 to 18 cycles,
58 // <...>. The common case penalty is 13 cycles.
59 let MispredictPenalty = 13;
61 let PostRAScheduler = 1; // Enable Post RegAlloc Scheduler pass.
63 let CompleteModel = 1;
66 let SchedModel = Znver4Model in {
69 //===----------------------------------------------------------------------===//
71 //===----------------------------------------------------------------------===//
73 // AMD SOG 19h, 2.10.3 Retire Control Unit
74 // The unit can receive up to 6 macro ops dispatched per cycle and track up to
75 // 320 macro ops in-flight in non-SMT mode or 128 per thread in SMT mode. <...>
76 // The retire unit handles in-order commit of up to nine macro ops per cycle.
77 def Zn4RCU : RetireControlUnit<Znver4Model.MicroOpBufferSize, 9>;
79 //===----------------------------------------------------------------------===//
80 // Integer Execution Unit
83 // AMD SOG 19h, 2.4 Superscalar Organization
84 // The processor uses four decoupled independent integer scheduler queues,
85 // each one servicing one ALU pipeline and one or two other pipelines
89 //===----------------------------------------------------------------------===//
91 // AMD SOG 19h, 2.10.2 Execution Units
92 // The processor contains 4 general purpose integer execution pipes.
93 // Each pipe has an ALU capable of general purpose integer operations.
94 def Zn4ALU0 : ProcResource<1>;
95 def Zn4ALU1 : ProcResource<1>;
96 def Zn4ALU2 : ProcResource<1>;
97 def Zn4ALU3 : ProcResource<1>;
99 // AMD SOG 19h, 2.10.2 Execution Units
100 // There is also a separate branch execution unit.
101 def Zn4BRU1 : ProcResource<1>;
103 // AMD SOG 19h, 2.10.2 Execution Units
104 // There are three Address Generation Units (AGUs) for all load and store
105 // address generation. There are also 3 store data movement units
106 // associated with the same schedulers as the AGUs.
107 def Zn4AGU0 : ProcResource<1>;
108 def Zn4AGU1 : ProcResource<1>;
109 def Zn4AGU2 : ProcResource<1>;
113 //===----------------------------------------------------------------------===//
115 // AMD SOG 19h, 2.10.2 Execution Units
116 // ALU0 additionally has divide <...> execution capability.
117 defvar Zn4Divider = Zn4ALU0;
119 // AMD SOG 19h, 2.10.2 Execution Units
120 // ALU0 additionally has <...> branch execution capability.
121 defvar Zn4BRU0 = Zn4ALU0;
123 // Integer Multiplication issued on ALU1.
124 defvar Zn4Multiplier = Zn4ALU1;
126 // Execution pipeline grouping
127 //===----------------------------------------------------------------------===//
129 // General ALU operations
130 def Zn4ALU0123 : ProcResGroup<[Zn4ALU0, Zn4ALU1, Zn4ALU2, Zn4ALU3]>;
132 // General AGU operations
133 def Zn4AGU012 : ProcResGroup<[Zn4AGU0, Zn4AGU1, Zn4AGU2]>;
135 // Control flow: jumps, calls
136 def Zn4BRU01 : ProcResGroup<[Zn4BRU0, Zn4BRU1]>;
138 // Everything that isn't control flow, but still needs to access CC register,
139 // namely: conditional moves, SETcc.
140 def Zn4ALU03 : ProcResGroup<[Zn4ALU0, Zn4ALU3]>;
142 // Zn4ALU1 handles complex bit twiddling: CRC/PDEP/PEXT
144 // Simple bit twiddling: bit test, shift/rotate, bit extraction
145 def Zn4ALU12 : ProcResGroup<[Zn4ALU1, Zn4ALU2]>;
150 //===----------------------------------------------------------------------===//
152 // AMD SOG 19h, 2.10.3 Retire Control Unit
153 // The integer physical register file (PRF) consists of 224 registers.
154 def Zn4IntegerPRF : RegisterFile<224, [GR64, CCR], [1, 1], [1, 0],
155 6, // Max moves that can be eliminated per cycle.
156 0>; // Restrict move elimination to zero regs.
158 // anandtech, The integer scheduler has a 4*24 entry macro op capacity.
159 // AMD SOG 19h, 2.10.1 Schedulers
160 // The schedulers can receive up to six macro ops per cycle, with a limit of
161 // two per scheduler. Each scheduler can issue one micro op per cycle into
162 // each of its associated pipelines
163 def Zn4Int : ProcResGroup<[Zn4ALU0, Zn4AGU0, Zn4BRU0, // scheduler 0
164 Zn4ALU1, Zn4AGU1, // scheduler 1
165 Zn4ALU2, Zn4AGU2, // scheduler 2
166 Zn4ALU3, Zn4BRU1 // scheduler 3
168 let BufferSize = !mul(4, 24);
172 //===----------------------------------------------------------------------===//
173 // Floating-Point Unit
176 // AMD SOG 19h, 2.4 Superscalar Organization
177 // The processor uses <...> two decoupled independent floating point schedulers
178 // each servicing two FP pipelines and one store or FP-to-integer pipeline.
182 //===----------------------------------------------------------------------===//
184 // AMD SOG 19h, 2.10.1 Schedulers
185 // <...>, and six FPU pipes.
186 // Agner, 22.10 Floating point execution pipes
187 // There are six floating point/vector execution pipes,
188 def Zn4FP0 : ProcResource<1>;
189 def Zn4FP1 : ProcResource<1>;
190 def Zn4FP2 : ProcResource<1>;
191 def Zn4FP3 : ProcResource<1>;
192 def Zn4FP45 : ProcResource<2>;
196 //===----------------------------------------------------------------------===//
197 // AMD SOG 19h, 2.11.1 Floating Point Execution Resources
199 // (v)FMUL*, (v)FMA*, Floating Point Compares, Blendv(DQ)
200 defvar Zn4FPFMul0 = Zn4FP0;
201 defvar Zn4FPFMul1 = Zn4FP1;
204 defvar Zn4FPFAdd0 = Zn4FP2;
205 defvar Zn4FPFAdd1 = Zn4FP3;
207 // All convert operations except pack/unpack
208 defvar Zn4FPFCvt0 = Zn4FP2;
209 defvar Zn4FPFCvt1 = Zn4FP3;
211 // All Divide and Square Root except Reciprocal Approximation
212 // AMD SOG 19h, 2.11.1 Floating Point Execution Resources
213 // FDIV unit can support 2 simultaneous operations in flight
214 // even though it occupies a single pipe.
215 // FIXME: BufferSize=2 ?
216 defvar Zn4FPFDiv = Zn4FP1;
218 // Moves and Logical operations on Floating Point Data Types
219 defvar Zn4FPFMisc0 = Zn4FP0;
220 defvar Zn4FPFMisc1 = Zn4FP1;
221 defvar Zn4FPFMisc2 = Zn4FP2;
222 defvar Zn4FPFMisc3 = Zn4FP3;
224 // Integer Adds, Subtracts, and Compares
225 // Some complex VADD operations are not available in all pipes.
226 defvar Zn4FPVAdd0 = Zn4FP0;
227 defvar Zn4FPVAdd1 = Zn4FP1;
228 defvar Zn4FPVAdd2 = Zn4FP2;
229 defvar Zn4FPVAdd3 = Zn4FP3;
231 // Integer Multiplies, SAD, Blendvb
232 defvar Zn4FPVMul0 = Zn4FP0;
233 defvar Zn4FPVMul1 = Zn4FP3;
235 // Data Shuffles, Packs, Unpacks, Permute
236 // Some complex shuffle operations are only available in pipe1.
237 defvar Zn4FPVShuf = Zn4FP1;
238 defvar Zn4FPVShufAux = Zn4FP2;
240 // Bit Shift Left/Right operations
241 defvar Zn4FPVShift0 = Zn4FP1;
242 defvar Zn4FPVShift1 = Zn4FP2;
244 // Moves and Logical operations on Packed Integer Data Types
245 defvar Zn4FPVMisc0 = Zn4FP0;
246 defvar Zn4FPVMisc1 = Zn4FP1;
247 defvar Zn4FPVMisc2 = Zn4FP2;
248 defvar Zn4FPVMisc3 = Zn4FP3;
251 defvar Zn4FPAES0 = Zn4FP0;
252 defvar Zn4FPAES1 = Zn4FP1;
255 defvar Zn4FPCLM0 = Zn4FP0;
256 defvar Zn4FPCLM1 = Zn4FP1;
258 // Execution pipeline grouping
259 //===----------------------------------------------------------------------===//
261 // AMD SOG 19h, 2.11 Floating-Point Unit
262 // Stores and floating point to general purpose register transfer
263 // have 2 dedicated pipelines (pipe 5 and 6).
264 def Zn4FPU0123 : ProcResGroup<[Zn4FP0, Zn4FP1, Zn4FP2, Zn4FP3]>;
266 // (v)FMUL*, (v)FMA*, Floating Point Compares, Blendv(DQ)
267 def Zn4FPFMul01 : ProcResGroup<[Zn4FPFMul0, Zn4FPFMul1]>;
270 // Some complex VADD operations are not available in all pipes.
271 def Zn4FPFAdd01 : ProcResGroup<[Zn4FPFAdd0, Zn4FPFAdd1]>;
273 // All convert operations except pack/unpack
274 def Zn4FPFCvt01 : ProcResGroup<[Zn4FPFCvt0, Zn4FPFCvt1]>;
276 // All Divide and Square Root except Reciprocal Approximation
277 // def Zn4FPFDiv : ProcResGroup<[Zn4FPFDiv]>;
279 // Moves and Logical operations on Floating Point Data Types
280 def Zn4FPFMisc0123 : ProcResGroup<[Zn4FPFMisc0, Zn4FPFMisc1, Zn4FPFMisc2, Zn4FPFMisc3]>;
282 // FIXUP and RANGE use FP01 pipelines
283 def Zn4FPFMisc01 : ProcResGroup<[Zn4FPFMisc0, Zn4FPFMisc1]>;
284 def Zn4FPFMisc12 : ProcResGroup<[Zn4FPFMisc1, Zn4FPFMisc2]>;
285 // SCALE instructions use FP23 pipelines
286 def Zn4FPFMisc23 : ProcResGroup<[Zn4FPFMisc2, Zn4FPFMisc3]>;
287 def Zn4FPFMisc123 : ProcResGroup<[Zn4FPFMisc1,Zn4FPFMisc2, Zn4FPFMisc3]>;
289 // Loads, Stores and Move to General Register (EX) Operations
290 // AMD SOG 19h, 2.11 Floating-Point Unit
291 // Stores and floating point to general purpose register transfer
292 // have 2 dedicated pipelines (pipe 5 and 6).
293 defvar Zn4FPLd01 = Zn4FP45;
295 // AMD SOG 19h, 2.11 Floating-Point Unit
296 // Note that FP stores are supported on two pipelines,
297 // but throughput is limited to one per cycle.
298 let Super = Zn4FP45 in
299 def Zn4FPSt : ProcResource<1>;
301 // Integer Adds, Subtracts, and Compares
302 // Some complex VADD operations are not available in all pipes.
303 def Zn4FPVAdd0123 : ProcResGroup<[Zn4FPVAdd0, Zn4FPVAdd1, Zn4FPVAdd2, Zn4FPVAdd3]>;
305 def Zn4FPVAdd01: ProcResGroup<[Zn4FPVAdd0, Zn4FPVAdd1]>;
306 def Zn4FPVAdd12: ProcResGroup<[Zn4FPVAdd1, Zn4FPVAdd2]>;
308 // AVX512 Opmask pipelines
309 def Zn4FPOpMask01: ProcResGroup<[Zn4FP2, Zn4FP3]>;
310 def Zn4FPOpMask4: ProcResGroup<[Zn4FP45]>;
312 // Integer Multiplies, SAD, Blendvb
313 def Zn4FPVMul01 : ProcResGroup<[Zn4FPVMul0, Zn4FPVMul1]>;
315 // Data Shuffles, Packs, Unpacks, Permute
316 // Some complex shuffle operations are only available in pipe1.
317 def Zn4FPVShuf01 : ProcResGroup<[Zn4FPVShuf, Zn4FPVShufAux]>;
319 // Bit Shift Left/Right operations
320 def Zn4FPVShift01 : ProcResGroup<[Zn4FPVShift0, Zn4FPVShift1]>;
322 // Moves and Logical operations on Packed Integer Data Types
323 def Zn4FPVMisc0123 : ProcResGroup<[Zn4FPVMisc0, Zn4FPVMisc1, Zn4FPVMisc2, Zn4FPVMisc3]>;
326 def Zn4FPAES01 : ProcResGroup<[Zn4FPAES0, Zn4FPAES1]>;
329 def Zn4FPCLM01 : ProcResGroup<[Zn4FPCLM0, Zn4FPCLM1]>;
334 //===----------------------------------------------------------------------===//
336 // Agner, 21.8 Register renaming and out-of-order schedulers
337 // The floating point register file has 192 vector registers
338 // of 512b each in zen4.
339 def Zn4FpPRF : RegisterFile<192, [VR64, VR128, VR256, VR512], [1, 1, 1, 1], [0, 1, 1],
340 6, // Max moves that can be eliminated per cycle.
341 0>; // Restrict move elimination to zero regs.
343 // AMD SOG 19h, 2.11 Floating-Point Unit
344 // The floating-point scheduler has a 2*32 entry macro op capacity.
345 // AMD SOG 19h, 2.11 Floating-Point Unit
346 // <...> the scheduler can issue 1 micro op per cycle for each pipe.
347 // FIXME: those are two separate schedulers, not a single big one.
348 def Zn4FP : ProcResGroup<[Zn4FP0, Zn4FP2, /*Zn4FP4,*/ // scheduler 0
349 Zn4FP1, Zn4FP3, Zn4FP45 /*Zn4FP5*/ // scheduler 1
351 let BufferSize = !mul(2, 32);
354 // AMD SOG 19h, 2.11 Floating-Point Unit
355 // Macro ops can be dispatched to the 64 entry Non Scheduling Queue (NSQ)
356 // even if floating-point scheduler is full.
357 // FIXME: how to model this properly?
360 //===----------------------------------------------------------------------===//
364 // AMD SOG 19h, 2.12 Load-Store Unit
365 // The LS unit contains three largely independent pipe-lines
366 // enabling the execution of three 256-bit memory operations per cycle.
367 def Zn4LSU : ProcResource<3>;
369 // AMD SOG 19h, 2.12 Load-Store Unit
370 // All three memory operations can be loads.
371 let Super = Zn4LSU in
372 def Zn4Load : ProcResource<3> {
373 // AMD SOG 19h, 2.12 Load-Store Unit
374 // The LS unit can process up to 72 out-of-order loads.
378 def Zn4LoadQueue : LoadQueue<Zn4Load>;
380 // AMD SOG 19h, 2.12 Load-Store Unit
381 // A maximum of two of the memory operations can be stores.
382 let Super = Zn4LSU in
383 def Zn4Store : ProcResource<2> {
384 // AMD SOG 19h, 2.12 Load-Store Unit
385 // The LS unit utilizes a 64-entry store queue (STQ).
389 def Zn4StoreQueue : StoreQueue<Zn4Store>;
391 //===----------------------------------------------------------------------===//
392 // Basic helper classes.
393 //===----------------------------------------------------------------------===//
395 // Many SchedWrites are defined in pairs with and without a folded load.
396 // Instructions with folded loads are usually micro-fused, so they only appear
397 // as two micro-ops when dispatched by the schedulers.
398 // This multiclass defines the resource usage for variants with and without
401 multiclass __Zn4WriteRes<SchedWrite SchedRW, list<ProcResourceKind> ExePorts,
402 int Lat = 1, list<int> Res = [], int UOps = 1> {
403 def : WriteRes<SchedRW, ExePorts> {
405 let ReleaseAtCycles = Res;
406 let NumMicroOps = UOps;
410 multiclass __Zn4WriteResPair<X86FoldableSchedWrite SchedRW,
411 list<ProcResourceKind> ExePorts, int Lat,
412 list<int> Res, int UOps, int LoadLat, int LoadUOps,
413 ProcResourceKind AGU, int LoadRes> {
414 defm : __Zn4WriteRes<SchedRW, ExePorts, Lat, Res, UOps>;
416 defm : __Zn4WriteRes<SchedRW.Folded,
417 !listconcat([AGU, Zn4Load], ExePorts),
419 !if(!and(!empty(Res), !eq(LoadRes, 1)),
421 !listconcat([1, LoadRes],
423 !listsplat(1, !size(ExePorts)),
425 !add(UOps, LoadUOps)>;
428 // For classes without folded loads.
429 multiclass Zn4WriteResInt<SchedWrite SchedRW,
430 list<ProcResourceKind> ExePorts, int Lat = 1,
431 list<int> Res = [], int UOps = 1> {
432 defm : __Zn4WriteRes<SchedRW, ExePorts, Lat, Res, UOps>;
435 multiclass Zn4WriteResXMM<SchedWrite SchedRW,
436 list<ProcResourceKind> ExePorts, int Lat = 1,
437 list<int> Res = [], int UOps = 1> {
438 defm : __Zn4WriteRes<SchedRW, ExePorts, Lat, Res, UOps>;
441 multiclass Zn4WriteResYMM<SchedWrite SchedRW,
442 list<ProcResourceKind> ExePorts, int Lat = 1,
443 list<int> Res = [], int UOps = 1> {
444 defm : __Zn4WriteRes<SchedRW, ExePorts, Lat, Res, UOps>;
447 multiclass Zn4WriteResZMM<SchedWrite SchedRW,
448 list<ProcResourceKind> ExePorts, int Lat = 1,
449 list<int> Res = [], int UOps = 1> {
450 defm : __Zn4WriteRes<SchedRW, ExePorts, Lat, Res, UOps>;
453 // For classes with folded loads.
454 multiclass Zn4WriteResIntPair<X86FoldableSchedWrite SchedRW,
455 list<ProcResourceKind> ExePorts, int Lat = 1,
456 list<int> Res = [], int UOps = 1,
457 int LoadUOps = 0, int LoadRes = 1> {
458 defm : __Zn4WriteResPair<SchedRW, ExePorts, Lat, Res, UOps,
459 Znver4Model.LoadLatency,
460 LoadUOps, Zn4AGU012, LoadRes>;
463 multiclass Zn4WriteResXMMPair<X86FoldableSchedWrite SchedRW,
464 list<ProcResourceKind> ExePorts, int Lat = 1,
465 list<int> Res = [], int UOps = 1,
466 int LoadUOps = 0, int LoadRes = 1> {
467 defm : __Zn4WriteResPair<SchedRW, ExePorts, Lat, Res, UOps,
468 Znver4Model.VecLoadLatency,
469 LoadUOps, Zn4FPLd01, LoadRes>;
472 multiclass Zn4WriteResYMMPair<X86FoldableSchedWrite SchedRW,
473 list<ProcResourceKind> ExePorts, int Lat = 1,
474 list<int> Res = [], int UOps = 1,
475 int LoadUOps = 0, int LoadRes = 1> {
476 defm : __Zn4WriteResPair<SchedRW, ExePorts, Lat, Res, UOps,
477 Znver4Model.VecLoadLatency,
478 LoadUOps, Zn4FPLd01, LoadRes>;
481 multiclass Zn4WriteResZMMPair<X86FoldableSchedWrite SchedRW,
482 list<ProcResourceKind> ExePorts, int Lat = 1,
483 list<int> Res = [], int UOps = 2,
484 int LoadUOps = 0, int LoadRes = 1> {
485 defm : __Zn4WriteResPair<SchedRW, ExePorts, Lat, Res, UOps,
486 Znver4Model.VecLoadLatency,
487 LoadUOps, Zn4FPLd01, LoadRes>;
490 //===----------------------------------------------------------------------===//
492 //===----------------------------------------------------------------------===//
494 def : ReadAdvance<ReadAfterLd, Znver4Model.LoadLatency>;
496 def : ReadAdvance<ReadAfterVecLd, Znver4Model.VecLoadLatency>;
497 def : ReadAdvance<ReadAfterVecXLd, Znver4Model.VecLoadLatency>;
498 def : ReadAdvance<ReadAfterVecYLd, Znver4Model.VecLoadLatency>;
500 // AMD SOG 19h, 2.11 Floating-Point Unit
501 // There is 1 cycle of added latency for a result to cross
502 // from F to I or I to F domain.
503 def : ReadAdvance<ReadInt2Fpu, -1>;
505 // Instructions with both a load and a store folded are modeled as a folded
507 defm : Zn4WriteResInt<WriteRMW, [Zn4AGU012, Zn4Store], Znver4Model.StoreLatency, [1, 1], 0>;
509 // Loads, stores, and moves, not folded with other operations.
510 defm : Zn4WriteResInt<WriteLoad, [Zn4AGU012, Zn4Load], !add(Znver4Model.LoadLatency, 1), [1, 1], 1>;
512 // Model the effect of clobbering the read-write mask operand of the GATHER operation.
513 // Does not cost anything by itself, only has latency, matching that of the WriteLoad,
514 defm : Zn4WriteResInt<WriteVecMaskedGatherWriteback, [], !add(Znver4Model.LoadLatency, 1), [], 0>;
516 def Zn4WriteMOVSlow : SchedWriteRes<[Zn4AGU012, Zn4Load]> {
517 let Latency = !add(Znver4Model.LoadLatency, 1);
518 let ReleaseAtCycles = [3, 1];
521 def : InstRW<[Zn4WriteMOVSlow], (instrs MOV8rm, MOV8rm_NOREX, MOV16rm, MOVSX16rm16, MOVSX16rm32, MOVZX16rm16, MOVSX16rm8, MOVZX16rm8)>;
523 defm : Zn4WriteResInt<WriteStore, [Zn4AGU012, Zn4Store], Znver4Model.StoreLatency, [1, 2], 1>;
524 defm : Zn4WriteResInt<WriteStoreNT, [Zn4AGU012, Zn4Store], Znver4Model.StoreLatency, [1, 2], 1>;
525 defm : Zn4WriteResInt<WriteMove, [Zn4ALU0123], 1, [4], 1>;
527 // Treat misc copies as a move.
528 def : InstRW<[WriteMove], (instrs COPY)>;
530 def Zn4WriteMOVBE16rm : SchedWriteRes<[Zn4AGU012, Zn4Load, Zn4ALU0123]> {
531 let Latency = Znver4Model.LoadLatency;
532 let ReleaseAtCycles = [1, 1, 4];
535 def : InstRW<[Zn4WriteMOVBE16rm], (instrs MOVBE16rm)>;
537 def Zn4WriteMOVBEmr : SchedWriteRes<[Zn4ALU0123, Zn4AGU012, Zn4Store]> {
538 let Latency = Znver4Model.StoreLatency;
539 let ReleaseAtCycles = [4, 1, 1];
542 def : InstRW<[Zn4WriteMOVBEmr], (instrs MOVBE16mr, MOVBE32mr, MOVBE64mr)>;
545 defm : Zn4WriteResIntPair<WriteALU, [Zn4ALU0123], 1, [1], 1>; // Simple integer ALU op.
547 def Zn4WriteALUSlow : SchedWriteRes<[Zn4ALU0123]> {
549 let ReleaseAtCycles = [4];
552 def : InstRW<[Zn4WriteALUSlow], (instrs ADD8i8, ADD16i16, ADD32i32, ADD64i32,
553 AND8i8, AND16i16, AND32i32, AND64i32,
554 OR8i8, OR16i16, OR32i32, OR64i32,
555 SUB8i8, SUB16i16, SUB32i32, SUB64i32,
556 XOR8i8, XOR16i16, XOR32i32, XOR64i32)>;
558 def Zn4WriteMoveExtend : SchedWriteRes<[Zn4ALU0123]> {
560 let ReleaseAtCycles = [4];
563 def : InstRW<[Zn4WriteMoveExtend], (instrs MOVSX16rr16, MOVSX16rr32, MOVZX16rr16, MOVSX16rr8, MOVZX16rr8)>;
565 def Zn4WriteMaterialize32bitImm: SchedWriteRes<[Zn4ALU0123]> {
567 let ReleaseAtCycles = [2];
570 def : InstRW<[Zn4WriteMaterialize32bitImm], (instrs MOV32ri, MOV32ri_alt, MOV64ri32)>;
572 def Zn4WritePDEP_PEXT : SchedWriteRes<[Zn4ALU1]> {
574 let ReleaseAtCycles = [1];
577 def : InstRW<[Zn4WritePDEP_PEXT], (instrs PDEP32rr, PDEP64rr,
578 PEXT32rr, PEXT64rr)>;
580 defm : Zn4WriteResIntPair<WriteADC, [Zn4ALU0123], 1, [4], 1>; // Integer ALU + flags op.
582 def Zn4WriteADC8mr_SBB8mr : SchedWriteRes<[Zn4AGU012, Zn4Load, Zn4ALU0123, Zn4Store]> {
584 let ReleaseAtCycles = [1, 1, 7, 1];
587 def : InstRW<[Zn4WriteADC8mr_SBB8mr], (instrs ADC8mr, SBB8mr)>;
589 // This is for simple LEAs with one or two input operands.
590 defm : Zn4WriteResInt<WriteLEA, [Zn4AGU012], 1, [1], 1>; // LEA instructions can't fold loads.
592 // This write is used for slow LEA instructions.
593 def Zn4Write3OpsLEA : SchedWriteRes<[Zn4ALU0123]> {
595 let ReleaseAtCycles = [1];
599 // On Znver4, a slow LEA is either a 3Ops LEA (base, index, offset),
600 // or an LEA with a `Scale` value different than 1.
601 def Zn4SlowLEAPredicate : MCSchedPredicate<
603 // A 3-operand LEA (base, index, offset).
604 IsThreeOperandsLEAFn,
605 // An LEA with a "Scale" different than 1.
607 CheckIsImmOperand<2>,
608 CheckNot<CheckImmOperand<2, 1>>
613 def Zn4WriteLEA : SchedWriteVariant<[
614 SchedVar<Zn4SlowLEAPredicate, [Zn4Write3OpsLEA]>,
615 SchedVar<NoSchedPred, [WriteLEA]>
618 def : InstRW<[Zn4WriteLEA], (instrs LEA32r, LEA64r, LEA64_32r)>;
620 def Zn4SlowLEA16r : SchedWriteRes<[Zn4ALU0123]> {
621 let Latency = 2; // FIXME: not from llvm-exegesis
622 let ReleaseAtCycles = [4];
626 def : InstRW<[Zn4SlowLEA16r], (instrs LEA16r)>;
628 // Integer multiplication
629 defm : Zn4WriteResIntPair<WriteIMul8, [Zn4Multiplier], 3, [3], 1>; // Integer 8-bit multiplication.
630 defm : Zn4WriteResIntPair<WriteIMul16, [Zn4Multiplier], 3, [3], 3, /*LoadUOps=*/1>; // Integer 16-bit multiplication.
631 defm : Zn4WriteResIntPair<WriteIMul16Imm, [Zn4Multiplier], 4, [4], 2>; // Integer 16-bit multiplication by immediate.
632 defm : Zn4WriteResIntPair<WriteIMul16Reg, [Zn4Multiplier], 3, [1], 1>; // Integer 16-bit multiplication by register.
633 defm : Zn4WriteResIntPair<WriteIMul32, [Zn4Multiplier], 3, [3], 2>; // Integer 32-bit multiplication.
634 defm : Zn4WriteResIntPair<WriteMULX32, [Zn4Multiplier], 3, [1], 2>; // Integer 32-bit Unsigned Multiply Without Affecting Flags.
635 defm : Zn4WriteResIntPair<WriteIMul32Imm, [Zn4Multiplier], 3, [1], 1>; // Integer 32-bit multiplication by immediate.
636 defm : Zn4WriteResIntPair<WriteIMul32Reg, [Zn4Multiplier], 3, [1], 1>; // Integer 32-bit multiplication by register.
637 defm : Zn4WriteResIntPair<WriteIMul64, [Zn4Multiplier], 3, [3], 2>; // Integer 64-bit multiplication.
638 defm : Zn4WriteResIntPair<WriteMULX64, [Zn4Multiplier], 3, [1], 2>; // Integer 32-bit Unsigned Multiply Without Affecting Flags.
639 defm : Zn4WriteResIntPair<WriteIMul64Imm, [Zn4Multiplier], 3, [1], 1>; // Integer 64-bit multiplication by immediate.
640 defm : Zn4WriteResIntPair<WriteIMul64Reg, [Zn4Multiplier], 3, [1], 1>; // Integer 64-bit multiplication by register.
641 defm : Zn4WriteResInt<WriteIMulHLd, [], !add(4, Znver4Model.LoadLatency), [], 0>; // Integer multiplication, high part.
642 defm : Zn4WriteResInt<WriteIMulH, [], 4, [], 0>; // Integer multiplication, high part.
644 defm : Zn4WriteResInt<WriteBSWAP32, [Zn4ALU0123], 1, [1], 1>; // Byte Order (Endianness) 32-bit Swap.
645 defm : Zn4WriteResInt<WriteBSWAP64, [Zn4ALU0123], 1, [1], 1>; // Byte Order (Endianness) 64-bit Swap.
647 defm : Zn4WriteResIntPair<WriteCMPXCHG, [Zn4ALU0123], 3, [12], 5>; // Compare and set, compare and swap.
649 def Zn4WriteCMPXCHG8rr : SchedWriteRes<[Zn4ALU0123]> {
651 let ReleaseAtCycles = [12];
654 def : InstRW<[Zn4WriteCMPXCHG8rr], (instrs CMPXCHG8rr)>;
656 defm : Zn4WriteResInt<WriteCMPXCHGRMW, [Zn4ALU0123], 3, [12], 6>; // Compare and set, compare and swap.
658 def Zn4WriteCMPXCHG8rm_LCMPXCHG8 : SchedWriteRes<[Zn4AGU012, Zn4Load, Zn4ALU0123]> {
659 let Latency = !add(Znver4Model.LoadLatency, Zn4WriteCMPXCHG8rr.Latency);
660 let ReleaseAtCycles = [1, 1, 12];
661 let NumMicroOps = !add(Zn4WriteCMPXCHG8rr.NumMicroOps, 2);
663 def : InstRW<[Zn4WriteCMPXCHG8rm_LCMPXCHG8], (instrs CMPXCHG8rm, LCMPXCHG8)>;
665 def Zn4WriteCMPXCHG8B : SchedWriteRes<[Zn4ALU0123]> {
666 let Latency = 3; // FIXME: not from llvm-exegesis
667 let ReleaseAtCycles = [24];
668 let NumMicroOps = 19;
670 def : InstRW<[Zn4WriteCMPXCHG8B], (instrs CMPXCHG8B)>;
672 def Zn4WriteCMPXCHG16B_LCMPXCHG16B : SchedWriteRes<[Zn4ALU0123]> {
673 let Latency = 4; // FIXME: not from llvm-exegesis
674 let ReleaseAtCycles = [59];
675 let NumMicroOps = 28;
677 def : InstRW<[Zn4WriteCMPXCHG16B_LCMPXCHG16B], (instrs CMPXCHG16B, LCMPXCHG16B)>;
679 def Zn4WriteWriteXCHGUnrenameable : SchedWriteRes<[Zn4ALU0123]> {
681 let ReleaseAtCycles = [2];
684 def : InstRW<[Zn4WriteWriteXCHGUnrenameable], (instrs XCHG8rr, XCHG16rr, XCHG16ar)>;
686 def Zn4WriteXCHG8rm_XCHG16rm : SchedWriteRes<[Zn4AGU012, Zn4Load, Zn4ALU0123]> {
687 let Latency = !add(Znver4Model.LoadLatency, 3); // FIXME: not from llvm-exegesis
688 let ReleaseAtCycles = [1, 1, 2];
691 def : InstRW<[Zn4WriteXCHG8rm_XCHG16rm], (instrs XCHG8rm, XCHG16rm)>;
693 def Zn4WriteXCHG32rm_XCHG64rm : SchedWriteRes<[Zn4AGU012, Zn4Load, Zn4ALU0123]> {
694 let Latency = !add(Znver4Model.LoadLatency, 2); // FIXME: not from llvm-exegesis
695 let ReleaseAtCycles = [1, 1, 2];
698 def : InstRW<[Zn4WriteXCHG32rm_XCHG64rm], (instrs XCHG32rm, XCHG64rm)>;
701 // FIXME: uops for 8-bit division measures as 2. for others it's a guess.
702 // FIXME: latency for 8-bit division measures as 10. for others it's a guess.
703 defm : Zn4WriteResIntPair<WriteDiv8, [Zn4Divider], 10, [10], 2>;
704 defm : Zn4WriteResIntPair<WriteDiv16, [Zn4Divider], 11, [11], 2>;
705 defm : Zn4WriteResIntPair<WriteDiv32, [Zn4Divider], 13, [13], 2>;
706 defm : Zn4WriteResIntPair<WriteDiv64, [Zn4Divider], 17, [17], 2>;
707 defm : Zn4WriteResIntPair<WriteIDiv8, [Zn4Divider], 10, [10], 2>;
708 defm : Zn4WriteResIntPair<WriteIDiv16, [Zn4Divider], 11, [11], 2>;
709 defm : Zn4WriteResIntPair<WriteIDiv32, [Zn4Divider], 13, [13], 2>;
710 defm : Zn4WriteResIntPair<WriteIDiv64, [Zn4Divider], 17, [17], 2>;
712 defm : Zn4WriteResIntPair<WriteBSF, [Zn4ALU1], 1, [1], 6, /*LoadUOps=*/1>; // Bit scan forward.
713 defm : Zn4WriteResIntPair<WriteBSR, [Zn4ALU1], 1, [1], 6, /*LoadUOps=*/1>; // Bit scan reverse.
715 defm : Zn4WriteResIntPair<WritePOPCNT, [Zn4ALU0123], 1, [1], 1>; // Bit population count.
717 def Zn4WritePOPCNT16rr : SchedWriteRes<[Zn4ALU0123]> {
719 let ReleaseAtCycles = [4];
722 def : InstRW<[Zn4WritePOPCNT16rr], (instrs POPCNT16rr)>;
724 defm : Zn4WriteResIntPair<WriteLZCNT, [Zn4ALU0123], 1, [1], 1>; // Leading zero count.
726 def Zn4WriteLZCNT16rr : SchedWriteRes<[Zn4ALU0123]> {
728 let ReleaseAtCycles = [4];
731 def : InstRW<[Zn4WriteLZCNT16rr], (instrs LZCNT16rr)>;
733 defm : Zn4WriteResIntPair<WriteTZCNT, [Zn4ALU12], 2, [1], 2>; // Trailing zero count.
735 def Zn4WriteTZCNT16rr : SchedWriteRes<[Zn4ALU0123]> {
737 let ReleaseAtCycles = [4];
740 def : InstRW<[Zn4WriteTZCNT16rr], (instrs TZCNT16rr)>;
742 defm : Zn4WriteResIntPair<WriteCMOV, [Zn4ALU03], 1, [1], 1>; // Conditional move.
743 defm : Zn4WriteResInt<WriteFCMOV, [Zn4ALU0123], 7, [28], 7>; // FIXME: not from llvm-exegesis // X87 conditional move.
744 defm : Zn4WriteResInt<WriteSETCC, [Zn4ALU03], 1, [2], 1>; // Set register based on condition code.
745 defm : Zn4WriteResInt<WriteSETCCStore, [Zn4ALU03, Zn4AGU012, Zn4Store], 2, [2, 1, 1], 2>; // FIXME: latency not from llvm-exegesis
746 defm : Zn4WriteResInt<WriteLAHFSAHF, [Zn4ALU3], 1, [1], 1>; // Load/Store flags in AH.
748 defm : Zn4WriteResInt<WriteBitTest, [Zn4ALU12], 1, [1], 1>; // Bit Test
749 defm : Zn4WriteResInt<WriteBitTestImmLd, [Zn4AGU012, Zn4Load, Zn4ALU12], !add(Znver4Model.LoadLatency, 1), [1, 1, 1], 2>;
750 defm : Zn4WriteResInt<WriteBitTestRegLd, [Zn4AGU012, Zn4Load, Zn4ALU12], !add(Znver4Model.LoadLatency, 1), [1, 1, 1], 7>;
752 defm : Zn4WriteResInt<WriteBitTestSet, [Zn4ALU12], 2, [2], 2>; // Bit Test + Set
753 defm : Zn4WriteResInt<WriteBitTestSetImmLd, [Zn4AGU012, Zn4Load, Zn4ALU12], !add(Znver4Model.LoadLatency, 2), [1, 1, 1], 4>;
754 defm : Zn4WriteResInt<WriteBitTestSetRegLd, [Zn4AGU012, Zn4Load, Zn4ALU12], !add(Znver4Model.LoadLatency, 2), [1, 1, 1], 9>;
756 // Integer shifts and rotates.
757 defm : Zn4WriteResIntPair<WriteShift, [Zn4ALU12], 1, [1], 1, /*LoadUOps=*/1>;
758 defm : Zn4WriteResIntPair<WriteShiftCL, [Zn4ALU12], 1, [1], 1, /*LoadUOps=*/1>;
759 defm : Zn4WriteResIntPair<WriteRotate, [Zn4ALU12], 1, [1], 1, /*LoadUOps=*/1>;
761 def Zn4WriteRotateR1 : SchedWriteRes<[Zn4ALU12]> {
763 let ReleaseAtCycles = [2];
766 def : InstRW<[Zn4WriteRotateR1], (instrs RCL8r1, RCL16r1, RCL32r1, RCL64r1,
767 RCR8r1, RCR16r1, RCR32r1, RCR64r1)>;
769 def Zn4WriteRotateM1 : SchedWriteRes<[Zn4AGU012, Zn4Load, Zn4ALU12]> {
770 let Latency = !add(Znver4Model.LoadLatency, Zn4WriteRotateR1.Latency);
771 let ReleaseAtCycles = [1, 1, 2];
772 let NumMicroOps = !add(Zn4WriteRotateR1.NumMicroOps, 1);
774 def : InstRW<[Zn4WriteRotateM1], (instrs RCL8m1, RCL16m1, RCL32m1, RCL64m1,
775 RCR8m1, RCR16m1, RCR32m1, RCR64m1)>;
777 def Zn4WriteRotateRightRI : SchedWriteRes<[Zn4ALU12]> {
779 let ReleaseAtCycles = [6];
782 def : InstRW<[Zn4WriteRotateRightRI], (instrs RCR8ri, RCR16ri, RCR32ri, RCR64ri)>;
784 def Zn4WriteRotateRightMI : SchedWriteRes<[Zn4AGU012, Zn4Load, Zn4ALU12]> {
785 let Latency = !add(Znver4Model.LoadLatency, Zn4WriteRotateRightRI.Latency);
786 let ReleaseAtCycles = [1, 1, 8];
787 let NumMicroOps = !add(Zn4WriteRotateRightRI.NumMicroOps, 3);
789 def : InstRW<[Zn4WriteRotateRightMI], (instrs RCR8mi, RCR16mi, RCR32mi, RCR64mi)>;
791 def Zn4WriteRotateLeftRI : SchedWriteRes<[Zn4ALU12]> {
793 let ReleaseAtCycles = [8];
796 def : InstRW<[Zn4WriteRotateLeftRI], (instrs RCL8ri, RCL16ri, RCL32ri, RCL64ri)>;
798 def Zn4WriteRotateLeftMI : SchedWriteRes<[Zn4AGU012, Zn4Load, Zn4ALU12]> {
799 let Latency = !add(Znver4Model.LoadLatency, Zn4WriteRotateLeftRI.Latency);
800 let ReleaseAtCycles = [1, 1, 8];
801 let NumMicroOps = !add(Zn4WriteRotateLeftRI.NumMicroOps, 2);
803 def : InstRW<[Zn4WriteRotateLeftMI], (instrs RCL8mi, RCL16mi, RCL32mi, RCL64mi)>;
805 defm : Zn4WriteResIntPair<WriteRotateCL, [Zn4ALU12], 1, [1], 1, /*LoadUOps=*/1>;
807 def Zn4WriteRotateRightRCL : SchedWriteRes<[Zn4ALU12]> {
809 let ReleaseAtCycles = [6];
812 def : InstRW<[Zn4WriteRotateRightRCL], (instrs RCR8rCL, RCR16rCL, RCR32rCL, RCR64rCL)>;
814 def Zn4WriteRotateRightMCL : SchedWriteRes<[Zn4AGU012, Zn4Load, Zn4ALU12]> {
815 let Latency = !add(Znver4Model.LoadLatency, Zn4WriteRotateRightRCL.Latency);
816 let ReleaseAtCycles = [1, 1, 8];
817 let NumMicroOps = !add(Zn4WriteRotateRightRCL.NumMicroOps, 2);
819 def : InstRW<[Zn4WriteRotateRightMCL], (instrs RCR8mCL, RCR16mCL, RCR32mCL, RCR64mCL)>;
821 def Zn4WriteRotateLeftRCL : SchedWriteRes<[Zn4ALU12]> {
823 let ReleaseAtCycles = [8];
826 def : InstRW<[Zn4WriteRotateLeftRCL], (instrs RCL8rCL, RCL16rCL, RCL32rCL, RCL64rCL)>;
828 def Zn4WriteRotateLeftMCL : SchedWriteRes<[Zn4AGU012, Zn4Load, Zn4ALU12]> {
829 let Latency = !add(Znver4Model.LoadLatency, Zn4WriteRotateLeftRCL.Latency);
830 let ReleaseAtCycles = [1, 1, 8];
831 let NumMicroOps = !add(Zn4WriteRotateLeftRCL.NumMicroOps, 2);
833 def : InstRW<[Zn4WriteRotateLeftMCL], (instrs RCL8mCL, RCL16mCL, RCL32mCL, RCL64mCL)>;
835 // Double shift instructions.
836 defm : Zn4WriteResInt<WriteSHDrri, [Zn4ALU12], 2, [3], 4>;
837 defm : Zn4WriteResInt<WriteSHDrrcl, [Zn4ALU12], 2, [3], 5>;
838 defm : Zn4WriteResInt<WriteSHDmri, [Zn4AGU012, Zn4Load, Zn4ALU12], !add(Znver4Model.LoadLatency, 2), [1, 1, 4], 6>;
839 defm : Zn4WriteResInt<WriteSHDmrcl, [Zn4AGU012, Zn4Load, Zn4ALU12], !add(Znver4Model.LoadLatency, 2), [1, 1, 4], 6>;
841 // BMI1 BEXTR/BLS, BMI2 BZHI
842 defm : Zn4WriteResIntPair<WriteBEXTR, [Zn4ALU12], 1, [1], 1, /*LoadUOps=*/1>;
843 defm : Zn4WriteResIntPair<WriteBLS, [Zn4ALU0123], 1, [1], 1, /*LoadUOps=*/1>;
844 defm : Zn4WriteResIntPair<WriteBZHI, [Zn4ALU12], 1, [1], 1, /*LoadUOps=*/1>;
846 // Idioms that clear a register, like xorps %xmm0, %xmm0.
847 // These can often bypass execution ports completely.
848 defm : Zn4WriteResInt<WriteZero, [Zn4ALU0123], 0, [0], 1>;
850 // Branches don't produce values, so they have no latency, but they still
851 // consume resources. Indirect branches can fold loads.
852 defm : Zn4WriteResIntPair<WriteJump, [Zn4BRU01], 1, [1], 1>; // FIXME: not from llvm-exegesis
854 // Floating point. This covers both scalar and vector operations.
855 defm : Zn4WriteResInt<WriteFLD0, [Zn4FPLd01, Zn4Load, Zn4FP1], !add(Znver4Model.LoadLatency, 4), [1, 1, 1], 1>;
856 defm : Zn4WriteResInt<WriteFLD1, [Zn4FPLd01, Zn4Load, Zn4FP1], !add(Znver4Model.LoadLatency, 7), [1, 1, 1], 1>;
857 defm : Zn4WriteResInt<WriteFLDC, [Zn4FPLd01, Zn4Load, Zn4FP1], !add(Znver4Model.LoadLatency, 7), [1, 1, 1], 1>;
858 defm : Zn4WriteResXMM<WriteFLoad, [Zn4FPLd01, Zn4Load], !add(Znver4Model.VecLoadLatency, 1), [1, 1], 1>;
859 defm : Zn4WriteResXMM<WriteFLoadX, [Zn4FPLd01, Zn4Load], !add(Znver4Model.VecLoadLatency, 1), [1, 1], 1>;
860 defm : Zn4WriteResYMM<WriteFLoadY, [Zn4FPLd01, Zn4Load], !add(Znver4Model.VecLoadLatency, 1), [1, 1], 1>;
861 defm : Zn4WriteResXMM<WriteFMaskedLoad, [Zn4FPLd01, Zn4Load], !add(Znver4Model.VecLoadLatency, 1), [1, 1], 1>;
862 defm : Zn4WriteResYMM<WriteFMaskedLoadY, [Zn4FPLd01, Zn4Load], !add(Znver4Model.VecLoadLatency, 1), [1, 1], 1>;
863 defm : Zn4WriteResXMM<WriteFStore, [Zn4FPSt, Zn4Store], Znver4Model.StoreLatency, [1, 1], 1>;
865 def Zn4WriteWriteFStoreMMX : SchedWriteRes<[Zn4FPSt, Zn4Store]> {
866 let Latency = 2; // FIXME: not from llvm-exegesis
867 let ReleaseAtCycles = [1, 1];
870 def : InstRW<[Zn4WriteWriteFStoreMMX], (instrs MOVHPDmr, MOVHPSmr,
871 VMOVHPDmr, VMOVHPSmr)>;
873 defm : Zn4WriteResXMM<WriteFStoreX, [Zn4FPSt, Zn4Store], Znver4Model.StoreLatency, [1, 1], 1>;
874 defm : Zn4WriteResYMM<WriteFStoreY, [Zn4FPSt, Zn4Store], Znver4Model.StoreLatency, [1, 1], 1>;
875 defm : Zn4WriteResXMM<WriteFStoreNT, [Zn4FPSt, Zn4Store], Znver4Model.StoreLatency, [1, 1], 1>;
876 defm : Zn4WriteResXMM<WriteFStoreNTX, [Zn4FPSt, Zn4Store], Znver4Model.StoreLatency, [1, 1], 1>;
877 defm : Zn4WriteResYMM<WriteFStoreNTY, [Zn4FPSt, Zn4Store], Znver4Model.StoreLatency, [1, 1], 1>;
879 defm : Zn4WriteResXMM<WriteFMaskedStore32, [Zn4FPSt, Zn4Store], Znver4Model.StoreLatency, [6, 1], 18>;
880 defm : Zn4WriteResXMM<WriteFMaskedStore64, [Zn4FPSt, Zn4Store], Znver4Model.StoreLatency, [4, 1], 10>;
881 defm : Zn4WriteResYMM<WriteFMaskedStore32Y, [Zn4FPSt, Zn4Store], Znver4Model.StoreLatency, [12, 1], 42>;
882 defm : Zn4WriteResYMM<WriteFMaskedStore64Y, [Zn4FPSt, Zn4Store], Znver4Model.StoreLatency, [6, 1], 18>;
884 defm : Zn4WriteResXMMPair<WriteFAdd, [Zn4FPFAdd01], 3, [1], 1>; // Floating point add/sub.
886 def Zn4WriteX87Arith : SchedWriteRes<[Zn4AGU012, Zn4Load, Zn4FPU0123]> {
887 let Latency = !add(Znver4Model.LoadLatency, 1); // FIXME: not from llvm-exegesis
888 let ReleaseAtCycles = [1, 1, 24];
891 def : InstRW<[Zn4WriteX87Arith], (instrs ADD_FI16m, ADD_FI32m,
892 SUB_FI16m, SUB_FI32m,
893 SUBR_FI16m, SUBR_FI32m,
894 MUL_FI16m, MUL_FI32m)>;
896 def Zn4WriteX87Div : SchedWriteRes<[Zn4AGU012, Zn4Load, Zn4FPU0123]> {
897 let Latency = !add(Znver4Model.LoadLatency, 1); // FIXME: not from llvm-exegesis
898 let ReleaseAtCycles = [1, 1, 62];
901 def : InstRW<[Zn4WriteX87Div], (instrs DIV_FI16m, DIV_FI32m,
902 DIVR_FI16m, DIVR_FI32m)>;
904 defm : Zn4WriteResXMMPair<WriteFAddX, [Zn4FPFAdd01], 3, [1], 1>; // Floating point add/sub (XMM).
905 defm : Zn4WriteResYMMPair<WriteFAddY, [Zn4FPFAdd01], 3, [1], 1>; // Floating point add/sub (YMM).
906 defm : Zn4WriteResZMMPair<WriteFAddZ, [Zn4FPFAdd01], 3, [2], 1>; // Floating point add/sub (ZMM).
907 defm : Zn4WriteResXMMPair<WriteFAdd64, [Zn4FPFAdd01], 3, [1], 1>; // Floating point double add/sub.
908 defm : Zn4WriteResXMMPair<WriteFAdd64X, [Zn4FPFAdd01], 3, [1], 1>; // Floating point double add/sub (XMM).
909 defm : Zn4WriteResYMMPair<WriteFAdd64Y, [Zn4FPFAdd01], 3, [1], 1>; // Floating point double add/sub (YMM).
910 defm : Zn4WriteResZMMPair<WriteFAdd64Z, [Zn4FPFAdd01], 3, [2], 1>; // Floating point double add/sub (ZMM).
911 defm : Zn4WriteResXMMPair<WriteFCmp, [Zn4FPFMul01], 2, [2], 1>; // Floating point compare.
912 defm : Zn4WriteResXMMPair<WriteFCmpX, [Zn4FPFMul01], 2, [1], 1>; // Floating point compare (XMM).
913 defm : Zn4WriteResYMMPair<WriteFCmpY, [Zn4FPFMul01], 2, [1], 1>; // Floating point compare (YMM).
914 defm : Zn4WriteResZMMPair<WriteFCmpZ, [Zn4FPFMul01], 2, [2], 1>; // Floating point compare (ZMM).
915 defm : Zn4WriteResXMMPair<WriteFCmp64, [Zn4FPFMul01], 1, [1], 1>; // Floating point double compare.
916 defm : Zn4WriteResXMMPair<WriteFCmp64X, [Zn4FPFMul01], 2, [1], 1>; // Floating point double compare (XMM).
917 defm : Zn4WriteResYMMPair<WriteFCmp64Y, [Zn4FPFMul01], 2, [1], 1>; // Floating point double compare (YMM).
918 defm : Zn4WriteResZMMPair<WriteFCmp64Z, [Zn4FPFMul01], 2, [2], 1>; // Floating point double compare (ZMM).
919 defm : Zn4WriteResXMMPair<WriteFCom, [Zn4FPFMul01], 3, [2], 1>; // FIXME: latency not from llvm-exegesis // Floating point compare to flags (X87).
920 defm : Zn4WriteResXMMPair<WriteFComX, [Zn4FPFMul01], 4, [2], 2>; // FIXME: latency not from llvm-exegesis // Floating point compare to flags (SSE).
921 defm : Zn4WriteResXMMPair<WriteFMul, [Zn4FPFMul01], 3, [1], 1>; // Floating point multiplication.
922 defm : Zn4WriteResXMMPair<WriteFMulX, [Zn4FPFMul01], 3, [1], 1>; // Floating point multiplication (XMM).
923 defm : Zn4WriteResYMMPair<WriteFMulY, [Zn4FPFMul01], 3, [1], 1>; // Floating point multiplication (YMM).
924 defm : Zn4WriteResZMMPair<WriteFMulZ, [Zn4FPFMul01], 3, [2], 1>; // Floating point multiplication (ZMM).
925 defm : Zn4WriteResXMMPair<WriteFMul64, [Zn4FPFMul01], 3, [1], 1>; // Floating point double multiplication.
926 defm : Zn4WriteResXMMPair<WriteFMul64X, [Zn4FPFMul01], 3, [1], 1>; // Floating point double multiplication (XMM).
927 defm : Zn4WriteResYMMPair<WriteFMul64Y, [Zn4FPFMul01], 3, [1], 1>; // Floating point double multiplication (YMM).
928 defm : Zn4WriteResZMMPair<WriteFMul64Z, [Zn4FPFMul01], 3, [2], 1>; // Floating point double multiplication (ZMM).
929 defm : Zn4WriteResXMMPair<WriteFDiv, [Zn4FPFDiv], 11, [3], 1>; // Floating point division.
930 defm : Zn4WriteResXMMPair<WriteFDivX, [Zn4FPFDiv], 11, [3], 1>; // Floating point division (XMM).
931 defm : Zn4WriteResYMMPair<WriteFDivY, [Zn4FPFDiv], 11, [3], 1>; // Floating point division (YMM).
932 defm : Zn4WriteResZMMPair<WriteFDivZ, [Zn4FPFDiv], 11, [6], 1>; // Floating point division (ZMM).
933 defm : Zn4WriteResXMMPair<WriteFDiv64, [Zn4FPFDiv], 13, [5], 1>; // Floating point double division.
934 defm : Zn4WriteResXMMPair<WriteFDiv64X, [Zn4FPFDiv], 13, [5], 1>; // Floating point double division (XMM).
935 defm : Zn4WriteResYMMPair<WriteFDiv64Y, [Zn4FPFDiv], 13, [5], 1>; // Floating point double division (YMM).
936 defm : Zn4WriteResZMMPair<WriteFDiv64Z, [Zn4FPFDiv], 13, [10], 1>; // Floating point double division (ZMM).
937 defm : Zn4WriteResXMMPair<WriteFSqrt, [Zn4FPFDiv], 15, [5], 1>; // Floating point square root.
938 defm : Zn4WriteResXMMPair<WriteFSqrtX, [Zn4FPFDiv], 15, [5], 1>; // Floating point square root (XMM).
939 defm : Zn4WriteResYMMPair<WriteFSqrtY, [Zn4FPFDiv], 15, [5], 1>; // Floating point square root (YMM).
940 defm : Zn4WriteResZMMPair<WriteFSqrtZ, [Zn4FPFDiv], 15, [10], 1>; // Floating point square root (ZMM).
941 defm : Zn4WriteResXMMPair<WriteFSqrt64, [Zn4FPFDiv], 21, [9], 1>; // Floating point double square root.
942 defm : Zn4WriteResXMMPair<WriteFSqrt64X, [Zn4FPFDiv], 21, [9], 1>; // Floating point double square root (XMM).
943 defm : Zn4WriteResYMMPair<WriteFSqrt64Y, [Zn4FPFDiv], 21, [9], 1>; // Floating point double square root (YMM).
944 defm : Zn4WriteResZMMPair<WriteFSqrt64Z, [Zn4FPFDiv], 21, [18], 1>; // Floating point double square root (ZMM).
945 defm : Zn4WriteResXMMPair<WriteFSqrt80, [Zn4FPFDiv], 22, [23], 1>; // FIXME: latency not from llvm-exegesis // Floating point long double square root.
946 defm : Zn4WriteResXMMPair<WriteFRcp, [Zn4FPFMul01], 4, [1], 1>; // Floating point reciprocal estimate.
947 defm : Zn4WriteResXMMPair<WriteFRcpX, [Zn4FPFMul01], 4, [1], 1>; // Floating point reciprocal estimate (XMM).
948 defm : Zn4WriteResYMMPair<WriteFRcpY, [Zn4FPFMul01], 5, [1], 1>; // Floating point reciprocal estimate (YMM).
949 defm : Zn4WriteResZMMPair<WriteFRcpZ, [Zn4FPFMul01], 5, [2], 1>; // Floating point reciprocal estimate (ZMM).
950 defm : Zn4WriteResXMMPair<WriteFRsqrt, [Zn4FPFDiv], 4, [1], 1>; // Floating point reciprocal square root estimate.
951 defm : Zn4WriteResXMMPair<WriteFRsqrtX, [Zn4FPFDiv], 4, [1], 1>; // Floating point reciprocal square root estimate (XMM).
952 defm : Zn4WriteResYMMPair<WriteFRsqrtY, [Zn4FPFDiv], 4, [1], 1>; // Floating point reciprocal square root estimate (YMM).
953 defm : Zn4WriteResZMMPair<WriteFRsqrtZ, [Zn4FPFDiv], 5, [2], 1>; // Floating point reciprocal square root estimate (ZMM).
954 defm : Zn4WriteResXMMPair<WriteFMA, [Zn4FPFMul01], 4, [2], 1>; // Fused Multiply Add.
955 defm : Zn4WriteResXMMPair<WriteFMAX, [Zn4FPFMul01], 4, [1], 1>; // Fused Multiply Add (XMM).
956 defm : Zn4WriteResYMMPair<WriteFMAY, [Zn4FPFMul01], 4, [1], 1>; // Fused Multiply Add (YMM).
957 defm : Zn4WriteResZMMPair<WriteFMAZ, [Zn4FPFMul01], 4, [2], 1>; // Fused Multiply Add (ZMM).
958 defm : Zn4WriteResXMMPair<WriteDPPD, [Zn4FPFMul01], 7, [6], 3, /*LoadUOps=*/2>; // Floating point double dot product.
959 defm : Zn4WriteResXMMPair<WriteDPPS, [Zn4FPFMul01], 11, [8], 8, /*LoadUOps=*/2>; // Floating point single dot product.
960 defm : Zn4WriteResYMMPair<WriteDPPSY, [Zn4FPFMul01], 11, [8], 7, /*LoadUOps=*/1>; // Floating point single dot product (YMM).
961 defm : Zn4WriteResXMMPair<WriteFSign, [Zn4FPFMul01], 1, [2], 1>; // FIXME: latency not from llvm-exegesis // Floating point fabs/fchs.
962 defm : Zn4WriteResXMMPair<WriteFRnd, [Zn4FPFCvt01], 3, [1], 1>; // Floating point rounding.
963 defm : Zn4WriteResYMMPair<WriteFRndY, [Zn4FPFCvt01], 3, [1], 1>; // Floating point rounding (YMM).
964 defm : Zn4WriteResZMMPair<WriteFRndZ, [Zn4FPFCvt01], 3, [2], 1>; // Floating point rounding (ZMM).
966 defm : Zn4WriteResXMMPair<WriteFLogic, [Zn4FPVMisc0123], 1, [1], 1>; // Floating point and/or/xor logicals.
967 defm : Zn4WriteResYMMPair<WriteFLogicY, [Zn4FPVMisc0123], 1, [1], 1>; // Floating point and/or/xor logicals (YMM).
968 defm : Zn4WriteResZMMPair<WriteFLogicZ, [Zn4FPVMisc0123], 1, [2], 1>; // Floating point and/or/xor logicals (ZMM).
969 defm : Zn4WriteResXMMPair<WriteFTest, [Zn4FPFMisc12], 1, [2], 2>; // FIXME: latency not from llvm-exegesis // Floating point TEST instructions.
970 defm : Zn4WriteResYMMPair<WriteFTestY, [Zn4FPFMisc12], 1, [2], 2>; // FIXME: latency not from llvm-exegesis // Floating point TEST instructions (YMM).
971 defm : Zn4WriteResZMMPair<WriteFTestZ, [Zn4FPFMisc12], 1, [4], 1>; // FIXME: latency not from llvm-exegesis // Floating point TEST instructions (ZMM).
972 defm : Zn4WriteResXMMPair<WriteFShuffle, [Zn4FPVShuf01], 1, [1], 1>; // Floating point vector shuffles.
973 defm : Zn4WriteResYMMPair<WriteFShuffleY, [Zn4FPVShuf01], 1, [1], 1>; // Floating point vector shuffles (YMM).
974 defm : Zn4WriteResZMMPair<WriteFShuffleZ, [Zn4FPVShuf01], 1, [2], 1>; // Floating point vector shuffles (ZMM).
975 defm : Zn4WriteResXMMPair<WriteFVarShuffle, [Zn4FPVShuf01], 3, [1], 1>; // Floating point vector variable shuffles.
976 defm : Zn4WriteResYMMPair<WriteFVarShuffleY, [Zn4FPVShuf01], 3, [1], 1>; // Floating point vector variable shuffles (YMM).
977 defm : Zn4WriteResZMMPair<WriteFVarShuffleZ, [Zn4FPVShuf01], 3, [2], 1>; // Floating point vector variable shuffles (ZMM).
978 defm : Zn4WriteResXMMPair<WriteFBlend, [Zn4FPFMul01], 1, [1], 1>; // Floating point vector blends.
979 defm : Zn4WriteResYMMPair<WriteFBlendY, [Zn4FPFMul01], 1, [1], 1>; // Floating point vector blends (YMM).
980 defm : Zn4WriteResZMMPair<WriteFBlendZ, [Zn4FPFMul01], 1, [2], 1>; // Floating point vector blends (ZMM).
981 defm : Zn4WriteResXMMPair<WriteFVarBlend, [Zn4FPFMul01], 1, [1], 1>; // Fp vector variable blends.
982 defm : Zn4WriteResYMMPair<WriteFVarBlendY, [Zn4FPFMul01], 1, [1], 1>; // Fp vector variable blends (YMM).
983 defm : Zn4WriteResZMMPair<WriteFVarBlendZ, [Zn4FPFMul01], 1, [2], 1>; // Fp vector variable blends (ZMM).
985 // Horizontal Add/Sub (float and integer)
986 defm : Zn4WriteResXMMPair<WriteFHAdd, [Zn4FPFAdd0], 4, [2], 3>;
987 defm : Zn4WriteResYMMPair<WriteFHAddY, [Zn4FPFAdd0], 4, [2], 3, /*LoadUOps=*/1>;
988 defm : Zn4WriteResZMMPair<WriteFHAddZ, [Zn4FPFAdd0], 6, [4], 3, /*LoadUOps=*/1>;
989 defm : Zn4WriteResXMMPair<WritePHAdd, [Zn4FPVAdd0], 2, [2], 3, /*LoadUOps=*/1>;
990 defm : Zn4WriteResXMMPair<WritePHAddX, [Zn4FPVAdd0], 2, [2], 3>;
991 defm : Zn4WriteResYMMPair<WritePHAddY, [Zn4FPVAdd0], 3, [3], 3, /*LoadUOps=*/1>;
992 defm : Zn4WriteResZMMPair<WritePHAddZ, [Zn4FPVAdd0], 2, [4], 3, /*LoadUOps=*/1>;
994 // Vector integer operations.
995 defm : Zn4WriteResXMM<WriteVecLoad, [Zn4FPLd01, Zn4Load], !add(Znver4Model.VecLoadLatency, 1), [1, 1], 1>;
996 defm : Zn4WriteResXMM<WriteVecLoadX, [Zn4FPLd01, Zn4Load], !add(Znver4Model.VecLoadLatency, 1), [1, 1], 1>;
997 defm : Zn4WriteResYMM<WriteVecLoadY, [Zn4FPLd01, Zn4Load], !add(Znver4Model.VecLoadLatency, 1), [1, 1], 1>;
998 defm : Zn4WriteResXMM<WriteVecLoadNT, [Zn4FPLd01, Zn4Load], !add(Znver4Model.VecLoadLatency, 1), [1, 1], 1>;
999 defm : Zn4WriteResYMM<WriteVecLoadNTY, [Zn4FPLd01, Zn4Load], !add(Znver4Model.VecLoadLatency, 1), [1, 1], 1>;
1000 defm : Zn4WriteResXMM<WriteVecMaskedLoad, [Zn4FPLd01, Zn4Load], !add(Znver4Model.VecLoadLatency, 1), [1, 1], 1>;
1001 defm : Zn4WriteResYMM<WriteVecMaskedLoadY, [Zn4FPLd01, Zn4Load], !add(Znver4Model.VecLoadLatency, 1), [1, 1], 1>;
1002 defm : Zn4WriteResXMM<WriteVecStore, [Zn4FPSt, Zn4Store], Znver4Model.StoreLatency, [1, 1], 1>;
1003 defm : Zn4WriteResXMM<WriteVecStoreX, [Zn4FPSt, Zn4Store], Znver4Model.StoreLatency, [1, 1], 1>;
1005 def Zn4WriteVEXTRACTF128rr_VEXTRACTI128rr : SchedWriteRes<[Zn4FPFMisc0]> {
1007 let ReleaseAtCycles = [1];
1008 let NumMicroOps = 1;
1010 def : InstRW<[Zn4WriteVEXTRACTF128rr_VEXTRACTI128rr], (instrs VEXTRACTF128rr, VEXTRACTI128rr)>;
1012 def Zn4WriteVEXTRACTI128mr : SchedWriteRes<[Zn4FPFMisc0, Zn4FPSt, Zn4Store]> {
1013 let Latency = !add(Znver4Model.LoadLatency, Zn4WriteVEXTRACTF128rr_VEXTRACTI128rr.Latency);
1014 let ReleaseAtCycles = [1, 1, 1];
1015 let NumMicroOps = !add(Zn4WriteVEXTRACTF128rr_VEXTRACTI128rr.NumMicroOps, 1);
1017 def : InstRW<[Zn4WriteVEXTRACTI128mr], (instrs VEXTRACTI128mr, VEXTRACTF128mr)>;
1019 def Zn4WriteVINSERTF128rmr : SchedWriteRes<[Zn4AGU012, Zn4Load, Zn4FPFMisc0]> {
1020 let Latency = !add(Znver4Model.LoadLatency, Zn4WriteVEXTRACTF128rr_VEXTRACTI128rr.Latency);
1021 let ReleaseAtCycles = [1, 1, 1];
1022 let NumMicroOps = !add(Zn4WriteVEXTRACTF128rr_VEXTRACTI128rr.NumMicroOps, 0);
1024 def : InstRW<[Zn4WriteVINSERTF128rmr], (instrs VINSERTF128rm)>;
1026 defm : Zn4WriteResYMM<WriteVecStoreY, [Zn4FPSt, Zn4Store], Znver4Model.StoreLatency, [1, 1], 1>;
1027 defm : Zn4WriteResXMM<WriteVecStoreNT, [Zn4FPSt, Zn4Store], Znver4Model.StoreLatency, [1, 1], 1>;
1028 defm : Zn4WriteResYMM<WriteVecStoreNTY, [Zn4FPSt, Zn4Store], Znver4Model.StoreLatency, [1, 1], 1>;
1029 defm : Zn4WriteResXMM<WriteVecMaskedStore32, [Zn4FPSt, Zn4Store], Znver4Model.StoreLatency, [6, 1], 18>;
1030 defm : Zn4WriteResXMM<WriteVecMaskedStore64, [Zn4FPSt, Zn4Store], Znver4Model.StoreLatency, [4, 1], 10>;
1031 defm : Zn4WriteResYMM<WriteVecMaskedStore32Y, [Zn4FPSt, Zn4Store], Znver4Model.StoreLatency, [12, 1], 42>;
1032 defm : Zn4WriteResYMM<WriteVecMaskedStore64Y, [Zn4FPSt, Zn4Store], Znver4Model.StoreLatency, [6, 1], 18>;
1034 defm : Zn4WriteResXMM<WriteVecMoveToGpr, [Zn4FPLd01], 1, [2], 1>;
1035 defm : Zn4WriteResXMM<WriteVecMoveFromGpr, [Zn4FPLd01], 1, [2], 1>;
1037 def Zn4WriteMOVMMX : SchedWriteRes<[Zn4FPLd01, Zn4FPFMisc0123]> {
1039 let ReleaseAtCycles = [1, 2];
1040 let NumMicroOps = 2;
1042 def : InstRW<[Zn4WriteMOVMMX], (instrs MMX_MOVQ2FR64rr, MMX_MOVQ2DQrr)>;
1044 def Zn4WriteMOVMMXSlow : SchedWriteRes<[Zn4FPLd01, Zn4FPFMisc0123]> {
1046 let ReleaseAtCycles = [1, 4];
1047 let NumMicroOps = 2;
1049 def : InstRW<[Zn4WriteMOVMMXSlow], (instrs MMX_MOVD64rr, MMX_MOVD64to64rr)>;
1051 defm : Zn4WriteResXMMPair<WriteVecALU, [Zn4FPVAdd0123], 1, [1], 1>; // Vector integer ALU op, no logicals.
1053 def Zn4WriteEXTRQ_INSERTQ : SchedWriteRes<[Zn4FPVShuf01, Zn4FPLd01]> {
1055 let ReleaseAtCycles = [1, 1];
1056 let NumMicroOps = 1;
1058 def : InstRW<[Zn4WriteEXTRQ_INSERTQ], (instrs EXTRQ, INSERTQ)>;
1060 def Zn4WriteEXTRQI_INSERTQI : SchedWriteRes<[Zn4FPVShuf01, Zn4FPLd01]> {
1062 let ReleaseAtCycles = [1, 1];
1063 let NumMicroOps = 2;
1065 def : InstRW<[Zn4WriteEXTRQI_INSERTQI], (instrs EXTRQI, INSERTQI)>;
1067 defm : Zn4WriteResXMMPair<WriteVecALUX, [Zn4FPVAdd0123], 1, [1], 1>; // Vector integer ALU op, no logicals (XMM).
1069 def Zn4WriteVecALUXSlow : SchedWriteRes<[Zn4FPVAdd01]> {
1071 let ReleaseAtCycles = [2];
1072 let NumMicroOps = 1;
1074 def : InstRW<[Zn4WriteVecALUXSlow], (instrs PABSBrr, PABSDrr, PABSWrr,
1075 PADDSBrr, PADDSWrr, PADDUSBrr, PADDUSWrr,
1077 PSIGNBrr, PSIGNDrr, PSIGNWrr,
1078 VPABSBrr, VPABSDrr, VPABSWrr,
1079 VPADDSBrr, VPADDSWrr, VPADDUSBrr, VPADDUSWrr,
1082 VPSIGNBrr, VPSIGNDrr, VPSIGNWrr,
1083 PSUBSBrr, PSUBSWrr, PSUBUSBrr, PSUBUSWrr, VPSUBSBrr, VPSUBSWrr, VPSUBUSBrr, VPSUBUSWrr)>;
1085 def Zn4WriteVecOpMask : SchedWriteRes<[Zn4FPOpMask01]> {
1087 let ReleaseAtCycles = [1];
1088 let NumMicroOps = 1;
1090 def : InstRW<[Zn4WriteVecOpMask], (instrs KADDBrr, KADDDrr, KADDQrr, KADDWrr,
1091 KANDBrr, KANDDrr, KANDQrr, KANDWrr,
1092 KANDNBrr, KANDNDrr, KANDNQrr, KANDNWrr,
1093 KMOVBkk, KMOVDkk, KMOVQkk, KMOVWkk,
1094 KMOVBrk, KMOVDrk, KMOVQrk, KMOVWrk,
1095 KNOTBrr, KNOTDrr, KNOTQrr, KNOTWrr,
1096 KORBrr, KORDrr, KORQrr, KORWrr,
1097 KORTESTBrr, KORTESTDrr, KORTESTQrr, KORTESTWrr,
1098 KTESTBrr, KTESTDrr, KTESTQrr, KTESTWrr,
1099 KUNPCKBWrr, KUNPCKDQrr, KUNPCKWDrr,
1100 KXNORBrr, KXNORDrr, KXNORQrr, KXNORWrr,
1101 KXORBrr, KXORDrr, KXORQrr, KXORWrr)>;
1103 def Zn4WriteVecOpMaskMemMov : SchedWriteRes<[Zn4FPOpMask4]> {
1105 let ReleaseAtCycles = [1];
1106 let NumMicroOps = 1;
1108 def : InstRW<[Zn4WriteVecOpMaskMemMov], (instrs KMOVBmk, KMOVDmk, KMOVQmk, KMOVWmk)>;
1110 def Zn4WriteVecOpMaskKRMov : SchedWriteRes<[Zn4FPOpMask4]> {
1112 let ReleaseAtCycles = [1];
1113 let NumMicroOps = 1;
1115 def : InstRW<[Zn4WriteVecOpMaskKRMov], (instrs KMOVBkr, KMOVDkr, KMOVQkr, KMOVWkr)>;
1117 def Zn4WriteVecALU2Slow : SchedWriteRes<[Zn4FPVAdd12]> {
1118 // TODO: All align instructions are expected to be of 4 cycle latency
1120 let ReleaseAtCycles = [1];
1121 let NumMicroOps = 1;
1123 def : InstRW<[Zn4WriteVecALU2Slow], (instrs VALIGNDZrri, VALIGNDZ128rri, VALIGNDZ256rri,
1124 VALIGNQZrri, VALIGNQZ128rri, VALIGNQZ256rri)
1126 defm : Zn4WriteResYMMPair<WriteVecALUY, [Zn4FPVAdd0123], 1, [1], 1>; // Vector integer ALU op, no logicals (YMM).
1128 def Zn4WriteVecALUYSlow : SchedWriteRes<[Zn4FPVAdd01]> {
1130 let ReleaseAtCycles = [1];
1131 let NumMicroOps = 1;
1133 def : InstRW<[Zn4WriteVecALUYSlow], (instrs VPABSBYrr, VPABSDYrr, VPABSWYrr,
1134 VPADDSBYrr, VPADDSWYrr, VPADDUSBYrr, VPADDUSWYrr,
1135 VPSUBSBYrr, VPSUBSWYrr, VPSUBUSBYrr, VPSUBUSWYrr,
1136 VPAVGBYrr, VPAVGWYrr,
1138 VPSIGNBYrr, VPSIGNDYrr, VPSIGNWYrr)>;
1140 defm : Zn4WriteResZMMPair<WriteVecALUZ, [Zn4FPVAdd0123], 1, [2], 1>; // Vector integer ALU op, no logicals (ZMM).
1142 defm : Zn4WriteResXMMPair<WriteVecLogic, [Zn4FPVMisc0123], 1, [1], 1>; // Vector integer and/or/xor logicals.
1143 defm : Zn4WriteResXMMPair<WriteVecLogicX, [Zn4FPVMisc0123], 1, [1], 1>; // Vector integer and/or/xor logicals (XMM).
1144 defm : Zn4WriteResYMMPair<WriteVecLogicY, [Zn4FPVMisc0123], 1, [1], 1>; // Vector integer and/or/xor logicals (YMM).
1145 defm : Zn4WriteResZMMPair<WriteVecLogicZ, [Zn4FPVMisc0123], 1, [2], 1>; // Vector integer and/or/xor logicals (ZMM).
1146 defm : Zn4WriteResXMMPair<WriteVecTest, [Zn4FPVAdd12, Zn4FPSt], 1, [1, 1], 2>; // FIXME: latency not from llvm-exegesis // Vector integer TEST instructions.
1147 defm : Zn4WriteResYMMPair<WriteVecTestY, [Zn4FPVAdd12, Zn4FPSt], 1, [1, 1], 2>; // FIXME: latency not from llvm-exegesis // Vector integer TEST instructions (YMM).
1148 defm : Zn4WriteResZMMPair<WriteVecTestZ, [Zn4FPVAdd12, Zn4FPSt], 1, [2, 2], 2>; // FIXME: latency not from llvm-exegesis // Vector integer TEST instructions (ZMM).
1149 defm : Zn4WriteResXMMPair<WriteVecShift, [Zn4FPVShift01], 1, [1], 1>; // Vector integer shifts (default).
1150 defm : Zn4WriteResXMMPair<WriteVecShiftX, [Zn4FPVShift01], 2, [2], 1>; // Vector integer shifts (XMM).
1151 defm : Zn4WriteResYMMPair<WriteVecShiftY, [Zn4FPVShift01], 1, [1], 1>; // Vector integer shifts (YMM).
1152 defm : Zn4WriteResZMMPair<WriteVecShiftZ, [Zn4FPVShift01], 1, [2], 1>; // Vector integer shifts (ZMM).
1153 defm : Zn4WriteResXMMPair<WriteVecShiftImm, [Zn4FPVShift01], 1, [1], 1>; // Vector integer immediate shifts (default).
1154 defm : Zn4WriteResXMMPair<WriteVecShiftImmX, [Zn4FPVShift01], 1, [1], 1>; // Vector integer immediate shifts (XMM).
1155 defm : Zn4WriteResYMMPair<WriteVecShiftImmY, [Zn4FPVShift01], 1, [1], 1>; // Vector integer immediate shifts (YMM).
1156 defm : Zn4WriteResZMMPair<WriteVecShiftImmZ, [Zn4FPVShift01], 1, [2], 1>; // Vector integer immediate shifts (ZMM).
1157 defm : Zn4WriteResXMMPair<WriteVecIMul, [Zn4FPVMul01], 3, [1], 1>; // Vector integer multiply (default).
1158 defm : Zn4WriteResXMMPair<WriteVecIMulX, [Zn4FPVMul01], 3, [1], 1>; // Vector integer multiply (XMM).
1159 defm : Zn4WriteResYMMPair<WriteVecIMulY, [Zn4FPVMul01], 3, [1], 1>; // Vector integer multiply (YMM).
1160 defm : Zn4WriteResZMMPair<WriteVecIMulZ, [Zn4FPVMul01], 3, [2], 1>; // Vector integer multiply (ZMM).
1161 defm : Zn4WriteResXMMPair<WritePMULLD, [Zn4FPVMul01], 3, [1], 1>; // Vector PMULLD.
1162 defm : Zn4WriteResYMMPair<WritePMULLDY, [Zn4FPVMul01], 3, [1], 1>; // Vector PMULLD (YMM).
1163 defm : Zn4WriteResZMMPair<WritePMULLDZ, [Zn4FPVMul01], 3, [2], 1>; // Vector PMULLD (ZMM).
1164 defm : Zn4WriteResXMMPair<WriteShuffle, [Zn4FPVShuf01], 1, [1], 1>; // Vector shuffles.
1165 defm : Zn4WriteResXMMPair<WriteShuffleX, [Zn4FPVShuf01], 1, [1], 1>; // Vector shuffles (XMM).
1166 defm : Zn4WriteResYMMPair<WriteShuffleY, [Zn4FPVShuf01], 1, [1], 1>; // Vector shuffles (YMM).
1167 defm : Zn4WriteResZMMPair<WriteShuffleZ, [Zn4FPVShuf01], 1, [2], 1>; // Vector shuffles (ZMM).
1168 defm : Zn4WriteResXMMPair<WriteVarShuffle, [Zn4FPVShuf01], 1, [1], 1>; // Vector variable shuffles.
1169 defm : Zn4WriteResXMMPair<WriteVarShuffleX, [Zn4FPVShuf01], 1, [1], 1>; // Vector variable shuffles (XMM).
1170 defm : Zn4WriteResYMMPair<WriteVarShuffleY, [Zn4FPVShuf01], 1, [1], 1>; // Vector variable shuffles (YMM).
1171 defm : Zn4WriteResZMMPair<WriteVarShuffleZ, [Zn4FPVShuf01], 1, [2], 1>; // Vector variable shuffles (ZMM).
1172 defm : Zn4WriteResXMMPair<WriteBlend, [Zn4FPVMisc0123], 1, [1], 1>; // Vector blends.
1173 defm : Zn4WriteResYMMPair<WriteBlendY, [Zn4FPVMisc0123], 1, [1], 1>; // Vector blends (YMM).
1174 defm : Zn4WriteResZMMPair<WriteBlendZ, [Zn4FPVMisc0123], 1, [2], 1>; // Vector blends (ZMM).
1175 defm : Zn4WriteResXMMPair<WriteVarBlend, [Zn4FPVMul01], 1, [1], 1>; // Vector variable blends.
1176 defm : Zn4WriteResYMMPair<WriteVarBlendY, [Zn4FPVMul01], 1, [1], 1>; // Vector variable blends (YMM).
1177 defm : Zn4WriteResZMMPair<WriteVarBlendZ, [Zn4FPVMul01], 1, [2], 1>; // Vector variable blends (ZMM).
1178 defm : Zn4WriteResXMMPair<WritePSADBW, [Zn4FPVAdd0123], 3, [2], 1>; // Vector PSADBW.
1179 defm : Zn4WriteResXMMPair<WritePSADBWX, [Zn4FPVAdd0123], 3, [2], 1>; // Vector PSADBW (XMM).
1180 defm : Zn4WriteResYMMPair<WritePSADBWY, [Zn4FPVAdd0123], 3, [2], 1>; // Vector PSADBW (YMM).
1181 defm : Zn4WriteResZMMPair<WritePSADBWZ, [Zn4FPVAdd0123], 4, [4], 1>; // Vector PSADBW (ZMM).
1182 defm : Zn4WriteResXMMPair<WriteMPSAD, [Zn4FPVAdd0123], 4, [8], 4, /*LoadUOps=*/2>; // Vector MPSAD.
1183 defm : Zn4WriteResYMMPair<WriteMPSADY, [Zn4FPVAdd0123], 4, [8], 3, /*LoadUOps=*/1>; // Vector MPSAD (YMM).
1184 defm : Zn4WriteResZMMPair<WriteMPSADZ, [Zn4FPVAdd0123], 4, [16], 3, /*LoadUOps=*/1>; // Vector MPSAD (ZMM).
1185 defm : Zn4WriteResXMMPair<WritePHMINPOS, [Zn4FPVAdd01], 3, [1], 1>; // Vector PHMINPOS.
1187 // Vector insert/extract operations.
1188 defm : Zn4WriteResXMMPair<WriteVecInsert, [Zn4FPLd01], 1, [2], 2, /*LoadUOps=*/-1>; // Insert gpr to vector element.
1189 defm : Zn4WriteResXMM<WriteVecExtract, [Zn4FPLd01], 1, [2], 2>; // Extract vector element to gpr.
1190 defm : Zn4WriteResXMM<WriteVecExtractSt, [Zn4FPSt, Zn4Store], !add(1, Znver4Model.StoreLatency), [1, 1], 2>; // Extract vector element and store.
1192 // MOVMSK operations.
1193 defm : Zn4WriteResXMM<WriteFMOVMSK, [Zn4FPVMisc2], 1, [1], 1>;
1194 defm : Zn4WriteResXMM<WriteVecMOVMSK, [Zn4FPVMisc2], 1, [1], 1>;
1195 defm : Zn4WriteResYMM<WriteVecMOVMSKY, [Zn4FPVMisc2], 1, [1], 1>;
1196 defm : Zn4WriteResXMM<WriteMMXMOVMSK, [Zn4FPVMisc2], 1, [1], 1>;
1198 // Conversion between integer and float.
1199 defm : Zn4WriteResXMMPair<WriteCvtSD2I, [Zn4FPFCvt01], 1, [1], 1>; // Double -> Integer.
1200 defm : Zn4WriteResXMMPair<WriteCvtPD2I, [Zn4FPFCvt01], 3, [2], 1>; // Double -> Integer (XMM).
1201 defm : Zn4WriteResYMMPair<WriteCvtPD2IY, [Zn4FPFCvt01], 3, [2], 2>; // Double -> Integer (YMM).
1202 defm : Zn4WriteResZMMPair<WriteCvtPD2IZ, [Zn4FPFCvt01], 3, [4], 2>; // Double -> Integer (ZMM).
1204 def Zn4WriteCvtPD2IMMX : SchedWriteRes<[Zn4FPFCvt01]> {
1206 let ReleaseAtCycles = [2];
1207 let NumMicroOps = 2;
1209 defm : Zn4WriteResXMMPair<WriteCvtSS2I, [Zn4FPFCvt01], 5, [5], 2>; // Float -> Integer.
1211 defm : Zn4WriteResXMMPair<WriteCvtPS2I, [Zn4FPFCvt01], 3, [1], 1>; // Float -> Integer (XMM).
1212 defm : Zn4WriteResYMMPair<WriteCvtPS2IY, [Zn4FPFCvt01], 4, [1], 1>; // Float -> Integer (YMM).
1213 defm : Zn4WriteResZMMPair<WriteCvtPS2IZ, [Zn4FPFCvt01], 4, [2], 2>; // Float -> Integer (ZMM).
1215 defm : Zn4WriteResXMMPair<WriteCvtI2SD, [Zn4FPFCvt01], 4, [2], 2, /*LoadUOps=*/-1>; // Integer -> Double.
1216 defm : Zn4WriteResXMMPair<WriteCvtI2PD, [Zn4FPFCvt01], 3, [1], 1>; // Integer -> Double (XMM).
1217 defm : Zn4WriteResYMMPair<WriteCvtI2PDY, [Zn4FPFCvt01], 3, [2], 2, /*LoadUOps=*/-1>; // Integer -> Double (YMM).
1218 defm : Zn4WriteResZMMPair<WriteCvtI2PDZ, [Zn4FPFCvt01], 4, [4], 4, /*LoadUOps=*/-1>; // Integer -> Double (ZMM).
1220 def Zn4WriteCvtI2PDMMX : SchedWriteRes<[Zn4FPFCvt01]> {
1222 let ReleaseAtCycles = [6];
1223 let NumMicroOps = 2;
1226 defm : Zn4WriteResXMMPair<WriteCvtI2SS, [Zn4FPFCvt01], 3, [2], 2, /*LoadUOps=*/-1>; // Integer -> Float.
1227 defm : Zn4WriteResXMMPair<WriteCvtI2PS, [Zn4FPFCvt01], 3, [1], 1>; // Integer -> Float (XMM).
1228 defm : Zn4WriteResYMMPair<WriteCvtI2PSY, [Zn4FPFCvt01], 3, [1], 1>; // Integer -> Float (YMM).
1229 defm : Zn4WriteResZMMPair<WriteCvtI2PSZ, [Zn4FPFCvt01], 3, [2], 2>; // Integer -> Float (ZMM).
1231 def Zn4WriteCvtI2PSMMX : SchedWriteRes<[Zn4FPFCvt01]> {
1233 let ReleaseAtCycles = [1];
1234 let NumMicroOps = 2;
1237 defm : Zn4WriteResXMMPair<WriteCvtSS2SD, [Zn4FPFCvt01], 3, [1], 1>; // Float -> Double size conversion.
1238 defm : Zn4WriteResXMMPair<WriteCvtPS2PD, [Zn4FPFCvt01], 3, [1], 1>; // Float -> Double size conversion (XMM).
1239 defm : Zn4WriteResYMMPair<WriteCvtPS2PDY, [Zn4FPFCvt01], 4, [2], 2, /*LoadUOps=*/-1>; // Float -> Double size conversion (YMM).
1240 defm : Zn4WriteResZMMPair<WriteCvtPS2PDZ, [Zn4FPFCvt01], 6, [4], 4, /*LoadUOps=*/-1>; // Float -> Double size conversion (ZMM).
1242 defm : Zn4WriteResXMMPair<WriteCvtSD2SS, [Zn4FPFCvt01], 3, [1], 1>; // Double -> Float size conversion.
1243 defm : Zn4WriteResXMMPair<WriteCvtPD2PS, [Zn4FPFCvt01], 3, [1], 1>; // Double -> Float size conversion (XMM).
1244 defm : Zn4WriteResYMMPair<WriteCvtPD2PSY, [Zn4FPFCvt01], 6, [2], 2>; // Double -> Float size conversion (YMM).
1245 defm : Zn4WriteResZMMPair<WriteCvtPD2PSZ, [Zn4FPFCvt01], 6, [4], 4>; // Double -> Float size conversion (ZMM).
1247 defm : Zn4WriteResXMMPair<WriteCvtPH2PS, [Zn4FPFCvt01], 3, [1], 1>; // Half -> Float size conversion.
1248 defm : Zn4WriteResYMMPair<WriteCvtPH2PSY, [Zn4FPFCvt01], 4, [2], 2, /*LoadUOps=*/-1>; // Half -> Float size conversion (YMM).
1249 defm : Zn4WriteResZMMPair<WriteCvtPH2PSZ, [Zn4FPFCvt01], 4, [4], 4, /*LoadUOps=*/-1>; // Half -> Float size conversion (ZMM).
1251 defm : Zn4WriteResXMM<WriteCvtPS2PH, [Zn4FPFCvt01], 3, [2], 1>; // Float -> Half size conversion.
1252 defm : Zn4WriteResYMM<WriteCvtPS2PHY, [Zn4FPFCvt01], 6, [2], 2>; // Float -> Half size conversion (YMM).
1253 defm : Zn4WriteResZMM<WriteCvtPS2PHZ, [Zn4FPFCvt01], 6, [2], 2>; // Float -> Half size conversion (ZMM).
1255 defm : Zn4WriteResXMM<WriteCvtPS2PHSt, [Zn4FPFCvt01, Zn4FPSt, Zn4Store], !add(3, Znver4Model.StoreLatency), [1, 1, 1], 2>; // Float -> Half + store size conversion.
1256 defm : Zn4WriteResYMM<WriteCvtPS2PHYSt, [Zn4FPFCvt01, Zn4FPSt, Zn4Store], !add(6, Znver4Model.StoreLatency), [2, 1, 1], 3>; // Float -> Half + store size conversion (YMM).
1257 defm : Zn4WriteResYMM<WriteCvtPS2PHZSt, [Zn4FPFCvt01, Zn4FPSt, Zn4Store], !add(6, Znver4Model.StoreLatency), [2, 1, 1], 3>; // Float -> Half + store size conversion (ZMM).
1259 // CRC32 instruction.
1260 defm : Zn4WriteResIntPair<WriteCRC32, [Zn4ALU1], 3, [1], 1>;
1262 def Zn4WriteSHA1MSG1rr : SchedWriteRes<[Zn4FPU0123]> {
1264 let ReleaseAtCycles = [2];
1265 let NumMicroOps = 2;
1267 def : InstRW<[Zn4WriteSHA1MSG1rr], (instrs SHA1MSG1rr)>;
1269 def Zn4WriteSHA1MSG1rm : SchedWriteRes<[Zn4AGU012, Zn4Load, Zn4FPU0123]> {
1270 let Latency = !add(Znver4Model.LoadLatency, Zn4WriteSHA1MSG1rr.Latency);
1271 let ReleaseAtCycles = [1, 1, 2];
1272 let NumMicroOps = !add(Zn4WriteSHA1MSG1rr.NumMicroOps, 0);
1274 def : InstRW<[Zn4WriteSHA1MSG1rm], (instrs SHA1MSG1rm)>;
1276 def Zn4WriteSHA1MSG2rr_SHA1NEXTErr : SchedWriteRes<[Zn4FPU0123]> {
1278 let ReleaseAtCycles = [2];
1279 let NumMicroOps = 1;
1281 def : InstRW<[Zn4WriteSHA1MSG2rr_SHA1NEXTErr], (instrs SHA1MSG2rr, SHA1NEXTErr)>;
1283 def Zn4Writerm_SHA1MSG2rm_SHA1NEXTErm : SchedWriteRes<[Zn4AGU012, Zn4Load, Zn4FPU0123]> {
1284 let Latency = !add(Znver4Model.LoadLatency, Zn4WriteSHA1MSG2rr_SHA1NEXTErr.Latency);
1285 let ReleaseAtCycles = [1, 1, 2];
1286 let NumMicroOps = !add(Zn4WriteSHA1MSG2rr_SHA1NEXTErr.NumMicroOps, 0);
1288 def : InstRW<[Zn4Writerm_SHA1MSG2rm_SHA1NEXTErm], (instrs SHA1MSG2rm, SHA1NEXTErm)>;
1290 def Zn4WriteSHA256MSG1rr : SchedWriteRes<[Zn4FPU0123]> {
1292 let ReleaseAtCycles = [3];
1293 let NumMicroOps = 2;
1295 def : InstRW<[Zn4WriteSHA256MSG1rr], (instrs SHA256MSG1rr)>;
1297 def Zn4Writerm_SHA256MSG1rm : SchedWriteRes<[Zn4AGU012, Zn4Load, Zn4FPU0123]> {
1298 let Latency = !add(Znver4Model.LoadLatency, Zn4WriteSHA256MSG1rr.Latency);
1299 let ReleaseAtCycles = [1, 1, 3];
1300 let NumMicroOps = !add(Zn4WriteSHA256MSG1rr.NumMicroOps, 0);
1302 def : InstRW<[Zn4Writerm_SHA256MSG1rm], (instrs SHA256MSG1rm)>;
1304 def Zn4WriteSHA256MSG2rr : SchedWriteRes<[Zn4FPU0123]> {
1306 let ReleaseAtCycles = [8];
1307 let NumMicroOps = 4;
1309 def : InstRW<[Zn4WriteSHA256MSG2rr], (instrs SHA256MSG2rr)>;
1311 def Zn4WriteSHA256MSG2rm : SchedWriteRes<[Zn4AGU012, Zn4Load, Zn4FPU0123]> {
1312 let Latency = !add(Znver4Model.LoadLatency, Zn4WriteSHA256MSG2rr.Latency);
1313 let ReleaseAtCycles = [1, 1, 8];
1314 let NumMicroOps = !add(Zn4WriteSHA256MSG2rr.NumMicroOps, 1);
1316 def : InstRW<[Zn4WriteSHA256MSG2rm], (instrs SHA256MSG2rm)>;
1318 def Zn4WriteSHA1RNDS4rri : SchedWriteRes<[Zn4FPU0123]> {
1320 let ReleaseAtCycles = [8];
1321 let NumMicroOps = 1;
1323 def : InstRW<[Zn4WriteSHA1RNDS4rri], (instrs SHA1RNDS4rri)>;
1325 def Zn4WriteSHA256RNDS2rr : SchedWriteRes<[Zn4FPU0123]> {
1327 let ReleaseAtCycles = [8];
1328 let NumMicroOps = 1;
1330 def : InstRW<[Zn4WriteSHA256RNDS2rr], (instrs SHA256RNDS2rr)>;
1332 // Strings instructions.
1333 // Packed Compare Implicit Length Strings, Return Mask
1334 defm : Zn4WriteResXMMPair<WritePCmpIStrM, [Zn4FPVAdd0123], 6, [8], 3, /*LoadUOps=*/1>;
1335 // Packed Compare Explicit Length Strings, Return Mask
1336 defm : Zn4WriteResXMMPair<WritePCmpEStrM, [Zn4FPVAdd0123], 6, [12], 7, /*LoadUOps=*/5>;
1337 // Packed Compare Implicit Length Strings, Return Index
1338 defm : Zn4WriteResXMMPair<WritePCmpIStrI, [Zn4FPVAdd0123], 2, [8], 4>;
1339 // Packed Compare Explicit Length Strings, Return Index
1340 defm : Zn4WriteResXMMPair<WritePCmpEStrI, [Zn4FPVAdd0123], 6, [12], 8, /*LoadUOps=*/4>;
1342 // AES instructions.
1343 defm : Zn4WriteResXMMPair<WriteAESDecEnc, [Zn4FPAES01], 4, [1], 1>; // Decryption, encryption.
1344 defm : Zn4WriteResXMMPair<WriteAESIMC, [Zn4FPAES01], 4, [1], 1>; // InvMixColumn.
1345 defm : Zn4WriteResXMMPair<WriteAESKeyGen, [Zn4FPAES01], 4, [1], 1>; // Key Generation.
1347 // Carry-less multiplication instructions.
1348 defm : Zn4WriteResXMMPair<WriteCLMul, [Zn4FPCLM01], 4, [4], 4>;
1351 defm : Zn4WriteResInt<WriteEMMS, [Zn4ALU0123], 2, [1], 1>; // FIXME: latency not from llvm-exegesis
1354 defm : Zn4WriteResInt<WriteLDMXCSR, [Zn4AGU012, Zn4Load, Zn4ALU0123], !add(Znver4Model.LoadLatency, 1), [1, 1, 6], 1>; // FIXME: latency not from llvm-exegesis
1355 defm : Zn4WriteResInt<WriteSTMXCSR, [Zn4ALU0123, Zn4AGU012, Zn4Store], !add(1, Znver4Model.StoreLatency), [60, 1, 1], 2>; // FIXME: latency not from llvm-exegesis
1357 // Catch-all for expensive system instructions.
1358 defm : Zn4WriteResInt<WriteSystem, [Zn4ALU0123], 100, [100], 100>;
1360 def Zn4WriteVZEROUPPER : SchedWriteRes<[Zn4FPU0123]> {
1361 let Latency = 0; // FIXME: not from llvm-exegesis
1362 let ReleaseAtCycles = [1];
1363 let NumMicroOps = 1;
1365 def : InstRW<[Zn4WriteVZEROUPPER], (instrs VZEROUPPER)>;
1367 def Zn4WriteVZEROALL : SchedWriteRes<[Zn4FPU0123]> {
1368 let Latency = 10; // FIXME: not from llvm-exegesis
1369 let ReleaseAtCycles = [24];
1370 let NumMicroOps = 18;
1372 def : InstRW<[Zn4WriteVZEROALL], (instrs VZEROALL)>;
1375 defm : Zn4WriteResYMMPair<WriteFShuffle256, [Zn4FPVShuf], 2, [1], 1, /*LoadUOps=*/2>; // Fp 256-bit width vector shuffles.
1376 defm : Zn4WriteResYMMPair<WriteFVarShuffle256, [Zn4FPVShuf], 7, [1], 2, /*LoadUOps=*/1>; // Fp 256-bit width variable shuffles.
1377 defm : Zn4WriteResYMMPair<WriteShuffle256, [Zn4FPVShuf], 1, [1], 1>; // 256-bit width vector shuffles.
1379 def Zn4WriteVPERM2I128rr_VPERM2F128rr : SchedWriteRes<[Zn4FPVShuf]> {
1381 let ReleaseAtCycles = [1];
1382 let NumMicroOps = 1;
1384 def : InstRW<[Zn4WriteVPERM2I128rr_VPERM2F128rr], (instrs VPERM2I128rr, VPERM2F128rr)>;
1386 def Zn4WriteVPERM2F128rm : SchedWriteRes<[Zn4AGU012, Zn4Load, Zn4FPVShuf]> {
1387 let Latency = !add(Znver4Model.LoadLatency, Zn4WriteVPERM2I128rr_VPERM2F128rr.Latency);
1388 let ReleaseAtCycles = [1, 1, 1];
1389 let NumMicroOps = !add(Zn4WriteVPERM2I128rr_VPERM2F128rr.NumMicroOps, 0);
1391 def : InstRW<[Zn4WriteVPERM2F128rm], (instrs VPERM2F128rm)>;
1393 def Zn4WriteVPERMPSYrr : SchedWriteRes<[Zn4FPVShuf]> {
1395 let ReleaseAtCycles = [1];
1396 let NumMicroOps = 2;
1398 def : InstRW<[Zn4WriteVPERMPSYrr], (instrs VPERMPSYrr)>;
1400 def Zn4WriteVPERMPSYrm : SchedWriteRes<[Zn4AGU012, Zn4Load, Zn4FPVShuf]> {
1401 let Latency = !add(Znver4Model.LoadLatency, Zn4WriteVPERMPSYrr.Latency);
1402 let ReleaseAtCycles = [1, 1, 2];
1403 let NumMicroOps = !add(Zn4WriteVPERMPSYrr.NumMicroOps, 1);
1405 def : InstRW<[Zn4WriteVPERMPSYrm], (instrs VPERMPSYrm)>;
1407 def Zn4WriteVPERMYri : SchedWriteRes<[Zn4FPVShuf]> {
1409 let ReleaseAtCycles = [1];
1410 let NumMicroOps = 2;
1412 def : InstRW<[Zn4WriteVPERMYri], (instrs VPERMPDYri, VPERMQYri)>;
1414 def Zn4WriteVPERMPDYmi : SchedWriteRes<[Zn4AGU012, Zn4Load, Zn4FPVShuf]> {
1415 let Latency = !add(Znver4Model.LoadLatency, Zn4WriteVPERMYri.Latency);
1416 let ReleaseAtCycles = [1, 1, 2];
1417 let NumMicroOps = !add(Zn4WriteVPERMYri.NumMicroOps, 1);
1419 def : InstRW<[Zn4WriteVPERMPDYmi], (instrs VPERMPDYmi)>;
1421 def Zn4WriteVPERMDYrr : SchedWriteRes<[Zn4FPVShuf]> {
1423 let ReleaseAtCycles = [1];
1424 let NumMicroOps = 2;
1426 def : InstRW<[Zn4WriteVPERMDYrr], (instrs VPERMDYrr)>;
1428 def Zn4WriteVPERMYm : SchedWriteRes<[Zn4AGU012, Zn4Load, Zn4FPVShuf]> {
1429 let Latency = !add(Znver4Model.LoadLatency, Zn4WriteVPERMDYrr.Latency);
1430 let ReleaseAtCycles = [1, 1, 2];
1431 let NumMicroOps = !add(Zn4WriteVPERMDYrr.NumMicroOps, 0);
1433 def : InstRW<[Zn4WriteVPERMYm], (instrs VPERMQYmi, VPERMDYrm)>;
1435 defm : Zn4WriteResYMMPair<WriteVPMOV256, [Zn4FPVShuf01], 4, [3], 2, /*LoadUOps=*/-1>; // 256-bit width packed vector width-changing move.
1436 defm : Zn4WriteResYMMPair<WriteVarShuffle256, [Zn4FPVShuf01], 1, [1], 2>; // 256-bit width vector variable shuffles.
1437 defm : Zn4WriteResXMMPair<WriteVarVecShift, [Zn4FPVShift01], 1, [1], 1>; // Variable vector shifts.
1438 defm : Zn4WriteResYMMPair<WriteVarVecShiftY, [Zn4FPVShift01], 1, [1], 1>; // Variable vector shifts (YMM).
1439 defm : Zn4WriteResZMMPair<WriteVarVecShiftZ, [Zn4FPVShift01], 1, [2], 2>; // Variable vector shifts (ZMM).
1441 // Old microcoded instructions that nobody use.
1442 defm : Zn4WriteResInt<WriteMicrocoded, [Zn4ALU0123], 100, [100], 100>;
1444 // Fence instructions.
1445 defm : Zn4WriteResInt<WriteFence, [Zn4ALU0123], 1, [100], 1>;
1447 def Zn4WriteLFENCE : SchedWriteRes<[Zn4LSU]> {
1449 let ReleaseAtCycles = [30];
1450 let NumMicroOps = 1;
1452 def : InstRW<[Zn4WriteLFENCE], (instrs LFENCE)>;
1454 def Zn4WriteSFENCE : SchedWriteRes<[Zn4LSU]> {
1456 let ReleaseAtCycles = [1];
1457 let NumMicroOps = 1;
1459 def : InstRW<[Zn4WriteSFENCE], (instrs SFENCE)>;
1461 // Nop, not very useful expect it provides a model for nops!
1462 defm : Zn4WriteResInt<WriteNop, [Zn4ALU0123], 0, [1], 1>; // FIXME: latency not from llvm-exegesis
1465 ///////////////////////////////////////////////////////////////////////////////
1467 ///////////////////////////////////////////////////////////////////////////////
1469 def Zn4WriteZeroLatency : SchedWriteRes<[]> {
1471 let ReleaseAtCycles = [];
1472 let NumMicroOps = 1;
1474 def : InstRW<[Zn4WriteZeroLatency], (instrs MOV32rr, MOV32rr_REV,
1475 MOV64rr, MOV64rr_REV,
1478 def Zn4WriteSwapRenameable : SchedWriteRes<[]> {
1480 let ReleaseAtCycles = [];
1481 let NumMicroOps = 2;
1483 def : InstRW<[Zn4WriteSwapRenameable], (instrs XCHG32rr, XCHG32ar,
1484 XCHG64rr, XCHG64ar)>;
1486 defm : Zn4WriteResInt<WriteXCHG, [Zn4ALU0123], 0, [8], 2>; // Compare+Exchange - TODO RMW support.
1488 defm : Zn4WriteResXMM<WriteFMoveX, [], 0, [], 1>;
1489 defm : Zn4WriteResYMM<WriteFMoveY, [], 0, [], 1>;
1490 defm : Zn4WriteResYMM<WriteFMoveZ, [], 0, [], 1>;
1492 defm : Zn4WriteResXMM<WriteVecMove, [Zn4FPFMisc0123], 1, [1], 1>; // MMX
1493 defm : Zn4WriteResXMM<WriteVecMoveX, [], 0, [], 1>;
1494 defm : Zn4WriteResYMM<WriteVecMoveY, [], 0, [], 1>;
1495 defm : Zn4WriteResYMM<WriteVecMoveZ, [], 0, [], 1>;
1497 def : IsOptimizableRegisterMove<[
1498 InstructionEquivalenceClass<[
1500 MOV32rr, MOV32rr_REV,
1501 MOV64rr, MOV64rr_REV,
1507 // MMX moves are *NOT* eliminated.
1510 MOVAPSrr, MOVAPSrr_REV,
1511 MOVUPSrr, MOVUPSrr_REV,
1512 MOVAPDrr, MOVAPDrr_REV,
1513 MOVUPDrr, MOVUPDrr_REV,
1514 MOVDQArr, MOVDQArr_REV,
1515 MOVDQUrr, MOVDQUrr_REV,
1518 VMOVAPSrr, VMOVAPSrr_REV,
1519 VMOVUPSrr, VMOVUPSrr_REV,
1520 VMOVAPDrr, VMOVAPDrr_REV,
1521 VMOVUPDrr, VMOVUPDrr_REV,
1522 VMOVDQArr, VMOVDQArr_REV,
1523 VMOVDQUrr, VMOVDQUrr_REV,
1525 // AVX YMM variants.
1526 VMOVAPSYrr, VMOVAPSYrr_REV,
1527 VMOVUPSYrr, VMOVUPSYrr_REV,
1528 VMOVAPDYrr, VMOVAPDYrr_REV,
1529 VMOVUPDYrr, VMOVUPDYrr_REV,
1530 VMOVDQAYrr, VMOVDQAYrr_REV,
1531 VMOVDQUYrr, VMOVDQUYrr_REV,
1535 // FIXUP and RANGE Instructions
1536 def Zn4WriteVFIXUPIMMPDZrr_VRANGESDrr : SchedWriteRes<[Zn4FPFMisc01]> {
1538 let ReleaseAtCycles = [2];
1539 let NumMicroOps = 1;
1541 def : InstRW<[Zn4WriteVFIXUPIMMPDZrr_VRANGESDrr], (instregex
1542 "VFIXUPIMM(S|P)(S|D)(Z|Z128|Z256?)rrik", "VFIXUPIMM(S|P)(S|D)(Z?|Z128?|Z256?)rrikz",
1543 "VFIXUPIMM(S|P)(S|D)(Z128|Z256?)rri", "VRANGE(S|P)(S|D)(Z?|Z128?|Z256?)rri(b?)",
1544 "VRANGE(S|P)(S|D)(Z|Z128|Z256?)rri(b?)k","VRANGE(S|P)(S|D)(Z?|Z128?|Z256?)rri(b?)kz"
1547 // SCALE & REDUCE instructions
1548 def Zn4WriteSCALErr: SchedWriteRes<[Zn4FPFMisc23]> {
1550 let ReleaseAtCycles = [6];
1551 let NumMicroOps = 2;
1553 def : InstRW<[Zn4WriteSCALErr], (instregex
1554 "V(SCALEF|REDUCE)(S|P)(S|D)(Z?|Z128?|Z256?)(rr|rrb|rrkz|rrik|rrikz|rri)(_Int?|_Intkz?)",
1555 "(V?)REDUCE(PD|PS|SD|SS)(Z?|Z128?)(rri|rrikz|rrib)"
1558 //BF16PS Instructions
1559 def Zn4WriteBF16: SchedWriteRes<[Zn4FPFMisc23]> {
1561 let ReleaseAtCycles = [6];
1562 let NumMicroOps = 2;
1564 def : InstRW<[Zn4WriteBF16], (instregex
1565 "(V?)DPBF16PS(Z?|Z128?|Z256?)(r|rk|rkz)"
1568 // BUSD and VPMADD Instructions
1569 def Zn4WriteBUSDr_VPMADDr: SchedWriteRes<[Zn4FPFMisc01]> {
1571 let ReleaseAtCycles = [4];
1572 let NumMicroOps = 1;
1574 def : InstRW<[Zn4WriteBUSDr_VPMADDr], (instregex
1575 "VPDP(BU|WS)(S|P)(S|D|DS)(Z|Z128|Z256)(r|rk|rkz)",
1576 "VPMADD52(H|L)UQ(Z|Z128|Z256)(r|rk|rkz)"
1579 // SHIFT instructions
1580 def Zn4WriteSHIFTrr: SchedWriteRes<[Zn4FPFMisc01]> {
1582 let ReleaseAtCycles = [2];
1583 let NumMicroOps = 1;
1585 def : InstRW<[Zn4WriteSHIFTrr], (instregex
1586 "VP(LZCNT|SHLD|SHRD?)(D|Q|W|VD|VQ|VW?)(Z?|Z128?|Z256?)(rr|rk|rrk|rrkz|rri|rrik|rrikz)",
1587 "(V?)P(SLL|SRL|SRA)(D|Q|W|DQ)(Y?|Z?|Z128?|Z256?)(rr|rrk|rrkz)",
1588 "(V?)P(SLL|SRL|SRA)DQYri",
1589 "(V?)P(SLL|SRL)DQ(Z?|Z256?)ri",
1590 "(V?)P(SHUFB)(Y|Z|Z128|Z256?)(rr|rrk|rrkz)",
1591 "(V?)P(ROL|ROR)(D|Q|VD|VQ)(Z?|Z128?|Z256?)(rr|rrk|rrkz)",
1592 "(V?)P(ROL|ROR)(D|Q|VD|VQ)(Z256?)(ri|rik|rikz)",
1593 "(V?)P(ROL|ROR)(D|Q)(Z?|Z128?)(ri|rik|rikz)",
1594 "VPSHUFBITQMBZ128rr", "VFMSUB231SSZr_Intkz"
1597 def Zn4WriteSHIFTri: SchedWriteRes<[Zn4FPFMisc01]> {
1599 let ReleaseAtCycles = [1];
1600 let NumMicroOps = 1;
1602 def : InstRW<[Zn4WriteSHIFTri], (instregex
1603 "VP(SLL|SRL|SRA)(D|Q|W)(Z|Z128|Z256?)(ri|rik|rikz)"
1606 // ALIGN Instructions
1607 def Zn4WriteALIGN: SchedWriteRes<[Zn4FPFMisc12]> {
1609 let ReleaseAtCycles = [2];
1610 let NumMicroOps = 1;
1612 def : InstRW<[Zn4WriteALIGN], (instregex
1613 "(V?)PALIGNR(Z?|Z128?|Z256?)(rri|rrik|rrikz)"
1617 def Zn4WritePACK: SchedWriteRes<[Zn4FPFMisc12]> {
1619 let ReleaseAtCycles = [2];
1620 let NumMicroOps = 1;
1622 def : InstRW<[Zn4WritePACK], (instregex
1623 "(V?)PACK(SS|US)(DW|WB)(Z?|Z128?|Z256?)(rr|rrk|rrkz)"
1626 // MAX and MIN Instructions
1627 def Zn4WriteFCmp64: SchedWriteRes<[Zn4FPFMisc01]> {
1629 let ReleaseAtCycles = [2];
1630 let NumMicroOps = 1;
1632 def : InstRW<[Zn4WriteFCmp64], (instregex
1633 "(V?)CMP(S|P)(S|D)(rr|rri|rr_Int)",
1634 "(V?|VP?)(MAX|MIN|MINC|MAXC)(S|P|U)(S|D|Q)(Z?|Z128?|Z256?)(rr|rri|rrk|rrkz)(_Int?)",
1635 "VP(MAX|MIN)(SQ|UQ)(Z|Z128|Z256)(rr|rrk|rrkz)",
1636 "(V?)(MAX|MAXC|MIN|MINC)PD(Z|Z128|Z256?)(rr|rrk|rrkz)"
1640 def Zn4MOVS: SchedWriteRes<[Zn4FPFMisc12]> {
1642 let ReleaseAtCycles = [2];
1643 let NumMicroOps = 1;
1645 def : InstRW<[Zn4MOVS], (instregex
1646 "(V?)PMOV(SX|ZX)(BD|BQ|BW|WD|WQ|DQ)(Z128?|Z256?)(rr|rrk|rrkz)",
1647 "(V?)PMOV(SX|QD|UZ|ZX)(BD|BQ|BW?)(Y|Z128?)(rr|rrk|rrkz)",
1648 "(V?)PMOV(SX|US|ZX)(DQ|WD|QW|WQ?)(Y|Z128?)(rr|rrk|rrkz)",
1649 "(V?)VMOVDDUP(Z|Z128|Z256)(rr|rrk|rrkz)",
1650 "VPMOV(DB|DW|QB|QD|QW|SDB|SDW|SQB|SQD|SQW|SWB|USDB|USDW|USQB|USQD|USWB|WB)(Z128?)(rr|rrk|rrkz)"
1653 def Zn4MOVSZ: SchedWriteRes<[Zn4FPFMisc12]> {
1655 let ReleaseAtCycles = [4];
1656 let NumMicroOps = 1;
1658 def : InstRW<[Zn4MOVSZ], (instregex
1659 "(V?)PMOV(SX|ZX)(BD|BQ|BW|WD|WQ|DQ)(Z?)(rr|rrk|rrkz)"
1662 def Zn4MOVSrr: SchedWriteRes<[Zn4FPFMisc12]> {
1664 let ReleaseAtCycles = [5];
1665 let NumMicroOps = 1;
1667 def : InstRW<[Zn4MOVSrr], (instregex
1668 "(V?)PMOV(DB|QB|QW|SDB|SQB|SQW|USDB|USQB|USQW)(Z?)(rr|rrk|rrkz)"
1672 //VPTEST Instructions
1673 def Zn4VPTESTZ128: SchedWriteRes<[Zn4FPFMisc01]> {
1675 let ReleaseAtCycles = [3];
1676 let NumMicroOps = 1;
1678 def : InstRW<[Zn4VPTESTZ128], (instregex
1679 "(V?)PTEST(N?)(MB|MD|MQ|MW)(Z128?)(rrk)"
1682 def Zn4VPTESTZ256: SchedWriteRes<[Zn4FPFMisc01]> {
1684 let ReleaseAtCycles = [4];
1685 let NumMicroOps = 1;
1687 def : InstRW<[Zn4VPTESTZ256], (instregex
1688 "(V?)PTEST(N?)(MB|MD|MQ|MW)(Z256?)(rr|rrk)"
1691 def Zn4VPTESTZ: SchedWriteRes<[Zn4FPFMisc01]> {
1693 let ReleaseAtCycles = [5];
1694 let NumMicroOps = 1;
1696 def : InstRW<[Zn4VPTESTZ], (instregex
1697 "(V?)PTEST(N?)(MB|MD|MQ|MW)(Z?)(rrk)"
1700 // CONFLICT Instructions
1701 def Zn4CONFLICTZ128: SchedWriteRes<[Zn4FPFMisc01]> {
1703 let ReleaseAtCycles = [2];
1704 let NumMicroOps = 1;
1706 def : InstRW<[Zn4CONFLICTZ128], (instregex
1707 "VPCONFLICT(D|Q)(Z128)(rr|rrk|rrkz)"
1710 def Zn4CONFLICTrr: SchedWriteRes<[Zn4FPFMisc01,Zn4FPFMisc12,Zn4FPFMisc23]> {
1712 let ReleaseAtCycles = [2,2,2];
1713 let NumMicroOps = 4;
1715 def : InstRW<[Zn4CONFLICTrr], (instregex
1716 "VPCONFLICT(D|Q)(Z|Z256)(rr|rrkz)"
1719 // RSQRT Instructions
1720 def Zn4VRSQRT14PDZ256: SchedWriteRes<[Zn4FPFMisc01]> {
1722 let ReleaseAtCycles = [2];
1723 let NumMicroOps = 1;
1725 def : InstRW<[Zn4VRSQRT14PDZ256], (instregex
1726 "VRSQRT14(PD|PS)(Z?|Z128?|Z256?)(r|rr|rk|rrk|rkz|rrkz)"
1730 // PERM Instructions
1731 def Zn4PERMILP: SchedWriteRes<[Zn4FPFMisc123]> {
1733 let ReleaseAtCycles = [2];
1734 let NumMicroOps = 1;
1736 def : InstRW<[Zn4PERMILP], (instregex
1737 "VPERMILP(S|D)(Y|Z|Z128|Z256)(rr|rrk|rrkz)"
1740 def Zn4PERMIT2_128: SchedWriteRes<[Zn4FPFMisc12]> {
1742 let ReleaseAtCycles = [2];
1743 let NumMicroOps = 1;
1745 def : InstRW<[Zn4PERMIT2_128], (instregex
1746 "VPERM(I2|T2)(PS|PD|W)Z128(rr|rrk|rrkz)",
1747 "VPERM(I2|T2)(B|D|Q)Z128(rr|rrk|rrkz)"
1750 def Zn4PERMIT2_128rr:SchedWriteRes<[Zn4FPFMisc12]> {
1752 let ReleaseAtCycles = [2];
1753 let NumMicroOps = 1;
1755 def : InstRW<[Zn4PERMIT2_128rr], (instregex
1756 "V(P?)COMPRESS(B|W|D|Q|PD|PS|SD|SQ)Z128(rr|rrk|rrkz)",
1757 "VPERM(B|D|Q|W)(Z128?)(rr|rrk|rrkz)"
1760 def Zn4PERMIT2_256: SchedWriteRes<[Zn4FPFMisc12]> {
1762 let ReleaseAtCycles = [2];
1763 let NumMicroOps = 1;
1765 def : InstRW<[Zn4PERMIT2_256], (instregex
1766 "VPERM(I2|T2)(PS|PD|W)Z256(rr|rrk|rrkz)",
1767 "VPERMP(S|D)Z256(rr|rrk|rrkz)",
1768 "V(P?)COMPRESS(B|W|D|Q|PD|PS|SD|SQ)Z256(rr|rrk|rrkz)",
1769 "VPERM(B|D|Q|W)Z256(rr|rrk|rrkz)",
1770 "VPERM(I2|Q|T2)(B|D|Q)Z256(rr|rrk|rrkz)",
1771 "VPEXPAND(B|W)Z256(rr|rrk|rrkz)"
1774 def Zn4PERMIT2Z: SchedWriteRes<[Zn4FPFMisc12]> {
1776 let ReleaseAtCycles = [2];
1777 let NumMicroOps = 1;
1779 def : InstRW<[Zn4PERMIT2Z], (instregex
1780 "VPERM(I2|T2)(PS|PD|W)Z(rr|rrk|rrkz)",
1781 "VPERM(B|D|W)Z(rr|rrk|rrkz)",
1782 "VPERM(I2|Q|T2)(B|D|Q)Z(rr|rrk|rrkz)",
1783 "V(P?)COMPRESS(B|W|D|Q|PD|PS|SD|SQ)Z(rr|rrk|rrkz)",
1784 "VPEXPAND(B|W)Z(rr|rrk|rrkz)",
1785 "VPERMP(S|D)Z(rr|rrk|rrkz)"
1788 // ALU SLOW Misc Instructions
1789 def Zn4VecALUZSlow: SchedWriteRes<[Zn4FPFMisc01]> {
1791 let ReleaseAtCycles = [2];
1792 let NumMicroOps = 1;
1794 def : InstRW<[Zn4VecALUZSlow], (instrs
1795 VPABSBZ128rr, VPABSBZ128rrk, VPABSBZ128rrkz, VPABSDZ128rr,
1796 VPABSDZ128rrk, VPABSDZ128rrkz, VPABSQZ128rr, VPABSQZ128rrk,
1797 VPABSQZ128rrkz, VPABSWZ128rr, VPABSWZ128rrk, VPABSWZ128rrkz,
1798 VPADDSBZ128rr, VPADDSBZ128rrk, VPADDSBZ128rrkz, VPADDSWZ128rr,
1799 VPADDSWZ128rrk, VPADDSWZ128rrkz,VPADDUSBZ128rr, VPADDUSBZ128rrk,
1800 VPADDUSBZ128rrkz, VPADDUSWZ128rr, VPADDUSWZ128rrk, VPADDUSWZ128rrkz,
1801 VPAVGBZ128rr, VPAVGBZ128rrk, VPAVGBZ128rrkz, VPAVGWZ128rr,
1802 VPAVGWZ128rrk, VPAVGWZ128rrkz, VPOPCNTBZ128rr, VPOPCNTBZ128rrk,
1803 VPOPCNTBZ128rrkz, VPOPCNTDZ128rr, VPOPCNTDZ128rrk, VPOPCNTDZ128rrkz,
1804 VPOPCNTQZ128rr, VPOPCNTQZ128rrk,VPOPCNTQZ128rrkz, VPOPCNTWZ128rr,
1805 VPOPCNTWZ128rrk, VPOPCNTWZ128rrkz,VPSUBSBZ128rr, VPSUBSBZ128rrk,
1806 VPSUBSBZ128rrkz, VPSUBSWZ128rr, VPSUBSWZ128rrk, VPSUBSWZ128rrkz,
1807 VPSUBUSBZ128rr, VPSUBUSBZ128rrk, VPSUBUSBZ128rrkz,VPSUBUSWZ128rr,
1808 VPSUBUSWZ128rrk, VPSUBUSWZ128rrkz
1812 ///////////////////////////////////////////////////////////////////////////////
1813 // Dependency breaking instructions.
1814 ///////////////////////////////////////////////////////////////////////////////
1816 def Zn4WriteZeroIdiom : SchedWriteVariant<[
1817 SchedVar<MCSchedPredicate<ZeroIdiomPredicate>, [Zn4WriteZeroLatency]>,
1818 SchedVar<NoSchedPred, [WriteALU]>
1820 def : InstRW<[Zn4WriteZeroIdiom], (instrs XOR32rr, XOR32rr_REV,
1821 XOR64rr, XOR64rr_REV,
1822 SUB32rr, SUB32rr_REV,
1823 SUB64rr, SUB64rr_REV)>;
1825 def Zn4WriteZeroIdiomEFLAGS : SchedWriteVariant<[
1826 SchedVar<MCSchedPredicate<CheckSameRegOperand<0, 1>>, [Zn4WriteZeroLatency]>,
1827 SchedVar<NoSchedPred, [WriteALU]>
1829 def : InstRW<[Zn4WriteZeroIdiomEFLAGS], (instrs CMP8rr, CMP8rr_REV,
1830 CMP16rr, CMP16rr_REV,
1831 CMP32rr, CMP32rr_REV,
1832 CMP64rr, CMP64rr_REV)>;
1834 def Zn4WriteFZeroIdiom : SchedWriteVariant<[
1835 SchedVar<MCSchedPredicate<ZeroIdiomPredicate>, [Zn4WriteZeroLatency]>,
1836 SchedVar<NoSchedPred, [WriteFLogic]>
1838 // NOTE: XORPSrr, XORPDrr are not zero-cycle!
1839 def : InstRW<[Zn4WriteFZeroIdiom], (instrs VXORPSrr, VXORPDrr,
1840 VANDNPSrr, VANDNPDrr)>;
1842 def Zn4WriteFZeroIdiomY : SchedWriteVariant<[
1843 SchedVar<MCSchedPredicate<ZeroIdiomPredicate>, [Zn4WriteZeroLatency]>,
1844 SchedVar<NoSchedPred, [WriteFLogicY]>
1846 def : InstRW<[Zn4WriteFZeroIdiomY], (instrs VXORPSYrr, VXORPDYrr,
1847 VANDNPSYrr, VANDNPDYrr)>;
1849 def Zn4WriteVZeroIdiomLogicX : SchedWriteVariant<[
1850 SchedVar<MCSchedPredicate<ZeroIdiomPredicate>, [Zn4WriteZeroLatency]>,
1851 SchedVar<NoSchedPred, [WriteVecLogicX]>
1853 // NOTE: PXORrr,PANDNrr are not zero-cycle!
1854 def : InstRW<[Zn4WriteVZeroIdiomLogicX], (instrs VPXORrr, VPANDNrr)>;
1856 def Zn4WriteVZeroIdiomLogicY : SchedWriteVariant<[
1857 SchedVar<MCSchedPredicate<ZeroIdiomPredicate>, [Zn4WriteZeroLatency]>,
1858 SchedVar<NoSchedPred, [WriteVecLogicY]>
1860 def : InstRW<[Zn4WriteVZeroIdiomLogicY], (instrs VPXORYrr, VPANDNYrr)>;
1862 def Zn4WriteVZeroIdiomALUX : SchedWriteVariant<[
1863 SchedVar<MCSchedPredicate<ZeroIdiomPredicate>, [Zn4WriteZeroLatency]>,
1864 SchedVar<NoSchedPred, [WriteVecALUX]>
1866 // NOTE: PSUBBrr, PSUBWrr, PSUBDrr, PSUBQrr,
1867 // PCMPGTBrr, PCMPGTWrr, PCMPGTDrr, PCMPGTQrr are not zero-cycle!
1868 def : InstRW<[Zn4WriteVZeroIdiomALUX],
1869 (instrs VPSUBBrr, VPSUBWrr, VPSUBDrr, VPSUBQrr,
1870 VPCMPGTBrr, VPCMPGTWrr, VPCMPGTDrr, VPCMPGTQrr)>;
1872 def Zn4WriteVZeroIdiomALUY : SchedWriteVariant<[
1873 SchedVar<MCSchedPredicate<ZeroIdiomPredicate>, [Zn4WriteZeroLatency]>,
1874 SchedVar<NoSchedPred, [WriteVecALUY]>
1876 def : InstRW<[Zn4WriteVZeroIdiomALUY],
1877 (instrs VPSUBBYrr, VPSUBWYrr, VPSUBDYrr, VPSUBQYrr,
1878 VPCMPGTBYrr, VPCMPGTWYrr, VPCMPGTDYrr, VPCMPGTQYrr)>;
1880 def : IsZeroIdiomFunction<[
1882 DepBreakingClass<[ XOR32rr, XOR32rr_REV,
1883 XOR64rr, XOR64rr_REV,
1884 SUB32rr, SUB32rr_REV,
1885 SUB64rr, SUB64rr_REV ], ZeroIdiomPredicate>,
1887 // SSE XMM Zero-idioms.
1896 PSUBBrr, PSUBWrr, PSUBDrr, PSUBQrr,
1898 PSUBUSBrr, PSUBUSWrr,
1899 PCMPGTBrr, PCMPGTWrr, PCMPGTDrr, PCMPGTQrr
1900 ], ZeroIdiomPredicate>,
1902 // AVX XMM Zero-idioms.
1906 VANDNPSrr, VANDNPDrr,
1911 VPSUBBrr, VPSUBWrr, VPSUBDrr, VPSUBQrr,
1912 VPSUBSBrr, VPSUBSWrr,
1913 VPSUBUSBrr, VPSUBUSWrr,
1914 VPCMPGTBrr, VPCMPGTWrr, VPCMPGTDrr, VPCMPGTQrr,
1915 ], ZeroIdiomPredicate>,
1917 // AVX YMM Zero-idioms.
1920 VXORPSYrr, VXORPDYrr,
1921 VANDNPSYrr, VANDNPDYrr,
1926 VPSUBBYrr, VPSUBWYrr, VPSUBDYrr, VPSUBQYrr,
1927 VPSUBSBYrr, VPSUBSWYrr,
1928 VPSUBUSBYrr, VPSUBUSWYrr,
1929 VPCMPGTBYrr, VPCMPGTWYrr, VPCMPGTDYrr, VPCMPGTQYrr
1930 ], ZeroIdiomPredicate>,
1933 def : IsDepBreakingFunction<[
1935 DepBreakingClass<[ SBB32rr, SBB32rr_REV,
1936 SBB64rr, SBB64rr_REV ], ZeroIdiomPredicate>,
1937 DepBreakingClass<[ CMP8rr, CMP8rr_REV,
1938 CMP16rr, CMP16rr_REV,
1939 CMP32rr, CMP32rr_REV,
1940 CMP64rr, CMP64rr_REV ], CheckSameRegOperand<0, 1> >,
1943 PCMPEQBrr, PCMPEQWrr, PCMPEQDrr, PCMPEQQrr
1944 ], ZeroIdiomPredicate>,
1948 VPCMPEQBrr, VPCMPEQWrr, VPCMPEQDrr, VPCMPEQQrr
1949 ], ZeroIdiomPredicate>,
1953 VPCMPEQBYrr, VPCMPEQWYrr, VPCMPEQDYrr, VPCMPEQQYrr
1954 ], ZeroIdiomPredicate>,