1 //=- X86ScheduleZnver4.td - X86 Znver4 Scheduling ------------*- tablegen -*-=//
3 // Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
4 // See https://llvm.org/LICENSE.txt for license information.
5 // SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
7 //===----------------------------------------------------------------------===//
9 // This file defines the machine model for Znver4 to support instruction
10 // scheduling and other instruction cost heuristics.
12 // * AMD Software Optimization Guide for AMD Family 19h Processors.
13 // https://www.amd.com/system/files/TechDocs/56665.zip
14 //===----------------------------------------------------------------------===//
16 def Znver4Model : SchedMachineModel {
17 // AMD SOG 19h, 2.9.6 Dispatch
18 // The processor may dispatch up to 6 macro ops per cycle
19 // into the execution engine.
21 // AMD SOG 19h, 2.10.3
22 // The retire control unit (RCU) tracks the completion status of all
23 // outstanding operations (integer, load/store, and floating-point) and is
24 // the final arbiter for exception processing and recovery.
25 // The unit can receive up to 6 macro ops dispatched per cycle and track up
26 // to 320 macro ops in-flight in non-SMT mode or 160 per thread in SMT mode.
27 let MicroOpBufferSize = 320;
28 // AMD SOG 19h, 2.9.1 Op Cache
29 // The op cache is organized as an associative cache with 64 sets and 8 ways.
30 // At each set-way intersection is an entry containing up to 8 macro ops.
31 // The maximum capacity of the op cache is 6.75K ops.
32 // Assuming a maximum dispatch of 9 ops/cy and a mispredict cost of 12cy from
33 // the op-cache, we limit the loop buffer to 9*12 = 108 to avoid loop
34 // unrolling leading to excessive filling of the op-cache from frontend.
35 let LoopMicroOpBufferSize = 108;
36 // AMD SOG 19h, 2.6.2 L1 Data Cache
37 // The L1 data cache has a 4- or 5- cycle integer load-to-use latency.
38 // AMD SOG 19h, 2.12 L1 Data Cache
39 // The AGU and LS pipelines are optimized for simple address generation modes.
40 // <...> and can achieve 4-cycle load-to-use integer load latency.
42 // AMD SOG 19h, 2.12 L1 Data Cache
43 // The AGU and LS pipelines are optimized for simple address generation modes.
44 // <...> and can achieve <...> 7-cycle load-to-use FP load latency.
45 int VecLoadLatency = 7;
46 // Latency of a simple store operation.
49 let HighLatency = 25; // FIXME: any better choice?
50 // AMD SOG 19h, 2.8 Optimizing Branching
51 // The branch misprediction penalty is in the range from 11 to 18 cycles,
52 // <...>. The common case penalty is 13 cycles.
53 let MispredictPenalty = 13;
55 let PostRAScheduler = 1; // Enable Post RegAlloc Scheduler pass.
57 let CompleteModel = 1;
60 let SchedModel = Znver4Model in {
63 //===----------------------------------------------------------------------===//
65 //===----------------------------------------------------------------------===//
67 // AMD SOG 19h, 2.10.3 Retire Control Unit
68 // The unit can receive up to 6 macro ops dispatched per cycle and track up to
69 // 320 macro ops in-flight in non-SMT mode or 128 per thread in SMT mode. <...>
70 // The retire unit handles in-order commit of up to nine macro ops per cycle.
71 def Zn4RCU : RetireControlUnit<Znver4Model.MicroOpBufferSize, 9>;
73 //===----------------------------------------------------------------------===//
74 // Integer Execution Unit
77 // AMD SOG 19h, 2.4 Superscalar Organization
78 // The processor uses four decoupled independent integer scheduler queues,
79 // each one servicing one ALU pipeline and one or two other pipelines
83 //===----------------------------------------------------------------------===//
85 // AMD SOG 19h, 2.10.2 Execution Units
86 // The processor contains 4 general purpose integer execution pipes.
87 // Each pipe has an ALU capable of general purpose integer operations.
88 def Zn4ALU0 : ProcResource<1>;
89 def Zn4ALU1 : ProcResource<1>;
90 def Zn4ALU2 : ProcResource<1>;
91 def Zn4ALU3 : ProcResource<1>;
93 // AMD SOG 19h, 2.10.2 Execution Units
94 // There is also a separate branch execution unit.
95 def Zn4BRU1 : ProcResource<1>;
97 // AMD SOG 19h, 2.10.2 Execution Units
98 // There are three Address Generation Units (AGUs) for all load and store
99 // address generation. There are also 3 store data movement units
100 // associated with the same schedulers as the AGUs.
101 def Zn4AGU0 : ProcResource<1>;
102 def Zn4AGU1 : ProcResource<1>;
103 def Zn4AGU2 : ProcResource<1>;
107 //===----------------------------------------------------------------------===//
109 // AMD SOG 19h, 2.10.2 Execution Units
110 // ALU0 additionally has divide <...> execution capability.
111 defvar Zn4Divider = Zn4ALU0;
113 // AMD SOG 19h, 2.10.2 Execution Units
114 // ALU0 additionally has <...> branch execution capability.
115 defvar Zn4BRU0 = Zn4ALU0;
117 // Integer Multiplication issued on ALU1.
118 defvar Zn4Multiplier = Zn4ALU1;
120 // Execution pipeline grouping
121 //===----------------------------------------------------------------------===//
123 // General ALU operations
124 def Zn4ALU0123 : ProcResGroup<[Zn4ALU0, Zn4ALU1, Zn4ALU2, Zn4ALU3]>;
126 // General AGU operations
127 def Zn4AGU012 : ProcResGroup<[Zn4AGU0, Zn4AGU1, Zn4AGU2]>;
129 // Control flow: jumps, calls
130 def Zn4BRU01 : ProcResGroup<[Zn4BRU0, Zn4BRU1]>;
132 // Everything that isn't control flow, but still needs to access CC register,
133 // namely: conditional moves, SETcc.
134 def Zn4ALU03 : ProcResGroup<[Zn4ALU0, Zn4ALU3]>;
136 // Zn4ALU1 handles complex bit twiddling: CRC/PDEP/PEXT
138 // Simple bit twiddling: bit test, shift/rotate, bit extraction
139 def Zn4ALU12 : ProcResGroup<[Zn4ALU1, Zn4ALU2]>;
144 //===----------------------------------------------------------------------===//
146 // AMD SOG 19h, 2.10.3 Retire Control Unit
147 // The integer physical register file (PRF) consists of 224 registers.
148 def Zn4IntegerPRF : RegisterFile<224, [GR64, CCR], [1, 1], [1, 0],
149 6, // Max moves that can be eliminated per cycle.
150 0>; // Restrict move elimination to zero regs.
152 // anandtech, The integer scheduler has a 4*24 entry macro op capacity.
153 // AMD SOG 19h, 2.10.1 Schedulers
154 // The schedulers can receive up to six macro ops per cycle, with a limit of
155 // two per scheduler. Each scheduler can issue one micro op per cycle into
156 // each of its associated pipelines
157 def Zn4Int : ProcResGroup<[Zn4ALU0, Zn4AGU0, Zn4BRU0, // scheduler 0
158 Zn4ALU1, Zn4AGU1, // scheduler 1
159 Zn4ALU2, Zn4AGU2, // scheduler 2
160 Zn4ALU3, Zn4BRU1 // scheduler 3
162 let BufferSize = !mul(4, 24);
166 //===----------------------------------------------------------------------===//
167 // Floating-Point Unit
170 // AMD SOG 19h, 2.4 Superscalar Organization
171 // The processor uses <...> two decoupled independent floating point schedulers
172 // each servicing two FP pipelines and one store or FP-to-integer pipeline.
176 //===----------------------------------------------------------------------===//
178 // AMD SOG 19h, 2.10.1 Schedulers
179 // <...>, and six FPU pipes.
180 // Agner, 22.10 Floating point execution pipes
181 // There are six floating point/vector execution pipes,
182 def Zn4FP0 : ProcResource<1>;
183 def Zn4FP1 : ProcResource<1>;
184 def Zn4FP2 : ProcResource<1>;
185 def Zn4FP3 : ProcResource<1>;
186 def Zn4FP45 : ProcResource<2>;
190 //===----------------------------------------------------------------------===//
191 // AMD SOG 19h, 2.11.1 Floating Point Execution Resources
193 // (v)FMUL*, (v)FMA*, Floating Point Compares, Blendv(DQ)
194 defvar Zn4FPFMul0 = Zn4FP0;
195 defvar Zn4FPFMul1 = Zn4FP1;
198 defvar Zn4FPFAdd0 = Zn4FP2;
199 defvar Zn4FPFAdd1 = Zn4FP3;
201 // All convert operations except pack/unpack
202 defvar Zn4FPFCvt0 = Zn4FP2;
203 defvar Zn4FPFCvt1 = Zn4FP3;
205 // All Divide and Square Root except Reciprocal Approximation
206 // AMD SOG 19h, 2.11.1 Floating Point Execution Resources
207 // FDIV unit can support 2 simultaneous operations in flight
208 // even though it occupies a single pipe.
209 // FIXME: BufferSize=2 ?
210 defvar Zn4FPFDiv = Zn4FP1;
212 // Moves and Logical operations on Floating Point Data Types
213 defvar Zn4FPFMisc0 = Zn4FP0;
214 defvar Zn4FPFMisc1 = Zn4FP1;
215 defvar Zn4FPFMisc2 = Zn4FP2;
216 defvar Zn4FPFMisc3 = Zn4FP3;
218 // Integer Adds, Subtracts, and Compares
219 // Some complex VADD operations are not available in all pipes.
220 defvar Zn4FPVAdd0 = Zn4FP0;
221 defvar Zn4FPVAdd1 = Zn4FP1;
222 defvar Zn4FPVAdd2 = Zn4FP2;
223 defvar Zn4FPVAdd3 = Zn4FP3;
225 // Integer Multiplies, SAD, Blendvb
226 defvar Zn4FPVMul0 = Zn4FP0;
227 defvar Zn4FPVMul1 = Zn4FP3;
229 // Data Shuffles, Packs, Unpacks, Permute
230 // Some complex shuffle operations are only available in pipe1.
231 defvar Zn4FPVShuf = Zn4FP1;
232 defvar Zn4FPVShufAux = Zn4FP2;
234 // Bit Shift Left/Right operations
235 defvar Zn4FPVShift0 = Zn4FP1;
236 defvar Zn4FPVShift1 = Zn4FP2;
238 // Moves and Logical operations on Packed Integer Data Types
239 defvar Zn4FPVMisc0 = Zn4FP0;
240 defvar Zn4FPVMisc1 = Zn4FP1;
241 defvar Zn4FPVMisc2 = Zn4FP2;
242 defvar Zn4FPVMisc3 = Zn4FP3;
245 defvar Zn4FPAES0 = Zn4FP0;
246 defvar Zn4FPAES1 = Zn4FP1;
249 defvar Zn4FPCLM0 = Zn4FP0;
250 defvar Zn4FPCLM1 = Zn4FP1;
252 // Execution pipeline grouping
253 //===----------------------------------------------------------------------===//
255 // AMD SOG 19h, 2.11 Floating-Point Unit
256 // Stores and floating point to general purpose register transfer
257 // have 2 dedicated pipelines (pipe 5 and 6).
258 def Zn4FPU0123 : ProcResGroup<[Zn4FP0, Zn4FP1, Zn4FP2, Zn4FP3]>;
260 // (v)FMUL*, (v)FMA*, Floating Point Compares, Blendv(DQ)
261 def Zn4FPFMul01 : ProcResGroup<[Zn4FPFMul0, Zn4FPFMul1]>;
264 // Some complex VADD operations are not available in all pipes.
265 def Zn4FPFAdd01 : ProcResGroup<[Zn4FPFAdd0, Zn4FPFAdd1]>;
267 // All convert operations except pack/unpack
268 def Zn4FPFCvt01 : ProcResGroup<[Zn4FPFCvt0, Zn4FPFCvt1]>;
270 // All Divide and Square Root except Reciprocal Approximation
271 // def Zn4FPFDiv : ProcResGroup<[Zn4FPFDiv]>;
273 // Moves and Logical operations on Floating Point Data Types
274 def Zn4FPFMisc0123 : ProcResGroup<[Zn4FPFMisc0, Zn4FPFMisc1, Zn4FPFMisc2, Zn4FPFMisc3]>;
276 // FIXUP and RANGE use FP01 pipelines
277 def Zn4FPFMisc01 : ProcResGroup<[Zn4FPFMisc0, Zn4FPFMisc1]>;
278 def Zn4FPFMisc12 : ProcResGroup<[Zn4FPFMisc1, Zn4FPFMisc2]>;
279 // SCALE instructions use FP23 pipelines
280 def Zn4FPFMisc23 : ProcResGroup<[Zn4FPFMisc2, Zn4FPFMisc3]>;
281 def Zn4FPFMisc123 : ProcResGroup<[Zn4FPFMisc1,Zn4FPFMisc2, Zn4FPFMisc3]>;
283 // Loads, Stores and Move to General Register (EX) Operations
284 // AMD SOG 19h, 2.11 Floating-Point Unit
285 // Stores and floating point to general purpose register transfer
286 // have 2 dedicated pipelines (pipe 5 and 6).
287 defvar Zn4FPLd01 = Zn4FP45;
289 // AMD SOG 19h, 2.11 Floating-Point Unit
290 // Note that FP stores are supported on two pipelines,
291 // but throughput is limited to one per cycle.
292 let Super = Zn4FP45 in
293 def Zn4FPSt : ProcResource<1>;
295 // Integer Adds, Subtracts, and Compares
296 // Some complex VADD operations are not available in all pipes.
297 def Zn4FPVAdd0123 : ProcResGroup<[Zn4FPVAdd0, Zn4FPVAdd1, Zn4FPVAdd2, Zn4FPVAdd3]>;
299 def Zn4FPVAdd01: ProcResGroup<[Zn4FPVAdd0, Zn4FPVAdd1]>;
300 def Zn4FPVAdd12: ProcResGroup<[Zn4FPVAdd1, Zn4FPVAdd2]>;
302 // AVX512 Opmask pipelines
303 def Zn4FPOpMask01: ProcResGroup<[Zn4FP2, Zn4FP3]>;
304 def Zn4FPOpMask4: ProcResGroup<[Zn4FP45]>;
306 // Integer Multiplies, SAD, Blendvb
307 def Zn4FPVMul01 : ProcResGroup<[Zn4FPVMul0, Zn4FPVMul1]>;
309 // Data Shuffles, Packs, Unpacks, Permute
310 // Some complex shuffle operations are only available in pipe1.
311 def Zn4FPVShuf01 : ProcResGroup<[Zn4FPVShuf, Zn4FPVShufAux]>;
313 // Bit Shift Left/Right operations
314 def Zn4FPVShift01 : ProcResGroup<[Zn4FPVShift0, Zn4FPVShift1]>;
316 // Moves and Logical operations on Packed Integer Data Types
317 def Zn4FPVMisc0123 : ProcResGroup<[Zn4FPVMisc0, Zn4FPVMisc1, Zn4FPVMisc2, Zn4FPVMisc3]>;
320 def Zn4FPAES01 : ProcResGroup<[Zn4FPAES0, Zn4FPAES1]>;
323 def Zn4FPCLM01 : ProcResGroup<[Zn4FPCLM0, Zn4FPCLM1]>;
328 //===----------------------------------------------------------------------===//
330 // Agner, 21.8 Register renaming and out-of-order schedulers
331 // The floating point register file has 192 vector registers
332 // of 512b each in zen4.
333 def Zn4FpPRF : RegisterFile<192, [VR64, VR128, VR256, VR512], [1, 1, 1, 1], [0, 1, 1],
334 6, // Max moves that can be eliminated per cycle.
335 0>; // Restrict move elimination to zero regs.
337 // AMD SOG 19h, 2.11 Floating-Point Unit
338 // The floating-point scheduler has a 2*32 entry macro op capacity.
339 // AMD SOG 19h, 2.11 Floating-Point Unit
340 // <...> the scheduler can issue 1 micro op per cycle for each pipe.
341 // FIXME: those are two separate schedulers, not a single big one.
342 def Zn4FP : ProcResGroup<[Zn4FP0, Zn4FP2, /*Zn4FP4,*/ // scheduler 0
343 Zn4FP1, Zn4FP3, Zn4FP45 /*Zn4FP5*/ // scheduler 1
345 let BufferSize = !mul(2, 32);
348 // AMD SOG 19h, 2.11 Floating-Point Unit
349 // Macro ops can be dispatched to the 64 entry Non Scheduling Queue (NSQ)
350 // even if floating-point scheduler is full.
351 // FIXME: how to model this properly?
354 //===----------------------------------------------------------------------===//
358 // AMD SOG 19h, 2.12 Load-Store Unit
359 // The LS unit contains three largely independent pipe-lines
360 // enabling the execution of three 256-bit memory operations per cycle.
361 def Zn4LSU : ProcResource<3>;
363 // AMD SOG 19h, 2.12 Load-Store Unit
364 // All three memory operations can be loads.
365 let Super = Zn4LSU in
366 def Zn4Load : ProcResource<3> {
367 // AMD SOG 19h, 2.12 Load-Store Unit
368 // The LS unit can process up to 72 out-of-order loads.
372 def Zn4LoadQueue : LoadQueue<Zn4Load>;
374 // AMD SOG 19h, 2.12 Load-Store Unit
375 // A maximum of two of the memory operations can be stores.
376 let Super = Zn4LSU in
377 def Zn4Store : ProcResource<2> {
378 // AMD SOG 19h, 2.12 Load-Store Unit
379 // The LS unit utilizes a 64-entry store queue (STQ).
383 def Zn4StoreQueue : StoreQueue<Zn4Store>;
385 //===----------------------------------------------------------------------===//
386 // Basic helper classes.
387 //===----------------------------------------------------------------------===//
389 // Many SchedWrites are defined in pairs with and without a folded load.
390 // Instructions with folded loads are usually micro-fused, so they only appear
391 // as two micro-ops when dispatched by the schedulers.
392 // This multiclass defines the resource usage for variants with and without
395 multiclass __Zn4WriteRes<SchedWrite SchedRW, list<ProcResourceKind> ExePorts,
396 int Lat = 1, list<int> Res = [], int UOps = 1> {
397 def : WriteRes<SchedRW, ExePorts> {
399 let ReleaseAtCycles = Res;
400 let NumMicroOps = UOps;
404 multiclass __Zn4WriteResPair<X86FoldableSchedWrite SchedRW,
405 list<ProcResourceKind> ExePorts, int Lat,
406 list<int> Res, int UOps, int LoadLat, int LoadUOps,
407 ProcResourceKind AGU, int LoadRes> {
408 defm : __Zn4WriteRes<SchedRW, ExePorts, Lat, Res, UOps>;
410 defm : __Zn4WriteRes<SchedRW.Folded,
411 !listconcat([AGU, Zn4Load], ExePorts),
413 !if(!and(!empty(Res), !eq(LoadRes, 1)),
415 !listconcat([1, LoadRes],
417 !listsplat(1, !size(ExePorts)),
419 !add(UOps, LoadUOps)>;
422 // For classes without folded loads.
423 multiclass Zn4WriteResInt<SchedWrite SchedRW,
424 list<ProcResourceKind> ExePorts, int Lat = 1,
425 list<int> Res = [], int UOps = 1> {
426 defm : __Zn4WriteRes<SchedRW, ExePorts, Lat, Res, UOps>;
429 multiclass Zn4WriteResXMM<SchedWrite SchedRW,
430 list<ProcResourceKind> ExePorts, int Lat = 1,
431 list<int> Res = [], int UOps = 1> {
432 defm : __Zn4WriteRes<SchedRW, ExePorts, Lat, Res, UOps>;
435 multiclass Zn4WriteResYMM<SchedWrite SchedRW,
436 list<ProcResourceKind> ExePorts, int Lat = 1,
437 list<int> Res = [], int UOps = 1> {
438 defm : __Zn4WriteRes<SchedRW, ExePorts, Lat, Res, UOps>;
441 multiclass Zn4WriteResZMM<SchedWrite SchedRW,
442 list<ProcResourceKind> ExePorts, int Lat = 1,
443 list<int> Res = [], int UOps = 1> {
444 defm : __Zn4WriteRes<SchedRW, ExePorts, Lat, Res, UOps>;
447 // For classes with folded loads.
448 multiclass Zn4WriteResIntPair<X86FoldableSchedWrite SchedRW,
449 list<ProcResourceKind> ExePorts, int Lat = 1,
450 list<int> Res = [], int UOps = 1,
451 int LoadUOps = 0, int LoadRes = 1> {
452 defm : __Zn4WriteResPair<SchedRW, ExePorts, Lat, Res, UOps,
453 Znver4Model.LoadLatency,
454 LoadUOps, Zn4AGU012, LoadRes>;
457 multiclass Zn4WriteResXMMPair<X86FoldableSchedWrite SchedRW,
458 list<ProcResourceKind> ExePorts, int Lat = 1,
459 list<int> Res = [], int UOps = 1,
460 int LoadUOps = 0, int LoadRes = 1> {
461 defm : __Zn4WriteResPair<SchedRW, ExePorts, Lat, Res, UOps,
462 Znver4Model.VecLoadLatency,
463 LoadUOps, Zn4FPLd01, LoadRes>;
466 multiclass Zn4WriteResYMMPair<X86FoldableSchedWrite SchedRW,
467 list<ProcResourceKind> ExePorts, int Lat = 1,
468 list<int> Res = [], int UOps = 1,
469 int LoadUOps = 0, int LoadRes = 1> {
470 defm : __Zn4WriteResPair<SchedRW, ExePorts, Lat, Res, UOps,
471 Znver4Model.VecLoadLatency,
472 LoadUOps, Zn4FPLd01, LoadRes>;
475 multiclass Zn4WriteResZMMPair<X86FoldableSchedWrite SchedRW,
476 list<ProcResourceKind> ExePorts, int Lat = 1,
477 list<int> Res = [], int UOps = 2,
478 int LoadUOps = 0, int LoadRes = 1> {
479 defm : __Zn4WriteResPair<SchedRW, ExePorts, Lat, Res, UOps,
480 Znver4Model.VecLoadLatency,
481 LoadUOps, Zn4FPLd01, LoadRes>;
484 //===----------------------------------------------------------------------===//
486 //===----------------------------------------------------------------------===//
488 def : ReadAdvance<ReadAfterLd, Znver4Model.LoadLatency>;
490 def : ReadAdvance<ReadAfterVecLd, Znver4Model.VecLoadLatency>;
491 def : ReadAdvance<ReadAfterVecXLd, Znver4Model.VecLoadLatency>;
492 def : ReadAdvance<ReadAfterVecYLd, Znver4Model.VecLoadLatency>;
494 // AMD SOG 19h, 2.11 Floating-Point Unit
495 // There is 1 cycle of added latency for a result to cross
496 // from F to I or I to F domain.
497 def : ReadAdvance<ReadInt2Fpu, -1>;
499 // Instructions with both a load and a store folded are modeled as a folded
501 defm : Zn4WriteResInt<WriteRMW, [Zn4AGU012, Zn4Store], Znver4Model.StoreLatency, [1, 1], 0>;
503 // Loads, stores, and moves, not folded with other operations.
504 defm : Zn4WriteResInt<WriteLoad, [Zn4AGU012, Zn4Load], !add(Znver4Model.LoadLatency, 1), [1, 1], 1>;
506 // Model the effect of clobbering the read-write mask operand of the GATHER operation.
507 // Does not cost anything by itself, only has latency, matching that of the WriteLoad,
508 defm : Zn4WriteResInt<WriteVecMaskedGatherWriteback, [], !add(Znver4Model.LoadLatency, 1), [], 0>;
510 def Zn4WriteMOVSlow : SchedWriteRes<[Zn4AGU012, Zn4Load]> {
511 let Latency = !add(Znver4Model.LoadLatency, 1);
512 let ReleaseAtCycles = [3, 1];
515 def : InstRW<[Zn4WriteMOVSlow], (instrs MOV8rm, MOV8rm_NOREX, MOV16rm, MOVSX16rm16, MOVSX16rm32, MOVZX16rm16, MOVSX16rm8, MOVZX16rm8)>;
517 defm : Zn4WriteResInt<WriteStore, [Zn4AGU012, Zn4Store], Znver4Model.StoreLatency, [1, 2], 1>;
518 defm : Zn4WriteResInt<WriteStoreNT, [Zn4AGU012, Zn4Store], Znver4Model.StoreLatency, [1, 2], 1>;
519 defm : Zn4WriteResInt<WriteMove, [Zn4ALU0123], 1, [4], 1>;
521 // Treat misc copies as a move.
522 def : InstRW<[WriteMove], (instrs COPY)>;
524 def Zn4WriteMOVBE16rm : SchedWriteRes<[Zn4AGU012, Zn4Load, Zn4ALU0123]> {
525 let Latency = Znver4Model.LoadLatency;
526 let ReleaseAtCycles = [1, 1, 4];
529 def : InstRW<[Zn4WriteMOVBE16rm], (instrs MOVBE16rm)>;
531 def Zn4WriteMOVBEmr : SchedWriteRes<[Zn4ALU0123, Zn4AGU012, Zn4Store]> {
532 let Latency = Znver4Model.StoreLatency;
533 let ReleaseAtCycles = [4, 1, 1];
536 def : InstRW<[Zn4WriteMOVBEmr], (instrs MOVBE16mr, MOVBE32mr, MOVBE64mr)>;
539 defm : Zn4WriteResIntPair<WriteALU, [Zn4ALU0123], 1, [1], 1>; // Simple integer ALU op.
541 def Zn4WriteALUSlow : SchedWriteRes<[Zn4ALU0123]> {
543 let ReleaseAtCycles = [4];
546 def : InstRW<[Zn4WriteALUSlow], (instrs ADD8i8, ADD16i16, ADD32i32, ADD64i32,
547 AND8i8, AND16i16, AND32i32, AND64i32,
548 OR8i8, OR16i16, OR32i32, OR64i32,
549 SUB8i8, SUB16i16, SUB32i32, SUB64i32,
550 XOR8i8, XOR16i16, XOR32i32, XOR64i32)>;
552 def Zn4WriteMoveExtend : SchedWriteRes<[Zn4ALU0123]> {
554 let ReleaseAtCycles = [4];
557 def : InstRW<[Zn4WriteMoveExtend], (instrs MOVSX16rr16, MOVSX16rr32, MOVZX16rr16, MOVSX16rr8, MOVZX16rr8)>;
559 def Zn4WriteMaterialize32bitImm: SchedWriteRes<[Zn4ALU0123]> {
561 let ReleaseAtCycles = [2];
564 def : InstRW<[Zn4WriteMaterialize32bitImm], (instrs MOV32ri, MOV32ri_alt, MOV64ri32)>;
566 def Zn4WritePDEP_PEXT : SchedWriteRes<[Zn4ALU1]> {
568 let ReleaseAtCycles = [1];
571 def : InstRW<[Zn4WritePDEP_PEXT], (instrs PDEP32rr, PDEP64rr,
572 PEXT32rr, PEXT64rr)>;
574 defm : Zn4WriteResIntPair<WriteADC, [Zn4ALU0123], 1, [4], 1>; // Integer ALU + flags op.
576 def Zn4WriteADC8mr_SBB8mr : SchedWriteRes<[Zn4AGU012, Zn4Load, Zn4ALU0123, Zn4Store]> {
578 let ReleaseAtCycles = [1, 1, 7, 1];
581 def : InstRW<[Zn4WriteADC8mr_SBB8mr], (instrs ADC8mr, SBB8mr)>;
583 // This is for simple LEAs with one or two input operands.
584 defm : Zn4WriteResInt<WriteLEA, [Zn4AGU012], 1, [1], 1>; // LEA instructions can't fold loads.
586 // This write is used for slow LEA instructions.
587 def Zn4Write3OpsLEA : SchedWriteRes<[Zn4ALU0123]> {
589 let ReleaseAtCycles = [1];
593 // On Znver4, a slow LEA is either a 3Ops LEA (base, index, offset),
594 // or an LEA with a `Scale` value different than 1.
595 def Zn4SlowLEAPredicate : MCSchedPredicate<
597 // A 3-operand LEA (base, index, offset).
598 IsThreeOperandsLEAFn,
599 // An LEA with a "Scale" different than 1.
601 CheckIsImmOperand<2>,
602 CheckNot<CheckImmOperand<2, 1>>
607 def Zn4WriteLEA : SchedWriteVariant<[
608 SchedVar<Zn4SlowLEAPredicate, [Zn4Write3OpsLEA]>,
609 SchedVar<NoSchedPred, [WriteLEA]>
612 def : InstRW<[Zn4WriteLEA], (instrs LEA32r, LEA64r, LEA64_32r)>;
614 def Zn4SlowLEA16r : SchedWriteRes<[Zn4ALU0123]> {
615 let Latency = 2; // FIXME: not from llvm-exegesis
616 let ReleaseAtCycles = [4];
620 def : InstRW<[Zn4SlowLEA16r], (instrs LEA16r)>;
622 // Integer multiplication
623 defm : Zn4WriteResIntPair<WriteIMul8, [Zn4Multiplier], 3, [3], 1>; // Integer 8-bit multiplication.
624 defm : Zn4WriteResIntPair<WriteIMul16, [Zn4Multiplier], 3, [3], 3, /*LoadUOps=*/1>; // Integer 16-bit multiplication.
625 defm : Zn4WriteResIntPair<WriteIMul16Imm, [Zn4Multiplier], 4, [4], 2>; // Integer 16-bit multiplication by immediate.
626 defm : Zn4WriteResIntPair<WriteIMul16Reg, [Zn4Multiplier], 3, [1], 1>; // Integer 16-bit multiplication by register.
627 defm : Zn4WriteResIntPair<WriteIMul32, [Zn4Multiplier], 3, [3], 2>; // Integer 32-bit multiplication.
628 defm : Zn4WriteResIntPair<WriteMULX32, [Zn4Multiplier], 3, [1], 2>; // Integer 32-bit Unsigned Multiply Without Affecting Flags.
629 defm : Zn4WriteResIntPair<WriteIMul32Imm, [Zn4Multiplier], 3, [1], 1>; // Integer 32-bit multiplication by immediate.
630 defm : Zn4WriteResIntPair<WriteIMul32Reg, [Zn4Multiplier], 3, [1], 1>; // Integer 32-bit multiplication by register.
631 defm : Zn4WriteResIntPair<WriteIMul64, [Zn4Multiplier], 3, [3], 2>; // Integer 64-bit multiplication.
632 defm : Zn4WriteResIntPair<WriteMULX64, [Zn4Multiplier], 3, [1], 2>; // Integer 32-bit Unsigned Multiply Without Affecting Flags.
633 defm : Zn4WriteResIntPair<WriteIMul64Imm, [Zn4Multiplier], 3, [1], 1>; // Integer 64-bit multiplication by immediate.
634 defm : Zn4WriteResIntPair<WriteIMul64Reg, [Zn4Multiplier], 3, [1], 1>; // Integer 64-bit multiplication by register.
635 defm : Zn4WriteResInt<WriteIMulHLd, [], !add(4, Znver4Model.LoadLatency), [], 0>; // Integer multiplication, high part.
636 defm : Zn4WriteResInt<WriteIMulH, [], 4, [], 0>; // Integer multiplication, high part.
638 defm : Zn4WriteResInt<WriteBSWAP32, [Zn4ALU0123], 1, [1], 1>; // Byte Order (Endianness) 32-bit Swap.
639 defm : Zn4WriteResInt<WriteBSWAP64, [Zn4ALU0123], 1, [1], 1>; // Byte Order (Endianness) 64-bit Swap.
641 defm : Zn4WriteResIntPair<WriteCMPXCHG, [Zn4ALU0123], 3, [12], 5>; // Compare and set, compare and swap.
643 def Zn4WriteCMPXCHG8rr : SchedWriteRes<[Zn4ALU0123]> {
645 let ReleaseAtCycles = [12];
648 def : InstRW<[Zn4WriteCMPXCHG8rr], (instrs CMPXCHG8rr)>;
650 defm : Zn4WriteResInt<WriteCMPXCHGRMW, [Zn4ALU0123], 3, [12], 6>; // Compare and set, compare and swap.
652 def Zn4WriteCMPXCHG8rm_LCMPXCHG8 : SchedWriteRes<[Zn4AGU012, Zn4Load, Zn4ALU0123]> {
653 let Latency = !add(Znver4Model.LoadLatency, Zn4WriteCMPXCHG8rr.Latency);
654 let ReleaseAtCycles = [1, 1, 12];
655 let NumMicroOps = !add(Zn4WriteCMPXCHG8rr.NumMicroOps, 2);
657 def : InstRW<[Zn4WriteCMPXCHG8rm_LCMPXCHG8], (instrs CMPXCHG8rm, LCMPXCHG8)>;
659 def Zn4WriteCMPXCHG8B : SchedWriteRes<[Zn4ALU0123]> {
660 let Latency = 3; // FIXME: not from llvm-exegesis
661 let ReleaseAtCycles = [24];
662 let NumMicroOps = 19;
664 def : InstRW<[Zn4WriteCMPXCHG8B], (instrs CMPXCHG8B)>;
666 def Zn4WriteCMPXCHG16B_LCMPXCHG16B : SchedWriteRes<[Zn4ALU0123]> {
667 let Latency = 4; // FIXME: not from llvm-exegesis
668 let ReleaseAtCycles = [59];
669 let NumMicroOps = 28;
671 def : InstRW<[Zn4WriteCMPXCHG16B_LCMPXCHG16B], (instrs CMPXCHG16B, LCMPXCHG16B)>;
673 def Zn4WriteWriteXCHGUnrenameable : SchedWriteRes<[Zn4ALU0123]> {
675 let ReleaseAtCycles = [2];
678 def : InstRW<[Zn4WriteWriteXCHGUnrenameable], (instrs XCHG8rr, XCHG16rr, XCHG16ar)>;
680 def Zn4WriteXCHG8rm_XCHG16rm : SchedWriteRes<[Zn4AGU012, Zn4Load, Zn4ALU0123]> {
681 let Latency = !add(Znver4Model.LoadLatency, 3); // FIXME: not from llvm-exegesis
682 let ReleaseAtCycles = [1, 1, 2];
685 def : InstRW<[Zn4WriteXCHG8rm_XCHG16rm], (instrs XCHG8rm, XCHG16rm)>;
687 def Zn4WriteXCHG32rm_XCHG64rm : SchedWriteRes<[Zn4AGU012, Zn4Load, Zn4ALU0123]> {
688 let Latency = !add(Znver4Model.LoadLatency, 2); // FIXME: not from llvm-exegesis
689 let ReleaseAtCycles = [1, 1, 2];
692 def : InstRW<[Zn4WriteXCHG32rm_XCHG64rm], (instrs XCHG32rm, XCHG64rm)>;
695 // FIXME: uops for 8-bit division measures as 2. for others it's a guess.
696 // FIXME: latency for 8-bit division measures as 10. for others it's a guess.
697 defm : Zn4WriteResIntPair<WriteDiv8, [Zn4Divider], 10, [10], 2>;
698 defm : Zn4WriteResIntPair<WriteDiv16, [Zn4Divider], 11, [11], 2>;
699 defm : Zn4WriteResIntPair<WriteDiv32, [Zn4Divider], 13, [13], 2>;
700 defm : Zn4WriteResIntPair<WriteDiv64, [Zn4Divider], 17, [17], 2>;
701 defm : Zn4WriteResIntPair<WriteIDiv8, [Zn4Divider], 10, [10], 2>;
702 defm : Zn4WriteResIntPair<WriteIDiv16, [Zn4Divider], 11, [11], 2>;
703 defm : Zn4WriteResIntPair<WriteIDiv32, [Zn4Divider], 13, [13], 2>;
704 defm : Zn4WriteResIntPair<WriteIDiv64, [Zn4Divider], 17, [17], 2>;
706 defm : Zn4WriteResIntPair<WriteBSF, [Zn4ALU1], 1, [1], 6, /*LoadUOps=*/1>; // Bit scan forward.
707 defm : Zn4WriteResIntPair<WriteBSR, [Zn4ALU1], 1, [1], 6, /*LoadUOps=*/1>; // Bit scan reverse.
709 defm : Zn4WriteResIntPair<WritePOPCNT, [Zn4ALU0123], 1, [1], 1>; // Bit population count.
711 def Zn4WritePOPCNT16rr : SchedWriteRes<[Zn4ALU0123]> {
713 let ReleaseAtCycles = [4];
716 def : InstRW<[Zn4WritePOPCNT16rr], (instrs POPCNT16rr)>;
718 defm : Zn4WriteResIntPair<WriteLZCNT, [Zn4ALU0123], 1, [1], 1>; // Leading zero count.
720 def Zn4WriteLZCNT16rr : SchedWriteRes<[Zn4ALU0123]> {
722 let ReleaseAtCycles = [4];
725 def : InstRW<[Zn4WriteLZCNT16rr], (instrs LZCNT16rr)>;
727 defm : Zn4WriteResIntPair<WriteTZCNT, [Zn4ALU12], 2, [1], 2>; // Trailing zero count.
729 def Zn4WriteTZCNT16rr : SchedWriteRes<[Zn4ALU0123]> {
731 let ReleaseAtCycles = [4];
734 def : InstRW<[Zn4WriteTZCNT16rr], (instrs TZCNT16rr)>;
736 defm : Zn4WriteResIntPair<WriteCMOV, [Zn4ALU03], 1, [1], 1>; // Conditional move.
737 defm : Zn4WriteResInt<WriteFCMOV, [Zn4ALU0123], 7, [28], 7>; // FIXME: not from llvm-exegesis // X87 conditional move.
738 defm : Zn4WriteResInt<WriteSETCC, [Zn4ALU03], 1, [2], 1>; // Set register based on condition code.
739 defm : Zn4WriteResInt<WriteSETCCStore, [Zn4ALU03, Zn4AGU012, Zn4Store], 2, [2, 1, 1], 2>; // FIXME: latency not from llvm-exegesis
740 defm : Zn4WriteResInt<WriteLAHFSAHF, [Zn4ALU3], 1, [1], 1>; // Load/Store flags in AH.
742 defm : Zn4WriteResInt<WriteBitTest, [Zn4ALU12], 1, [1], 1>; // Bit Test
743 defm : Zn4WriteResInt<WriteBitTestImmLd, [Zn4AGU012, Zn4Load, Zn4ALU12], !add(Znver4Model.LoadLatency, 1), [1, 1, 1], 2>;
744 defm : Zn4WriteResInt<WriteBitTestRegLd, [Zn4AGU012, Zn4Load, Zn4ALU12], !add(Znver4Model.LoadLatency, 1), [1, 1, 1], 7>;
746 defm : Zn4WriteResInt<WriteBitTestSet, [Zn4ALU12], 2, [2], 2>; // Bit Test + Set
747 defm : Zn4WriteResInt<WriteBitTestSetImmLd, [Zn4AGU012, Zn4Load, Zn4ALU12], !add(Znver4Model.LoadLatency, 2), [1, 1, 1], 4>;
748 defm : Zn4WriteResInt<WriteBitTestSetRegLd, [Zn4AGU012, Zn4Load, Zn4ALU12], !add(Znver4Model.LoadLatency, 2), [1, 1, 1], 9>;
750 // Integer shifts and rotates.
751 defm : Zn4WriteResIntPair<WriteShift, [Zn4ALU12], 1, [1], 1, /*LoadUOps=*/1>;
752 defm : Zn4WriteResIntPair<WriteShiftCL, [Zn4ALU12], 1, [1], 1, /*LoadUOps=*/1>;
753 defm : Zn4WriteResIntPair<WriteRotate, [Zn4ALU12], 1, [1], 1, /*LoadUOps=*/1>;
755 def Zn4WriteRotateR1 : SchedWriteRes<[Zn4ALU12]> {
757 let ReleaseAtCycles = [2];
760 def : InstRW<[Zn4WriteRotateR1], (instrs RCL8r1, RCL16r1, RCL32r1, RCL64r1,
761 RCR8r1, RCR16r1, RCR32r1, RCR64r1)>;
763 def Zn4WriteRotateM1 : SchedWriteRes<[Zn4AGU012, Zn4Load, Zn4ALU12]> {
764 let Latency = !add(Znver4Model.LoadLatency, Zn4WriteRotateR1.Latency);
765 let ReleaseAtCycles = [1, 1, 2];
766 let NumMicroOps = !add(Zn4WriteRotateR1.NumMicroOps, 1);
768 def : InstRW<[Zn4WriteRotateM1], (instrs RCL8m1, RCL16m1, RCL32m1, RCL64m1,
769 RCR8m1, RCR16m1, RCR32m1, RCR64m1)>;
771 def Zn4WriteRotateRightRI : SchedWriteRes<[Zn4ALU12]> {
773 let ReleaseAtCycles = [6];
776 def : InstRW<[Zn4WriteRotateRightRI], (instrs RCR8ri, RCR16ri, RCR32ri, RCR64ri)>;
778 def Zn4WriteRotateRightMI : SchedWriteRes<[Zn4AGU012, Zn4Load, Zn4ALU12]> {
779 let Latency = !add(Znver4Model.LoadLatency, Zn4WriteRotateRightRI.Latency);
780 let ReleaseAtCycles = [1, 1, 8];
781 let NumMicroOps = !add(Zn4WriteRotateRightRI.NumMicroOps, 3);
783 def : InstRW<[Zn4WriteRotateRightMI], (instrs RCR8mi, RCR16mi, RCR32mi, RCR64mi)>;
785 def Zn4WriteRotateLeftRI : SchedWriteRes<[Zn4ALU12]> {
787 let ReleaseAtCycles = [8];
790 def : InstRW<[Zn4WriteRotateLeftRI], (instrs RCL8ri, RCL16ri, RCL32ri, RCL64ri)>;
792 def Zn4WriteRotateLeftMI : SchedWriteRes<[Zn4AGU012, Zn4Load, Zn4ALU12]> {
793 let Latency = !add(Znver4Model.LoadLatency, Zn4WriteRotateLeftRI.Latency);
794 let ReleaseAtCycles = [1, 1, 8];
795 let NumMicroOps = !add(Zn4WriteRotateLeftRI.NumMicroOps, 2);
797 def : InstRW<[Zn4WriteRotateLeftMI], (instrs RCL8mi, RCL16mi, RCL32mi, RCL64mi)>;
799 defm : Zn4WriteResIntPair<WriteRotateCL, [Zn4ALU12], 1, [1], 1, /*LoadUOps=*/1>;
801 def Zn4WriteRotateRightRCL : SchedWriteRes<[Zn4ALU12]> {
803 let ReleaseAtCycles = [6];
806 def : InstRW<[Zn4WriteRotateRightRCL], (instrs RCR8rCL, RCR16rCL, RCR32rCL, RCR64rCL)>;
808 def Zn4WriteRotateRightMCL : SchedWriteRes<[Zn4AGU012, Zn4Load, Zn4ALU12]> {
809 let Latency = !add(Znver4Model.LoadLatency, Zn4WriteRotateRightRCL.Latency);
810 let ReleaseAtCycles = [1, 1, 8];
811 let NumMicroOps = !add(Zn4WriteRotateRightRCL.NumMicroOps, 2);
813 def : InstRW<[Zn4WriteRotateRightMCL], (instrs RCR8mCL, RCR16mCL, RCR32mCL, RCR64mCL)>;
815 def Zn4WriteRotateLeftRCL : SchedWriteRes<[Zn4ALU12]> {
817 let ReleaseAtCycles = [8];
820 def : InstRW<[Zn4WriteRotateLeftRCL], (instrs RCL8rCL, RCL16rCL, RCL32rCL, RCL64rCL)>;
822 def Zn4WriteRotateLeftMCL : SchedWriteRes<[Zn4AGU012, Zn4Load, Zn4ALU12]> {
823 let Latency = !add(Znver4Model.LoadLatency, Zn4WriteRotateLeftRCL.Latency);
824 let ReleaseAtCycles = [1, 1, 8];
825 let NumMicroOps = !add(Zn4WriteRotateLeftRCL.NumMicroOps, 2);
827 def : InstRW<[Zn4WriteRotateLeftMCL], (instrs RCL8mCL, RCL16mCL, RCL32mCL, RCL64mCL)>;
829 // Double shift instructions.
830 defm : Zn4WriteResInt<WriteSHDrri, [Zn4ALU12], 2, [3], 4>;
831 defm : Zn4WriteResInt<WriteSHDrrcl, [Zn4ALU12], 2, [3], 5>;
832 defm : Zn4WriteResInt<WriteSHDmri, [Zn4AGU012, Zn4Load, Zn4ALU12], !add(Znver4Model.LoadLatency, 2), [1, 1, 4], 6>;
833 defm : Zn4WriteResInt<WriteSHDmrcl, [Zn4AGU012, Zn4Load, Zn4ALU12], !add(Znver4Model.LoadLatency, 2), [1, 1, 4], 6>;
835 // BMI1 BEXTR/BLS, BMI2 BZHI
836 defm : Zn4WriteResIntPair<WriteBEXTR, [Zn4ALU12], 1, [1], 1, /*LoadUOps=*/1>;
837 defm : Zn4WriteResIntPair<WriteBLS, [Zn4ALU0123], 1, [1], 1, /*LoadUOps=*/1>;
838 defm : Zn4WriteResIntPair<WriteBZHI, [Zn4ALU12], 1, [1], 1, /*LoadUOps=*/1>;
840 // Idioms that clear a register, like xorps %xmm0, %xmm0.
841 // These can often bypass execution ports completely.
842 defm : Zn4WriteResInt<WriteZero, [Zn4ALU0123], 0, [0], 1>;
844 // Branches don't produce values, so they have no latency, but they still
845 // consume resources. Indirect branches can fold loads.
846 defm : Zn4WriteResIntPair<WriteJump, [Zn4BRU01], 1, [1], 1>; // FIXME: not from llvm-exegesis
848 // Floating point. This covers both scalar and vector operations.
849 defm : Zn4WriteResInt<WriteFLD0, [Zn4FPLd01, Zn4Load, Zn4FP1], !add(Znver4Model.LoadLatency, 4), [1, 1, 1], 1>;
850 defm : Zn4WriteResInt<WriteFLD1, [Zn4FPLd01, Zn4Load, Zn4FP1], !add(Znver4Model.LoadLatency, 7), [1, 1, 1], 1>;
851 defm : Zn4WriteResInt<WriteFLDC, [Zn4FPLd01, Zn4Load, Zn4FP1], !add(Znver4Model.LoadLatency, 7), [1, 1, 1], 1>;
852 defm : Zn4WriteResXMM<WriteFLoad, [Zn4FPLd01, Zn4Load], !add(Znver4Model.VecLoadLatency, 1), [1, 1], 1>;
853 defm : Zn4WriteResXMM<WriteFLoadX, [Zn4FPLd01, Zn4Load], !add(Znver4Model.VecLoadLatency, 1), [1, 1], 1>;
854 defm : Zn4WriteResYMM<WriteFLoadY, [Zn4FPLd01, Zn4Load], !add(Znver4Model.VecLoadLatency, 1), [1, 1], 1>;
855 defm : Zn4WriteResXMM<WriteFMaskedLoad, [Zn4FPLd01, Zn4Load], !add(Znver4Model.VecLoadLatency, 1), [1, 1], 1>;
856 defm : Zn4WriteResYMM<WriteFMaskedLoadY, [Zn4FPLd01, Zn4Load], !add(Znver4Model.VecLoadLatency, 1), [1, 1], 1>;
857 defm : Zn4WriteResXMM<WriteFStore, [Zn4FPSt, Zn4Store], Znver4Model.StoreLatency, [1, 1], 1>;
859 def Zn4WriteWriteFStoreMMX : SchedWriteRes<[Zn4FPSt, Zn4Store]> {
860 let Latency = 2; // FIXME: not from llvm-exegesis
861 let ReleaseAtCycles = [1, 1];
864 def : InstRW<[Zn4WriteWriteFStoreMMX], (instrs MOVHPDmr, MOVHPSmr,
865 VMOVHPDmr, VMOVHPSmr)>;
867 defm : Zn4WriteResXMM<WriteFStoreX, [Zn4FPSt, Zn4Store], Znver4Model.StoreLatency, [1, 1], 1>;
868 defm : Zn4WriteResYMM<WriteFStoreY, [Zn4FPSt, Zn4Store], Znver4Model.StoreLatency, [1, 1], 1>;
869 defm : Zn4WriteResXMM<WriteFStoreNT, [Zn4FPSt, Zn4Store], Znver4Model.StoreLatency, [1, 1], 1>;
870 defm : Zn4WriteResXMM<WriteFStoreNTX, [Zn4FPSt, Zn4Store], Znver4Model.StoreLatency, [1, 1], 1>;
871 defm : Zn4WriteResYMM<WriteFStoreNTY, [Zn4FPSt, Zn4Store], Znver4Model.StoreLatency, [1, 1], 1>;
873 defm : Zn4WriteResXMM<WriteFMaskedStore32, [Zn4FPSt, Zn4Store], Znver4Model.StoreLatency, [6, 1], 18>;
874 defm : Zn4WriteResXMM<WriteFMaskedStore64, [Zn4FPSt, Zn4Store], Znver4Model.StoreLatency, [4, 1], 10>;
875 defm : Zn4WriteResYMM<WriteFMaskedStore32Y, [Zn4FPSt, Zn4Store], Znver4Model.StoreLatency, [12, 1], 42>;
876 defm : Zn4WriteResYMM<WriteFMaskedStore64Y, [Zn4FPSt, Zn4Store], Znver4Model.StoreLatency, [6, 1], 18>;
878 defm : Zn4WriteResXMMPair<WriteFAdd, [Zn4FPFAdd01], 3, [1], 1>; // Floating point add/sub.
880 def Zn4WriteX87Arith : SchedWriteRes<[Zn4AGU012, Zn4Load, Zn4FPU0123]> {
881 let Latency = !add(Znver4Model.LoadLatency, 1); // FIXME: not from llvm-exegesis
882 let ReleaseAtCycles = [1, 1, 24];
885 def : InstRW<[Zn4WriteX87Arith], (instrs ADD_FI16m, ADD_FI32m,
886 SUB_FI16m, SUB_FI32m,
887 SUBR_FI16m, SUBR_FI32m,
888 MUL_FI16m, MUL_FI32m)>;
890 def Zn4WriteX87Div : SchedWriteRes<[Zn4AGU012, Zn4Load, Zn4FPU0123]> {
891 let Latency = !add(Znver4Model.LoadLatency, 1); // FIXME: not from llvm-exegesis
892 let ReleaseAtCycles = [1, 1, 62];
895 def : InstRW<[Zn4WriteX87Div], (instrs DIV_FI16m, DIV_FI32m,
896 DIVR_FI16m, DIVR_FI32m)>;
898 defm : Zn4WriteResXMMPair<WriteFAddX, [Zn4FPFAdd01], 3, [1], 1>; // Floating point add/sub (XMM).
899 defm : Zn4WriteResYMMPair<WriteFAddY, [Zn4FPFAdd01], 3, [1], 1>; // Floating point add/sub (YMM).
900 defm : Zn4WriteResZMMPair<WriteFAddZ, [Zn4FPFAdd01], 3, [2], 1>; // Floating point add/sub (ZMM).
901 defm : Zn4WriteResXMMPair<WriteFAdd64, [Zn4FPFAdd01], 3, [1], 1>; // Floating point double add/sub.
902 defm : Zn4WriteResXMMPair<WriteFAdd64X, [Zn4FPFAdd01], 3, [1], 1>; // Floating point double add/sub (XMM).
903 defm : Zn4WriteResYMMPair<WriteFAdd64Y, [Zn4FPFAdd01], 3, [1], 1>; // Floating point double add/sub (YMM).
904 defm : Zn4WriteResZMMPair<WriteFAdd64Z, [Zn4FPFAdd01], 3, [2], 1>; // Floating point double add/sub (ZMM).
905 defm : Zn4WriteResXMMPair<WriteFCmp, [Zn4FPFMul01], 2, [2], 1>; // Floating point compare.
906 defm : Zn4WriteResXMMPair<WriteFCmpX, [Zn4FPFMul01], 2, [1], 1>; // Floating point compare (XMM).
907 defm : Zn4WriteResYMMPair<WriteFCmpY, [Zn4FPFMul01], 2, [1], 1>; // Floating point compare (YMM).
908 defm : Zn4WriteResZMMPair<WriteFCmpZ, [Zn4FPFMul01], 2, [2], 1>; // Floating point compare (ZMM).
909 defm : Zn4WriteResXMMPair<WriteFCmp64, [Zn4FPFMul01], 1, [1], 1>; // Floating point double compare.
910 defm : Zn4WriteResXMMPair<WriteFCmp64X, [Zn4FPFMul01], 2, [1], 1>; // Floating point double compare (XMM).
911 defm : Zn4WriteResYMMPair<WriteFCmp64Y, [Zn4FPFMul01], 2, [1], 1>; // Floating point double compare (YMM).
912 defm : Zn4WriteResZMMPair<WriteFCmp64Z, [Zn4FPFMul01], 2, [2], 1>; // Floating point double compare (ZMM).
913 defm : Zn4WriteResXMMPair<WriteFCom, [Zn4FPFMul01], 3, [2], 1>; // FIXME: latency not from llvm-exegesis // Floating point compare to flags (X87).
914 defm : Zn4WriteResXMMPair<WriteFComX, [Zn4FPFMul01], 4, [2], 2>; // FIXME: latency not from llvm-exegesis // Floating point compare to flags (SSE).
915 defm : Zn4WriteResXMMPair<WriteFMul, [Zn4FPFMul01], 3, [1], 1>; // Floating point multiplication.
916 defm : Zn4WriteResXMMPair<WriteFMulX, [Zn4FPFMul01], 3, [1], 1>; // Floating point multiplication (XMM).
917 defm : Zn4WriteResYMMPair<WriteFMulY, [Zn4FPFMul01], 3, [1], 1>; // Floating point multiplication (YMM).
918 defm : Zn4WriteResZMMPair<WriteFMulZ, [Zn4FPFMul01], 3, [2], 1>; // Floating point multiplication (ZMM).
919 defm : Zn4WriteResXMMPair<WriteFMul64, [Zn4FPFMul01], 3, [1], 1>; // Floating point double multiplication.
920 defm : Zn4WriteResXMMPair<WriteFMul64X, [Zn4FPFMul01], 3, [1], 1>; // Floating point double multiplication (XMM).
921 defm : Zn4WriteResYMMPair<WriteFMul64Y, [Zn4FPFMul01], 3, [1], 1>; // Floating point double multiplication (YMM).
922 defm : Zn4WriteResZMMPair<WriteFMul64Z, [Zn4FPFMul01], 3, [2], 1>; // Floating point double multiplication (ZMM).
923 defm : Zn4WriteResXMMPair<WriteFDiv, [Zn4FPFDiv], 11, [3], 1>; // Floating point division.
924 defm : Zn4WriteResXMMPair<WriteFDivX, [Zn4FPFDiv], 11, [3], 1>; // Floating point division (XMM).
925 defm : Zn4WriteResYMMPair<WriteFDivY, [Zn4FPFDiv], 11, [3], 1>; // Floating point division (YMM).
926 defm : Zn4WriteResZMMPair<WriteFDivZ, [Zn4FPFDiv], 11, [6], 1>; // Floating point division (ZMM).
927 defm : Zn4WriteResXMMPair<WriteFDiv64, [Zn4FPFDiv], 13, [5], 1>; // Floating point double division.
928 defm : Zn4WriteResXMMPair<WriteFDiv64X, [Zn4FPFDiv], 13, [5], 1>; // Floating point double division (XMM).
929 defm : Zn4WriteResYMMPair<WriteFDiv64Y, [Zn4FPFDiv], 13, [5], 1>; // Floating point double division (YMM).
930 defm : Zn4WriteResZMMPair<WriteFDiv64Z, [Zn4FPFDiv], 13, [10], 1>; // Floating point double division (ZMM).
931 defm : Zn4WriteResXMMPair<WriteFSqrt, [Zn4FPFDiv], 15, [5], 1>; // Floating point square root.
932 defm : Zn4WriteResXMMPair<WriteFSqrtX, [Zn4FPFDiv], 15, [5], 1>; // Floating point square root (XMM).
933 defm : Zn4WriteResYMMPair<WriteFSqrtY, [Zn4FPFDiv], 15, [5], 1>; // Floating point square root (YMM).
934 defm : Zn4WriteResZMMPair<WriteFSqrtZ, [Zn4FPFDiv], 15, [10], 1>; // Floating point square root (ZMM).
935 defm : Zn4WriteResXMMPair<WriteFSqrt64, [Zn4FPFDiv], 21, [9], 1>; // Floating point double square root.
936 defm : Zn4WriteResXMMPair<WriteFSqrt64X, [Zn4FPFDiv], 21, [9], 1>; // Floating point double square root (XMM).
937 defm : Zn4WriteResYMMPair<WriteFSqrt64Y, [Zn4FPFDiv], 21, [9], 1>; // Floating point double square root (YMM).
938 defm : Zn4WriteResZMMPair<WriteFSqrt64Z, [Zn4FPFDiv], 21, [18], 1>; // Floating point double square root (ZMM).
939 defm : Zn4WriteResXMMPair<WriteFSqrt80, [Zn4FPFDiv], 22, [23], 1>; // FIXME: latency not from llvm-exegesis // Floating point long double square root.
940 defm : Zn4WriteResXMMPair<WriteFRcp, [Zn4FPFMul01], 4, [1], 1>; // Floating point reciprocal estimate.
941 defm : Zn4WriteResXMMPair<WriteFRcpX, [Zn4FPFMul01], 4, [1], 1>; // Floating point reciprocal estimate (XMM).
942 defm : Zn4WriteResYMMPair<WriteFRcpY, [Zn4FPFMul01], 5, [1], 1>; // Floating point reciprocal estimate (YMM).
943 defm : Zn4WriteResZMMPair<WriteFRcpZ, [Zn4FPFMul01], 5, [2], 1>; // Floating point reciprocal estimate (ZMM).
944 defm : Zn4WriteResXMMPair<WriteFRsqrt, [Zn4FPFDiv], 4, [1], 1>; // Floating point reciprocal square root estimate.
945 defm : Zn4WriteResXMMPair<WriteFRsqrtX, [Zn4FPFDiv], 4, [1], 1>; // Floating point reciprocal square root estimate (XMM).
946 defm : Zn4WriteResYMMPair<WriteFRsqrtY, [Zn4FPFDiv], 4, [1], 1>; // Floating point reciprocal square root estimate (YMM).
947 defm : Zn4WriteResZMMPair<WriteFRsqrtZ, [Zn4FPFDiv], 5, [2], 1>; // Floating point reciprocal square root estimate (ZMM).
948 defm : Zn4WriteResXMMPair<WriteFMA, [Zn4FPFMul01], 4, [2], 1>; // Fused Multiply Add.
949 defm : Zn4WriteResXMMPair<WriteFMAX, [Zn4FPFMul01], 4, [1], 1>; // Fused Multiply Add (XMM).
950 defm : Zn4WriteResYMMPair<WriteFMAY, [Zn4FPFMul01], 4, [1], 1>; // Fused Multiply Add (YMM).
951 defm : Zn4WriteResZMMPair<WriteFMAZ, [Zn4FPFMul01], 4, [2], 1>; // Fused Multiply Add (ZMM).
952 defm : Zn4WriteResXMMPair<WriteDPPD, [Zn4FPFMul01], 7, [6], 3, /*LoadUOps=*/2>; // Floating point double dot product.
953 defm : Zn4WriteResXMMPair<WriteDPPS, [Zn4FPFMul01], 11, [8], 8, /*LoadUOps=*/2>; // Floating point single dot product.
954 defm : Zn4WriteResYMMPair<WriteDPPSY, [Zn4FPFMul01], 11, [8], 7, /*LoadUOps=*/1>; // Floating point single dot product (YMM).
955 defm : Zn4WriteResXMMPair<WriteFSign, [Zn4FPFMul01], 1, [2], 1>; // FIXME: latency not from llvm-exegesis // Floating point fabs/fchs.
956 defm : Zn4WriteResXMMPair<WriteFRnd, [Zn4FPFCvt01], 3, [1], 1>; // Floating point rounding.
957 defm : Zn4WriteResYMMPair<WriteFRndY, [Zn4FPFCvt01], 3, [1], 1>; // Floating point rounding (YMM).
958 defm : Zn4WriteResZMMPair<WriteFRndZ, [Zn4FPFCvt01], 3, [2], 1>; // Floating point rounding (ZMM).
960 defm : Zn4WriteResXMMPair<WriteFLogic, [Zn4FPVMisc0123], 1, [1], 1>; // Floating point and/or/xor logicals.
961 defm : Zn4WriteResYMMPair<WriteFLogicY, [Zn4FPVMisc0123], 1, [1], 1>; // Floating point and/or/xor logicals (YMM).
962 defm : Zn4WriteResZMMPair<WriteFLogicZ, [Zn4FPVMisc0123], 1, [2], 1>; // Floating point and/or/xor logicals (ZMM).
963 defm : Zn4WriteResXMMPair<WriteFTest, [Zn4FPFMisc12], 1, [2], 2>; // FIXME: latency not from llvm-exegesis // Floating point TEST instructions.
964 defm : Zn4WriteResYMMPair<WriteFTestY, [Zn4FPFMisc12], 1, [2], 2>; // FIXME: latency not from llvm-exegesis // Floating point TEST instructions (YMM).
965 defm : Zn4WriteResZMMPair<WriteFTestZ, [Zn4FPFMisc12], 1, [4], 1>; // FIXME: latency not from llvm-exegesis // Floating point TEST instructions (ZMM).
966 defm : Zn4WriteResXMMPair<WriteFShuffle, [Zn4FPVShuf01], 1, [1], 1>; // Floating point vector shuffles.
967 defm : Zn4WriteResYMMPair<WriteFShuffleY, [Zn4FPVShuf01], 1, [1], 1>; // Floating point vector shuffles (YMM).
968 defm : Zn4WriteResZMMPair<WriteFShuffleZ, [Zn4FPVShuf01], 1, [2], 1>; // Floating point vector shuffles (ZMM).
969 defm : Zn4WriteResXMMPair<WriteFVarShuffle, [Zn4FPVShuf01], 3, [1], 1>; // Floating point vector variable shuffles.
970 defm : Zn4WriteResYMMPair<WriteFVarShuffleY, [Zn4FPVShuf01], 3, [1], 1>; // Floating point vector variable shuffles (YMM).
971 defm : Zn4WriteResZMMPair<WriteFVarShuffleZ, [Zn4FPVShuf01], 3, [2], 1>; // Floating point vector variable shuffles (ZMM).
972 defm : Zn4WriteResXMMPair<WriteFBlend, [Zn4FPFMul01], 1, [1], 1>; // Floating point vector blends.
973 defm : Zn4WriteResYMMPair<WriteFBlendY, [Zn4FPFMul01], 1, [1], 1>; // Floating point vector blends (YMM).
974 defm : Zn4WriteResZMMPair<WriteFBlendZ, [Zn4FPFMul01], 1, [2], 1>; // Floating point vector blends (ZMM).
975 defm : Zn4WriteResXMMPair<WriteFVarBlend, [Zn4FPFMul01], 1, [1], 1>; // Fp vector variable blends.
976 defm : Zn4WriteResYMMPair<WriteFVarBlendY, [Zn4FPFMul01], 1, [1], 1>; // Fp vector variable blends (YMM).
977 defm : Zn4WriteResZMMPair<WriteFVarBlendZ, [Zn4FPFMul01], 1, [2], 1>; // Fp vector variable blends (ZMM).
979 // Horizontal Add/Sub (float and integer)
980 defm : Zn4WriteResXMMPair<WriteFHAdd, [Zn4FPFAdd0], 4, [2], 3>;
981 defm : Zn4WriteResYMMPair<WriteFHAddY, [Zn4FPFAdd0], 4, [2], 3, /*LoadUOps=*/1>;
982 defm : Zn4WriteResZMMPair<WriteFHAddZ, [Zn4FPFAdd0], 6, [4], 3, /*LoadUOps=*/1>;
983 defm : Zn4WriteResXMMPair<WritePHAdd, [Zn4FPVAdd0], 2, [2], 3, /*LoadUOps=*/1>;
984 defm : Zn4WriteResXMMPair<WritePHAddX, [Zn4FPVAdd0], 2, [2], 3>;
985 defm : Zn4WriteResYMMPair<WritePHAddY, [Zn4FPVAdd0], 3, [3], 3, /*LoadUOps=*/1>;
986 defm : Zn4WriteResZMMPair<WritePHAddZ, [Zn4FPVAdd0], 2, [4], 3, /*LoadUOps=*/1>;
988 // Vector integer operations.
989 defm : Zn4WriteResXMM<WriteVecLoad, [Zn4FPLd01, Zn4Load], !add(Znver4Model.VecLoadLatency, 1), [1, 1], 1>;
990 defm : Zn4WriteResXMM<WriteVecLoadX, [Zn4FPLd01, Zn4Load], !add(Znver4Model.VecLoadLatency, 1), [1, 1], 1>;
991 defm : Zn4WriteResYMM<WriteVecLoadY, [Zn4FPLd01, Zn4Load], !add(Znver4Model.VecLoadLatency, 1), [1, 1], 1>;
992 defm : Zn4WriteResXMM<WriteVecLoadNT, [Zn4FPLd01, Zn4Load], !add(Znver4Model.VecLoadLatency, 1), [1, 1], 1>;
993 defm : Zn4WriteResYMM<WriteVecLoadNTY, [Zn4FPLd01, Zn4Load], !add(Znver4Model.VecLoadLatency, 1), [1, 1], 1>;
994 defm : Zn4WriteResXMM<WriteVecMaskedLoad, [Zn4FPLd01, Zn4Load], !add(Znver4Model.VecLoadLatency, 1), [1, 1], 1>;
995 defm : Zn4WriteResYMM<WriteVecMaskedLoadY, [Zn4FPLd01, Zn4Load], !add(Znver4Model.VecLoadLatency, 1), [1, 1], 1>;
996 defm : Zn4WriteResXMM<WriteVecStore, [Zn4FPSt, Zn4Store], Znver4Model.StoreLatency, [1, 1], 1>;
997 defm : Zn4WriteResXMM<WriteVecStoreX, [Zn4FPSt, Zn4Store], Znver4Model.StoreLatency, [1, 1], 1>;
999 def Zn4WriteVEXTRACTF128rr_VEXTRACTI128rr : SchedWriteRes<[Zn4FPFMisc0]> {
1001 let ReleaseAtCycles = [1];
1002 let NumMicroOps = 1;
1004 def : InstRW<[Zn4WriteVEXTRACTF128rr_VEXTRACTI128rr], (instrs VEXTRACTF128rri, VEXTRACTI128rri)>;
1006 def Zn4WriteVEXTRACTI128mr : SchedWriteRes<[Zn4FPFMisc0, Zn4FPSt, Zn4Store]> {
1007 let Latency = !add(Znver4Model.LoadLatency, Zn4WriteVEXTRACTF128rr_VEXTRACTI128rr.Latency);
1008 let ReleaseAtCycles = [1, 1, 1];
1009 let NumMicroOps = !add(Zn4WriteVEXTRACTF128rr_VEXTRACTI128rr.NumMicroOps, 1);
1011 def : InstRW<[Zn4WriteVEXTRACTI128mr], (instrs VEXTRACTI128mri, VEXTRACTF128mri)>;
1013 def Zn4WriteVINSERTF128rmr : SchedWriteRes<[Zn4AGU012, Zn4Load, Zn4FPFMisc0]> {
1014 let Latency = !add(Znver4Model.LoadLatency, Zn4WriteVEXTRACTF128rr_VEXTRACTI128rr.Latency);
1015 let ReleaseAtCycles = [1, 1, 1];
1016 let NumMicroOps = !add(Zn4WriteVEXTRACTF128rr_VEXTRACTI128rr.NumMicroOps, 0);
1018 def : InstRW<[Zn4WriteVINSERTF128rmr], (instrs VINSERTF128rmi)>;
1020 defm : Zn4WriteResYMM<WriteVecStoreY, [Zn4FPSt, Zn4Store], Znver4Model.StoreLatency, [1, 1], 1>;
1021 defm : Zn4WriteResXMM<WriteVecStoreNT, [Zn4FPSt, Zn4Store], Znver4Model.StoreLatency, [1, 1], 1>;
1022 defm : Zn4WriteResYMM<WriteVecStoreNTY, [Zn4FPSt, Zn4Store], Znver4Model.StoreLatency, [1, 1], 1>;
1023 defm : Zn4WriteResXMM<WriteVecMaskedStore32, [Zn4FPSt, Zn4Store], Znver4Model.StoreLatency, [6, 1], 18>;
1024 defm : Zn4WriteResXMM<WriteVecMaskedStore64, [Zn4FPSt, Zn4Store], Znver4Model.StoreLatency, [4, 1], 10>;
1025 defm : Zn4WriteResYMM<WriteVecMaskedStore32Y, [Zn4FPSt, Zn4Store], Znver4Model.StoreLatency, [12, 1], 42>;
1026 defm : Zn4WriteResYMM<WriteVecMaskedStore64Y, [Zn4FPSt, Zn4Store], Znver4Model.StoreLatency, [6, 1], 18>;
1028 defm : Zn4WriteResXMM<WriteVecMoveToGpr, [Zn4FPLd01], 1, [2], 1>;
1029 defm : Zn4WriteResXMM<WriteVecMoveFromGpr, [Zn4FPLd01], 1, [2], 1>;
1031 def Zn4WriteMOVMMX : SchedWriteRes<[Zn4FPLd01, Zn4FPFMisc0123]> {
1033 let ReleaseAtCycles = [1, 2];
1034 let NumMicroOps = 2;
1036 def : InstRW<[Zn4WriteMOVMMX], (instrs MMX_MOVQ2FR64rr, MMX_MOVQ2DQrr)>;
1038 def Zn4WriteMOVMMXSlow : SchedWriteRes<[Zn4FPLd01, Zn4FPFMisc0123]> {
1040 let ReleaseAtCycles = [1, 4];
1041 let NumMicroOps = 2;
1043 def : InstRW<[Zn4WriteMOVMMXSlow], (instrs MMX_MOVD64rr, MMX_MOVD64to64rr)>;
1045 defm : Zn4WriteResXMMPair<WriteVecALU, [Zn4FPVAdd0123], 1, [1], 1>; // Vector integer ALU op, no logicals.
1047 def Zn4WriteEXTRQ_INSERTQ : SchedWriteRes<[Zn4FPVShuf01, Zn4FPLd01]> {
1049 let ReleaseAtCycles = [1, 1];
1050 let NumMicroOps = 1;
1052 def : InstRW<[Zn4WriteEXTRQ_INSERTQ], (instrs EXTRQ, INSERTQ)>;
1054 def Zn4WriteEXTRQI_INSERTQI : SchedWriteRes<[Zn4FPVShuf01, Zn4FPLd01]> {
1056 let ReleaseAtCycles = [1, 1];
1057 let NumMicroOps = 2;
1059 def : InstRW<[Zn4WriteEXTRQI_INSERTQI], (instrs EXTRQI, INSERTQI)>;
1061 defm : Zn4WriteResXMMPair<WriteVecALUX, [Zn4FPVAdd0123], 1, [1], 1>; // Vector integer ALU op, no logicals (XMM).
1063 def Zn4WriteVecALUXSlow : SchedWriteRes<[Zn4FPVAdd01]> {
1065 let ReleaseAtCycles = [2];
1066 let NumMicroOps = 1;
1068 def : InstRW<[Zn4WriteVecALUXSlow], (instrs PABSBrr, PABSDrr, PABSWrr,
1069 PADDSBrr, PADDSWrr, PADDUSBrr, PADDUSWrr,
1071 PSIGNBrr, PSIGNDrr, PSIGNWrr,
1072 VPABSBrr, VPABSDrr, VPABSWrr,
1073 VPADDSBrr, VPADDSWrr, VPADDUSBrr, VPADDUSWrr,
1076 VPSIGNBrr, VPSIGNDrr, VPSIGNWrr,
1077 PSUBSBrr, PSUBSWrr, PSUBUSBrr, PSUBUSWrr, VPSUBSBrr, VPSUBSWrr, VPSUBUSBrr, VPSUBUSWrr)>;
1079 def Zn4WriteVecOpMask : SchedWriteRes<[Zn4FPOpMask01]> {
1081 let ReleaseAtCycles = [1];
1082 let NumMicroOps = 1;
1084 def : InstRW<[Zn4WriteVecOpMask], (instrs KADDBkk, KADDDkk, KADDQkk, KADDWkk,
1085 KANDBkk, KANDDkk, KANDQkk, KANDWkk,
1086 KANDNBkk, KANDNDkk, KANDNQkk, KANDNWkk,
1087 KMOVBkk, KMOVDkk, KMOVQkk, KMOVWkk,
1088 KMOVBrk, KMOVDrk, KMOVQrk, KMOVWrk,
1089 KNOTBkk, KNOTDkk, KNOTQkk, KNOTWkk,
1090 KORBkk, KORDkk, KORQkk, KORWkk,
1091 KORTESTBkk, KORTESTDkk, KORTESTQkk, KORTESTWkk,
1092 KTESTBkk, KTESTDkk, KTESTQkk, KTESTWkk,
1093 KUNPCKBWkk, KUNPCKDQkk, KUNPCKWDkk,
1094 KXNORBkk, KXNORDkk, KXNORQkk, KXNORWkk,
1095 KXORBkk, KXORDkk, KXORQkk, KXORWkk)>;
1097 def Zn4WriteVecOpMaskMemMov : SchedWriteRes<[Zn4FPOpMask4]> {
1099 let ReleaseAtCycles = [1];
1100 let NumMicroOps = 1;
1102 def : InstRW<[Zn4WriteVecOpMaskMemMov], (instrs KMOVBmk, KMOVDmk, KMOVQmk, KMOVWmk)>;
1104 def Zn4WriteVecOpMaskKRMov : SchedWriteRes<[Zn4FPOpMask4]> {
1106 let ReleaseAtCycles = [1];
1107 let NumMicroOps = 1;
1109 def : InstRW<[Zn4WriteVecOpMaskKRMov], (instrs KMOVBkr, KMOVDkr, KMOVQkr, KMOVWkr)>;
1111 def Zn4WriteVecALU2Slow : SchedWriteRes<[Zn4FPVAdd12]> {
1112 // TODO: All align instructions are expected to be of 4 cycle latency
1114 let ReleaseAtCycles = [1];
1115 let NumMicroOps = 1;
1117 def : InstRW<[Zn4WriteVecALU2Slow], (instrs VALIGNDZrri, VALIGNDZ128rri, VALIGNDZ256rri,
1118 VALIGNQZrri, VALIGNQZ128rri, VALIGNQZ256rri)
1120 defm : Zn4WriteResYMMPair<WriteVecALUY, [Zn4FPVAdd0123], 1, [1], 1>; // Vector integer ALU op, no logicals (YMM).
1122 def Zn4WriteVecALUYSlow : SchedWriteRes<[Zn4FPVAdd01]> {
1124 let ReleaseAtCycles = [1];
1125 let NumMicroOps = 1;
1127 def : InstRW<[Zn4WriteVecALUYSlow], (instrs VPABSBYrr, VPABSDYrr, VPABSWYrr,
1128 VPADDSBYrr, VPADDSWYrr, VPADDUSBYrr, VPADDUSWYrr,
1129 VPSUBSBYrr, VPSUBSWYrr, VPSUBUSBYrr, VPSUBUSWYrr,
1130 VPAVGBYrr, VPAVGWYrr,
1132 VPSIGNBYrr, VPSIGNDYrr, VPSIGNWYrr)>;
1134 defm : Zn4WriteResZMMPair<WriteVecALUZ, [Zn4FPVAdd0123], 1, [2], 1>; // Vector integer ALU op, no logicals (ZMM).
1136 defm : Zn4WriteResXMMPair<WriteVecLogic, [Zn4FPVMisc0123], 1, [1], 1>; // Vector integer and/or/xor logicals.
1137 defm : Zn4WriteResXMMPair<WriteVecLogicX, [Zn4FPVMisc0123], 1, [1], 1>; // Vector integer and/or/xor logicals (XMM).
1138 defm : Zn4WriteResYMMPair<WriteVecLogicY, [Zn4FPVMisc0123], 1, [1], 1>; // Vector integer and/or/xor logicals (YMM).
1139 defm : Zn4WriteResZMMPair<WriteVecLogicZ, [Zn4FPVMisc0123], 1, [2], 1>; // Vector integer and/or/xor logicals (ZMM).
1140 defm : Zn4WriteResXMMPair<WriteVecTest, [Zn4FPVAdd12, Zn4FPSt], 1, [1, 1], 2>; // FIXME: latency not from llvm-exegesis // Vector integer TEST instructions.
1141 defm : Zn4WriteResYMMPair<WriteVecTestY, [Zn4FPVAdd12, Zn4FPSt], 1, [1, 1], 2>; // FIXME: latency not from llvm-exegesis // Vector integer TEST instructions (YMM).
1142 defm : Zn4WriteResZMMPair<WriteVecTestZ, [Zn4FPVAdd12, Zn4FPSt], 1, [2, 2], 2>; // FIXME: latency not from llvm-exegesis // Vector integer TEST instructions (ZMM).
1143 defm : Zn4WriteResXMMPair<WriteVecShift, [Zn4FPVShift01], 1, [1], 1>; // Vector integer shifts (default).
1144 defm : Zn4WriteResXMMPair<WriteVecShiftX, [Zn4FPVShift01], 2, [2], 1>; // Vector integer shifts (XMM).
1145 defm : Zn4WriteResYMMPair<WriteVecShiftY, [Zn4FPVShift01], 1, [1], 1>; // Vector integer shifts (YMM).
1146 defm : Zn4WriteResZMMPair<WriteVecShiftZ, [Zn4FPVShift01], 1, [2], 1>; // Vector integer shifts (ZMM).
1147 defm : Zn4WriteResXMMPair<WriteVecShiftImm, [Zn4FPVShift01], 1, [1], 1>; // Vector integer immediate shifts (default).
1148 defm : Zn4WriteResXMMPair<WriteVecShiftImmX, [Zn4FPVShift01], 1, [1], 1>; // Vector integer immediate shifts (XMM).
1149 defm : Zn4WriteResYMMPair<WriteVecShiftImmY, [Zn4FPVShift01], 1, [1], 1>; // Vector integer immediate shifts (YMM).
1150 defm : Zn4WriteResZMMPair<WriteVecShiftImmZ, [Zn4FPVShift01], 1, [2], 1>; // Vector integer immediate shifts (ZMM).
1151 defm : Zn4WriteResXMMPair<WriteVecIMul, [Zn4FPVMul01], 3, [1], 1>; // Vector integer multiply (default).
1152 defm : Zn4WriteResXMMPair<WriteVecIMulX, [Zn4FPVMul01], 3, [1], 1>; // Vector integer multiply (XMM).
1153 defm : Zn4WriteResYMMPair<WriteVecIMulY, [Zn4FPVMul01], 3, [1], 1>; // Vector integer multiply (YMM).
1154 defm : Zn4WriteResZMMPair<WriteVecIMulZ, [Zn4FPVMul01], 3, [2], 1>; // Vector integer multiply (ZMM).
1155 defm : Zn4WriteResXMMPair<WritePMULLD, [Zn4FPVMul01], 3, [1], 1>; // Vector PMULLD.
1156 defm : Zn4WriteResYMMPair<WritePMULLDY, [Zn4FPVMul01], 3, [1], 1>; // Vector PMULLD (YMM).
1157 defm : Zn4WriteResZMMPair<WritePMULLDZ, [Zn4FPVMul01], 3, [2], 1>; // Vector PMULLD (ZMM).
1158 defm : Zn4WriteResXMMPair<WriteShuffle, [Zn4FPVShuf01], 1, [1], 1>; // Vector shuffles.
1159 defm : Zn4WriteResXMMPair<WriteShuffleX, [Zn4FPVShuf01], 1, [1], 1>; // Vector shuffles (XMM).
1160 defm : Zn4WriteResYMMPair<WriteShuffleY, [Zn4FPVShuf01], 1, [1], 1>; // Vector shuffles (YMM).
1161 defm : Zn4WriteResZMMPair<WriteShuffleZ, [Zn4FPVShuf01], 1, [2], 1>; // Vector shuffles (ZMM).
1162 defm : Zn4WriteResXMMPair<WriteVarShuffle, [Zn4FPVShuf01], 1, [1], 1>; // Vector variable shuffles.
1163 defm : Zn4WriteResXMMPair<WriteVarShuffleX, [Zn4FPVShuf01], 1, [1], 1>; // Vector variable shuffles (XMM).
1164 defm : Zn4WriteResYMMPair<WriteVarShuffleY, [Zn4FPVShuf01], 1, [1], 1>; // Vector variable shuffles (YMM).
1165 defm : Zn4WriteResZMMPair<WriteVarShuffleZ, [Zn4FPVShuf01], 1, [2], 1>; // Vector variable shuffles (ZMM).
1166 defm : Zn4WriteResXMMPair<WriteBlend, [Zn4FPVMisc0123], 1, [1], 1>; // Vector blends.
1167 defm : Zn4WriteResYMMPair<WriteBlendY, [Zn4FPVMisc0123], 1, [1], 1>; // Vector blends (YMM).
1168 defm : Zn4WriteResZMMPair<WriteBlendZ, [Zn4FPVMisc0123], 1, [2], 1>; // Vector blends (ZMM).
1169 defm : Zn4WriteResXMMPair<WriteVarBlend, [Zn4FPVMul01], 1, [1], 1>; // Vector variable blends.
1170 defm : Zn4WriteResYMMPair<WriteVarBlendY, [Zn4FPVMul01], 1, [1], 1>; // Vector variable blends (YMM).
1171 defm : Zn4WriteResZMMPair<WriteVarBlendZ, [Zn4FPVMul01], 1, [2], 1>; // Vector variable blends (ZMM).
1172 defm : Zn4WriteResXMMPair<WritePSADBW, [Zn4FPVAdd0123], 3, [2], 1>; // Vector PSADBW.
1173 defm : Zn4WriteResXMMPair<WritePSADBWX, [Zn4FPVAdd0123], 3, [2], 1>; // Vector PSADBW (XMM).
1174 defm : Zn4WriteResYMMPair<WritePSADBWY, [Zn4FPVAdd0123], 3, [2], 1>; // Vector PSADBW (YMM).
1175 defm : Zn4WriteResZMMPair<WritePSADBWZ, [Zn4FPVAdd0123], 4, [4], 1>; // Vector PSADBW (ZMM).
1176 defm : Zn4WriteResXMMPair<WriteMPSAD, [Zn4FPVAdd0123], 4, [8], 4, /*LoadUOps=*/2>; // Vector MPSAD.
1177 defm : Zn4WriteResYMMPair<WriteMPSADY, [Zn4FPVAdd0123], 4, [8], 3, /*LoadUOps=*/1>; // Vector MPSAD (YMM).
1178 defm : Zn4WriteResZMMPair<WriteMPSADZ, [Zn4FPVAdd0123], 4, [16], 3, /*LoadUOps=*/1>; // Vector MPSAD (ZMM).
1179 defm : Zn4WriteResXMMPair<WritePHMINPOS, [Zn4FPVAdd01], 3, [1], 1>; // Vector PHMINPOS.
1181 // Vector insert/extract operations.
1182 defm : Zn4WriteResXMMPair<WriteVecInsert, [Zn4FPLd01], 1, [2], 2, /*LoadUOps=*/-1>; // Insert gpr to vector element.
1183 defm : Zn4WriteResXMM<WriteVecExtract, [Zn4FPLd01], 1, [2], 2>; // Extract vector element to gpr.
1184 defm : Zn4WriteResXMM<WriteVecExtractSt, [Zn4FPSt, Zn4Store], !add(1, Znver4Model.StoreLatency), [1, 1], 2>; // Extract vector element and store.
1186 // MOVMSK operations.
1187 defm : Zn4WriteResXMM<WriteFMOVMSK, [Zn4FPVMisc2], 1, [1], 1>;
1188 defm : Zn4WriteResXMM<WriteVecMOVMSK, [Zn4FPVMisc2], 1, [1], 1>;
1189 defm : Zn4WriteResYMM<WriteVecMOVMSKY, [Zn4FPVMisc2], 1, [1], 1>;
1190 defm : Zn4WriteResXMM<WriteMMXMOVMSK, [Zn4FPVMisc2], 1, [1], 1>;
1192 // Conversion between integer and float.
1193 defm : Zn4WriteResXMMPair<WriteCvtSD2I, [Zn4FPFCvt01], 1, [1], 1>; // Double -> Integer.
1194 defm : Zn4WriteResXMMPair<WriteCvtPD2I, [Zn4FPFCvt01], 3, [2], 1>; // Double -> Integer (XMM).
1195 defm : Zn4WriteResYMMPair<WriteCvtPD2IY, [Zn4FPFCvt01], 3, [2], 2>; // Double -> Integer (YMM).
1196 defm : Zn4WriteResZMMPair<WriteCvtPD2IZ, [Zn4FPFCvt01], 3, [4], 2>; // Double -> Integer (ZMM).
1198 def Zn4WriteCvtPD2IMMX : SchedWriteRes<[Zn4FPFCvt01]> {
1200 let ReleaseAtCycles = [2];
1201 let NumMicroOps = 2;
1203 defm : Zn4WriteResXMMPair<WriteCvtSS2I, [Zn4FPFCvt01], 5, [5], 2>; // Float -> Integer.
1205 defm : Zn4WriteResXMMPair<WriteCvtPS2I, [Zn4FPFCvt01], 3, [1], 1>; // Float -> Integer (XMM).
1206 defm : Zn4WriteResYMMPair<WriteCvtPS2IY, [Zn4FPFCvt01], 4, [1], 1>; // Float -> Integer (YMM).
1207 defm : Zn4WriteResZMMPair<WriteCvtPS2IZ, [Zn4FPFCvt01], 4, [2], 2>; // Float -> Integer (ZMM).
1209 defm : Zn4WriteResXMMPair<WriteCvtI2SD, [Zn4FPFCvt01], 4, [2], 2, /*LoadUOps=*/-1>; // Integer -> Double.
1210 defm : Zn4WriteResXMMPair<WriteCvtI2PD, [Zn4FPFCvt01], 3, [1], 1>; // Integer -> Double (XMM).
1211 defm : Zn4WriteResYMMPair<WriteCvtI2PDY, [Zn4FPFCvt01], 3, [2], 2, /*LoadUOps=*/-1>; // Integer -> Double (YMM).
1212 defm : Zn4WriteResZMMPair<WriteCvtI2PDZ, [Zn4FPFCvt01], 4, [4], 4, /*LoadUOps=*/-1>; // Integer -> Double (ZMM).
1214 def Zn4WriteCvtI2PDMMX : SchedWriteRes<[Zn4FPFCvt01]> {
1216 let ReleaseAtCycles = [6];
1217 let NumMicroOps = 2;
1220 defm : Zn4WriteResXMMPair<WriteCvtI2SS, [Zn4FPFCvt01], 3, [2], 2, /*LoadUOps=*/-1>; // Integer -> Float.
1221 defm : Zn4WriteResXMMPair<WriteCvtI2PS, [Zn4FPFCvt01], 3, [1], 1>; // Integer -> Float (XMM).
1222 defm : Zn4WriteResYMMPair<WriteCvtI2PSY, [Zn4FPFCvt01], 3, [1], 1>; // Integer -> Float (YMM).
1223 defm : Zn4WriteResZMMPair<WriteCvtI2PSZ, [Zn4FPFCvt01], 3, [2], 2>; // Integer -> Float (ZMM).
1225 def Zn4WriteCvtI2PSMMX : SchedWriteRes<[Zn4FPFCvt01]> {
1227 let ReleaseAtCycles = [1];
1228 let NumMicroOps = 2;
1231 defm : Zn4WriteResXMMPair<WriteCvtSS2SD, [Zn4FPFCvt01], 3, [1], 1>; // Float -> Double size conversion.
1232 defm : Zn4WriteResXMMPair<WriteCvtPS2PD, [Zn4FPFCvt01], 3, [1], 1>; // Float -> Double size conversion (XMM).
1233 defm : Zn4WriteResYMMPair<WriteCvtPS2PDY, [Zn4FPFCvt01], 4, [2], 2, /*LoadUOps=*/-1>; // Float -> Double size conversion (YMM).
1234 defm : Zn4WriteResZMMPair<WriteCvtPS2PDZ, [Zn4FPFCvt01], 6, [4], 4, /*LoadUOps=*/-1>; // Float -> Double size conversion (ZMM).
1236 defm : Zn4WriteResXMMPair<WriteCvtSD2SS, [Zn4FPFCvt01], 3, [1], 1>; // Double -> Float size conversion.
1237 defm : Zn4WriteResXMMPair<WriteCvtPD2PS, [Zn4FPFCvt01], 3, [1], 1>; // Double -> Float size conversion (XMM).
1238 defm : Zn4WriteResYMMPair<WriteCvtPD2PSY, [Zn4FPFCvt01], 6, [2], 2>; // Double -> Float size conversion (YMM).
1239 defm : Zn4WriteResZMMPair<WriteCvtPD2PSZ, [Zn4FPFCvt01], 6, [4], 4>; // Double -> Float size conversion (ZMM).
1241 defm : Zn4WriteResXMMPair<WriteCvtPH2PS, [Zn4FPFCvt01], 3, [1], 1>; // Half -> Float size conversion.
1242 defm : Zn4WriteResYMMPair<WriteCvtPH2PSY, [Zn4FPFCvt01], 4, [2], 2, /*LoadUOps=*/-1>; // Half -> Float size conversion (YMM).
1243 defm : Zn4WriteResZMMPair<WriteCvtPH2PSZ, [Zn4FPFCvt01], 4, [4], 4, /*LoadUOps=*/-1>; // Half -> Float size conversion (ZMM).
1245 defm : Zn4WriteResXMM<WriteCvtPS2PH, [Zn4FPFCvt01], 3, [2], 1>; // Float -> Half size conversion.
1246 defm : Zn4WriteResYMM<WriteCvtPS2PHY, [Zn4FPFCvt01], 6, [2], 2>; // Float -> Half size conversion (YMM).
1247 defm : Zn4WriteResZMM<WriteCvtPS2PHZ, [Zn4FPFCvt01], 6, [2], 2>; // Float -> Half size conversion (ZMM).
1249 defm : Zn4WriteResXMM<WriteCvtPS2PHSt, [Zn4FPFCvt01, Zn4FPSt, Zn4Store], !add(3, Znver4Model.StoreLatency), [1, 1, 1], 2>; // Float -> Half + store size conversion.
1250 defm : Zn4WriteResYMM<WriteCvtPS2PHYSt, [Zn4FPFCvt01, Zn4FPSt, Zn4Store], !add(6, Znver4Model.StoreLatency), [2, 1, 1], 3>; // Float -> Half + store size conversion (YMM).
1251 defm : Zn4WriteResYMM<WriteCvtPS2PHZSt, [Zn4FPFCvt01, Zn4FPSt, Zn4Store], !add(6, Znver4Model.StoreLatency), [2, 1, 1], 3>; // Float -> Half + store size conversion (ZMM).
1253 // CRC32 instruction.
1254 defm : Zn4WriteResIntPair<WriteCRC32, [Zn4ALU1], 3, [1], 1>;
1256 def Zn4WriteSHA1MSG1rr : SchedWriteRes<[Zn4FPU0123]> {
1258 let ReleaseAtCycles = [2];
1259 let NumMicroOps = 2;
1261 def : InstRW<[Zn4WriteSHA1MSG1rr], (instrs SHA1MSG1rr)>;
1263 def Zn4WriteSHA1MSG1rm : SchedWriteRes<[Zn4AGU012, Zn4Load, Zn4FPU0123]> {
1264 let Latency = !add(Znver4Model.LoadLatency, Zn4WriteSHA1MSG1rr.Latency);
1265 let ReleaseAtCycles = [1, 1, 2];
1266 let NumMicroOps = !add(Zn4WriteSHA1MSG1rr.NumMicroOps, 0);
1268 def : InstRW<[Zn4WriteSHA1MSG1rm], (instrs SHA1MSG1rm)>;
1270 def Zn4WriteSHA1MSG2rr_SHA1NEXTErr : SchedWriteRes<[Zn4FPU0123]> {
1272 let ReleaseAtCycles = [2];
1273 let NumMicroOps = 1;
1275 def : InstRW<[Zn4WriteSHA1MSG2rr_SHA1NEXTErr], (instrs SHA1MSG2rr, SHA1NEXTErr)>;
1277 def Zn4Writerm_SHA1MSG2rm_SHA1NEXTErm : SchedWriteRes<[Zn4AGU012, Zn4Load, Zn4FPU0123]> {
1278 let Latency = !add(Znver4Model.LoadLatency, Zn4WriteSHA1MSG2rr_SHA1NEXTErr.Latency);
1279 let ReleaseAtCycles = [1, 1, 2];
1280 let NumMicroOps = !add(Zn4WriteSHA1MSG2rr_SHA1NEXTErr.NumMicroOps, 0);
1282 def : InstRW<[Zn4Writerm_SHA1MSG2rm_SHA1NEXTErm], (instrs SHA1MSG2rm, SHA1NEXTErm)>;
1284 def Zn4WriteSHA256MSG1rr : SchedWriteRes<[Zn4FPU0123]> {
1286 let ReleaseAtCycles = [3];
1287 let NumMicroOps = 2;
1289 def : InstRW<[Zn4WriteSHA256MSG1rr], (instrs SHA256MSG1rr)>;
1291 def Zn4Writerm_SHA256MSG1rm : SchedWriteRes<[Zn4AGU012, Zn4Load, Zn4FPU0123]> {
1292 let Latency = !add(Znver4Model.LoadLatency, Zn4WriteSHA256MSG1rr.Latency);
1293 let ReleaseAtCycles = [1, 1, 3];
1294 let NumMicroOps = !add(Zn4WriteSHA256MSG1rr.NumMicroOps, 0);
1296 def : InstRW<[Zn4Writerm_SHA256MSG1rm], (instrs SHA256MSG1rm)>;
1298 def Zn4WriteSHA256MSG2rr : SchedWriteRes<[Zn4FPU0123]> {
1300 let ReleaseAtCycles = [8];
1301 let NumMicroOps = 4;
1303 def : InstRW<[Zn4WriteSHA256MSG2rr], (instrs SHA256MSG2rr)>;
1305 def Zn4WriteSHA256MSG2rm : SchedWriteRes<[Zn4AGU012, Zn4Load, Zn4FPU0123]> {
1306 let Latency = !add(Znver4Model.LoadLatency, Zn4WriteSHA256MSG2rr.Latency);
1307 let ReleaseAtCycles = [1, 1, 8];
1308 let NumMicroOps = !add(Zn4WriteSHA256MSG2rr.NumMicroOps, 1);
1310 def : InstRW<[Zn4WriteSHA256MSG2rm], (instrs SHA256MSG2rm)>;
1312 def Zn4WriteSHA1RNDS4rri : SchedWriteRes<[Zn4FPU0123]> {
1314 let ReleaseAtCycles = [8];
1315 let NumMicroOps = 1;
1317 def : InstRW<[Zn4WriteSHA1RNDS4rri], (instrs SHA1RNDS4rri)>;
1319 def Zn4WriteSHA256RNDS2rr : SchedWriteRes<[Zn4FPU0123]> {
1321 let ReleaseAtCycles = [8];
1322 let NumMicroOps = 1;
1324 def : InstRW<[Zn4WriteSHA256RNDS2rr], (instrs SHA256RNDS2rr)>;
1326 // Strings instructions.
1327 // Packed Compare Implicit Length Strings, Return Mask
1328 defm : Zn4WriteResXMMPair<WritePCmpIStrM, [Zn4FPVAdd0123], 6, [8], 3, /*LoadUOps=*/1>;
1329 // Packed Compare Explicit Length Strings, Return Mask
1330 defm : Zn4WriteResXMMPair<WritePCmpEStrM, [Zn4FPVAdd0123], 6, [12], 7, /*LoadUOps=*/5>;
1331 // Packed Compare Implicit Length Strings, Return Index
1332 defm : Zn4WriteResXMMPair<WritePCmpIStrI, [Zn4FPVAdd0123], 2, [8], 4>;
1333 // Packed Compare Explicit Length Strings, Return Index
1334 defm : Zn4WriteResXMMPair<WritePCmpEStrI, [Zn4FPVAdd0123], 6, [12], 8, /*LoadUOps=*/4>;
1336 // AES instructions.
1337 defm : Zn4WriteResXMMPair<WriteAESDecEnc, [Zn4FPAES01], 4, [1], 1>; // Decryption, encryption.
1338 defm : Zn4WriteResXMMPair<WriteAESIMC, [Zn4FPAES01], 4, [1], 1>; // InvMixColumn.
1339 defm : Zn4WriteResXMMPair<WriteAESKeyGen, [Zn4FPAES01], 4, [1], 1>; // Key Generation.
1341 // Carry-less multiplication instructions.
1342 defm : Zn4WriteResXMMPair<WriteCLMul, [Zn4FPCLM01], 4, [4], 4>;
1345 defm : Zn4WriteResInt<WriteEMMS, [Zn4ALU0123], 2, [1], 1>; // FIXME: latency not from llvm-exegesis
1348 defm : Zn4WriteResInt<WriteLDMXCSR, [Zn4AGU012, Zn4Load, Zn4ALU0123], !add(Znver4Model.LoadLatency, 1), [1, 1, 6], 1>; // FIXME: latency not from llvm-exegesis
1349 defm : Zn4WriteResInt<WriteSTMXCSR, [Zn4ALU0123, Zn4AGU012, Zn4Store], !add(1, Znver4Model.StoreLatency), [60, 1, 1], 2>; // FIXME: latency not from llvm-exegesis
1351 // Catch-all for expensive system instructions.
1352 defm : Zn4WriteResInt<WriteSystem, [Zn4ALU0123], 100, [100], 100>;
1354 def Zn4WriteVZEROUPPER : SchedWriteRes<[Zn4FPU0123]> {
1355 let Latency = 0; // FIXME: not from llvm-exegesis
1356 let ReleaseAtCycles = [1];
1357 let NumMicroOps = 1;
1359 def : InstRW<[Zn4WriteVZEROUPPER], (instrs VZEROUPPER)>;
1361 def Zn4WriteVZEROALL : SchedWriteRes<[Zn4FPU0123]> {
1362 let Latency = 10; // FIXME: not from llvm-exegesis
1363 let ReleaseAtCycles = [24];
1364 let NumMicroOps = 18;
1366 def : InstRW<[Zn4WriteVZEROALL], (instrs VZEROALL)>;
1369 defm : Zn4WriteResYMMPair<WriteFShuffle256, [Zn4FPVShuf], 2, [1], 1, /*LoadUOps=*/2>; // Fp 256-bit width vector shuffles.
1370 defm : Zn4WriteResYMMPair<WriteFVarShuffle256, [Zn4FPVShuf], 7, [1], 2, /*LoadUOps=*/1>; // Fp 256-bit width variable shuffles.
1371 defm : Zn4WriteResYMMPair<WriteShuffle256, [Zn4FPVShuf], 1, [1], 1>; // 256-bit width vector shuffles.
1373 def Zn4WriteVPERM2I128rr_VPERM2F128rr : SchedWriteRes<[Zn4FPVShuf]> {
1375 let ReleaseAtCycles = [1];
1376 let NumMicroOps = 1;
1378 def : InstRW<[Zn4WriteVPERM2I128rr_VPERM2F128rr], (instrs VPERM2I128rri, VPERM2F128rri)>;
1380 def Zn4WriteVPERM2F128rm : SchedWriteRes<[Zn4AGU012, Zn4Load, Zn4FPVShuf]> {
1381 let Latency = !add(Znver4Model.LoadLatency, Zn4WriteVPERM2I128rr_VPERM2F128rr.Latency);
1382 let ReleaseAtCycles = [1, 1, 1];
1383 let NumMicroOps = !add(Zn4WriteVPERM2I128rr_VPERM2F128rr.NumMicroOps, 0);
1385 def : InstRW<[Zn4WriteVPERM2F128rm], (instrs VPERM2F128rmi)>;
1387 def Zn4WriteVPERMPSYrr : SchedWriteRes<[Zn4FPVShuf]> {
1389 let ReleaseAtCycles = [1];
1390 let NumMicroOps = 2;
1392 def : InstRW<[Zn4WriteVPERMPSYrr], (instrs VPERMPSYrr)>;
1394 def Zn4WriteVPERMPSYrm : SchedWriteRes<[Zn4AGU012, Zn4Load, Zn4FPVShuf]> {
1395 let Latency = !add(Znver4Model.LoadLatency, Zn4WriteVPERMPSYrr.Latency);
1396 let ReleaseAtCycles = [1, 1, 2];
1397 let NumMicroOps = !add(Zn4WriteVPERMPSYrr.NumMicroOps, 1);
1399 def : InstRW<[Zn4WriteVPERMPSYrm], (instrs VPERMPSYrm)>;
1401 def Zn4WriteVPERMYri : SchedWriteRes<[Zn4FPVShuf]> {
1403 let ReleaseAtCycles = [1];
1404 let NumMicroOps = 2;
1406 def : InstRW<[Zn4WriteVPERMYri], (instrs VPERMPDYri, VPERMQYri)>;
1408 def Zn4WriteVPERMPDYmi : SchedWriteRes<[Zn4AGU012, Zn4Load, Zn4FPVShuf]> {
1409 let Latency = !add(Znver4Model.LoadLatency, Zn4WriteVPERMYri.Latency);
1410 let ReleaseAtCycles = [1, 1, 2];
1411 let NumMicroOps = !add(Zn4WriteVPERMYri.NumMicroOps, 1);
1413 def : InstRW<[Zn4WriteVPERMPDYmi], (instrs VPERMPDYmi)>;
1415 def Zn4WriteVPERMDYrr : SchedWriteRes<[Zn4FPVShuf]> {
1417 let ReleaseAtCycles = [1];
1418 let NumMicroOps = 2;
1420 def : InstRW<[Zn4WriteVPERMDYrr], (instrs VPERMDYrr)>;
1422 def Zn4WriteVPERMYm : SchedWriteRes<[Zn4AGU012, Zn4Load, Zn4FPVShuf]> {
1423 let Latency = !add(Znver4Model.LoadLatency, Zn4WriteVPERMDYrr.Latency);
1424 let ReleaseAtCycles = [1, 1, 2];
1425 let NumMicroOps = !add(Zn4WriteVPERMDYrr.NumMicroOps, 0);
1427 def : InstRW<[Zn4WriteVPERMYm], (instrs VPERMQYmi, VPERMDYrm)>;
1429 defm : Zn4WriteResYMMPair<WriteVPMOV256, [Zn4FPVShuf01], 4, [3], 2, /*LoadUOps=*/-1>; // 256-bit width packed vector width-changing move.
1430 defm : Zn4WriteResYMMPair<WriteVarShuffle256, [Zn4FPVShuf01], 1, [1], 2>; // 256-bit width vector variable shuffles.
1431 defm : Zn4WriteResXMMPair<WriteVarVecShift, [Zn4FPVShift01], 1, [1], 1>; // Variable vector shifts.
1432 defm : Zn4WriteResYMMPair<WriteVarVecShiftY, [Zn4FPVShift01], 1, [1], 1>; // Variable vector shifts (YMM).
1433 defm : Zn4WriteResZMMPair<WriteVarVecShiftZ, [Zn4FPVShift01], 1, [2], 2>; // Variable vector shifts (ZMM).
1435 // Old microcoded instructions that nobody use.
1436 defm : Zn4WriteResInt<WriteMicrocoded, [Zn4ALU0123], 100, [100], 100>;
1438 // Fence instructions.
1439 defm : Zn4WriteResInt<WriteFence, [Zn4ALU0123], 1, [100], 1>;
1441 def Zn4WriteLFENCE : SchedWriteRes<[Zn4LSU]> {
1443 let ReleaseAtCycles = [30];
1444 let NumMicroOps = 1;
1446 def : InstRW<[Zn4WriteLFENCE], (instrs LFENCE)>;
1448 def Zn4WriteSFENCE : SchedWriteRes<[Zn4LSU]> {
1450 let ReleaseAtCycles = [1];
1451 let NumMicroOps = 1;
1453 def : InstRW<[Zn4WriteSFENCE], (instrs SFENCE)>;
1455 // Nop, not very useful expect it provides a model for nops!
1456 defm : Zn4WriteResInt<WriteNop, [Zn4ALU0123], 0, [1], 1>; // FIXME: latency not from llvm-exegesis
1459 ///////////////////////////////////////////////////////////////////////////////
1461 ///////////////////////////////////////////////////////////////////////////////
1463 def Zn4WriteZeroLatency : SchedWriteRes<[]> {
1465 let ReleaseAtCycles = [];
1466 let NumMicroOps = 1;
1468 def : InstRW<[Zn4WriteZeroLatency], (instrs MOV32rr, MOV32rr_REV,
1469 MOV64rr, MOV64rr_REV,
1472 def Zn4WriteSwapRenameable : SchedWriteRes<[]> {
1474 let ReleaseAtCycles = [];
1475 let NumMicroOps = 2;
1477 def : InstRW<[Zn4WriteSwapRenameable], (instrs XCHG32rr, XCHG32ar,
1478 XCHG64rr, XCHG64ar)>;
1480 defm : Zn4WriteResInt<WriteXCHG, [Zn4ALU0123], 0, [8], 2>; // Compare+Exchange - TODO RMW support.
1482 defm : Zn4WriteResXMM<WriteFMoveX, [], 0, [], 1>;
1483 defm : Zn4WriteResYMM<WriteFMoveY, [], 0, [], 1>;
1484 defm : Zn4WriteResYMM<WriteFMoveZ, [], 0, [], 1>;
1486 defm : Zn4WriteResXMM<WriteVecMove, [Zn4FPFMisc0123], 1, [1], 1>; // MMX
1487 defm : Zn4WriteResXMM<WriteVecMoveX, [], 0, [], 1>;
1488 defm : Zn4WriteResYMM<WriteVecMoveY, [], 0, [], 1>;
1489 defm : Zn4WriteResYMM<WriteVecMoveZ, [], 0, [], 1>;
1491 def : IsOptimizableRegisterMove<[
1492 InstructionEquivalenceClass<[
1494 MOV32rr, MOV32rr_REV,
1495 MOV64rr, MOV64rr_REV,
1501 // MMX moves are *NOT* eliminated.
1504 MOVAPSrr, MOVAPSrr_REV,
1505 MOVUPSrr, MOVUPSrr_REV,
1506 MOVAPDrr, MOVAPDrr_REV,
1507 MOVUPDrr, MOVUPDrr_REV,
1508 MOVDQArr, MOVDQArr_REV,
1509 MOVDQUrr, MOVDQUrr_REV,
1512 VMOVAPSrr, VMOVAPSrr_REV,
1513 VMOVUPSrr, VMOVUPSrr_REV,
1514 VMOVAPDrr, VMOVAPDrr_REV,
1515 VMOVUPDrr, VMOVUPDrr_REV,
1516 VMOVDQArr, VMOVDQArr_REV,
1517 VMOVDQUrr, VMOVDQUrr_REV,
1519 // AVX YMM variants.
1520 VMOVAPSYrr, VMOVAPSYrr_REV,
1521 VMOVUPSYrr, VMOVUPSYrr_REV,
1522 VMOVAPDYrr, VMOVAPDYrr_REV,
1523 VMOVUPDYrr, VMOVUPDYrr_REV,
1524 VMOVDQAYrr, VMOVDQAYrr_REV,
1525 VMOVDQUYrr, VMOVDQUYrr_REV,
1529 // FIXUP and RANGE Instructions
1530 def Zn4WriteVFIXUPIMMPDZrr_VRANGESDrr : SchedWriteRes<[Zn4FPFMisc01]> {
1532 let ReleaseAtCycles = [2];
1533 let NumMicroOps = 1;
1535 def : InstRW<[Zn4WriteVFIXUPIMMPDZrr_VRANGESDrr], (instregex
1536 "VFIXUPIMM(S|P)(S|D)(Z|Z128|Z256?)rrik", "VFIXUPIMM(S|P)(S|D)(Z?|Z128?|Z256?)rrikz",
1537 "VFIXUPIMM(S|P)(S|D)(Z128|Z256?)rri", "VRANGE(S|P)(S|D)(Z?|Z128?|Z256?)rri(b?)",
1538 "VRANGE(S|P)(S|D)(Z|Z128|Z256?)rri(b?)k","VRANGE(S|P)(S|D)(Z?|Z128?|Z256?)rri(b?)kz"
1541 // SCALE & REDUCE instructions
1542 def Zn4WriteSCALErr: SchedWriteRes<[Zn4FPFMisc23]> {
1544 let ReleaseAtCycles = [6];
1545 let NumMicroOps = 2;
1547 def : InstRW<[Zn4WriteSCALErr], (instregex
1548 "V(SCALEF|REDUCE)(S|P)(S|D)(Z?|Z128?|Z256?)(rr|rrb|rrkz|rrik|rrikz|rri)(_Int?)",
1549 "(V?)REDUCE(PD|PS|SD|SS)(Z?|Z128?)(rri|rrikz|rrib)"
1552 //BF16PS Instructions
1553 def Zn4WriteBF16: SchedWriteRes<[Zn4FPFMisc23]> {
1555 let ReleaseAtCycles = [6];
1556 let NumMicroOps = 2;
1558 def : InstRW<[Zn4WriteBF16], (instregex
1559 "(V?)DPBF16PS(Z?|Z128?|Z256?)(r|rk|rkz)"
1562 // BUSD and VPMADD Instructions
1563 def Zn4WriteBUSDr_VPMADDr: SchedWriteRes<[Zn4FPFMisc01]> {
1565 let ReleaseAtCycles = [4];
1566 let NumMicroOps = 1;
1568 def : InstRW<[Zn4WriteBUSDr_VPMADDr], (instregex
1569 "VPDP(BU|WS)(S|P)(S|D|DS)(Z|Z128|Z256)(r|rk|rkz)",
1570 "VPMADD52(H|L)UQ(Z|Z128|Z256)(r|rk|rkz)"
1573 // SHIFT instructions
1574 def Zn4WriteSHIFTrr: SchedWriteRes<[Zn4FPFMisc01]> {
1576 let ReleaseAtCycles = [2];
1577 let NumMicroOps = 1;
1579 def : InstRW<[Zn4WriteSHIFTrr], (instregex
1580 "VP(LZCNT|SHLD|SHRD?)(D|Q|W|VD|VQ|VW?)(Z?|Z128?|Z256?)(rr|rk|rrk|rrkz|rri|rrik|rrikz)",
1581 "(V?)P(SLL|SRL|SRA)(D|Q|W|DQ)(Y?|Z?|Z128?|Z256?)(rr|rrk|rrkz)",
1582 "(V?)P(SLL|SRL|SRA)DQYri",
1583 "(V?)P(SLL|SRL)DQ(Z?|Z256?)ri",
1584 "(V?)P(SHUFB)(Y|Z|Z128|Z256?)(rr|rrk|rrkz)",
1585 "(V?)P(ROL|ROR)(D|Q|VD|VQ)(Z?|Z128?|Z256?)(rr|rrk|rrkz)",
1586 "(V?)P(ROL|ROR)(D|Q|VD|VQ)(Z256?)(ri|rik|rikz)",
1587 "(V?)P(ROL|ROR)(D|Q)(Z?|Z128?)(ri|rik|rikz)",
1588 "VPSHUFBITQMBZ128rr", "VFMSUB231SSZrkz_Int"
1591 def Zn4WriteSHIFTri: SchedWriteRes<[Zn4FPFMisc01]> {
1593 let ReleaseAtCycles = [1];
1594 let NumMicroOps = 1;
1596 def : InstRW<[Zn4WriteSHIFTri], (instregex
1597 "VP(SLL|SRL|SRA)(D|Q|W)(Z|Z128|Z256?)(ri|rik|rikz)"
1600 // ALIGN Instructions
1601 def Zn4WriteALIGN: SchedWriteRes<[Zn4FPFMisc12]> {
1603 let ReleaseAtCycles = [2];
1604 let NumMicroOps = 1;
1606 def : InstRW<[Zn4WriteALIGN], (instregex
1607 "(V?)PALIGNR(Z?|Z128?|Z256?)(rri|rrik|rrikz)"
1611 def Zn4WritePACK: SchedWriteRes<[Zn4FPFMisc12]> {
1613 let ReleaseAtCycles = [2];
1614 let NumMicroOps = 1;
1616 def : InstRW<[Zn4WritePACK], (instregex
1617 "(V?)PACK(SS|US)(DW|WB)(Z?|Z128?|Z256?)(rr|rrk|rrkz)"
1620 // MAX and MIN Instructions
1621 def Zn4WriteFCmp64: SchedWriteRes<[Zn4FPFMisc01]> {
1623 let ReleaseAtCycles = [2];
1624 let NumMicroOps = 1;
1626 def : InstRW<[Zn4WriteFCmp64], (instregex
1627 "(V?)CMP(S|P)(S|D)(rr|rri|rr_Int)",
1628 "(V?|VP?)(MAX|MIN|MINC|MAXC)(S|P|U)(S|D|Q)(Z?|Z128?|Z256?)(rr|rri|rrk|rrkz)(_Int?)",
1629 "VP(MAX|MIN)(SQ|UQ)(Z|Z128|Z256)(rr|rrk|rrkz)",
1630 "(V?)(MAX|MAXC|MIN|MINC)PD(Z|Z128|Z256?)(rr|rrk|rrkz)"
1634 def Zn4MOVDUPZ: SchedWriteRes<[Zn4FPFMisc12]> {
1636 let ReleaseAtCycles = [2];
1637 let NumMicroOps = 1;
1639 def : InstRW<[Zn4MOVDUPZ], (instregex
1640 "(V?)VMOVDDUP(Z|Z128|Z256)(rr|rrk|rrkz)"
1643 def Zn4MOVS: SchedWriteRes<[Zn4FPFMisc12]> {
1645 let ReleaseAtCycles = [1];
1646 let NumMicroOps = 1;
1648 def : InstRW<[Zn4MOVS], (instregex
1649 "(V?)PMOV(SX|ZX)(BD|BQ|BW|WD|WQ|DQ)(Y?|Z128?|Z256?)(rr|rrk|rrkz)",
1650 "(V?)PMOV(S?|US?)(DB|DW|QB|QD|QW|WB)(Z128|Z256)(rr|rrk|rrkz)"
1653 def Zn4MOVSZ: SchedWriteRes<[Zn4FPFMisc12]> {
1655 let ReleaseAtCycles = [2];
1656 let NumMicroOps = 1;
1658 def : InstRW<[Zn4MOVSZ], (instregex
1659 "(V?)PMOV(SX|ZX)(BD|BQ|BW|WD|WQ|DQ)Z(rr|rrk|rrkz)"
1662 def Zn4MOVSrr: SchedWriteRes<[Zn4FPFMisc12]> {
1664 let ReleaseAtCycles = [2];
1665 let NumMicroOps = 1;
1667 def : InstRW<[Zn4MOVSrr], (instregex
1668 "(V?)PMOV(S?|US?)(DB|DW|QB|QD|QW|WB)Z(rr|rrk|rrkz)"
1672 //VPTEST Instructions
1673 def Zn4VPTESTZ128: SchedWriteRes<[Zn4FPFMisc01]> {
1675 let ReleaseAtCycles = [3];
1676 let NumMicroOps = 1;
1678 def : InstRW<[Zn4VPTESTZ128], (instregex
1679 "(V?)PTEST(N?)(MB|MD|MQ|MW)(Z128?)(rrk)"
1682 def Zn4VPTESTZ256: SchedWriteRes<[Zn4FPFMisc01]> {
1684 let ReleaseAtCycles = [4];
1685 let NumMicroOps = 1;
1687 def : InstRW<[Zn4VPTESTZ256], (instregex
1688 "(V?)PTEST(N?)(MB|MD|MQ|MW)(Z256?)(rr|rrk)"
1691 def Zn4VPTESTZ: SchedWriteRes<[Zn4FPFMisc01]> {
1693 let ReleaseAtCycles = [5];
1694 let NumMicroOps = 1;
1696 def : InstRW<[Zn4VPTESTZ], (instregex
1697 "(V?)PTEST(N?)(MB|MD|MQ|MW)(Z?)(rrk)"
1700 // CONFLICT Instructions
1701 def Zn4CONFLICTZ128: SchedWriteRes<[Zn4FPFMisc01]> {
1703 let ReleaseAtCycles = [2];
1704 let NumMicroOps = 1;
1706 def : InstRW<[Zn4CONFLICTZ128], (instregex
1707 "VPCONFLICT(D|Q)(Z128)(rr|rrk|rrkz)"
1710 def Zn4CONFLICTrr: SchedWriteRes<[Zn4FPFMisc01,Zn4FPFMisc12,Zn4FPFMisc23]> {
1712 let ReleaseAtCycles = [2,2,2];
1713 let NumMicroOps = 4;
1715 def : InstRW<[Zn4CONFLICTrr], (instregex
1716 "VPCONFLICT(D|Q)(Z|Z256)(rr|rrkz)"
1719 // RSQRT Instructions
1720 def Zn4VRSQRT14PDZ256: SchedWriteRes<[Zn4FPFMisc01]> {
1722 let ReleaseAtCycles = [2];
1723 let NumMicroOps = 1;
1725 def : InstRW<[Zn4VRSQRT14PDZ256], (instregex
1726 "VRSQRT14(PD|PS)(Z?|Z128?|Z256?)(r|rr|rk|rrk|rkz|rrkz)"
1730 // PERM Instructions
1731 def Zn4PERMILP: SchedWriteRes<[Zn4FPFMisc123]> {
1733 let ReleaseAtCycles = [2];
1734 let NumMicroOps = 1;
1736 def : InstRW<[Zn4PERMILP], (instregex
1737 "VPERMILP(S|D)(Y|Z|Z128|Z256)(rr|rrk|rrkz)"
1740 def Zn4PERMIT2_128: SchedWriteRes<[Zn4FPFMisc12]> {
1742 let ReleaseAtCycles = [2];
1743 let NumMicroOps = 1;
1745 def : InstRW<[Zn4PERMIT2_128], (instregex
1746 "VPERM(I2|T2)(PS|PD|W)Z128(rr|rrk|rrkz)",
1747 "VPERM(I2|T2)(B|D|Q)Z128(rr|rrk|rrkz)"
1750 def Zn4PERMIT2_128rr:SchedWriteRes<[Zn4FPFMisc12]> {
1752 let ReleaseAtCycles = [2];
1753 let NumMicroOps = 1;
1755 def : InstRW<[Zn4PERMIT2_128rr], (instregex
1756 "V(P?)COMPRESS(B|W|D|Q|PD|PS|SD|SQ)Z128(rr|rrk|rrkz)",
1757 "VPERM(B|D|Q|W)(Z128?)(rr|rrk|rrkz)"
1760 def Zn4PERMIT2_256: SchedWriteRes<[Zn4FPFMisc12]> {
1762 let ReleaseAtCycles = [2];
1763 let NumMicroOps = 1;
1765 def : InstRW<[Zn4PERMIT2_256], (instregex
1766 "VPERM(I2|T2)(PS|PD|W)Z256(rr|rrk|rrkz)",
1767 "VPERMP(S|D)Z256(rr|rrk|rrkz)",
1768 "V(P?)COMPRESS(B|W|D|Q|PD|PS|SD|SQ)Z256(rr|rrk|rrkz)",
1769 "VPERM(B|D|Q|W)Z256(rr|rrk|rrkz)",
1770 "VPERM(I2|Q|T2)(B|D|Q)Z256(rr|rrk|rrkz)",
1771 "VPEXPAND(B|W)Z256(rr|rrk|rrkz)"
1774 def Zn4PERMIT2Z: SchedWriteRes<[Zn4FPFMisc12]> {
1776 let ReleaseAtCycles = [2];
1777 let NumMicroOps = 1;
1779 def : InstRW<[Zn4PERMIT2Z], (instregex
1780 "VPERM(I2|T2)(PS|PD|W)Z(rr|rrk|rrkz)",
1781 "VPERM(B|D|W)Z(rr|rrk|rrkz)",
1782 "VPERM(I2|Q|T2)(B|D|Q)Z(rr|rrk|rrkz)",
1783 "V(P?)COMPRESS(B|W|D|Q|PD|PS|SD|SQ)Z(rr|rrk|rrkz)",
1784 "VPEXPAND(B|W)Z(rr|rrk|rrkz)",
1785 "VPERMP(S|D)Z(rr|rrk|rrkz)"
1788 // ALU SLOW Misc Instructions
1789 def Zn4VecALUZSlow: SchedWriteRes<[Zn4FPFMisc01]> {
1791 let ReleaseAtCycles = [2];
1792 let NumMicroOps = 1;
1794 def : InstRW<[Zn4VecALUZSlow], (instrs
1795 VPABSBZ128rr, VPABSBZ128rrk, VPABSBZ128rrkz, VPABSDZ128rr,
1796 VPABSDZ128rrk, VPABSDZ128rrkz, VPABSQZ128rr, VPABSQZ128rrk,
1797 VPABSQZ128rrkz, VPABSWZ128rr, VPABSWZ128rrk, VPABSWZ128rrkz,
1798 VPADDSBZ128rr, VPADDSBZ128rrk, VPADDSBZ128rrkz, VPADDSWZ128rr,
1799 VPADDSWZ128rrk, VPADDSWZ128rrkz,VPADDUSBZ128rr, VPADDUSBZ128rrk,
1800 VPADDUSBZ128rrkz, VPADDUSWZ128rr, VPADDUSWZ128rrk, VPADDUSWZ128rrkz,
1801 VPAVGBZ128rr, VPAVGBZ128rrk, VPAVGBZ128rrkz, VPAVGWZ128rr,
1802 VPAVGWZ128rrk, VPAVGWZ128rrkz, VPOPCNTBZ128rr, VPOPCNTBZ128rrk,
1803 VPOPCNTBZ128rrkz, VPOPCNTDZ128rr, VPOPCNTDZ128rrk, VPOPCNTDZ128rrkz,
1804 VPOPCNTQZ128rr, VPOPCNTQZ128rrk,VPOPCNTQZ128rrkz, VPOPCNTWZ128rr,
1805 VPOPCNTWZ128rrk, VPOPCNTWZ128rrkz,VPSUBSBZ128rr, VPSUBSBZ128rrk,
1806 VPSUBSBZ128rrkz, VPSUBSWZ128rr, VPSUBSWZ128rrk, VPSUBSWZ128rrkz,
1807 VPSUBUSBZ128rr, VPSUBUSBZ128rrk, VPSUBUSBZ128rrkz,VPSUBUSWZ128rr,
1808 VPSUBUSWZ128rrk, VPSUBUSWZ128rrkz
1812 ///////////////////////////////////////////////////////////////////////////////
1813 // Dependency breaking instructions.
1814 ///////////////////////////////////////////////////////////////////////////////
1816 def Zn4WriteZeroIdiom : SchedWriteVariant<[
1817 SchedVar<MCSchedPredicate<ZeroIdiomPredicate>, [Zn4WriteZeroLatency]>,
1818 SchedVar<NoSchedPred, [WriteALU]>
1820 def : InstRW<[Zn4WriteZeroIdiom], (instrs XOR32rr, XOR32rr_REV,
1821 XOR64rr, XOR64rr_REV,
1822 SUB32rr, SUB32rr_REV,
1823 SUB64rr, SUB64rr_REV)>;
1825 def Zn4WriteZeroIdiomEFLAGS : SchedWriteVariant<[
1826 SchedVar<MCSchedPredicate<CheckSameRegOperand<0, 1>>, [Zn4WriteZeroLatency]>,
1827 SchedVar<NoSchedPred, [WriteALU]>
1829 def : InstRW<[Zn4WriteZeroIdiomEFLAGS], (instrs CMP8rr, CMP8rr_REV,
1830 CMP16rr, CMP16rr_REV,
1831 CMP32rr, CMP32rr_REV,
1832 CMP64rr, CMP64rr_REV)>;
1834 def Zn4WriteFZeroIdiom : SchedWriteVariant<[
1835 SchedVar<MCSchedPredicate<ZeroIdiomPredicate>, [Zn4WriteZeroLatency]>,
1836 SchedVar<NoSchedPred, [WriteFLogic]>
1838 // NOTE: XORPSrr, XORPDrr are not zero-cycle!
1839 def : InstRW<[Zn4WriteFZeroIdiom], (instrs VXORPSrr, VXORPDrr,
1842 VANDNPSrr, VANDNPDrr,
1846 def Zn4WriteFZeroIdiomY : SchedWriteVariant<[
1847 SchedVar<MCSchedPredicate<ZeroIdiomPredicate>, [Zn4WriteZeroLatency]>,
1848 SchedVar<NoSchedPred, [WriteFLogicY]>
1850 def : InstRW<[Zn4WriteFZeroIdiomY], (instrs VXORPSYrr, VXORPDYrr,
1853 VANDNPSYrr, VANDNPDYrr,
1857 def Zn4WriteFZeroIdiomZ : SchedWriteVariant<[
1858 SchedVar<MCSchedPredicate<ZeroIdiomPredicate>, [Zn4WriteZeroLatency]>,
1859 SchedVar<NoSchedPred, [WriteFLogicZ]>
1861 def : InstRW<[Zn4WriteFZeroIdiomZ], (instrs VXORPSZrr, VXORPDZrr,
1862 VANDNPSZrr, VANDNPDZrr)>;
1864 def Zn4WriteVZeroIdiomLogicX : SchedWriteVariant<[
1865 SchedVar<MCSchedPredicate<ZeroIdiomPredicate>, [Zn4WriteZeroLatency]>,
1866 SchedVar<NoSchedPred, [WriteVecLogicX]>
1868 // NOTE: PXORrr,PANDNrr are not zero-cycle!
1869 def : InstRW<[Zn4WriteVZeroIdiomLogicX], (instrs VPXORrr,
1876 def Zn4WriteVZeroIdiomLogicY : SchedWriteVariant<[
1877 SchedVar<MCSchedPredicate<ZeroIdiomPredicate>, [Zn4WriteZeroLatency]>,
1878 SchedVar<NoSchedPred, [WriteVecLogicY]>
1880 def : InstRW<[Zn4WriteVZeroIdiomLogicY], (instrs VPXORYrr,
1887 def Zn4WriteVZeroIdiomLogicZ : SchedWriteVariant<[
1888 SchedVar<MCSchedPredicate<ZeroIdiomPredicate>, [Zn4WriteZeroLatency]>,
1889 SchedVar<NoSchedPred, [WriteVecLogicZ]>
1891 def : InstRW<[Zn4WriteVZeroIdiomLogicZ], (instrs VPXORDZrr, VPXORQZrr,
1892 VPANDNDZrr, VPANDNQZrr)>;
1894 def Zn4WriteVZeroIdiomALUX : SchedWriteVariant<[
1895 SchedVar<MCSchedPredicate<ZeroIdiomPredicate>, [Zn4WriteZeroLatency]>,
1896 SchedVar<NoSchedPred, [WriteVecALUX]>
1898 // NOTE: PSUBBrr, PSUBWrr, PSUBDrr, PSUBQrr,
1899 // PCMPGTBrr, PCMPGTWrr, PCMPGTDrr, PCMPGTQrr are not zero-cycle!
1900 def : InstRW<[Zn4WriteVZeroIdiomALUX],
1901 (instrs VPSUBBrr, VPSUBWrr, VPSUBDrr, VPSUBQrr,
1902 VPSUBBZ128rr, VPSUBWZ128rr, VPSUBDZ128rr, VPSUBQZ128rr,
1903 VPCMPGTBrr, VPCMPGTWrr, VPCMPGTDrr, VPCMPGTQrr,
1904 VPCMPGTBZ128rr, VPCMPGTWZ128rr,
1905 VPCMPGTDZ128rr, VPCMPGTQZ128rr)>;
1907 def Zn4WriteVZeroIdiomALUY : SchedWriteVariant<[
1908 SchedVar<MCSchedPredicate<ZeroIdiomPredicate>, [Zn4WriteZeroLatency]>,
1909 SchedVar<NoSchedPred, [WriteVecALUY]>
1911 def : InstRW<[Zn4WriteVZeroIdiomALUY],
1912 (instrs VPSUBBYrr, VPSUBWYrr, VPSUBDYrr, VPSUBQYrr,
1913 VPSUBBZ256rr, VPSUBWZ256rr, VPSUBDZ256rr, VPSUBQZ256rr,
1914 VPCMPGTBYrr, VPCMPGTWYrr, VPCMPGTDYrr, VPCMPGTQYrr,
1915 VPCMPGTBZ256rr, VPCMPGTWZ256rr,
1916 VPCMPGTDZ256rr, VPCMPGTQZ256rr)>;
1918 def Zn4WriteVZeroIdiomALUZ : SchedWriteVariant<[
1919 SchedVar<MCSchedPredicate<ZeroIdiomPredicate>, [Zn4WriteZeroLatency]>,
1920 SchedVar<NoSchedPred, [WriteVecALUZ]>
1922 def : InstRW<[Zn4WriteVZeroIdiomALUY],
1923 (instrs VPSUBBZrr, VPSUBWZrr, VPSUBDZrr, VPSUBQZrr,
1924 VPCMPGTBZrr, VPCMPGTWZrr, VPCMPGTDZrr, VPCMPGTQZrr)>;
1926 def : IsZeroIdiomFunction<[
1928 DepBreakingClass<[ XOR32rr, XOR32rr_REV,
1929 XOR64rr, XOR64rr_REV,
1930 SUB32rr, SUB32rr_REV,
1931 SUB64rr, SUB64rr_REV ], ZeroIdiomPredicate>,
1933 // SSE XMM Zero-idioms.
1942 PSUBBrr, PSUBWrr, PSUBDrr, PSUBQrr,
1944 PSUBUSBrr, PSUBUSWrr,
1945 PCMPGTBrr, PCMPGTWrr, PCMPGTDrr, PCMPGTQrr
1946 ], ZeroIdiomPredicate>,
1948 // AVX XMM Zero-idioms.
1952 VANDNPSrr, VANDNPDrr,
1957 VPSUBBrr, VPSUBWrr, VPSUBDrr, VPSUBQrr,
1958 VPSUBSBrr, VPSUBSWrr,
1959 VPSUBUSBrr, VPSUBUSWrr,
1960 VPCMPGTBrr, VPCMPGTWrr, VPCMPGTDrr, VPCMPGTQrr,
1961 ], ZeroIdiomPredicate>,
1963 // AVX YMM Zero-idioms.
1966 VXORPSYrr, VXORPDYrr,
1967 VANDNPSYrr, VANDNPDYrr,
1972 VPSUBBYrr, VPSUBWYrr, VPSUBDYrr, VPSUBQYrr,
1973 VPSUBSBYrr, VPSUBSWYrr,
1974 VPSUBUSBYrr, VPSUBUSWYrr,
1975 VPCMPGTBYrr, VPCMPGTWYrr, VPCMPGTDYrr, VPCMPGTQYrr
1976 ], ZeroIdiomPredicate>,
1978 // AVX ZMM Zero-idioms.
1981 VXORPSZrr, VXORPDZrr,
1982 VXORPSZ128rr, VXORPDZ128rr, VXORPSZ256rr, VXORPDZ256rr,
1983 VANDNPSZrr, VANDNPDZrr,
1984 VANDNPSZ128rr, VANDNPDZ128rr, VANDNPSZ256rr, VANDNPDZ256rr,
1987 VPCMPGTBZrr, VPCMPGTWZrr, VPCMPGTDZrr, VPCMPGTQZrr,
1988 VPCMPGTBZ128rr, VPCMPGTWZ128rr, VPCMPGTDZ128rr, VPCMPGTQZ128rr,
1989 VPCMPGTBZ256rr, VPCMPGTWZ256rr, VPCMPGTDZ256rr, VPCMPGTQZ256rr,
1990 VPANDNDZrr, VPANDNQZrr,
1991 VPANDNDZ128rr, VPANDNQZ128rr, VPANDNDZ256rr, VPANDNQZ256rr,
1992 VPXORDZrr, VPXORQZrr,
1993 VPXORDZ128rr, VPXORQZ128rr, VPXORDZ256rr, VPXORQZ256rr,
1994 VPSUBBZrr, VPSUBWZrr, VPSUBDZrr, VPSUBQZrr,
1995 VPSUBBZ128rr, VPSUBWZ128rr, VPSUBDZ128rr, VPSUBQZ128rr,
1996 VPSUBBZ256rr, VPSUBWZ256rr, VPSUBDZ256rr, VPSUBQZ256rr,
1997 ], ZeroIdiomPredicate>,
2000 def : IsDepBreakingFunction<[
2002 DepBreakingClass<[ SBB32rr, SBB32rr_REV,
2003 SBB64rr, SBB64rr_REV ], ZeroIdiomPredicate>,
2004 DepBreakingClass<[ CMP8rr, CMP8rr_REV,
2005 CMP16rr, CMP16rr_REV,
2006 CMP32rr, CMP32rr_REV,
2007 CMP64rr, CMP64rr_REV ], CheckSameRegOperand<0, 1> >,
2010 PCMPEQBrr, PCMPEQWrr, PCMPEQDrr, PCMPEQQrr
2011 ], ZeroIdiomPredicate>,
2015 VPCMPEQBrr, VPCMPEQWrr, VPCMPEQDrr, VPCMPEQQrr
2016 ], ZeroIdiomPredicate>,
2020 VPCMPEQBYrr, VPCMPEQWYrr, VPCMPEQDYrr, VPCMPEQQYrr
2021 ], ZeroIdiomPredicate>,