1 //===-- PPCScheduleP7.td - PPC P7 Scheduling Definitions ---*- tablegen -*-===//
3 // Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
4 // See https://llvm.org/LICENSE.txt for license information.
5 // SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
7 //===----------------------------------------------------------------------===//
9 // This file defines the itinerary class data for the POWER7 processor.
11 //===----------------------------------------------------------------------===//
14 // IBM POWER7 multicore server processor
15 // B. Sinharoy, et al.
16 // IBM J. Res. & Dev. (55) 3. May/June 2011.
18 // Scheduling for the P7 involves tracking two types of resources:
19 // 1. The dispatch bundle slots
20 // 2. The functional unit resources
23 def P7_DU1 : FuncUnit;
24 def P7_DU2 : FuncUnit;
25 def P7_DU3 : FuncUnit;
26 def P7_DU4 : FuncUnit;
27 def P7_DU5 : FuncUnit;
28 def P7_DU6 : FuncUnit;
30 def P7_LS1 : FuncUnit; // Load/Store pipeline 1
31 def P7_LS2 : FuncUnit; // Load/Store pipeline 2
33 def P7_FX1 : FuncUnit; // FX pipeline 1
34 def P7_FX2 : FuncUnit; // FX pipeline 2
36 // VS pipeline 1 (vector integer ops. always here)
37 def P7_VS1 : FuncUnit; // VS pipeline 1
38 // VS pipeline 2 (128-bit stores and perms. here)
39 def P7_VS2 : FuncUnit; // VS pipeline 2
41 def P7_CRU : FuncUnit; // CR unit (CR logicals and move-from-SPRs)
42 def P7_BRU : FuncUnit; // BR unit
45 // Each LSU pipeline can also execute FX add and logical instructions.
46 // Each LSU pipeline can complete a load or store in one cycle.
48 // Each store is broken into two parts, AGEN goes to the LSU while a
49 // "data steering" op. goes to the FXU or VSU.
51 // FX loads have a two cycle load-to-use latency (so one "bubble" cycle).
52 // VSU loads have a three cycle load-to-use latency (so two "bubble" cycle).
54 // Frequent FX ops. take only one cycle and results can be used again in the
55 // next cycle (there is a self-bypass). Getting results from the other FX
56 // pipeline takes an additional cycle.
58 // The VSU XS is similar to the POWER6, but with a pipeline length of 2 cycles
59 // (instead of 3 cycles on the POWER6). VSU XS handles vector FX-style ops.
60 // Dispatch of an instruction to VS1 that uses four single prec. inputs
61 // (either to a float or XC op). prevents dispatch in that cycle to VS2 of any
62 // floating point instruction.
64 // The VSU PM is similar to the POWER6, but with a pipeline length of 3 cycles
65 // (instead of 4 cycles on the POWER6). vsel is handled by the PM pipeline
66 // (unlike on the POWER6).
68 // FMA from the VSUs can forward results in 6 cycles. VS1 XS and vector FP
69 // share the same write-back, and have a 5-cycle latency difference, so the
70 // IFU/IDU will not dispatch an XS instructon 5 cycles after a vector FP
71 // op. has been dispatched to VS1.
73 // Three cycles after an L1 cache hit, a dependent VSU instruction can issue.
75 // Instruction dispatch groups have (at most) four non-branch instructions, and
76 // two branches. Unlike on the POWER4/5, a branch does not automatically
77 // end the dispatch group, but a second branch must be the last in the group.
79 def P7Itineraries : ProcessorItineraries<
80 [P7_DU1, P7_DU2, P7_DU3, P7_DU4, P7_DU5, P7_DU6,
81 P7_LS1, P7_LS2, P7_FX1, P7_FX2, P7_VS1, P7_VS2, P7_CRU, P7_BRU], [], [
82 InstrItinData<IIC_IntSimple , [InstrStage<1, [P7_DU1, P7_DU2,
84 InstrStage<1, [P7_FX1, P7_FX2,
87 InstrItinData<IIC_IntGeneral , [InstrStage<1, [P7_DU1, P7_DU2,
89 InstrStage<1, [P7_FX1, P7_FX2]>],
91 InstrItinData<IIC_IntISEL, [InstrStage<1, [P7_DU1], 0>,
92 InstrStage<1, [P7_FX1, P7_FX2], 0>,
93 InstrStage<1, [P7_BRU]>],
95 InstrItinData<IIC_IntCompare , [InstrStage<1, [P7_DU1, P7_DU2,
97 InstrStage<1, [P7_FX1, P7_FX2]>],
99 // FIXME: Add record-form itinerary data.
100 InstrItinData<IIC_IntDivW , [InstrStage<1, [P7_DU1], 0>,
101 InstrStage<1, [P7_DU2], 0>,
102 InstrStage<36, [P7_FX1, P7_FX2]>],
104 InstrItinData<IIC_IntDivD , [InstrStage<1, [P7_DU1], 0>,
105 InstrStage<1, [P7_DU2], 0>,
106 InstrStage<68, [P7_FX1, P7_FX2]>],
108 InstrItinData<IIC_IntMulHW , [InstrStage<1, [P7_DU1, P7_DU2,
110 InstrStage<1, [P7_FX1, P7_FX2]>],
112 InstrItinData<IIC_IntMulHWU , [InstrStage<1, [P7_DU1, P7_DU2,
114 InstrStage<1, [P7_FX1, P7_FX2]>],
116 InstrItinData<IIC_IntMulHD , [InstrStage<1, [P7_DU1, P7_DU2,
118 InstrStage<1, [P7_FX1, P7_FX2]>],
120 InstrItinData<IIC_IntMulLI , [InstrStage<1, [P7_DU1, P7_DU2,
122 InstrStage<1, [P7_FX1, P7_FX2]>],
124 InstrItinData<IIC_IntRotate , [InstrStage<1, [P7_DU1, P7_DU2,
126 InstrStage<1, [P7_FX1, P7_FX2]>],
128 InstrItinData<IIC_IntRotateD , [InstrStage<1, [P7_DU1, P7_DU2,
130 InstrStage<1, [P7_FX1, P7_FX2]>],
132 InstrItinData<IIC_IntRotateDI , [InstrStage<1, [P7_DU1, P7_DU2,
134 InstrStage<1, [P7_FX1, P7_FX2]>],
136 InstrItinData<IIC_IntShift , [InstrStage<1, [P7_DU1, P7_DU2,
138 InstrStage<1, [P7_FX1, P7_FX2]>],
140 InstrItinData<IIC_IntTrapW , [InstrStage<1, [P7_DU1, P7_DU2,
142 InstrStage<1, [P7_FX1, P7_FX2]>],
144 InstrItinData<IIC_IntTrapD , [InstrStage<1, [P7_DU1, P7_DU2,
146 InstrStage<1, [P7_FX1, P7_FX2]>],
148 InstrItinData<IIC_BrB , [InstrStage<1, [P7_DU5, P7_DU6], 0>,
149 InstrStage<1, [P7_BRU]>],
151 InstrItinData<IIC_BrCR , [InstrStage<1, [P7_DU1], 0>,
152 InstrStage<1, [P7_CRU]>],
154 InstrItinData<IIC_BrMCR , [InstrStage<1, [P7_DU5, P7_DU6], 0>,
155 InstrStage<1, [P7_BRU]>],
157 InstrItinData<IIC_BrMCRX , [InstrStage<1, [P7_DU5, P7_DU6], 0>,
158 InstrStage<1, [P7_BRU]>],
160 InstrItinData<IIC_LdStLoad , [InstrStage<1, [P7_DU1, P7_DU2,
162 InstrStage<1, [P7_LS1, P7_LS2]>],
164 InstrItinData<IIC_LdStLoadUpd , [InstrStage<1, [P7_DU1], 0>,
165 InstrStage<1, [P7_DU2], 0>,
166 InstrStage<1, [P7_LS1, P7_LS2], 0>,
167 InstrStage<1, [P7_FX1, P7_FX2]>],
169 InstrItinData<IIC_LdStLoadUpdX, [InstrStage<1, [P7_DU1], 0>,
170 InstrStage<1, [P7_DU2], 0>,
171 InstrStage<1, [P7_DU3], 0>,
172 InstrStage<1, [P7_DU4], 0>,
173 InstrStage<1, [P7_FX1, P7_FX2]>,
174 InstrStage<1, [P7_LS1, P7_LS2], 0>,
175 InstrStage<1, [P7_FX1, P7_FX2]>],
177 InstrItinData<IIC_LdStLD , [InstrStage<1, [P7_DU1, P7_DU2,
179 InstrStage<1, [P7_LS1, P7_LS2]>],
181 InstrItinData<IIC_LdStLDU , [InstrStage<1, [P7_DU1], 0>,
182 InstrStage<1, [P7_DU2], 0>,
183 InstrStage<1, [P7_LS1, P7_LS2], 0>,
184 InstrStage<1, [P7_FX1, P7_FX2]>],
186 InstrItinData<IIC_LdStLDUX , [InstrStage<1, [P7_DU1], 0>,
187 InstrStage<1, [P7_DU2], 0>,
188 InstrStage<1, [P7_DU3], 0>,
189 InstrStage<1, [P7_DU4], 0>,
190 InstrStage<1, [P7_FX1, P7_FX2]>,
191 InstrStage<1, [P7_LS1, P7_LS2], 0>,
192 InstrStage<1, [P7_FX1, P7_FX2]>],
194 InstrItinData<IIC_LdStLFD , [InstrStage<1, [P7_DU1, P7_DU2,
196 InstrStage<1, [P7_LS1, P7_LS2]>],
198 InstrItinData<IIC_LdStLVecX , [InstrStage<1, [P7_DU1, P7_DU2,
200 InstrStage<1, [P7_LS1, P7_LS2]>],
202 InstrItinData<IIC_LdStLFDU , [InstrStage<1, [P7_DU1], 0>,
203 InstrStage<1, [P7_DU2], 0>,
204 InstrStage<1, [P7_LS1, P7_LS2], 0>,
205 InstrStage<1, [P7_FX1, P7_FX2]>],
207 InstrItinData<IIC_LdStLFDUX , [InstrStage<1, [P7_DU1], 0>,
208 InstrStage<1, [P7_DU2], 0>,
209 InstrStage<1, [P7_LS1, P7_LS2], 0>,
210 InstrStage<1, [P7_FX1, P7_FX2]>],
212 InstrItinData<IIC_LdStLHA , [InstrStage<1, [P7_DU1], 0>,
213 InstrStage<1, [P7_DU2], 0>,
214 InstrStage<1, [P7_LS1, P7_LS2]>,
215 InstrStage<1, [P7_FX1, P7_FX2]>],
217 InstrItinData<IIC_LdStLHAU , [InstrStage<1, [P7_DU1], 0>,
218 InstrStage<1, [P7_DU2], 0>,
219 InstrStage<1, [P7_LS1, P7_LS2], 0>,
220 InstrStage<1, [P7_FX1, P7_FX2]>,
221 InstrStage<1, [P7_FX1, P7_FX2]>],
223 InstrItinData<IIC_LdStLHAUX , [InstrStage<1, [P7_DU1], 0>,
224 InstrStage<1, [P7_DU2], 0>,
225 InstrStage<1, [P7_DU3], 0>,
226 InstrStage<1, [P7_DU4], 0>,
227 InstrStage<1, [P7_FX1, P7_FX2]>,
228 InstrStage<1, [P7_LS1, P7_LS2], 0>,
229 InstrStage<1, [P7_FX1, P7_FX2]>,
230 InstrStage<1, [P7_FX1, P7_FX2]>],
232 InstrItinData<IIC_LdStLWA , [InstrStage<1, [P7_DU1], 0>,
233 InstrStage<1, [P7_DU2], 0>,
234 InstrStage<1, [P7_LS1, P7_LS2]>,
235 InstrStage<1, [P7_FX1, P7_FX2]>],
237 InstrItinData<IIC_LdStLWARX, [InstrStage<1, [P7_DU1], 0>,
238 InstrStage<1, [P7_DU2], 0>,
239 InstrStage<1, [P7_DU3], 0>,
240 InstrStage<1, [P7_DU4], 0>,
241 InstrStage<1, [P7_LS1, P7_LS2]>],
243 InstrItinData<IIC_LdStLDARX, [InstrStage<1, [P7_DU1], 0>,
244 InstrStage<1, [P7_DU2], 0>,
245 InstrStage<1, [P7_DU3], 0>,
246 InstrStage<1, [P7_DU4], 0>,
247 InstrStage<1, [P7_LS1, P7_LS2]>],
249 InstrItinData<IIC_LdStLMW , [InstrStage<1, [P7_DU1, P7_DU2,
251 InstrStage<1, [P7_LS1, P7_LS2]>],
253 InstrItinData<IIC_LdStStore , [InstrStage<1, [P7_DU1, P7_DU2,
255 InstrStage<1, [P7_LS1, P7_LS2], 0>,
256 InstrStage<1, [P7_FX1, P7_FX2]>],
258 InstrItinData<IIC_LdStSTD , [InstrStage<1, [P7_DU1, P7_DU2,
260 InstrStage<1, [P7_LS1, P7_LS2], 0>,
261 InstrStage<1, [P7_FX1, P7_FX2]>],
263 InstrItinData<IIC_LdStSTU , [InstrStage<1, [P7_DU1], 0>,
264 InstrStage<1, [P7_DU2], 0>,
265 InstrStage<1, [P7_LS1, P7_LS2], 0>,
266 InstrStage<1, [P7_FX1, P7_FX2]>,
267 InstrStage<1, [P7_FX1, P7_FX2]>],
269 InstrItinData<IIC_LdStSTUX , [InstrStage<1, [P7_DU1], 0>,
270 InstrStage<1, [P7_DU2], 0>,
271 InstrStage<1, [P7_DU3], 0>,
272 InstrStage<1, [P7_DU4], 0>,
273 InstrStage<1, [P7_LS1, P7_LS2], 0>,
274 InstrStage<1, [P7_FX1, P7_FX2]>,
275 InstrStage<1, [P7_FX1, P7_FX2]>],
277 InstrItinData<IIC_LdStSTFD , [InstrStage<1, [P7_DU1, P7_DU2,
279 InstrStage<1, [P7_LS1, P7_LS2], 0>,
280 InstrStage<1, [P7_VS1, P7_VS2]>],
282 InstrItinData<IIC_LdStSTFDU , [InstrStage<1, [P7_DU1], 0>,
283 InstrStage<1, [P7_DU2], 0>,
284 InstrStage<1, [P7_LS1, P7_LS2], 0>,
285 InstrStage<1, [P7_FX1, P7_FX2], 0>,
286 InstrStage<1, [P7_VS1, P7_VS2]>],
288 InstrItinData<IIC_LdStSTVEBX , [InstrStage<1, [P7_DU1, P7_DU2,
290 InstrStage<1, [P7_LS1, P7_LS2], 0>,
291 InstrStage<1, [P7_VS2]>],
293 InstrItinData<IIC_LdStSTDCX , [InstrStage<1, [P7_DU1], 0>,
294 InstrStage<1, [P7_DU2], 0>,
295 InstrStage<1, [P7_DU3], 0>,
296 InstrStage<1, [P7_DU4], 0>,
297 InstrStage<1, [P7_LS1, P7_LS2]>],
299 InstrItinData<IIC_LdStSTWCX , [InstrStage<1, [P7_DU1], 0>,
300 InstrStage<1, [P7_DU2], 0>,
301 InstrStage<1, [P7_DU3], 0>,
302 InstrStage<1, [P7_DU4], 0>,
303 InstrStage<1, [P7_LS1, P7_LS2]>],
305 InstrItinData<IIC_BrMCRX , [InstrStage<1, [P7_DU1], 0>,
306 InstrStage<1, [P7_DU2], 0>,
307 InstrStage<1, [P7_DU3], 0>,
308 InstrStage<1, [P7_DU4], 0>,
309 InstrStage<1, [P7_CRU]>,
310 InstrStage<1, [P7_FX1, P7_FX2]>],
312 InstrItinData<IIC_SprMFCR , [InstrStage<1, [P7_DU1], 0>,
313 InstrStage<1, [P7_CRU]>],
315 InstrItinData<IIC_SprMFCRF , [InstrStage<1, [P7_DU1], 0>,
316 InstrStage<1, [P7_CRU]>],
318 InstrItinData<IIC_SprMTSPR , [InstrStage<1, [P7_DU1], 0>,
319 InstrStage<1, [P7_FX1]>],
321 InstrItinData<IIC_FPGeneral , [InstrStage<1, [P7_DU1, P7_DU2,
323 InstrStage<1, [P7_VS1, P7_VS2]>],
325 InstrItinData<IIC_FPAddSub , [InstrStage<1, [P7_DU1, P7_DU2,
327 InstrStage<1, [P7_VS1, P7_VS2]>],
329 InstrItinData<IIC_FPCompare , [InstrStage<1, [P7_DU1, P7_DU2,
331 InstrStage<1, [P7_VS1, P7_VS2]>],
333 InstrItinData<IIC_FPDivD , [InstrStage<1, [P7_DU1, P7_DU2,
335 InstrStage<1, [P7_VS1, P7_VS2]>],
337 InstrItinData<IIC_FPDivS , [InstrStage<1, [P7_DU1, P7_DU2,
339 InstrStage<1, [P7_VS1, P7_VS2]>],
341 InstrItinData<IIC_FPSqrtD , [InstrStage<1, [P7_DU1, P7_DU2,
343 InstrStage<1, [P7_VS1, P7_VS2]>],
345 InstrItinData<IIC_FPSqrtS , [InstrStage<1, [P7_DU1, P7_DU2,
347 InstrStage<1, [P7_VS1, P7_VS2]>],
349 InstrItinData<IIC_FPFused , [InstrStage<1, [P7_DU1, P7_DU2,
351 InstrStage<1, [P7_VS1, P7_VS2]>],
353 InstrItinData<IIC_FPRes , [InstrStage<1, [P7_DU1, P7_DU2,
355 InstrStage<1, [P7_VS1, P7_VS2]>],
357 InstrItinData<IIC_VecGeneral , [InstrStage<1, [P7_DU1], 0>,
358 InstrStage<1, [P7_VS1]>],
360 InstrItinData<IIC_VecVSL , [InstrStage<1, [P7_DU1], 0>,
361 InstrStage<1, [P7_VS1]>],
363 InstrItinData<IIC_VecVSR , [InstrStage<1, [P7_DU1], 0>,
364 InstrStage<1, [P7_VS1]>],
366 InstrItinData<IIC_VecFP , [InstrStage<1, [P7_DU1], 0>,
367 InstrStage<1, [P7_VS1, P7_VS2]>],
369 InstrItinData<IIC_VecFPCompare, [InstrStage<1, [P7_DU1], 0>,
370 InstrStage<1, [P7_VS1, P7_VS2]>],
372 InstrItinData<IIC_VecFPRound , [InstrStage<1, [P7_DU1], 0>,
373 InstrStage<1, [P7_VS1, P7_VS2]>],
375 InstrItinData<IIC_VecComplex , [InstrStage<1, [P7_DU1], 0>,
376 InstrStage<1, [P7_VS1]>],
378 InstrItinData<IIC_VecPerm , [InstrStage<1, [P7_DU1, P7_DU2], 0>,
379 InstrStage<1, [P7_VS2]>],
383 // ===---------------------------------------------------------------------===//
384 // P7 machine model for scheduling and other instruction cost heuristics.
386 def P7Model : SchedMachineModel {
387 let IssueWidth = 6; // 4 (non-branch) instructions are dispatched per cycle.
388 // Note that the dispatch bundle size is 6 (including
389 // branches), but the total internal issue bandwidth per
390 // cycle (from all queues) is 8.
392 let LoadLatency = 3; // Optimistic load latency assuming bypass.
393 // This is overriden by OperandCycles if the
394 // Itineraries are queried instead.
395 let MispredictPenalty = 16;
397 // Try to make sure we have at least 10 dispatch groups in a loop.
398 let LoopMicroOpBufferSize = 40;
400 let CompleteModel = 0;
402 let Itineraries = P7Itineraries;