1 /* Costs of operations of individual x86 CPUs.
2 Copyright (C) 1988-2024 Free Software Foundation, Inc.
4 This file is part of GCC.
6 GCC is free software; you can redistribute it and/or modify
7 it under the terms of the GNU General Public License as published by
8 the Free Software Foundation; either version 3, or (at your option)
11 GCC is distributed in the hope that it will be useful,
12 but WITHOUT ANY WARRANTY; without even the implied warranty of
13 MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
14 GNU General Public License for more details.
16 Under Section 7 of GPL version 3, you are granted additional
17 permissions described in the GCC Runtime Library Exception, version
18 3.1, as published by the Free Software Foundation.
20 You should have received a copy of the GNU General Public License and
21 a copy of the GCC Runtime Library Exception along with this program;
22 see the files COPYING3 and COPYING.RUNTIME respectively. If not, see
23 <http://www.gnu.org/licenses/>. */
24 /* Processor costs (relative to an add) */
25 /* We assume COSTS_N_INSNS is defined as (N)*4 and an addition is 2 bytes. */
26 #define COSTS_N_BYTES(N) ((N) * 2)
28 #define DUMMY_STRINGOP_ALGS {libcall, {{-1, libcall, false}}}
30 static stringop_algs ix86_size_memcpy
[2] = {
31 {rep_prefix_1_byte
, {{-1, rep_prefix_1_byte
, false}}},
32 {rep_prefix_1_byte
, {{-1, rep_prefix_1_byte
, false}}}};
33 static stringop_algs ix86_size_memset
[2] = {
34 {rep_prefix_1_byte
, {{-1, rep_prefix_1_byte
, false}}},
35 {rep_prefix_1_byte
, {{-1, rep_prefix_1_byte
, false}}}};
38 struct processor_costs ix86_size_cost
= {/* costs for tuning for size */
40 /* Start of register allocator costs. integer->integer move cost is 2. */
41 2, /* cost for loading QImode using movzbl */
42 {2, 2, 2}, /* cost of loading integer registers
43 in QImode, HImode and SImode.
44 Relative to reg-reg move (2). */
45 {2, 2, 2}, /* cost of storing integer registers */
46 2, /* cost of reg,reg fld/fst */
47 {2, 2, 2}, /* cost of loading fp registers
48 in SFmode, DFmode and XFmode */
49 {2, 2, 2}, /* cost of storing fp registers
50 in SFmode, DFmode and XFmode */
51 3, /* cost of moving MMX register */
52 {3, 3}, /* cost of loading MMX registers
53 in SImode and DImode */
54 {3, 3}, /* cost of storing MMX registers
55 in SImode and DImode */
56 3, 3, 3, /* cost of moving XMM,YMM,ZMM register */
57 {3, 3, 3, 3, 3}, /* cost of loading SSE registers
58 in 32,64,128,256 and 512-bit */
59 {3, 3, 3, 3, 3}, /* cost of storing SSE registers
60 in 32,64,128,256 and 512-bit */
61 3, 3, /* SSE->integer and integer->SSE moves */
62 3, 3, /* mask->integer and integer->mask moves */
63 {2, 2, 2}, /* cost of loading mask register
64 in QImode, HImode, SImode. */
65 {2, 2, 2}, /* cost if storing mask register
66 in QImode, HImode, SImode. */
67 2, /* cost of moving mask register. */
68 /* End of register allocator costs. */
71 COSTS_N_BYTES (2), /* cost of an add instruction */
72 COSTS_N_BYTES (3), /* cost of a lea instruction */
73 COSTS_N_BYTES (2), /* variable shift costs */
74 COSTS_N_BYTES (3), /* constant shift costs */
75 {COSTS_N_BYTES (3), /* cost of starting multiply for QI */
76 COSTS_N_BYTES (3), /* HI */
77 COSTS_N_BYTES (3), /* SI */
78 COSTS_N_BYTES (3), /* DI */
79 COSTS_N_BYTES (5)}, /* other */
80 0, /* cost of multiply per each bit set */
81 {COSTS_N_BYTES (3), /* cost of a divide/mod for QI */
82 COSTS_N_BYTES (3), /* HI */
83 COSTS_N_BYTES (3), /* SI */
84 COSTS_N_BYTES (3), /* DI */
85 COSTS_N_BYTES (5)}, /* other */
86 COSTS_N_BYTES (3), /* cost of movsx */
87 COSTS_N_BYTES (3), /* cost of movzx */
91 {2, 2, 2}, /* cost of loading integer registers
92 in QImode, HImode and SImode.
93 Relative to reg-reg move (2). */
94 {2, 2, 2}, /* cost of storing integer registers */
95 {3, 3, 3, 3, 3}, /* cost of loading SSE register
96 in 32bit, 64bit, 128bit, 256bit and 512bit */
97 {3, 3, 3, 3, 3}, /* cost of storing SSE register
98 in 32bit, 64bit, 128bit, 256bit and 512bit */
99 {3, 3, 3, 3, 3}, /* cost of unaligned SSE load
100 in 128bit, 256bit and 512bit */
101 {3, 3, 3, 3, 3}, /* cost of unaligned SSE store
102 in 128bit, 256bit and 512bit */
103 3, 3, 3, /* cost of moving XMM,YMM,ZMM register */
104 3, /* cost of moving SSE register to integer. */
105 5, 0, /* Gather load static, per_elt. */
106 5, 0, /* Gather store static, per_elt. */
107 0, /* size of l1 cache */
108 0, /* size of l2 cache */
109 0, /* size of prefetch block */
110 0, /* number of parallel prefetches */
112 COSTS_N_BYTES (2), /* cost of FADD and FSUB insns. */
113 COSTS_N_BYTES (2), /* cost of FMUL instruction. */
114 COSTS_N_BYTES (2), /* cost of FDIV instruction. */
115 COSTS_N_BYTES (2), /* cost of FABS instruction. */
116 COSTS_N_BYTES (2), /* cost of FCHS instruction. */
117 COSTS_N_BYTES (2), /* cost of FSQRT instruction. */
119 COSTS_N_BYTES (2), /* cost of cheap SSE instruction. */
120 COSTS_N_BYTES (2), /* cost of ADDSS/SD SUBSS/SD insns. */
121 COSTS_N_BYTES (2), /* cost of MULSS instruction. */
122 COSTS_N_BYTES (2), /* cost of MULSD instruction. */
123 COSTS_N_BYTES (2), /* cost of FMA SS instruction. */
124 COSTS_N_BYTES (2), /* cost of FMA SD instruction. */
125 COSTS_N_BYTES (2), /* cost of DIVSS instruction. */
126 COSTS_N_BYTES (2), /* cost of DIVSD instruction. */
127 COSTS_N_BYTES (2), /* cost of SQRTSS instruction. */
128 COSTS_N_BYTES (2), /* cost of SQRTSD instruction. */
129 1, 1, 1, 1, /* reassoc int, fp, vec_int, vec_fp. */
132 COSTS_N_BYTES (1), /* cond_taken_branch_cost. */
133 COSTS_N_BYTES (1), /* cond_not_taken_branch_cost. */
134 NULL
, /* Loop alignment. */
135 NULL
, /* Jump alignment. */
136 NULL
, /* Label alignment. */
137 NULL
, /* Func alignment. */
138 4, /* Small unroll limit. */
139 2, /* Small unroll factor. */
142 /* Processor costs (relative to an add) */
143 static stringop_algs i386_memcpy
[2] = {
144 {rep_prefix_1_byte
, {{-1, rep_prefix_1_byte
, false}}},
145 DUMMY_STRINGOP_ALGS
};
146 static stringop_algs i386_memset
[2] = {
147 {rep_prefix_1_byte
, {{-1, rep_prefix_1_byte
, false}}},
148 DUMMY_STRINGOP_ALGS
};
151 struct processor_costs i386_cost
= { /* 386 specific costs */
153 /* Start of register allocator costs. integer->integer move cost is 2. */
154 4, /* cost for loading QImode using movzbl */
155 {2, 4, 2}, /* cost of loading integer registers
156 in QImode, HImode and SImode.
157 Relative to reg-reg move (2). */
158 {2, 4, 2}, /* cost of storing integer registers */
159 2, /* cost of reg,reg fld/fst */
160 {8, 8, 8}, /* cost of loading fp registers
161 in SFmode, DFmode and XFmode */
162 {8, 8, 8}, /* cost of storing fp registers
163 in SFmode, DFmode and XFmode */
164 2, /* cost of moving MMX register */
165 {4, 8}, /* cost of loading MMX registers
166 in SImode and DImode */
167 {4, 8}, /* cost of storing MMX registers
168 in SImode and DImode */
169 2, 4, 8, /* cost of moving XMM,YMM,ZMM register */
170 {4, 8, 16, 32, 64}, /* cost of loading SSE registers
171 in 32,64,128,256 and 512-bit */
172 {4, 8, 16, 32, 64}, /* cost of storing SSE registers
173 in 32,64,128,256 and 512-bit */
174 3, 3, /* SSE->integer and integer->SSE moves */
175 3, 3, /* mask->integer and integer->mask moves */
176 {2, 4, 2}, /* cost of loading mask register
177 in QImode, HImode, SImode. */
178 {2, 4, 2}, /* cost if storing mask register
179 in QImode, HImode, SImode. */
180 2, /* cost of moving mask register. */
181 /* End of register allocator costs. */
184 COSTS_N_INSNS (1), /* cost of an add instruction */
185 COSTS_N_INSNS (1), /* cost of a lea instruction */
186 COSTS_N_INSNS (3), /* variable shift costs */
187 COSTS_N_INSNS (2), /* constant shift costs */
188 {COSTS_N_INSNS (6), /* cost of starting multiply for QI */
189 COSTS_N_INSNS (6), /* HI */
190 COSTS_N_INSNS (6), /* SI */
191 COSTS_N_INSNS (6), /* DI */
192 COSTS_N_INSNS (6)}, /* other */
193 COSTS_N_INSNS (1), /* cost of multiply per each bit set */
194 {COSTS_N_INSNS (23), /* cost of a divide/mod for QI */
195 COSTS_N_INSNS (23), /* HI */
196 COSTS_N_INSNS (23), /* SI */
197 COSTS_N_INSNS (23), /* DI */
198 COSTS_N_INSNS (23)}, /* other */
199 COSTS_N_INSNS (3), /* cost of movsx */
200 COSTS_N_INSNS (2), /* cost of movzx */
201 15, /* "large" insn */
204 {2, 4, 2}, /* cost of loading integer registers
205 in QImode, HImode and SImode.
206 Relative to reg-reg move (2). */
207 {2, 4, 2}, /* cost of storing integer registers */
208 {4, 8, 16, 32, 64}, /* cost of loading SSE register
209 in 32bit, 64bit, 128bit, 256bit and 512bit */
210 {4, 8, 16, 32, 64}, /* cost of storing SSE register
211 in 32bit, 64bit, 128bit, 256bit and 512bit */
212 {4, 8, 16, 32, 64}, /* cost of unaligned loads. */
213 {4, 8, 16, 32, 64}, /* cost of unaligned stores. */
214 2, 4, 8, /* cost of moving XMM,YMM,ZMM register */
215 3, /* cost of moving SSE register to integer. */
216 4, 4, /* Gather load static, per_elt. */
217 4, 4, /* Gather store static, per_elt. */
218 0, /* size of l1 cache */
219 0, /* size of l2 cache */
220 0, /* size of prefetch block */
221 0, /* number of parallel prefetches */
223 COSTS_N_INSNS (23), /* cost of FADD and FSUB insns. */
224 COSTS_N_INSNS (27), /* cost of FMUL instruction. */
225 COSTS_N_INSNS (88), /* cost of FDIV instruction. */
226 COSTS_N_INSNS (22), /* cost of FABS instruction. */
227 COSTS_N_INSNS (24), /* cost of FCHS instruction. */
228 COSTS_N_INSNS (122), /* cost of FSQRT instruction. */
230 COSTS_N_INSNS (1), /* cost of cheap SSE instruction. */
231 COSTS_N_INSNS (23), /* cost of ADDSS/SD SUBSS/SD insns. */
232 COSTS_N_INSNS (27), /* cost of MULSS instruction. */
233 COSTS_N_INSNS (27), /* cost of MULSD instruction. */
234 COSTS_N_INSNS (27), /* cost of FMA SS instruction. */
235 COSTS_N_INSNS (27), /* cost of FMA SD instruction. */
236 COSTS_N_INSNS (88), /* cost of DIVSS instruction. */
237 COSTS_N_INSNS (88), /* cost of DIVSD instruction. */
238 COSTS_N_INSNS (122), /* cost of SQRTSS instruction. */
239 COSTS_N_INSNS (122), /* cost of SQRTSD instruction. */
240 1, 1, 1, 1, /* reassoc int, fp, vec_int, vec_fp. */
243 COSTS_N_INSNS (3), /* cond_taken_branch_cost. */
244 COSTS_N_INSNS (1), /* cond_not_taken_branch_cost. */
245 "4", /* Loop alignment. */
246 "4", /* Jump alignment. */
247 NULL
, /* Label alignment. */
248 "4", /* Func alignment. */
249 4, /* Small unroll limit. */
250 2, /* Small unroll factor. */
253 static stringop_algs i486_memcpy
[2] = {
254 {rep_prefix_4_byte
, {{-1, rep_prefix_4_byte
, false}}},
255 DUMMY_STRINGOP_ALGS
};
256 static stringop_algs i486_memset
[2] = {
257 {rep_prefix_4_byte
, {{-1, rep_prefix_4_byte
, false}}},
258 DUMMY_STRINGOP_ALGS
};
261 struct processor_costs i486_cost
= { /* 486 specific costs */
263 /* Start of register allocator costs. integer->integer move cost is 2. */
264 4, /* cost for loading QImode using movzbl */
265 {2, 4, 2}, /* cost of loading integer registers
266 in QImode, HImode and SImode.
267 Relative to reg-reg move (2). */
268 {2, 4, 2}, /* cost of storing integer registers */
269 2, /* cost of reg,reg fld/fst */
270 {8, 8, 8}, /* cost of loading fp registers
271 in SFmode, DFmode and XFmode */
272 {8, 8, 8}, /* cost of storing fp registers
273 in SFmode, DFmode and XFmode */
274 2, /* cost of moving MMX register */
275 {4, 8}, /* cost of loading MMX registers
276 in SImode and DImode */
277 {4, 8}, /* cost of storing MMX registers
278 in SImode and DImode */
279 2, 4, 8, /* cost of moving XMM,YMM,ZMM register */
280 {4, 8, 16, 32, 64}, /* cost of loading SSE registers
281 in 32,64,128,256 and 512-bit */
282 {4, 8, 16, 32, 64}, /* cost of storing SSE registers
283 in 32,64,128,256 and 512-bit */
284 3, 3, /* SSE->integer and integer->SSE moves */
285 3, 3, /* mask->integer and integer->mask moves */
286 {2, 4, 2}, /* cost of loading mask register
287 in QImode, HImode, SImode. */
288 {2, 4, 2}, /* cost if storing mask register
289 in QImode, HImode, SImode. */
290 2, /* cost of moving mask register. */
291 /* End of register allocator costs. */
294 COSTS_N_INSNS (1), /* cost of an add instruction */
295 COSTS_N_INSNS (1), /* cost of a lea instruction */
296 COSTS_N_INSNS (3), /* variable shift costs */
297 COSTS_N_INSNS (2), /* constant shift costs */
298 {COSTS_N_INSNS (12), /* cost of starting multiply for QI */
299 COSTS_N_INSNS (12), /* HI */
300 COSTS_N_INSNS (12), /* SI */
301 COSTS_N_INSNS (12), /* DI */
302 COSTS_N_INSNS (12)}, /* other */
303 1, /* cost of multiply per each bit set */
304 {COSTS_N_INSNS (40), /* cost of a divide/mod for QI */
305 COSTS_N_INSNS (40), /* HI */
306 COSTS_N_INSNS (40), /* SI */
307 COSTS_N_INSNS (40), /* DI */
308 COSTS_N_INSNS (40)}, /* other */
309 COSTS_N_INSNS (3), /* cost of movsx */
310 COSTS_N_INSNS (2), /* cost of movzx */
311 15, /* "large" insn */
314 {2, 4, 2}, /* cost of loading integer registers
315 in QImode, HImode and SImode.
316 Relative to reg-reg move (2). */
317 {2, 4, 2}, /* cost of storing integer registers */
318 {4, 8, 16, 32, 64}, /* cost of loading SSE register
319 in 32bit, 64bit, 128bit, 256bit and 512bit */
320 {4, 8, 16, 32, 64}, /* cost of storing SSE register
321 in 32bit, 64bit, 128bit, 256bit and 512bit */
322 {4, 8, 16, 32, 64}, /* cost of unaligned loads. */
323 {4, 8, 16, 32, 64}, /* cost of unaligned stores. */
324 2, 4, 8, /* cost of moving XMM,YMM,ZMM register */
325 3, /* cost of moving SSE register to integer. */
326 4, 4, /* Gather load static, per_elt. */
327 4, 4, /* Gather store static, per_elt. */
328 4, /* size of l1 cache. 486 has 8kB cache
329 shared for code and data, so 4kB is
330 not really precise. */
331 4, /* size of l2 cache */
332 0, /* size of prefetch block */
333 0, /* number of parallel prefetches */
335 COSTS_N_INSNS (8), /* cost of FADD and FSUB insns. */
336 COSTS_N_INSNS (16), /* cost of FMUL instruction. */
337 COSTS_N_INSNS (73), /* cost of FDIV instruction. */
338 COSTS_N_INSNS (3), /* cost of FABS instruction. */
339 COSTS_N_INSNS (3), /* cost of FCHS instruction. */
340 COSTS_N_INSNS (83), /* cost of FSQRT instruction. */
342 COSTS_N_INSNS (1), /* cost of cheap SSE instruction. */
343 COSTS_N_INSNS (8), /* cost of ADDSS/SD SUBSS/SD insns. */
344 COSTS_N_INSNS (16), /* cost of MULSS instruction. */
345 COSTS_N_INSNS (16), /* cost of MULSD instruction. */
346 COSTS_N_INSNS (16), /* cost of FMA SS instruction. */
347 COSTS_N_INSNS (16), /* cost of FMA SD instruction. */
348 COSTS_N_INSNS (73), /* cost of DIVSS instruction. */
349 COSTS_N_INSNS (74), /* cost of DIVSD instruction. */
350 COSTS_N_INSNS (83), /* cost of SQRTSS instruction. */
351 COSTS_N_INSNS (83), /* cost of SQRTSD instruction. */
352 1, 1, 1, 1, /* reassoc int, fp, vec_int, vec_fp. */
355 COSTS_N_INSNS (3), /* cond_taken_branch_cost. */
356 COSTS_N_INSNS (1), /* cond_not_taken_branch_cost. */
357 "16", /* Loop alignment. */
358 "16", /* Jump alignment. */
359 "0:0:8", /* Label alignment. */
360 "16", /* Func alignment. */
361 4, /* Small unroll limit. */
362 2, /* Small unroll factor. */
365 static stringop_algs pentium_memcpy
[2] = {
366 {libcall
, {{256, rep_prefix_4_byte
, false}, {-1, libcall
, false}}},
367 DUMMY_STRINGOP_ALGS
};
368 static stringop_algs pentium_memset
[2] = {
369 {libcall
, {{-1, rep_prefix_4_byte
, false}}},
370 DUMMY_STRINGOP_ALGS
};
373 struct processor_costs pentium_cost
= {
375 /* Start of register allocator costs. integer->integer move cost is 2. */
376 6, /* cost for loading QImode using movzbl */
377 {2, 4, 2}, /* cost of loading integer registers
378 in QImode, HImode and SImode.
379 Relative to reg-reg move (2). */
380 {2, 4, 2}, /* cost of storing integer registers */
381 2, /* cost of reg,reg fld/fst */
382 {2, 2, 6}, /* cost of loading fp registers
383 in SFmode, DFmode and XFmode */
384 {4, 4, 6}, /* cost of storing fp registers
385 in SFmode, DFmode and XFmode */
386 8, /* cost of moving MMX register */
387 {8, 8}, /* cost of loading MMX registers
388 in SImode and DImode */
389 {8, 8}, /* cost of storing MMX registers
390 in SImode and DImode */
391 2, 4, 8, /* cost of moving XMM,YMM,ZMM register */
392 {4, 8, 16, 32, 64}, /* cost of loading SSE registers
393 in 32,64,128,256 and 512-bit */
394 {4, 8, 16, 32, 64}, /* cost of storing SSE registers
395 in 32,64,128,256 and 512-bit */
396 3, 3, /* SSE->integer and integer->SSE moves */
397 3, 3, /* mask->integer and integer->mask moves */
398 {2, 4, 2}, /* cost of loading mask register
399 in QImode, HImode, SImode. */
400 {2, 4, 2}, /* cost if storing mask register
401 in QImode, HImode, SImode. */
402 2, /* cost of moving mask register. */
403 /* End of register allocator costs. */
406 COSTS_N_INSNS (1), /* cost of an add instruction */
407 COSTS_N_INSNS (1), /* cost of a lea instruction */
408 COSTS_N_INSNS (4), /* variable shift costs */
409 COSTS_N_INSNS (1), /* constant shift costs */
410 {COSTS_N_INSNS (11), /* cost of starting multiply for QI */
411 COSTS_N_INSNS (11), /* HI */
412 COSTS_N_INSNS (11), /* SI */
413 COSTS_N_INSNS (11), /* DI */
414 COSTS_N_INSNS (11)}, /* other */
415 0, /* cost of multiply per each bit set */
416 {COSTS_N_INSNS (25), /* cost of a divide/mod for QI */
417 COSTS_N_INSNS (25), /* HI */
418 COSTS_N_INSNS (25), /* SI */
419 COSTS_N_INSNS (25), /* DI */
420 COSTS_N_INSNS (25)}, /* other */
421 COSTS_N_INSNS (3), /* cost of movsx */
422 COSTS_N_INSNS (2), /* cost of movzx */
423 8, /* "large" insn */
426 {2, 4, 2}, /* cost of loading integer registers
427 in QImode, HImode and SImode.
428 Relative to reg-reg move (2). */
429 {2, 4, 2}, /* cost of storing integer registers */
430 {4, 8, 16, 32, 64}, /* cost of loading SSE register
431 in 32bit, 64bit, 128bit, 256bit and 512bit */
432 {4, 8, 16, 32, 64}, /* cost of storing SSE register
433 in 32bit, 64bit, 128bit, 256bit and 512bit */
434 {4, 8, 16, 32, 64}, /* cost of unaligned loads. */
435 {4, 8, 16, 32, 64}, /* cost of unaligned stores. */
436 2, 4, 8, /* cost of moving XMM,YMM,ZMM register */
437 3, /* cost of moving SSE register to integer. */
438 4, 4, /* Gather load static, per_elt. */
439 4, 4, /* Gather store static, per_elt. */
440 8, /* size of l1 cache. */
441 8, /* size of l2 cache */
442 0, /* size of prefetch block */
443 0, /* number of parallel prefetches */
445 COSTS_N_INSNS (3), /* cost of FADD and FSUB insns. */
446 COSTS_N_INSNS (3), /* cost of FMUL instruction. */
447 COSTS_N_INSNS (39), /* cost of FDIV instruction. */
448 COSTS_N_INSNS (1), /* cost of FABS instruction. */
449 COSTS_N_INSNS (1), /* cost of FCHS instruction. */
450 COSTS_N_INSNS (70), /* cost of FSQRT instruction. */
452 COSTS_N_INSNS (1), /* cost of cheap SSE instruction. */
453 COSTS_N_INSNS (3), /* cost of ADDSS/SD SUBSS/SD insns. */
454 COSTS_N_INSNS (3), /* cost of MULSS instruction. */
455 COSTS_N_INSNS (3), /* cost of MULSD instruction. */
456 COSTS_N_INSNS (6), /* cost of FMA SS instruction. */
457 COSTS_N_INSNS (6), /* cost of FMA SD instruction. */
458 COSTS_N_INSNS (39), /* cost of DIVSS instruction. */
459 COSTS_N_INSNS (39), /* cost of DIVSD instruction. */
460 COSTS_N_INSNS (70), /* cost of SQRTSS instruction. */
461 COSTS_N_INSNS (70), /* cost of SQRTSD instruction. */
462 1, 1, 1, 1, /* reassoc int, fp, vec_int, vec_fp. */
465 COSTS_N_INSNS (3), /* cond_taken_branch_cost. */
466 COSTS_N_INSNS (1), /* cond_not_taken_branch_cost. */
467 "16:8:8", /* Loop alignment. */
468 "16:8:8", /* Jump alignment. */
469 "0:0:8", /* Label alignment. */
470 "16", /* Func alignment. */
471 4, /* Small unroll limit. */
472 2, /* Small unroll factor. */
476 struct processor_costs lakemont_cost
= {
478 /* Start of register allocator costs. integer->integer move cost is 2. */
479 6, /* cost for loading QImode using movzbl */
480 {2, 4, 2}, /* cost of loading integer registers
481 in QImode, HImode and SImode.
482 Relative to reg-reg move (2). */
483 {2, 4, 2}, /* cost of storing integer registers */
484 2, /* cost of reg,reg fld/fst */
485 {2, 2, 6}, /* cost of loading fp registers
486 in SFmode, DFmode and XFmode */
487 {4, 4, 6}, /* cost of storing fp registers
488 in SFmode, DFmode and XFmode */
489 8, /* cost of moving MMX register */
490 {8, 8}, /* cost of loading MMX registers
491 in SImode and DImode */
492 {8, 8}, /* cost of storing MMX registers
493 in SImode and DImode */
494 2, 4, 8, /* cost of moving XMM,YMM,ZMM register */
495 {4, 8, 16, 32, 64}, /* cost of loading SSE registers
496 in 32,64,128,256 and 512-bit */
497 {4, 8, 16, 32, 64}, /* cost of storing SSE registers
498 in 32,64,128,256 and 512-bit */
499 3, 3, /* SSE->integer and integer->SSE moves */
500 3, 3, /* mask->integer and integer->mask moves */
501 {2, 4, 2}, /* cost of loading mask register
502 in QImode, HImode, SImode. */
503 {2, 4, 2}, /* cost if storing mask register
504 in QImode, HImode, SImode. */
505 2, /* cost of moving mask register. */
506 /* End of register allocator costs. */
509 COSTS_N_INSNS (1), /* cost of an add instruction */
510 COSTS_N_INSNS (1) + 1, /* cost of a lea instruction */
511 COSTS_N_INSNS (1), /* variable shift costs */
512 COSTS_N_INSNS (1), /* constant shift costs */
513 {COSTS_N_INSNS (11), /* cost of starting multiply for QI */
514 COSTS_N_INSNS (11), /* HI */
515 COSTS_N_INSNS (11), /* SI */
516 COSTS_N_INSNS (11), /* DI */
517 COSTS_N_INSNS (11)}, /* other */
518 0, /* cost of multiply per each bit set */
519 {COSTS_N_INSNS (25), /* cost of a divide/mod for QI */
520 COSTS_N_INSNS (25), /* HI */
521 COSTS_N_INSNS (25), /* SI */
522 COSTS_N_INSNS (25), /* DI */
523 COSTS_N_INSNS (25)}, /* other */
524 COSTS_N_INSNS (3), /* cost of movsx */
525 COSTS_N_INSNS (2), /* cost of movzx */
526 8, /* "large" insn */
529 {2, 4, 2}, /* cost of loading integer registers
530 in QImode, HImode and SImode.
531 Relative to reg-reg move (2). */
532 {2, 4, 2}, /* cost of storing integer registers */
533 {4, 8, 16, 32, 64}, /* cost of loading SSE register
534 in 32bit, 64bit, 128bit, 256bit and 512bit */
535 {4, 8, 16, 32, 64}, /* cost of storing SSE register
536 in 32bit, 64bit, 128bit, 256bit and 512bit */
537 {4, 8, 16, 32, 64}, /* cost of unaligned loads. */
538 {4, 8, 16, 32, 64}, /* cost of unaligned stores. */
539 2, 4, 8, /* cost of moving XMM,YMM,ZMM register */
540 3, /* cost of moving SSE register to integer. */
541 4, 4, /* Gather load static, per_elt. */
542 4, 4, /* Gather store static, per_elt. */
543 8, /* size of l1 cache. */
544 8, /* size of l2 cache */
545 0, /* size of prefetch block */
546 0, /* number of parallel prefetches */
548 COSTS_N_INSNS (3), /* cost of FADD and FSUB insns. */
549 COSTS_N_INSNS (3), /* cost of FMUL instruction. */
550 COSTS_N_INSNS (39), /* cost of FDIV instruction. */
551 COSTS_N_INSNS (1), /* cost of FABS instruction. */
552 COSTS_N_INSNS (1), /* cost of FCHS instruction. */
553 COSTS_N_INSNS (70), /* cost of FSQRT instruction. */
555 COSTS_N_INSNS (1), /* cost of cheap SSE instruction. */
556 COSTS_N_INSNS (5), /* cost of ADDSS/SD SUBSS/SD insns. */
557 COSTS_N_INSNS (5), /* cost of MULSS instruction. */
558 COSTS_N_INSNS (5), /* cost of MULSD instruction. */
559 COSTS_N_INSNS (10), /* cost of FMA SS instruction. */
560 COSTS_N_INSNS (10), /* cost of FMA SD instruction. */
561 COSTS_N_INSNS (31), /* cost of DIVSS instruction. */
562 COSTS_N_INSNS (60), /* cost of DIVSD instruction. */
563 COSTS_N_INSNS (31), /* cost of SQRTSS instruction. */
564 COSTS_N_INSNS (63), /* cost of SQRTSD instruction. */
565 1, 1, 1, 1, /* reassoc int, fp, vec_int, vec_fp. */
568 COSTS_N_INSNS (3), /* cond_taken_branch_cost. */
569 COSTS_N_INSNS (1), /* cond_not_taken_branch_cost. */
570 "16:8:8", /* Loop alignment. */
571 "16:8:8", /* Jump alignment. */
572 "0:0:8", /* Label alignment. */
573 "16", /* Func alignment. */
574 4, /* Small unroll limit. */
575 2, /* Small unroll factor. */
578 /* PentiumPro has optimized rep instructions for blocks aligned by 8 bytes
579 (we ensure the alignment). For small blocks inline loop is still a
580 noticeable win, for bigger blocks either rep movsl or rep movsb is
581 way to go. Rep movsb has apparently more expensive startup time in CPU,
582 but after 4K the difference is down in the noise. */
583 static stringop_algs pentiumpro_memcpy
[2] = {
584 {rep_prefix_4_byte
, {{128, loop
, false}, {1024, unrolled_loop
, false},
585 {8192, rep_prefix_4_byte
, false},
586 {-1, rep_prefix_1_byte
, false}}},
587 DUMMY_STRINGOP_ALGS
};
588 static stringop_algs pentiumpro_memset
[2] = {
589 {rep_prefix_4_byte
, {{1024, unrolled_loop
, false},
590 {8192, rep_prefix_4_byte
, false},
591 {-1, libcall
, false}}},
592 DUMMY_STRINGOP_ALGS
};
594 struct processor_costs pentiumpro_cost
= {
596 /* Start of register allocator costs. integer->integer move cost is 2. */
597 2, /* cost for loading QImode using movzbl */
598 {4, 4, 4}, /* cost of loading integer registers
599 in QImode, HImode and SImode.
600 Relative to reg-reg move (2). */
601 {2, 2, 2}, /* cost of storing integer registers */
602 2, /* cost of reg,reg fld/fst */
603 {2, 2, 6}, /* cost of loading fp registers
604 in SFmode, DFmode and XFmode */
605 {4, 4, 6}, /* cost of storing fp registers
606 in SFmode, DFmode and XFmode */
607 2, /* cost of moving MMX register */
608 {2, 2}, /* cost of loading MMX registers
609 in SImode and DImode */
610 {2, 2}, /* cost of storing MMX registers
611 in SImode and DImode */
612 2, 4, 8, /* cost of moving XMM,YMM,ZMM register */
613 {4, 8, 16, 32, 64}, /* cost of loading SSE registers
614 in 32,64,128,256 and 512-bit */
615 {4, 8, 16, 32, 64}, /* cost of storing SSE registers
616 in 32,64,128,256 and 512-bit */
617 3, 3, /* SSE->integer and integer->SSE moves */
618 3, 3, /* mask->integer and integer->mask moves */
619 {4, 4, 4}, /* cost of loading mask register
620 in QImode, HImode, SImode. */
621 {2, 2, 2}, /* cost if storing mask register
622 in QImode, HImode, SImode. */
623 2, /* cost of moving mask register. */
624 /* End of register allocator costs. */
627 COSTS_N_INSNS (1), /* cost of an add instruction */
628 COSTS_N_INSNS (1), /* cost of a lea instruction */
629 COSTS_N_INSNS (1), /* variable shift costs */
630 COSTS_N_INSNS (1), /* constant shift costs */
631 {COSTS_N_INSNS (4), /* cost of starting multiply for QI */
632 COSTS_N_INSNS (4), /* HI */
633 COSTS_N_INSNS (4), /* SI */
634 COSTS_N_INSNS (4), /* DI */
635 COSTS_N_INSNS (4)}, /* other */
636 0, /* cost of multiply per each bit set */
637 {COSTS_N_INSNS (17), /* cost of a divide/mod for QI */
638 COSTS_N_INSNS (17), /* HI */
639 COSTS_N_INSNS (17), /* SI */
640 COSTS_N_INSNS (17), /* DI */
641 COSTS_N_INSNS (17)}, /* other */
642 COSTS_N_INSNS (1), /* cost of movsx */
643 COSTS_N_INSNS (1), /* cost of movzx */
644 8, /* "large" insn */
647 {4, 4, 4}, /* cost of loading integer registers
648 in QImode, HImode and SImode.
649 Relative to reg-reg move (2). */
650 {2, 2, 2}, /* cost of storing integer registers */
651 {4, 8, 16, 32, 64}, /* cost of loading SSE register
652 in 32bit, 64bit, 128bit, 256bit and 512bit */
653 {4, 8, 16, 32, 64}, /* cost of storing SSE register
654 in 32bit, 64bit, 128bit, 256bit and 512bit */
655 {4, 8, 16, 32, 64}, /* cost of unaligned loads. */
656 {4, 8, 16, 32, 64}, /* cost of unaligned stores. */
657 2, 4, 8, /* cost of moving XMM,YMM,ZMM register */
658 3, /* cost of moving SSE register to integer. */
659 4, 4, /* Gather load static, per_elt. */
660 4, 4, /* Gather store static, per_elt. */
661 8, /* size of l1 cache. */
662 256, /* size of l2 cache */
663 32, /* size of prefetch block */
664 6, /* number of parallel prefetches */
666 COSTS_N_INSNS (3), /* cost of FADD and FSUB insns. */
667 COSTS_N_INSNS (5), /* cost of FMUL instruction. */
668 COSTS_N_INSNS (56), /* cost of FDIV instruction. */
669 COSTS_N_INSNS (2), /* cost of FABS instruction. */
670 COSTS_N_INSNS (2), /* cost of FCHS instruction. */
671 COSTS_N_INSNS (56), /* cost of FSQRT instruction. */
673 COSTS_N_INSNS (1), /* cost of cheap SSE instruction. */
674 COSTS_N_INSNS (3), /* cost of ADDSS/SD SUBSS/SD insns. */
675 COSTS_N_INSNS (4), /* cost of MULSS instruction. */
676 COSTS_N_INSNS (4), /* cost of MULSD instruction. */
677 COSTS_N_INSNS (7), /* cost of FMA SS instruction. */
678 COSTS_N_INSNS (7), /* cost of FMA SD instruction. */
679 COSTS_N_INSNS (18), /* cost of DIVSS instruction. */
680 COSTS_N_INSNS (18), /* cost of DIVSD instruction. */
681 COSTS_N_INSNS (31), /* cost of SQRTSS instruction. */
682 COSTS_N_INSNS (31), /* cost of SQRTSD instruction. */
683 1, 1, 1, 1, /* reassoc int, fp, vec_int, vec_fp. */
686 COSTS_N_INSNS (3), /* cond_taken_branch_cost. */
687 COSTS_N_INSNS (1), /* cond_not_taken_branch_cost. */
688 "16", /* Loop alignment. */
689 "16:11:8", /* Jump alignment. */
690 "0:0:8", /* Label alignment. */
691 "16", /* Func alignment. */
692 4, /* Small unroll limit. */
693 2, /* Small unroll factor. */
696 static stringop_algs geode_memcpy
[2] = {
697 {libcall
, {{256, rep_prefix_4_byte
, false}, {-1, libcall
, false}}},
698 DUMMY_STRINGOP_ALGS
};
699 static stringop_algs geode_memset
[2] = {
700 {libcall
, {{256, rep_prefix_4_byte
, false}, {-1, libcall
, false}}},
701 DUMMY_STRINGOP_ALGS
};
703 struct processor_costs geode_cost
= {
705 /* Start of register allocator costs. integer->integer move cost is 2. */
706 2, /* cost for loading QImode using movzbl */
707 {2, 2, 2}, /* cost of loading integer registers
708 in QImode, HImode and SImode.
709 Relative to reg-reg move (2). */
710 {2, 2, 2}, /* cost of storing integer registers */
711 2, /* cost of reg,reg fld/fst */
712 {2, 2, 2}, /* cost of loading fp registers
713 in SFmode, DFmode and XFmode */
714 {4, 6, 6}, /* cost of storing fp registers
715 in SFmode, DFmode and XFmode */
716 2, /* cost of moving MMX register */
717 {2, 2}, /* cost of loading MMX registers
718 in SImode and DImode */
719 {2, 2}, /* cost of storing MMX registers
720 in SImode and DImode */
721 2, 4, 8, /* cost of moving XMM,YMM,ZMM register */
722 {2, 2, 8, 16, 32}, /* cost of loading SSE registers
723 in 32,64,128,256 and 512-bit */
724 {2, 2, 8, 16, 32}, /* cost of storing SSE registers
725 in 32,64,128,256 and 512-bit */
726 6, 6, /* SSE->integer and integer->SSE moves */
727 6, 6, /* mask->integer and integer->mask moves */
728 {2, 2, 2}, /* cost of loading mask register
729 in QImode, HImode, SImode. */
730 {2, 2, 2}, /* cost if storing mask register
731 in QImode, HImode, SImode. */
732 2, /* cost of moving mask register. */
733 /* End of register allocator costs. */
736 COSTS_N_INSNS (1), /* cost of an add instruction */
737 COSTS_N_INSNS (1), /* cost of a lea instruction */
738 COSTS_N_INSNS (2), /* variable shift costs */
739 COSTS_N_INSNS (1), /* constant shift costs */
740 {COSTS_N_INSNS (3), /* cost of starting multiply for QI */
741 COSTS_N_INSNS (4), /* HI */
742 COSTS_N_INSNS (7), /* SI */
743 COSTS_N_INSNS (7), /* DI */
744 COSTS_N_INSNS (7)}, /* other */
745 0, /* cost of multiply per each bit set */
746 {COSTS_N_INSNS (15), /* cost of a divide/mod for QI */
747 COSTS_N_INSNS (23), /* HI */
748 COSTS_N_INSNS (39), /* SI */
749 COSTS_N_INSNS (39), /* DI */
750 COSTS_N_INSNS (39)}, /* other */
751 COSTS_N_INSNS (1), /* cost of movsx */
752 COSTS_N_INSNS (1), /* cost of movzx */
753 8, /* "large" insn */
756 {2, 2, 2}, /* cost of loading integer registers
757 in QImode, HImode and SImode.
758 Relative to reg-reg move (2). */
759 {2, 2, 2}, /* cost of storing integer registers */
760 {2, 2, 8, 16, 32}, /* cost of loading SSE register
761 in 32bit, 64bit, 128bit, 256bit and 512bit */
762 {2, 2, 8, 16, 32}, /* cost of storing SSE register
763 in 32bit, 64bit, 128bit, 256bit and 512bit */
764 {2, 2, 8, 16, 32}, /* cost of unaligned loads. */
765 {2, 2, 8, 16, 32}, /* cost of unaligned stores. */
766 2, 4, 8, /* cost of moving XMM,YMM,ZMM register */
767 6, /* cost of moving SSE register to integer. */
768 2, 2, /* Gather load static, per_elt. */
769 2, 2, /* Gather store static, per_elt. */
770 64, /* size of l1 cache. */
771 128, /* size of l2 cache. */
772 32, /* size of prefetch block */
773 1, /* number of parallel prefetches */
775 COSTS_N_INSNS (6), /* cost of FADD and FSUB insns. */
776 COSTS_N_INSNS (11), /* cost of FMUL instruction. */
777 COSTS_N_INSNS (47), /* cost of FDIV instruction. */
778 COSTS_N_INSNS (1), /* cost of FABS instruction. */
779 COSTS_N_INSNS (1), /* cost of FCHS instruction. */
780 COSTS_N_INSNS (54), /* cost of FSQRT instruction. */
782 COSTS_N_INSNS (1), /* cost of cheap SSE instruction. */
783 COSTS_N_INSNS (6), /* cost of ADDSS/SD SUBSS/SD insns. */
784 COSTS_N_INSNS (11), /* cost of MULSS instruction. */
785 COSTS_N_INSNS (11), /* cost of MULSD instruction. */
786 COSTS_N_INSNS (17), /* cost of FMA SS instruction. */
787 COSTS_N_INSNS (17), /* cost of FMA SD instruction. */
788 COSTS_N_INSNS (47), /* cost of DIVSS instruction. */
789 COSTS_N_INSNS (47), /* cost of DIVSD instruction. */
790 COSTS_N_INSNS (54), /* cost of SQRTSS instruction. */
791 COSTS_N_INSNS (54), /* cost of SQRTSD instruction. */
792 1, 1, 1, 1, /* reassoc int, fp, vec_int, vec_fp. */
795 COSTS_N_INSNS (3), /* cond_taken_branch_cost. */
796 COSTS_N_INSNS (1), /* cond_not_taken_branch_cost. */
797 NULL
, /* Loop alignment. */
798 NULL
, /* Jump alignment. */
799 NULL
, /* Label alignment. */
800 NULL
, /* Func alignment. */
801 4, /* Small unroll limit. */
802 2, /* Small unroll factor. */
805 static stringop_algs k6_memcpy
[2] = {
806 {libcall
, {{256, rep_prefix_4_byte
, false}, {-1, libcall
, false}}},
807 DUMMY_STRINGOP_ALGS
};
808 static stringop_algs k6_memset
[2] = {
809 {libcall
, {{256, rep_prefix_4_byte
, false}, {-1, libcall
, false}}},
810 DUMMY_STRINGOP_ALGS
};
812 struct processor_costs k6_cost
= {
814 /* Start of register allocator costs. integer->integer move cost is 2. */
815 3, /* cost for loading QImode using movzbl */
816 {4, 5, 4}, /* cost of loading integer registers
817 in QImode, HImode and SImode.
818 Relative to reg-reg move (2). */
819 {2, 3, 2}, /* cost of storing integer registers */
820 4, /* cost of reg,reg fld/fst */
821 {6, 6, 6}, /* cost of loading fp registers
822 in SFmode, DFmode and XFmode */
823 {4, 4, 4}, /* cost of storing fp registers
824 in SFmode, DFmode and XFmode */
825 2, /* cost of moving MMX register */
826 {2, 2}, /* cost of loading MMX registers
827 in SImode and DImode */
828 {2, 2}, /* cost of storing MMX registers
829 in SImode and DImode */
830 2, 4, 8, /* cost of moving XMM,YMM,ZMM register */
831 {2, 2, 8, 16, 32}, /* cost of loading SSE registers
832 in 32,64,128,256 and 512-bit */
833 {2, 2, 8, 16, 32}, /* cost of storing SSE registers
834 in 32,64,128,256 and 512-bit */
835 6, 6, /* SSE->integer and integer->SSE moves */
836 6, 6, /* mask->integer and integer->mask moves */
837 {4, 5, 4}, /* cost of loading mask register
838 in QImode, HImode, SImode. */
839 {2, 3, 2}, /* cost if storing mask register
840 in QImode, HImode, SImode. */
841 2, /* cost of moving mask register. */
842 /* End of register allocator costs. */
845 COSTS_N_INSNS (1), /* cost of an add instruction */
846 COSTS_N_INSNS (2), /* cost of a lea instruction */
847 COSTS_N_INSNS (1), /* variable shift costs */
848 COSTS_N_INSNS (1), /* constant shift costs */
849 {COSTS_N_INSNS (3), /* cost of starting multiply for QI */
850 COSTS_N_INSNS (3), /* HI */
851 COSTS_N_INSNS (3), /* SI */
852 COSTS_N_INSNS (3), /* DI */
853 COSTS_N_INSNS (3)}, /* other */
854 0, /* cost of multiply per each bit set */
855 {COSTS_N_INSNS (18), /* cost of a divide/mod for QI */
856 COSTS_N_INSNS (18), /* HI */
857 COSTS_N_INSNS (18), /* SI */
858 COSTS_N_INSNS (18), /* DI */
859 COSTS_N_INSNS (18)}, /* other */
860 COSTS_N_INSNS (2), /* cost of movsx */
861 COSTS_N_INSNS (2), /* cost of movzx */
862 8, /* "large" insn */
865 {4, 5, 4}, /* cost of loading integer registers
866 in QImode, HImode and SImode.
867 Relative to reg-reg move (2). */
868 {2, 3, 2}, /* cost of storing integer registers */
869 {2, 2, 8, 16, 32}, /* cost of loading SSE register
870 in 32bit, 64bit, 128bit, 256bit and 512bit */
871 {2, 2, 8, 16, 32}, /* cost of storing SSE register
872 in 32bit, 64bit, 128bit, 256bit and 512bit */
873 {2, 2, 8, 16, 32}, /* cost of unaligned loads. */
874 {2, 2, 8, 16, 32}, /* cost of unaligned stores. */
875 2, 4, 8, /* cost of moving XMM,YMM,ZMM register */
876 6, /* cost of moving SSE register to integer. */
877 2, 2, /* Gather load static, per_elt. */
878 2, 2, /* Gather store static, per_elt. */
879 32, /* size of l1 cache. */
880 32, /* size of l2 cache. Some models
881 have integrated l2 cache, but
882 optimizing for k6 is not important
883 enough to worry about that. */
884 32, /* size of prefetch block */
885 1, /* number of parallel prefetches */
887 COSTS_N_INSNS (2), /* cost of FADD and FSUB insns. */
888 COSTS_N_INSNS (2), /* cost of FMUL instruction. */
889 COSTS_N_INSNS (56), /* cost of FDIV instruction. */
890 COSTS_N_INSNS (2), /* cost of FABS instruction. */
891 COSTS_N_INSNS (2), /* cost of FCHS instruction. */
892 COSTS_N_INSNS (56), /* cost of FSQRT instruction. */
894 COSTS_N_INSNS (1), /* cost of cheap SSE instruction. */
895 COSTS_N_INSNS (2), /* cost of ADDSS/SD SUBSS/SD insns. */
896 COSTS_N_INSNS (2), /* cost of MULSS instruction. */
897 COSTS_N_INSNS (2), /* cost of MULSD instruction. */
898 COSTS_N_INSNS (4), /* cost of FMA SS instruction. */
899 COSTS_N_INSNS (4), /* cost of FMA SD instruction. */
900 COSTS_N_INSNS (56), /* cost of DIVSS instruction. */
901 COSTS_N_INSNS (56), /* cost of DIVSD instruction. */
902 COSTS_N_INSNS (56), /* cost of SQRTSS instruction. */
903 COSTS_N_INSNS (56), /* cost of SQRTSD instruction. */
904 1, 1, 1, 1, /* reassoc int, fp, vec_int, vec_fp. */
907 COSTS_N_INSNS (3), /* cond_taken_branch_cost. */
908 COSTS_N_INSNS (1), /* cond_not_taken_branch_cost. */
909 "32:8:8", /* Loop alignment. */
910 "32:8:8", /* Jump alignment. */
911 "0:0:8", /* Label alignment. */
912 "32", /* Func alignment. */
913 4, /* Small unroll limit. */
914 2, /* Small unroll factor. */
917 /* For some reason, Athlon deals better with REP prefix (relative to loops)
918 compared to K8. Alignment becomes important after 8 bytes for memcpy and
919 128 bytes for memset. */
920 static stringop_algs athlon_memcpy
[2] = {
921 {libcall
, {{2048, rep_prefix_4_byte
, false}, {-1, libcall
, false}}},
922 DUMMY_STRINGOP_ALGS
};
923 static stringop_algs athlon_memset
[2] = {
924 {libcall
, {{2048, rep_prefix_4_byte
, false}, {-1, libcall
, false}}},
925 DUMMY_STRINGOP_ALGS
};
927 struct processor_costs athlon_cost
= {
929 /* Start of register allocator costs. integer->integer move cost is 2. */
930 4, /* cost for loading QImode using movzbl */
931 {3, 4, 3}, /* cost of loading integer registers
932 in QImode, HImode and SImode.
933 Relative to reg-reg move (2). */
934 {3, 4, 3}, /* cost of storing integer registers */
935 4, /* cost of reg,reg fld/fst */
936 {4, 4, 12}, /* cost of loading fp registers
937 in SFmode, DFmode and XFmode */
938 {6, 6, 8}, /* cost of storing fp registers
939 in SFmode, DFmode and XFmode */
940 2, /* cost of moving MMX register */
941 {4, 4}, /* cost of loading MMX registers
942 in SImode and DImode */
943 {4, 4}, /* cost of storing MMX registers
944 in SImode and DImode */
945 2, 4, 8, /* cost of moving XMM,YMM,ZMM register */
946 {4, 4, 12, 12, 24}, /* cost of loading SSE registers
947 in 32,64,128,256 and 512-bit */
948 {4, 4, 10, 10, 20}, /* cost of storing SSE registers
949 in 32,64,128,256 and 512-bit */
950 5, 5, /* SSE->integer and integer->SSE moves */
951 5, 5, /* mask->integer and integer->mask moves */
952 {3, 4, 3}, /* cost of loading mask register
953 in QImode, HImode, SImode. */
954 {3, 4, 3}, /* cost if storing mask register
955 in QImode, HImode, SImode. */
956 2, /* cost of moving mask register. */
957 /* End of register allocator costs. */
960 COSTS_N_INSNS (1), /* cost of an add instruction */
961 COSTS_N_INSNS (2), /* cost of a lea instruction */
962 COSTS_N_INSNS (1), /* variable shift costs */
963 COSTS_N_INSNS (1), /* constant shift costs */
964 {COSTS_N_INSNS (5), /* cost of starting multiply for QI */
965 COSTS_N_INSNS (5), /* HI */
966 COSTS_N_INSNS (5), /* SI */
967 COSTS_N_INSNS (5), /* DI */
968 COSTS_N_INSNS (5)}, /* other */
969 0, /* cost of multiply per each bit set */
970 {COSTS_N_INSNS (18), /* cost of a divide/mod for QI */
971 COSTS_N_INSNS (26), /* HI */
972 COSTS_N_INSNS (42), /* SI */
973 COSTS_N_INSNS (74), /* DI */
974 COSTS_N_INSNS (74)}, /* other */
975 COSTS_N_INSNS (1), /* cost of movsx */
976 COSTS_N_INSNS (1), /* cost of movzx */
977 8, /* "large" insn */
980 {3, 4, 3}, /* cost of loading integer registers
981 in QImode, HImode and SImode.
982 Relative to reg-reg move (2). */
983 {3, 4, 3}, /* cost of storing integer registers */
984 {4, 4, 12, 12, 24}, /* cost of loading SSE register
985 in 32bit, 64bit, 128bit, 256bit and 512bit */
986 {4, 4, 10, 10, 20}, /* cost of storing SSE register
987 in 32bit, 64bit, 128bit, 256bit and 512bit */
988 {4, 4, 12, 12, 24}, /* cost of unaligned loads. */
989 {4, 4, 10, 10, 20}, /* cost of unaligned stores. */
990 2, 4, 8, /* cost of moving XMM,YMM,ZMM register */
991 5, /* cost of moving SSE register to integer. */
992 4, 4, /* Gather load static, per_elt. */
993 4, 4, /* Gather store static, per_elt. */
994 64, /* size of l1 cache. */
995 256, /* size of l2 cache. */
996 64, /* size of prefetch block */
997 6, /* number of parallel prefetches */
999 COSTS_N_INSNS (4), /* cost of FADD and FSUB insns. */
1000 COSTS_N_INSNS (4), /* cost of FMUL instruction. */
1001 COSTS_N_INSNS (24), /* cost of FDIV instruction. */
1002 COSTS_N_INSNS (2), /* cost of FABS instruction. */
1003 COSTS_N_INSNS (2), /* cost of FCHS instruction. */
1004 COSTS_N_INSNS (35), /* cost of FSQRT instruction. */
1006 COSTS_N_INSNS (2), /* cost of cheap SSE instruction. */
1007 COSTS_N_INSNS (4), /* cost of ADDSS/SD SUBSS/SD insns. */
1008 COSTS_N_INSNS (4), /* cost of MULSS instruction. */
1009 COSTS_N_INSNS (4), /* cost of MULSD instruction. */
1010 COSTS_N_INSNS (8), /* cost of FMA SS instruction. */
1011 COSTS_N_INSNS (8), /* cost of FMA SD instruction. */
1013 COSTS_N_INSNS (16), /* cost of DIVSS instruction. */
1014 COSTS_N_INSNS (24), /* cost of DIVSD instruction. */
1015 COSTS_N_INSNS (19), /* cost of SQRTSS instruction. */
1016 COSTS_N_INSNS (19), /* cost of SQRTSD instruction. */
1017 1, 1, 1, 1, /* reassoc int, fp, vec_int, vec_fp. */
1020 COSTS_N_INSNS (3), /* cond_taken_branch_cost. */
1021 COSTS_N_INSNS (1), /* cond_not_taken_branch_cost. */
1022 "16:8:8", /* Loop alignment. */
1023 "16:8:8", /* Jump alignment. */
1024 "0:0:8", /* Label alignment. */
1025 "16", /* Func alignment. */
1026 4, /* Small unroll limit. */
1027 2, /* Small unroll factor. */
1030 /* K8 has optimized REP instruction for medium sized blocks, but for very
1031 small blocks it is better to use loop. For large blocks, libcall can
1032 do nontemporary accesses and beat inline considerably. */
1033 static stringop_algs k8_memcpy
[2] = {
1034 {libcall
, {{6, loop
, false}, {14, unrolled_loop
, false},
1035 {-1, rep_prefix_4_byte
, false}}},
1036 {libcall
, {{16, loop
, false}, {8192, rep_prefix_8_byte
, false},
1037 {-1, libcall
, false}}}};
1038 static stringop_algs k8_memset
[2] = {
1039 {libcall
, {{8, loop
, false}, {24, unrolled_loop
, false},
1040 {2048, rep_prefix_4_byte
, false}, {-1, libcall
, false}}},
1041 {libcall
, {{48, unrolled_loop
, false},
1042 {8192, rep_prefix_8_byte
, false}, {-1, libcall
, false}}}};
1044 struct processor_costs k8_cost
= {
1046 /* Start of register allocator costs. integer->integer move cost is 2. */
1047 4, /* cost for loading QImode using movzbl */
1048 {3, 4, 3}, /* cost of loading integer registers
1049 in QImode, HImode and SImode.
1050 Relative to reg-reg move (2). */
1051 {3, 4, 3}, /* cost of storing integer registers */
1052 4, /* cost of reg,reg fld/fst */
1053 {4, 4, 12}, /* cost of loading fp registers
1054 in SFmode, DFmode and XFmode */
1055 {6, 6, 8}, /* cost of storing fp registers
1056 in SFmode, DFmode and XFmode */
1057 2, /* cost of moving MMX register */
1058 {3, 3}, /* cost of loading MMX registers
1059 in SImode and DImode */
1060 {4, 4}, /* cost of storing MMX registers
1061 in SImode and DImode */
1062 2, 4, 8, /* cost of moving XMM,YMM,ZMM register */
1063 {4, 3, 12, 12, 24}, /* cost of loading SSE registers
1064 in 32,64,128,256 and 512-bit */
1065 {4, 4, 10, 10, 20}, /* cost of storing SSE registers
1066 in 32,64,128,256 and 512-bit */
1067 5, 5, /* SSE->integer and integer->SSE moves */
1068 5, 5, /* mask->integer and integer->mask moves */
1069 {3, 4, 3}, /* cost of loading mask register
1070 in QImode, HImode, SImode. */
1071 {3, 4, 3}, /* cost if storing mask register
1072 in QImode, HImode, SImode. */
1073 2, /* cost of moving mask register. */
1074 /* End of register allocator costs. */
1077 COSTS_N_INSNS (1), /* cost of an add instruction */
1078 COSTS_N_INSNS (2), /* cost of a lea instruction */
1079 COSTS_N_INSNS (1), /* variable shift costs */
1080 COSTS_N_INSNS (1), /* constant shift costs */
1081 {COSTS_N_INSNS (3), /* cost of starting multiply for QI */
1082 COSTS_N_INSNS (4), /* HI */
1083 COSTS_N_INSNS (3), /* SI */
1084 COSTS_N_INSNS (4), /* DI */
1085 COSTS_N_INSNS (5)}, /* other */
1086 0, /* cost of multiply per each bit set */
1087 {COSTS_N_INSNS (18), /* cost of a divide/mod for QI */
1088 COSTS_N_INSNS (26), /* HI */
1089 COSTS_N_INSNS (42), /* SI */
1090 COSTS_N_INSNS (74), /* DI */
1091 COSTS_N_INSNS (74)}, /* other */
1092 COSTS_N_INSNS (1), /* cost of movsx */
1093 COSTS_N_INSNS (1), /* cost of movzx */
1094 8, /* "large" insn */
1096 6, /* CLEAR_RATIO */
1097 {3, 4, 3}, /* cost of loading integer registers
1098 in QImode, HImode and SImode.
1099 Relative to reg-reg move (2). */
1100 {3, 4, 3}, /* cost of storing integer registers */
1101 {4, 3, 12, 12, 24}, /* cost of loading SSE register
1102 in 32bit, 64bit, 128bit, 256bit and 512bit */
1103 {4, 4, 10, 10, 20}, /* cost of storing SSE register
1104 in 32bit, 64bit, 128bit, 256bit and 512bit */
1105 {4, 3, 12, 12, 24}, /* cost of unaligned loads. */
1106 {4, 4, 10, 10, 20}, /* cost of unaligned stores. */
1107 2, 4, 8, /* cost of moving XMM,YMM,ZMM register */
1108 5, /* cost of moving SSE register to integer. */
1109 4, 4, /* Gather load static, per_elt. */
1110 4, 4, /* Gather store static, per_elt. */
1111 64, /* size of l1 cache. */
1112 512, /* size of l2 cache. */
1113 64, /* size of prefetch block */
1114 /* New AMD processors never drop prefetches; if they cannot be performed
1115 immediately, they are queued. We set number of simultaneous prefetches
1116 to a large constant to reflect this (it probably is not a good idea not
1117 to limit number of prefetches at all, as their execution also takes some
1119 100, /* number of parallel prefetches */
1120 3, /* Branch cost */
1121 COSTS_N_INSNS (4), /* cost of FADD and FSUB insns. */
1122 COSTS_N_INSNS (4), /* cost of FMUL instruction. */
1123 COSTS_N_INSNS (19), /* cost of FDIV instruction. */
1124 COSTS_N_INSNS (2), /* cost of FABS instruction. */
1125 COSTS_N_INSNS (2), /* cost of FCHS instruction. */
1126 COSTS_N_INSNS (35), /* cost of FSQRT instruction. */
1128 COSTS_N_INSNS (2), /* cost of cheap SSE instruction. */
1129 COSTS_N_INSNS (4), /* cost of ADDSS/SD SUBSS/SD insns. */
1130 COSTS_N_INSNS (4), /* cost of MULSS instruction. */
1131 COSTS_N_INSNS (4), /* cost of MULSD instruction. */
1132 COSTS_N_INSNS (8), /* cost of FMA SS instruction. */
1133 COSTS_N_INSNS (8), /* cost of FMA SD instruction. */
1135 COSTS_N_INSNS (16), /* cost of DIVSS instruction. */
1136 COSTS_N_INSNS (20), /* cost of DIVSD instruction. */
1137 COSTS_N_INSNS (19), /* cost of SQRTSS instruction. */
1138 COSTS_N_INSNS (27), /* cost of SQRTSD instruction. */
1139 1, 1, 1, 1, /* reassoc int, fp, vec_int, vec_fp. */
1142 COSTS_N_INSNS (3), /* cond_taken_branch_cost. */
1143 COSTS_N_INSNS (2), /* cond_not_taken_branch_cost. */
1144 "16:8:8", /* Loop alignment. */
1145 "16:8:8", /* Jump alignment. */
1146 "0:0:8", /* Label alignment. */
1147 "16", /* Func alignment. */
1148 4, /* Small unroll limit. */
1149 2, /* Small unroll factor. */
1152 /* AMDFAM10 has optimized REP instruction for medium sized blocks, but for
1153 very small blocks it is better to use loop. For large blocks, libcall can
1154 do nontemporary accesses and beat inline considerably. */
1155 static stringop_algs amdfam10_memcpy
[2] = {
1156 {libcall
, {{6, loop
, false}, {14, unrolled_loop
, false},
1157 {-1, rep_prefix_4_byte
, false}}},
1158 {libcall
, {{16, loop
, false}, {8192, rep_prefix_8_byte
, false},
1159 {-1, libcall
, false}}}};
1160 static stringop_algs amdfam10_memset
[2] = {
1161 {libcall
, {{8, loop
, false}, {24, unrolled_loop
, false},
1162 {2048, rep_prefix_4_byte
, false}, {-1, libcall
, false}}},
1163 {libcall
, {{48, unrolled_loop
, false}, {8192, rep_prefix_8_byte
, false},
1164 {-1, libcall
, false}}}};
1165 struct processor_costs amdfam10_cost
= {
1167 /* Start of register allocator costs. integer->integer move cost is 2. */
1168 4, /* cost for loading QImode using movzbl */
1169 {3, 4, 3}, /* cost of loading integer registers
1170 in QImode, HImode and SImode.
1171 Relative to reg-reg move (2). */
1172 {3, 4, 3}, /* cost of storing integer registers */
1173 4, /* cost of reg,reg fld/fst */
1174 {4, 4, 12}, /* cost of loading fp registers
1175 in SFmode, DFmode and XFmode */
1176 {6, 6, 8}, /* cost of storing fp registers
1177 in SFmode, DFmode and XFmode */
1178 2, /* cost of moving MMX register */
1179 {3, 3}, /* cost of loading MMX registers
1180 in SImode and DImode */
1181 {4, 4}, /* cost of storing MMX registers
1182 in SImode and DImode */
1183 2, 4, 8, /* cost of moving XMM,YMM,ZMM register */
1184 {4, 4, 3, 6, 12}, /* cost of loading SSE registers
1185 in 32,64,128,256 and 512-bit */
1186 {4, 4, 5, 10, 20}, /* cost of storing SSE registers
1187 in 32,64,128,256 and 512-bit */
1188 3, 3, /* SSE->integer and integer->SSE moves */
1189 3, 3, /* mask->integer and integer->mask moves */
1190 {3, 4, 3}, /* cost of loading mask register
1191 in QImode, HImode, SImode. */
1192 {3, 4, 3}, /* cost if storing mask register
1193 in QImode, HImode, SImode. */
1194 2, /* cost of moving mask register. */
1197 MOVD reg64, xmmreg Double FSTORE 4
1198 MOVD reg32, xmmreg Double FSTORE 4
1200 MOVD reg64, xmmreg Double FADD 3
1202 MOVD reg32, xmmreg Double FADD 3
1204 /* End of register allocator costs. */
1207 COSTS_N_INSNS (1), /* cost of an add instruction */
1208 COSTS_N_INSNS (2), /* cost of a lea instruction */
1209 COSTS_N_INSNS (1), /* variable shift costs */
1210 COSTS_N_INSNS (1), /* constant shift costs */
1211 {COSTS_N_INSNS (3), /* cost of starting multiply for QI */
1212 COSTS_N_INSNS (4), /* HI */
1213 COSTS_N_INSNS (3), /* SI */
1214 COSTS_N_INSNS (4), /* DI */
1215 COSTS_N_INSNS (5)}, /* other */
1216 0, /* cost of multiply per each bit set */
1217 {COSTS_N_INSNS (19), /* cost of a divide/mod for QI */
1218 COSTS_N_INSNS (35), /* HI */
1219 COSTS_N_INSNS (51), /* SI */
1220 COSTS_N_INSNS (83), /* DI */
1221 COSTS_N_INSNS (83)}, /* other */
1222 COSTS_N_INSNS (1), /* cost of movsx */
1223 COSTS_N_INSNS (1), /* cost of movzx */
1224 8, /* "large" insn */
1226 6, /* CLEAR_RATIO */
1227 {3, 4, 3}, /* cost of loading integer registers
1228 in QImode, HImode and SImode.
1229 Relative to reg-reg move (2). */
1230 {3, 4, 3}, /* cost of storing integer registers */
1231 {4, 4, 3, 6, 12}, /* cost of loading SSE register
1232 in 32bit, 64bit, 128bit, 256bit and 512bit */
1233 {4, 4, 5, 10, 20}, /* cost of storing SSE register
1234 in 32bit, 64bit, 128bit, 256bit and 512bit */
1235 {4, 4, 3, 7, 12}, /* cost of unaligned loads. */
1236 {4, 4, 5, 10, 20}, /* cost of unaligned stores. */
1237 2, 4, 8, /* cost of moving XMM,YMM,ZMM register */
1238 3, /* cost of moving SSE register to integer. */
1239 4, 4, /* Gather load static, per_elt. */
1240 4, 4, /* Gather store static, per_elt. */
1241 64, /* size of l1 cache. */
1242 512, /* size of l2 cache. */
1243 64, /* size of prefetch block */
1244 /* New AMD processors never drop prefetches; if they cannot be performed
1245 immediately, they are queued. We set number of simultaneous prefetches
1246 to a large constant to reflect this (it probably is not a good idea not
1247 to limit number of prefetches at all, as their execution also takes some
1249 100, /* number of parallel prefetches */
1250 2, /* Branch cost */
1251 COSTS_N_INSNS (4), /* cost of FADD and FSUB insns. */
1252 COSTS_N_INSNS (4), /* cost of FMUL instruction. */
1253 COSTS_N_INSNS (19), /* cost of FDIV instruction. */
1254 COSTS_N_INSNS (2), /* cost of FABS instruction. */
1255 COSTS_N_INSNS (2), /* cost of FCHS instruction. */
1256 COSTS_N_INSNS (35), /* cost of FSQRT instruction. */
1258 COSTS_N_INSNS (2), /* cost of cheap SSE instruction. */
1259 COSTS_N_INSNS (4), /* cost of ADDSS/SD SUBSS/SD insns. */
1260 COSTS_N_INSNS (4), /* cost of MULSS instruction. */
1261 COSTS_N_INSNS (4), /* cost of MULSD instruction. */
1262 COSTS_N_INSNS (8), /* cost of FMA SS instruction. */
1263 COSTS_N_INSNS (8), /* cost of FMA SD instruction. */
1265 COSTS_N_INSNS (16), /* cost of DIVSS instruction. */
1266 COSTS_N_INSNS (20), /* cost of DIVSD instruction. */
1267 COSTS_N_INSNS (19), /* cost of SQRTSS instruction. */
1268 COSTS_N_INSNS (27), /* cost of SQRTSD instruction. */
1269 1, 1, 1, 1, /* reassoc int, fp, vec_int, vec_fp. */
1272 COSTS_N_INSNS (2), /* cond_taken_branch_cost. */
1273 COSTS_N_INSNS (1), /* cond_not_taken_branch_cost. */
1274 "32:25:8", /* Loop alignment. */
1275 "32:8:8", /* Jump alignment. */
1276 "0:0:8", /* Label alignment. */
1277 "32", /* Func alignment. */
1278 4, /* Small unroll limit. */
1279 2, /* Small unroll factor. */
1282 /* BDVER has optimized REP instruction for medium sized blocks, but for
1283 very small blocks it is better to use loop. For large blocks, libcall
1284 can do nontemporary accesses and beat inline considerably. */
1285 static stringop_algs bdver_memcpy
[2] = {
1286 {libcall
, {{6, loop
, false}, {14, unrolled_loop
, false},
1287 {-1, rep_prefix_4_byte
, false}}},
1288 {libcall
, {{16, loop
, false}, {8192, rep_prefix_8_byte
, false},
1289 {-1, libcall
, false}}}};
1290 static stringop_algs bdver_memset
[2] = {
1291 {libcall
, {{8, loop
, false}, {24, unrolled_loop
, false},
1292 {2048, rep_prefix_4_byte
, false}, {-1, libcall
, false}}},
1293 {libcall
, {{48, unrolled_loop
, false}, {8192, rep_prefix_8_byte
, false},
1294 {-1, libcall
, false}}}};
1296 const struct processor_costs bdver_cost
= {
1298 /* Start of register allocator costs. integer->integer move cost is 2. */
1299 8, /* cost for loading QImode using movzbl */
1300 {8, 8, 8}, /* cost of loading integer registers
1301 in QImode, HImode and SImode.
1302 Relative to reg-reg move (2). */
1303 {8, 8, 8}, /* cost of storing integer registers */
1304 4, /* cost of reg,reg fld/fst */
1305 {12, 12, 28}, /* cost of loading fp registers
1306 in SFmode, DFmode and XFmode */
1307 {10, 10, 18}, /* cost of storing fp registers
1308 in SFmode, DFmode and XFmode */
1309 4, /* cost of moving MMX register */
1310 {12, 12}, /* cost of loading MMX registers
1311 in SImode and DImode */
1312 {10, 10}, /* cost of storing MMX registers
1313 in SImode and DImode */
1314 2, 4, 8, /* cost of moving XMM,YMM,ZMM register */
1315 {12, 12, 10, 40, 60}, /* cost of loading SSE registers
1316 in 32,64,128,256 and 512-bit */
1317 {10, 10, 10, 40, 60}, /* cost of storing SSE registers
1318 in 32,64,128,256 and 512-bit */
1319 16, 20, /* SSE->integer and integer->SSE moves */
1320 16, 20, /* mask->integer and integer->mask moves */
1321 {8, 8, 8}, /* cost of loading mask register
1322 in QImode, HImode, SImode. */
1323 {8, 8, 8}, /* cost if storing mask register
1324 in QImode, HImode, SImode. */
1325 2, /* cost of moving mask register. */
1326 /* End of register allocator costs. */
1329 COSTS_N_INSNS (1), /* cost of an add instruction */
1330 COSTS_N_INSNS (1), /* cost of a lea instruction */
1331 COSTS_N_INSNS (1), /* variable shift costs */
1332 COSTS_N_INSNS (1), /* constant shift costs */
1333 {COSTS_N_INSNS (4), /* cost of starting multiply for QI */
1334 COSTS_N_INSNS (4), /* HI */
1335 COSTS_N_INSNS (4), /* SI */
1336 COSTS_N_INSNS (6), /* DI */
1337 COSTS_N_INSNS (6)}, /* other */
1338 0, /* cost of multiply per each bit set */
1339 {COSTS_N_INSNS (19), /* cost of a divide/mod for QI */
1340 COSTS_N_INSNS (35), /* HI */
1341 COSTS_N_INSNS (51), /* SI */
1342 COSTS_N_INSNS (83), /* DI */
1343 COSTS_N_INSNS (83)}, /* other */
1344 COSTS_N_INSNS (1), /* cost of movsx */
1345 COSTS_N_INSNS (1), /* cost of movzx */
1346 8, /* "large" insn */
1348 6, /* CLEAR_RATIO */
1349 {8, 8, 8}, /* cost of loading integer registers
1350 in QImode, HImode and SImode.
1351 Relative to reg-reg move (2). */
1352 {8, 8, 8}, /* cost of storing integer registers */
1353 {12, 12, 10, 40, 60}, /* cost of loading SSE register
1354 in 32bit, 64bit, 128bit, 256bit and 512bit */
1355 {10, 10, 10, 40, 60}, /* cost of storing SSE register
1356 in 32bit, 64bit, 128bit, 256bit and 512bit */
1357 {12, 12, 10, 40, 60}, /* cost of unaligned loads. */
1358 {10, 10, 10, 40, 60}, /* cost of unaligned stores. */
1359 2, 4, 8, /* cost of moving XMM,YMM,ZMM register */
1360 16, /* cost of moving SSE register to integer. */
1361 12, 12, /* Gather load static, per_elt. */
1362 10, 10, /* Gather store static, per_elt. */
1363 16, /* size of l1 cache. */
1364 2048, /* size of l2 cache. */
1365 64, /* size of prefetch block */
1366 /* New AMD processors never drop prefetches; if they cannot be performed
1367 immediately, they are queued. We set number of simultaneous prefetches
1368 to a large constant to reflect this (it probably is not a good idea not
1369 to limit number of prefetches at all, as their execution also takes some
1371 100, /* number of parallel prefetches */
1372 2, /* Branch cost */
1373 COSTS_N_INSNS (6), /* cost of FADD and FSUB insns. */
1374 COSTS_N_INSNS (6), /* cost of FMUL instruction. */
1375 COSTS_N_INSNS (42), /* cost of FDIV instruction. */
1376 COSTS_N_INSNS (2), /* cost of FABS instruction. */
1377 COSTS_N_INSNS (2), /* cost of FCHS instruction. */
1378 COSTS_N_INSNS (52), /* cost of FSQRT instruction. */
1380 COSTS_N_INSNS (2), /* cost of cheap SSE instruction. */
1381 COSTS_N_INSNS (6), /* cost of ADDSS/SD SUBSS/SD insns. */
1382 COSTS_N_INSNS (6), /* cost of MULSS instruction. */
1383 COSTS_N_INSNS (6), /* cost of MULSD instruction. */
1384 COSTS_N_INSNS (6), /* cost of FMA SS instruction. */
1385 COSTS_N_INSNS (6), /* cost of FMA SD instruction. */
1387 COSTS_N_INSNS (24), /* cost of DIVSS instruction. */
1389 COSTS_N_INSNS (27), /* cost of DIVSD instruction. */
1390 COSTS_N_INSNS (15), /* cost of SQRTSS instruction. */
1391 COSTS_N_INSNS (26), /* cost of SQRTSD instruction. */
1392 1, 2, 1, 1, /* reassoc int, fp, vec_int, vec_fp. */
1395 COSTS_N_INSNS (4), /* cond_taken_branch_cost. */
1396 COSTS_N_INSNS (2), /* cond_not_taken_branch_cost. */
1397 "16:11:8", /* Loop alignment. */
1398 "16:8:8", /* Jump alignment. */
1399 "0:0:8", /* Label alignment. */
1400 "11", /* Func alignment. */
1401 4, /* Small unroll limit. */
1402 2, /* Small unroll factor. */
1406 /* ZNVER1 has optimized REP instruction for medium sized blocks, but for
1407 very small blocks it is better to use loop. For large blocks, libcall
1408 can do nontemporary accesses and beat inline considerably. */
1409 static stringop_algs znver1_memcpy
[2] = {
1410 /* 32-bit tuning. */
1411 {libcall
, {{6, loop
, false},
1412 {14, unrolled_loop
, false},
1413 {-1, libcall
, false}}},
1414 /* 64-bit tuning. */
1415 {libcall
, {{16, loop
, false},
1416 {128, rep_prefix_8_byte
, false},
1417 {-1, libcall
, false}}}};
1418 static stringop_algs znver1_memset
[2] = {
1419 /* 32-bit tuning. */
1420 {libcall
, {{8, loop
, false},
1421 {24, unrolled_loop
, false},
1422 {128, rep_prefix_4_byte
, false},
1423 {-1, libcall
, false}}},
1424 /* 64-bit tuning. */
1425 {libcall
, {{48, unrolled_loop
, false},
1426 {128, rep_prefix_8_byte
, false},
1427 {-1, libcall
, false}}}};
1428 struct processor_costs znver1_cost
= {
1430 /* Start of register allocator costs. integer->integer move cost is 2. */
1432 /* reg-reg moves are done by renaming and thus they are even cheaper than
1433 1 cycle. Becuase reg-reg move cost is 2 and the following tables correspond
1434 to doubles of latencies, we do not model this correctly. It does not
1435 seem to make practical difference to bump prices up even more. */
1436 6, /* cost for loading QImode using
1438 {6, 6, 6}, /* cost of loading integer registers
1439 in QImode, HImode and SImode.
1440 Relative to reg-reg move (2). */
1441 {8, 8, 8}, /* cost of storing integer
1443 2, /* cost of reg,reg fld/fst. */
1444 {6, 6, 16}, /* cost of loading fp registers
1445 in SFmode, DFmode and XFmode. */
1446 {8, 8, 16}, /* cost of storing fp registers
1447 in SFmode, DFmode and XFmode. */
1448 2, /* cost of moving MMX register. */
1449 {6, 6}, /* cost of loading MMX registers
1450 in SImode and DImode. */
1451 {8, 8}, /* cost of storing MMX registers
1452 in SImode and DImode. */
1453 2, 3, 6, /* cost of moving XMM,YMM,ZMM register. */
1454 {6, 6, 6, 12, 24}, /* cost of loading SSE registers
1455 in 32,64,128,256 and 512-bit. */
1456 {8, 8, 8, 16, 32}, /* cost of storing SSE registers
1457 in 32,64,128,256 and 512-bit. */
1458 6, 6, /* SSE->integer and integer->SSE moves. */
1459 8, 8, /* mask->integer and integer->mask moves */
1460 {6, 6, 6}, /* cost of loading mask register
1461 in QImode, HImode, SImode. */
1462 {8, 8, 8}, /* cost if storing mask register
1463 in QImode, HImode, SImode. */
1464 2, /* cost of moving mask register. */
1465 /* End of register allocator costs. */
1468 COSTS_N_INSNS (1), /* cost of an add instruction. */
1469 COSTS_N_INSNS (1), /* cost of a lea instruction. */
1470 COSTS_N_INSNS (1), /* variable shift costs. */
1471 COSTS_N_INSNS (1), /* constant shift costs. */
1472 {COSTS_N_INSNS (3), /* cost of starting multiply for QI. */
1473 COSTS_N_INSNS (3), /* HI. */
1474 COSTS_N_INSNS (3), /* SI. */
1475 COSTS_N_INSNS (3), /* DI. */
1476 COSTS_N_INSNS (3)}, /* other. */
1477 0, /* cost of multiply per each bit
1479 /* Depending on parameters, idiv can get faster on ryzen. This is upper
1481 {COSTS_N_INSNS (16), /* cost of a divide/mod for QI. */
1482 COSTS_N_INSNS (22), /* HI. */
1483 COSTS_N_INSNS (30), /* SI. */
1484 COSTS_N_INSNS (45), /* DI. */
1485 COSTS_N_INSNS (45)}, /* other. */
1486 COSTS_N_INSNS (1), /* cost of movsx. */
1487 COSTS_N_INSNS (1), /* cost of movzx. */
1488 8, /* "large" insn. */
1489 9, /* MOVE_RATIO. */
1490 6, /* CLEAR_RATIO */
1491 {6, 6, 6}, /* cost of loading integer registers
1492 in QImode, HImode and SImode.
1493 Relative to reg-reg move (2). */
1494 {8, 8, 8}, /* cost of storing integer
1496 {6, 6, 6, 12, 24}, /* cost of loading SSE register
1497 in 32bit, 64bit, 128bit, 256bit and 512bit */
1498 {8, 8, 8, 16, 32}, /* cost of storing SSE register
1499 in 32bit, 64bit, 128bit, 256bit and 512bit */
1500 {6, 6, 6, 12, 24}, /* cost of unaligned loads. */
1501 {8, 8, 8, 16, 32}, /* cost of unaligned stores. */
1502 2, 3, 6, /* cost of moving XMM,YMM,ZMM register. */
1503 6, /* cost of moving SSE register to integer. */
1504 /* VGATHERDPD is 23 uops and throughput is 9, VGATHERDPD is 35 uops,
1505 throughput 12. Approx 9 uops do not depend on vector size and every load
1507 18, 8, /* Gather load static, per_elt. */
1508 18, 10, /* Gather store static, per_elt. */
1509 32, /* size of l1 cache. */
1510 512, /* size of l2 cache. */
1511 64, /* size of prefetch block. */
1512 /* New AMD processors never drop prefetches; if they cannot be performed
1513 immediately, they are queued. We set number of simultaneous prefetches
1514 to a large constant to reflect this (it probably is not a good idea not
1515 to limit number of prefetches at all, as their execution also takes some
1517 100, /* number of parallel prefetches. */
1518 3, /* Branch cost. */
1519 COSTS_N_INSNS (5), /* cost of FADD and FSUB insns. */
1520 COSTS_N_INSNS (5), /* cost of FMUL instruction. */
1521 /* Latency of fdiv is 8-15. */
1522 COSTS_N_INSNS (15), /* cost of FDIV instruction. */
1523 COSTS_N_INSNS (1), /* cost of FABS instruction. */
1524 COSTS_N_INSNS (1), /* cost of FCHS instruction. */
1525 /* Latency of fsqrt is 4-10. */
1526 COSTS_N_INSNS (10), /* cost of FSQRT instruction. */
1528 COSTS_N_INSNS (1), /* cost of cheap SSE instruction. */
1529 COSTS_N_INSNS (3), /* cost of ADDSS/SD SUBSS/SD insns. */
1530 COSTS_N_INSNS (3), /* cost of MULSS instruction. */
1531 COSTS_N_INSNS (4), /* cost of MULSD instruction. */
1532 COSTS_N_INSNS (5), /* cost of FMA SS instruction. */
1533 COSTS_N_INSNS (5), /* cost of FMA SD instruction. */
1534 COSTS_N_INSNS (10), /* cost of DIVSS instruction. */
1536 COSTS_N_INSNS (13), /* cost of DIVSD instruction. */
1537 COSTS_N_INSNS (10), /* cost of SQRTSS instruction. */
1538 COSTS_N_INSNS (15), /* cost of SQRTSD instruction. */
1539 /* Zen can execute 4 integer operations per cycle. FP operations take 3 cycles
1540 and it can execute 2 integer additions and 2 multiplications thus
1541 reassociation may make sense up to with of 6. SPEC2k6 bencharks suggests
1542 that 4 works better than 6 probably due to register pressure.
1544 Integer vector operations are taken by FP unit and execute 3 vector
1545 plus/minus operations per cycle but only one multiply. This is adjusted
1546 in ix86_reassociation_width. */
1547 4, 4, 3, 6, /* reassoc int, fp, vec_int, vec_fp. */
1550 COSTS_N_INSNS (4), /* cond_taken_branch_cost. */
1551 COSTS_N_INSNS (2), /* cond_not_taken_branch_cost. */
1552 "16", /* Loop alignment. */
1553 "16", /* Jump alignment. */
1554 "0:0:8", /* Label alignment. */
1555 "16", /* Func alignment. */
1556 4, /* Small unroll limit. */
1557 2, /* Small unroll factor. */
1560 /* ZNVER2 has optimized REP instruction for medium sized blocks, but for
1561 very small blocks it is better to use loop. For large blocks, libcall
1562 can do nontemporary accesses and beat inline considerably. */
1563 static stringop_algs znver2_memcpy
[2] = {
1564 /* 32-bit tuning. */
1565 {libcall
, {{6, loop
, false},
1566 {14, unrolled_loop
, false},
1567 {-1, libcall
, false}}},
1568 /* 64-bit tuning. */
1569 {libcall
, {{16, loop
, false},
1570 {64, rep_prefix_4_byte
, false},
1571 {-1, libcall
, false}}}};
1572 static stringop_algs znver2_memset
[2] = {
1573 /* 32-bit tuning. */
1574 {libcall
, {{8, loop
, false},
1575 {24, unrolled_loop
, false},
1576 {128, rep_prefix_4_byte
, false},
1577 {-1, libcall
, false}}},
1578 /* 64-bit tuning. */
1579 {libcall
, {{24, rep_prefix_4_byte
, false},
1580 {128, rep_prefix_8_byte
, false},
1581 {-1, libcall
, false}}}};
1583 struct processor_costs znver2_cost
= {
1585 /* Start of register allocator costs. integer->integer move cost is 2. */
1587 /* reg-reg moves are done by renaming and thus they are even cheaper than
1588 1 cycle. Because reg-reg move cost is 2 and following tables correspond
1589 to doubles of latencies, we do not model this correctly. It does not
1590 seem to make practical difference to bump prices up even more. */
1591 6, /* cost for loading QImode using
1593 {6, 6, 6}, /* cost of loading integer registers
1594 in QImode, HImode and SImode.
1595 Relative to reg-reg move (2). */
1596 {8, 8, 8}, /* cost of storing integer
1598 2, /* cost of reg,reg fld/fst. */
1599 {6, 6, 16}, /* cost of loading fp registers
1600 in SFmode, DFmode and XFmode. */
1601 {8, 8, 16}, /* cost of storing fp registers
1602 in SFmode, DFmode and XFmode. */
1603 2, /* cost of moving MMX register. */
1604 {6, 6}, /* cost of loading MMX registers
1605 in SImode and DImode. */
1606 {8, 8}, /* cost of storing MMX registers
1607 in SImode and DImode. */
1608 2, 2, 3, /* cost of moving XMM,YMM,ZMM
1610 {6, 6, 6, 6, 12}, /* cost of loading SSE registers
1611 in 32,64,128,256 and 512-bit. */
1612 {8, 8, 8, 8, 16}, /* cost of storing SSE registers
1613 in 32,64,128,256 and 512-bit. */
1614 6, 6, /* SSE->integer and integer->SSE
1616 8, 8, /* mask->integer and integer->mask moves */
1617 {6, 6, 6}, /* cost of loading mask register
1618 in QImode, HImode, SImode. */
1619 {8, 8, 8}, /* cost if storing mask register
1620 in QImode, HImode, SImode. */
1621 2, /* cost of moving mask register. */
1622 /* End of register allocator costs. */
1625 COSTS_N_INSNS (1), /* cost of an add instruction. */
1626 COSTS_N_INSNS (1), /* cost of a lea instruction. */
1627 COSTS_N_INSNS (1), /* variable shift costs. */
1628 COSTS_N_INSNS (1), /* constant shift costs. */
1629 {COSTS_N_INSNS (3), /* cost of starting multiply for QI. */
1630 COSTS_N_INSNS (3), /* HI. */
1631 COSTS_N_INSNS (3), /* SI. */
1632 COSTS_N_INSNS (3), /* DI. */
1633 COSTS_N_INSNS (3)}, /* other. */
1634 0, /* cost of multiply per each bit
1636 /* Depending on parameters, idiv can get faster on ryzen. This is upper
1638 {COSTS_N_INSNS (16), /* cost of a divide/mod for QI. */
1639 COSTS_N_INSNS (22), /* HI. */
1640 COSTS_N_INSNS (30), /* SI. */
1641 COSTS_N_INSNS (45), /* DI. */
1642 COSTS_N_INSNS (45)}, /* other. */
1643 COSTS_N_INSNS (1), /* cost of movsx. */
1644 COSTS_N_INSNS (1), /* cost of movzx. */
1645 8, /* "large" insn. */
1646 9, /* MOVE_RATIO. */
1647 6, /* CLEAR_RATIO */
1648 {6, 6, 6}, /* cost of loading integer registers
1649 in QImode, HImode and SImode.
1650 Relative to reg-reg move (2). */
1651 {8, 8, 8}, /* cost of storing integer
1653 {6, 6, 6, 6, 12}, /* cost of loading SSE registers
1654 in 32bit, 64bit, 128bit, 256bit and 512bit */
1655 {8, 8, 8, 8, 16}, /* cost of storing SSE register
1656 in 32bit, 64bit, 128bit, 256bit and 512bit */
1657 {6, 6, 6, 6, 12}, /* cost of unaligned loads. */
1658 {8, 8, 8, 8, 16}, /* cost of unaligned stores. */
1659 2, 2, 3, /* cost of moving XMM,YMM,ZMM
1661 6, /* cost of moving SSE register to integer. */
1662 /* VGATHERDPD is 23 uops and throughput is 9, VGATHERDPD is 35 uops,
1663 throughput 12. Approx 9 uops do not depend on vector size and every load
1665 18, 8, /* Gather load static, per_elt. */
1666 18, 10, /* Gather store static, per_elt. */
1667 32, /* size of l1 cache. */
1668 512, /* size of l2 cache. */
1669 64, /* size of prefetch block. */
1670 /* New AMD processors never drop prefetches; if they cannot be performed
1671 immediately, they are queued. We set number of simultaneous prefetches
1672 to a large constant to reflect this (it probably is not a good idea not
1673 to limit number of prefetches at all, as their execution also takes some
1675 100, /* number of parallel prefetches. */
1676 3, /* Branch cost. */
1677 COSTS_N_INSNS (5), /* cost of FADD and FSUB insns. */
1678 COSTS_N_INSNS (5), /* cost of FMUL instruction. */
1679 /* Latency of fdiv is 8-15. */
1680 COSTS_N_INSNS (15), /* cost of FDIV instruction. */
1681 COSTS_N_INSNS (1), /* cost of FABS instruction. */
1682 COSTS_N_INSNS (1), /* cost of FCHS instruction. */
1683 /* Latency of fsqrt is 4-10. */
1684 COSTS_N_INSNS (10), /* cost of FSQRT instruction. */
1686 COSTS_N_INSNS (1), /* cost of cheap SSE instruction. */
1687 COSTS_N_INSNS (3), /* cost of ADDSS/SD SUBSS/SD insns. */
1688 COSTS_N_INSNS (3), /* cost of MULSS instruction. */
1689 COSTS_N_INSNS (3), /* cost of MULSD instruction. */
1690 COSTS_N_INSNS (5), /* cost of FMA SS instruction. */
1691 COSTS_N_INSNS (5), /* cost of FMA SD instruction. */
1692 COSTS_N_INSNS (10), /* cost of DIVSS instruction. */
1694 COSTS_N_INSNS (13), /* cost of DIVSD instruction. */
1695 COSTS_N_INSNS (10), /* cost of SQRTSS instruction. */
1696 COSTS_N_INSNS (15), /* cost of SQRTSD instruction. */
1697 /* Zen can execute 4 integer operations per cycle. FP operations
1698 take 3 cycles and it can execute 2 integer additions and 2
1699 multiplications thus reassociation may make sense up to with of 6.
1700 SPEC2k6 bencharks suggests
1701 that 4 works better than 6 probably due to register pressure.
1703 Integer vector operations are taken by FP unit and execute 3 vector
1704 plus/minus operations per cycle but only one multiply. This is adjusted
1705 in ix86_reassociation_width. */
1706 4, 4, 3, 6, /* reassoc int, fp, vec_int, vec_fp. */
1709 COSTS_N_INSNS (4), /* cond_taken_branch_cost. */
1710 COSTS_N_INSNS (2), /* cond_not_taken_branch_cost. */
1711 "16", /* Loop alignment. */
1712 "16", /* Jump alignment. */
1713 "0:0:8", /* Label alignment. */
1714 "16", /* Func alignment. */
1715 4, /* Small unroll limit. */
1716 2, /* Small unroll factor. */
1719 struct processor_costs znver3_cost
= {
1721 /* Start of register allocator costs. integer->integer move cost is 2. */
1723 /* reg-reg moves are done by renaming and thus they are even cheaper than
1724 1 cycle. Because reg-reg move cost is 2 and following tables correspond
1725 to doubles of latencies, we do not model this correctly. It does not
1726 seem to make practical difference to bump prices up even more. */
1727 6, /* cost for loading QImode using
1729 {6, 6, 6}, /* cost of loading integer registers
1730 in QImode, HImode and SImode.
1731 Relative to reg-reg move (2). */
1732 {8, 8, 8}, /* cost of storing integer
1734 2, /* cost of reg,reg fld/fst. */
1735 {6, 6, 16}, /* cost of loading fp registers
1736 in SFmode, DFmode and XFmode. */
1737 {8, 8, 16}, /* cost of storing fp registers
1738 in SFmode, DFmode and XFmode. */
1739 2, /* cost of moving MMX register. */
1740 {6, 6}, /* cost of loading MMX registers
1741 in SImode and DImode. */
1742 {8, 8}, /* cost of storing MMX registers
1743 in SImode and DImode. */
1744 2, 2, 3, /* cost of moving XMM,YMM,ZMM
1746 {6, 6, 6, 6, 12}, /* cost of loading SSE registers
1747 in 32,64,128,256 and 512-bit. */
1748 {8, 8, 8, 8, 16}, /* cost of storing SSE registers
1749 in 32,64,128,256 and 512-bit. */
1750 6, 6, /* SSE->integer and integer->SSE
1752 8, 8, /* mask->integer and integer->mask moves */
1753 {6, 6, 6}, /* cost of loading mask register
1754 in QImode, HImode, SImode. */
1755 {8, 8, 8}, /* cost if storing mask register
1756 in QImode, HImode, SImode. */
1757 2, /* cost of moving mask register. */
1758 /* End of register allocator costs. */
1761 COSTS_N_INSNS (1), /* cost of an add instruction. */
1762 COSTS_N_INSNS (1), /* cost of a lea instruction. */
1763 COSTS_N_INSNS (1), /* variable shift costs. */
1764 COSTS_N_INSNS (1), /* constant shift costs. */
1765 {COSTS_N_INSNS (3), /* cost of starting multiply for QI. */
1766 COSTS_N_INSNS (3), /* HI. */
1767 COSTS_N_INSNS (3), /* SI. */
1768 COSTS_N_INSNS (3), /* DI. */
1769 COSTS_N_INSNS (3)}, /* other. */
1770 0, /* cost of multiply per each bit
1772 {COSTS_N_INSNS (9), /* cost of a divide/mod for QI. */
1773 COSTS_N_INSNS (10), /* HI. */
1774 COSTS_N_INSNS (12), /* SI. */
1775 COSTS_N_INSNS (17), /* DI. */
1776 COSTS_N_INSNS (17)}, /* other. */
1777 COSTS_N_INSNS (1), /* cost of movsx. */
1778 COSTS_N_INSNS (1), /* cost of movzx. */
1779 8, /* "large" insn. */
1780 9, /* MOVE_RATIO. */
1781 6, /* CLEAR_RATIO */
1782 {6, 6, 6}, /* cost of loading integer registers
1783 in QImode, HImode and SImode.
1784 Relative to reg-reg move (2). */
1785 {8, 8, 8}, /* cost of storing integer
1787 {6, 6, 6, 6, 12}, /* cost of loading SSE registers
1788 in 32bit, 64bit, 128bit, 256bit and 512bit */
1789 {8, 8, 8, 8, 16}, /* cost of storing SSE register
1790 in 32bit, 64bit, 128bit, 256bit and 512bit */
1791 {6, 6, 6, 6, 12}, /* cost of unaligned loads. */
1792 {8, 8, 8, 8, 16}, /* cost of unaligned stores. */
1793 2, 2, 3, /* cost of moving XMM,YMM,ZMM
1795 6, /* cost of moving SSE register to integer. */
1796 /* VGATHERDPD is 15 uops and throughput is 4, VGATHERDPS is 23 uops,
1797 throughput 9. Approx 7 uops do not depend on vector size and every load
1799 14, 8, /* Gather load static, per_elt. */
1800 14, 10, /* Gather store static, per_elt. */
1801 32, /* size of l1 cache. */
1802 512, /* size of l2 cache. */
1803 64, /* size of prefetch block. */
1804 /* New AMD processors never drop prefetches; if they cannot be performed
1805 immediately, they are queued. We set number of simultaneous prefetches
1806 to a large constant to reflect this (it probably is not a good idea not
1807 to limit number of prefetches at all, as their execution also takes some
1809 100, /* number of parallel prefetches. */
1810 3, /* Branch cost. */
1811 COSTS_N_INSNS (5), /* cost of FADD and FSUB insns. */
1812 COSTS_N_INSNS (5), /* cost of FMUL instruction. */
1813 /* Latency of fdiv is 8-15. */
1814 COSTS_N_INSNS (15), /* cost of FDIV instruction. */
1815 COSTS_N_INSNS (1), /* cost of FABS instruction. */
1816 COSTS_N_INSNS (1), /* cost of FCHS instruction. */
1817 /* Latency of fsqrt is 4-10. */
1818 COSTS_N_INSNS (10), /* cost of FSQRT instruction. */
1820 COSTS_N_INSNS (1), /* cost of cheap SSE instruction. */
1821 COSTS_N_INSNS (3), /* cost of ADDSS/SD SUBSS/SD insns. */
1822 COSTS_N_INSNS (3), /* cost of MULSS instruction. */
1823 COSTS_N_INSNS (3), /* cost of MULSD instruction. */
1824 COSTS_N_INSNS (5), /* cost of FMA SS instruction. */
1825 COSTS_N_INSNS (5), /* cost of FMA SD instruction. */
1826 COSTS_N_INSNS (10), /* cost of DIVSS instruction. */
1828 COSTS_N_INSNS (13), /* cost of DIVSD instruction. */
1829 COSTS_N_INSNS (10), /* cost of SQRTSS instruction. */
1830 COSTS_N_INSNS (15), /* cost of SQRTSD instruction. */
1831 /* Zen can execute 4 integer operations per cycle. FP operations
1832 take 3 cycles and it can execute 2 integer additions and 2
1833 multiplications thus reassociation may make sense up to with of 6.
1834 SPEC2k6 bencharks suggests
1835 that 4 works better than 6 probably due to register pressure.
1837 Integer vector operations are taken by FP unit and execute 3 vector
1838 plus/minus operations per cycle but only one multiply. This is adjusted
1839 in ix86_reassociation_width. */
1840 4, 4, 3, 6, /* reassoc int, fp, vec_int, vec_fp. */
1843 COSTS_N_INSNS (4), /* cond_taken_branch_cost. */
1844 COSTS_N_INSNS (2), /* cond_not_taken_branch_cost. */
1845 "16", /* Loop alignment. */
1846 "16", /* Jump alignment. */
1847 "0:0:8", /* Label alignment. */
1848 "16", /* Func alignment. */
1849 4, /* Small unroll limit. */
1850 2, /* Small unroll factor. */
1853 /* This table currently replicates znver3_cost table. */
1854 struct processor_costs znver4_cost
= {
1856 /* Start of register allocator costs. integer->integer move cost is 2. */
1858 /* reg-reg moves are done by renaming and thus they are even cheaper than
1859 1 cycle. Because reg-reg move cost is 2 and following tables correspond
1860 to doubles of latencies, we do not model this correctly. It does not
1861 seem to make practical difference to bump prices up even more. */
1862 6, /* cost for loading QImode using
1864 {6, 6, 6}, /* cost of loading integer registers
1865 in QImode, HImode and SImode.
1866 Relative to reg-reg move (2). */
1867 {8, 8, 8}, /* cost of storing integer
1869 2, /* cost of reg,reg fld/fst. */
1870 {14, 14, 17}, /* cost of loading fp registers
1871 in SFmode, DFmode and XFmode. */
1872 {12, 12, 16}, /* cost of storing fp registers
1873 in SFmode, DFmode and XFmode. */
1874 2, /* cost of moving MMX register. */
1875 {6, 6}, /* cost of loading MMX registers
1876 in SImode and DImode. */
1877 {8, 8}, /* cost of storing MMX registers
1878 in SImode and DImode. */
1879 2, 2, 3, /* cost of moving XMM,YMM,ZMM
1881 {6, 6, 10, 10, 12}, /* cost of loading SSE registers
1882 in 32,64,128,256 and 512-bit. */
1883 {8, 8, 8, 12, 12}, /* cost of storing SSE registers
1884 in 32,64,128,256 and 512-bit. */
1885 6, 8, /* SSE->integer and integer->SSE
1887 8, 8, /* mask->integer and integer->mask moves */
1888 {6, 6, 6}, /* cost of loading mask register
1889 in QImode, HImode, SImode. */
1890 {8, 8, 8}, /* cost if storing mask register
1891 in QImode, HImode, SImode. */
1892 2, /* cost of moving mask register. */
1893 /* End of register allocator costs. */
1896 COSTS_N_INSNS (1), /* cost of an add instruction. */
1897 /* TODO: Lea with 3 components has cost 2. */
1898 COSTS_N_INSNS (1), /* cost of a lea instruction. */
1899 COSTS_N_INSNS (1), /* variable shift costs. */
1900 COSTS_N_INSNS (1), /* constant shift costs. */
1901 {COSTS_N_INSNS (3), /* cost of starting multiply for QI. */
1902 COSTS_N_INSNS (3), /* HI. */
1903 COSTS_N_INSNS (3), /* SI. */
1904 COSTS_N_INSNS (3), /* DI. */
1905 COSTS_N_INSNS (3)}, /* other. */
1906 0, /* cost of multiply per each bit
1908 {COSTS_N_INSNS (12), /* cost of a divide/mod for QI. */
1909 COSTS_N_INSNS (13), /* HI. */
1910 COSTS_N_INSNS (13), /* SI. */
1911 COSTS_N_INSNS (18), /* DI. */
1912 COSTS_N_INSNS (18)}, /* other. */
1913 COSTS_N_INSNS (1), /* cost of movsx. */
1914 COSTS_N_INSNS (1), /* cost of movzx. */
1915 8, /* "large" insn. */
1916 9, /* MOVE_RATIO. */
1917 6, /* CLEAR_RATIO */
1918 {6, 6, 6}, /* cost of loading integer registers
1919 in QImode, HImode and SImode.
1920 Relative to reg-reg move (2). */
1921 {8, 8, 8}, /* cost of storing integer
1923 {6, 6, 10, 10, 12}, /* cost of loading SSE registers
1924 in 32bit, 64bit, 128bit, 256bit and 512bit */
1925 {8, 8, 8, 12, 12}, /* cost of storing SSE register
1926 in 32bit, 64bit, 128bit, 256bit and 512bit */
1927 {6, 6, 10, 10, 12}, /* cost of unaligned loads. */
1928 {8, 8, 8, 12, 12}, /* cost of unaligned stores. */
1929 2, 2, 2, /* cost of moving XMM,YMM,ZMM
1931 6, /* cost of moving SSE register to integer. */
1932 /* VGATHERDPD is 17 uops and throughput is 4, VGATHERDPS is 24 uops,
1933 throughput 5. Approx 7 uops do not depend on vector size and every load
1935 14, 10, /* Gather load static, per_elt. */
1936 14, 20, /* Gather store static, per_elt. */
1937 32, /* size of l1 cache. */
1938 1024, /* size of l2 cache. */
1939 64, /* size of prefetch block. */
1940 /* New AMD processors never drop prefetches; if they cannot be performed
1941 immediately, they are queued. We set number of simultaneous prefetches
1942 to a large constant to reflect this (it probably is not a good idea not
1943 to limit number of prefetches at all, as their execution also takes some
1945 100, /* number of parallel prefetches. */
1946 3, /* Branch cost. */
1947 COSTS_N_INSNS (7), /* cost of FADD and FSUB insns. */
1948 COSTS_N_INSNS (7), /* cost of FMUL instruction. */
1949 /* Latency of fdiv is 8-15. */
1950 COSTS_N_INSNS (15), /* cost of FDIV instruction. */
1951 COSTS_N_INSNS (1), /* cost of FABS instruction. */
1952 COSTS_N_INSNS (1), /* cost of FCHS instruction. */
1953 /* Latency of fsqrt is 4-10. */
1954 COSTS_N_INSNS (25), /* cost of FSQRT instruction. */
1956 COSTS_N_INSNS (1), /* cost of cheap SSE instruction. */
1957 COSTS_N_INSNS (3), /* cost of ADDSS/SD SUBSS/SD insns. */
1958 COSTS_N_INSNS (3), /* cost of MULSS instruction. */
1959 COSTS_N_INSNS (3), /* cost of MULSD instruction. */
1960 COSTS_N_INSNS (4), /* cost of FMA SS instruction. */
1961 COSTS_N_INSNS (4), /* cost of FMA SD instruction. */
1962 COSTS_N_INSNS (13), /* cost of DIVSS instruction. */
1964 COSTS_N_INSNS (13), /* cost of DIVSD instruction. */
1965 COSTS_N_INSNS (15), /* cost of SQRTSS instruction. */
1966 COSTS_N_INSNS (21), /* cost of SQRTSD instruction. */
1967 /* Zen can execute 4 integer operations per cycle. FP operations
1968 take 3 cycles and it can execute 2 integer additions and 2
1969 multiplications thus reassociation may make sense up to with of 6.
1970 SPEC2k6 bencharks suggests
1971 that 4 works better than 6 probably due to register pressure.
1973 Integer vector operations are taken by FP unit and execute 3 vector
1974 plus/minus operations per cycle but only one multiply. This is adjusted
1975 in ix86_reassociation_width. */
1976 4, 4, 3, 6, /* reassoc int, fp, vec_int, vec_fp. */
1979 COSTS_N_INSNS (4), /* cond_taken_branch_cost. */
1980 COSTS_N_INSNS (2), /* cond_not_taken_branch_cost. */
1981 "16", /* Loop alignment. */
1982 "16", /* Jump alignment. */
1983 "0:0:8", /* Label alignment. */
1984 "16", /* Func alignment. */
1985 4, /* Small unroll limit. */
1986 2, /* Small unroll factor. */
1989 /* This table currently replicates znver4_cost table. */
1990 struct processor_costs znver5_cost
= {
1992 /* Start of register allocator costs. integer->integer move cost is 2. */
1994 /* reg-reg moves are done by renaming and thus they are even cheaper than
1995 1 cycle. Because reg-reg move cost is 2 and following tables correspond
1996 to doubles of latencies, we do not model this correctly. It does not
1997 seem to make practical difference to bump prices up even more. */
1998 6, /* cost for loading QImode using
2000 {6, 6, 6}, /* cost of loading integer registers
2001 in QImode, HImode and SImode.
2002 Relative to reg-reg move (2). */
2003 {8, 8, 8}, /* cost of storing integer
2005 2, /* cost of reg,reg fld/fst. */
2006 {14, 14, 17}, /* cost of loading fp registers
2007 in SFmode, DFmode and XFmode. */
2008 {12, 12, 16}, /* cost of storing fp registers
2009 in SFmode, DFmode and XFmode. */
2010 2, /* cost of moving MMX register. */
2011 {6, 6}, /* cost of loading MMX registers
2012 in SImode and DImode. */
2013 {8, 8}, /* cost of storing MMX registers
2014 in SImode and DImode. */
2015 2, 2, 3, /* cost of moving XMM,YMM,ZMM
2017 {6, 6, 10, 10, 12}, /* cost of loading SSE registers
2018 in 32,64,128,256 and 512-bit. */
2019 {8, 8, 8, 12, 12}, /* cost of storing SSE registers
2020 in 32,64,128,256 and 512-bit. */
2021 6, 8, /* SSE->integer and integer->SSE
2023 8, 8, /* mask->integer and integer->mask moves */
2024 {6, 6, 6}, /* cost of loading mask register
2025 in QImode, HImode, SImode. */
2026 {8, 8, 8}, /* cost if storing mask register
2027 in QImode, HImode, SImode. */
2028 2, /* cost of moving mask register. */
2029 /* End of register allocator costs. */
2032 COSTS_N_INSNS (1), /* cost of an add instruction. */
2033 /* TODO: Lea with 3 components has cost 2. */
2034 COSTS_N_INSNS (1), /* cost of a lea instruction. */
2035 COSTS_N_INSNS (1), /* variable shift costs. */
2036 COSTS_N_INSNS (1), /* constant shift costs. */
2037 /* mul has latency 3, executes in 3 integer units. */
2038 {COSTS_N_INSNS (3), /* cost of starting multiply for QI. */
2039 COSTS_N_INSNS (3), /* HI. */
2040 COSTS_N_INSNS (3), /* SI. */
2041 COSTS_N_INSNS (3), /* DI. */
2042 COSTS_N_INSNS (3)}, /* other. */
2043 0, /* cost of multiply per each bit
2045 /* integer divide has latency of 8 cycles
2046 plus 1 for every 9 bits of quotient. */
2047 {COSTS_N_INSNS (10), /* cost of a divide/mod for QI. */
2048 COSTS_N_INSNS (11), /* HI. */
2049 COSTS_N_INSNS (13), /* SI. */
2050 COSTS_N_INSNS (16), /* DI. */
2051 COSTS_N_INSNS (16)}, /* other. */
2052 COSTS_N_INSNS (1), /* cost of movsx. */
2053 COSTS_N_INSNS (1), /* cost of movzx. */
2054 15, /* "large" insn. */
2055 9, /* MOVE_RATIO. */
2056 6, /* CLEAR_RATIO */
2057 {6, 6, 6}, /* cost of loading integer registers
2058 in QImode, HImode and SImode.
2059 Relative to reg-reg move (2). */
2060 {8, 8, 8}, /* cost of storing integer
2062 {6, 6, 10, 10, 12}, /* cost of loading SSE registers
2063 in 32bit, 64bit, 128bit, 256bit and 512bit */
2064 {8, 8, 8, 12, 12}, /* cost of storing SSE register
2065 in 32bit, 64bit, 128bit, 256bit and 512bit */
2066 {6, 6, 10, 10, 12}, /* cost of unaligned loads. */
2067 {8, 8, 8, 12, 12}, /* cost of unaligned stores. */
2068 2, 2, 2, /* cost of moving XMM,YMM,ZMM
2070 6, /* cost of moving SSE register to integer. */
2072 /* TODO: gather and scatter instructions are currently disabled in
2073 x86-tune.def. In some cases they are however a win, see PR116582
2074 We however need good cost model for them. */
2075 14, 10, /* Gather load static, per_elt. */
2076 14, 20, /* Gather store static, per_elt. */
2077 48, /* size of l1 cache. */
2078 1024, /* size of l2 cache. */
2079 64, /* size of prefetch block. */
2080 /* New AMD processors never drop prefetches; if they cannot be performed
2081 immediately, they are queued. We set number of simultaneous prefetches
2082 to a large constant to reflect this (it probably is not a good idea not
2083 to limit number of prefetches at all, as their execution also takes some
2085 100, /* number of parallel prefetches. */
2086 3, /* Branch cost. */
2087 /* TODO x87 latencies are still based on znver4.
2088 Probably not very important these days. */
2089 COSTS_N_INSNS (7), /* cost of FADD and FSUB insns. */
2090 COSTS_N_INSNS (7), /* cost of FMUL instruction. */
2091 /* Latency of fdiv is 8-15. */
2092 COSTS_N_INSNS (15), /* cost of FDIV instruction. */
2093 COSTS_N_INSNS (1), /* cost of FABS instruction. */
2094 COSTS_N_INSNS (1), /* cost of FCHS instruction. */
2095 /* Latency of fsqrt is 4-10. */
2096 COSTS_N_INSNS (25), /* cost of FSQRT instruction. */
2098 /* SSE instructions have typical throughput 4 and latency 1. */
2099 COSTS_N_INSNS (1), /* cost of cheap SSE instruction. */
2100 /* ADDSS has throughput 2 and latency 2
2101 (in some cases when source is another addition). */
2102 COSTS_N_INSNS (2), /* cost of ADDSS/SD SUBSS/SD insns. */
2103 /* MULSS has throughput 2 and latency 3. */
2104 COSTS_N_INSNS (3), /* cost of MULSS instruction. */
2105 COSTS_N_INSNS (3), /* cost of MULSD instruction. */
2106 /* FMA had throughput 2 and latency 4. */
2107 COSTS_N_INSNS (4), /* cost of FMA SS instruction. */
2108 COSTS_N_INSNS (4), /* cost of FMA SD instruction. */
2109 /* DIVSS has throughtput 0.4 and latency 10. */
2110 COSTS_N_INSNS (10), /* cost of DIVSS instruction. */
2111 /* DIVSD has throughtput 0.25 and latency 13. */
2112 COSTS_N_INSNS (13), /* cost of DIVSD instruction. */
2113 /* DIVSD has throughtput 0.22 and latency 14. */
2114 COSTS_N_INSNS (14), /* cost of SQRTSS instruction. */
2115 /* DIVSD has throughtput 0.13 and latency 20. */
2116 COSTS_N_INSNS (20), /* cost of SQRTSD instruction. */
2117 /* Zen5 can execute:
2118 - integer ops: 6 per cycle, at most 3 multiplications.
2119 latency 1 for additions, 3 for multiplications (pipelined)
2121 Setting width of 9 for multiplication is probably excessive
2122 for register pressure.
2123 - fp ops: 2 additions per cycle, latency 2-3
2124 2 multiplicaitons per cycle, latency 3
2125 - vector intger ops: 4 additions, latency 1
2126 2 multiplications, latency 4
2127 We increase width to 6 for multiplications
2128 in ix86_reassociation_width. */
2129 6, 6, 4, 6, /* reassoc int, fp, vec_int, vec_fp. */
2132 COSTS_N_INSNS (4), /* cond_taken_branch_cost. */
2133 COSTS_N_INSNS (2), /* cond_not_taken_branch_cost. */
2134 "16", /* Loop alignment. */
2135 "16", /* Jump alignment. */
2136 "0:0:8", /* Label alignment. */
2137 "16", /* Func alignment. */
2138 4, /* Small unroll limit. */
2139 2, /* Small unroll factor. */
2142 /* skylake_cost should produce code tuned for Skylake familly of CPUs. */
2143 static stringop_algs skylake_memcpy
[2] = {
2145 {{256, rep_prefix_1_byte
, true},
2147 {-1, libcall
, false}}},
2149 {{256, rep_prefix_1_byte
, true},
2151 {-1, libcall
, false}}}};
2153 static stringop_algs skylake_memset
[2] = {
2155 {{256, rep_prefix_1_byte
, true},
2157 {-1, libcall
, false}}},
2159 {{256, rep_prefix_1_byte
, true},
2161 {-1, libcall
, false}}}};
2164 struct processor_costs skylake_cost
= {
2166 /* Start of register allocator costs. integer->integer move cost is 2. */
2167 6, /* cost for loading QImode using movzbl */
2168 {4, 4, 4}, /* cost of loading integer registers
2169 in QImode, HImode and SImode.
2170 Relative to reg-reg move (2). */
2171 {6, 6, 6}, /* cost of storing integer registers */
2172 2, /* cost of reg,reg fld/fst */
2173 {6, 6, 8}, /* cost of loading fp registers
2174 in SFmode, DFmode and XFmode */
2175 {6, 6, 10}, /* cost of storing fp registers
2176 in SFmode, DFmode and XFmode */
2177 2, /* cost of moving MMX register */
2178 {6, 6}, /* cost of loading MMX registers
2179 in SImode and DImode */
2180 {6, 6}, /* cost of storing MMX registers
2181 in SImode and DImode */
2182 2, 2, 4, /* cost of moving XMM,YMM,ZMM register */
2183 {6, 6, 6, 10, 20}, /* cost of loading SSE registers
2184 in 32,64,128,256 and 512-bit */
2185 {8, 8, 8, 12, 24}, /* cost of storing SSE registers
2186 in 32,64,128,256 and 512-bit */
2187 6, 6, /* SSE->integer and integer->SSE moves */
2188 6, 6, /* mask->integer and integer->mask moves */
2189 {8, 8, 8}, /* cost of loading mask register
2190 in QImode, HImode, SImode. */
2191 {6, 6, 6}, /* cost if storing mask register
2192 in QImode, HImode, SImode. */
2193 3, /* cost of moving mask register. */
2194 /* End of register allocator costs. */
2197 COSTS_N_INSNS (1), /* cost of an add instruction */
2198 COSTS_N_INSNS (1)+1, /* cost of a lea instruction */
2199 COSTS_N_INSNS (1), /* variable shift costs */
2200 COSTS_N_INSNS (1), /* constant shift costs */
2201 {COSTS_N_INSNS (3), /* cost of starting multiply for QI */
2202 COSTS_N_INSNS (3), /* HI */
2203 COSTS_N_INSNS (3), /* SI */
2204 COSTS_N_INSNS (3), /* DI */
2205 COSTS_N_INSNS (3)}, /* other */
2206 0, /* cost of multiply per each bit set */
2207 /* Expanding div/mod currently doesn't consider parallelism. So the cost
2208 model is not realistic. We compensate by increasing the latencies a bit. */
2209 {COSTS_N_INSNS (11), /* cost of a divide/mod for QI */
2210 COSTS_N_INSNS (11), /* HI */
2211 COSTS_N_INSNS (14), /* SI */
2212 COSTS_N_INSNS (76), /* DI */
2213 COSTS_N_INSNS (76)}, /* other */
2214 COSTS_N_INSNS (1), /* cost of movsx */
2215 COSTS_N_INSNS (0), /* cost of movzx */
2216 8, /* "large" insn */
2217 17, /* MOVE_RATIO */
2218 17, /* CLEAR_RATIO */
2219 {6, 6, 6}, /* cost of loading integer registers
2220 in QImode, HImode and SImode.
2221 Relative to reg-reg move (2). */
2222 {8, 8, 8}, /* cost of storing integer registers */
2223 {8, 8, 8, 8, 16}, /* cost of loading SSE register
2224 in 32bit, 64bit, 128bit, 256bit and 512bit */
2225 {8, 8, 8, 8, 16}, /* cost of storing SSE register
2226 in 32bit, 64bit, 128bit, 256bit and 512bit */
2227 {8, 8, 8, 8, 16}, /* cost of unaligned loads. */
2228 {8, 8, 8, 8, 16}, /* cost of unaligned stores. */
2229 2, 2, 4, /* cost of moving XMM,YMM,ZMM register */
2230 6, /* cost of moving SSE register to integer. */
2231 20, 8, /* Gather load static, per_elt. */
2232 22, 10, /* Gather store static, per_elt. */
2233 64, /* size of l1 cache. */
2234 512, /* size of l2 cache. */
2235 64, /* size of prefetch block */
2236 6, /* number of parallel prefetches */
2237 3, /* Branch cost */
2238 COSTS_N_INSNS (3), /* cost of FADD and FSUB insns. */
2239 COSTS_N_INSNS (4), /* cost of FMUL instruction. */
2240 COSTS_N_INSNS (20), /* cost of FDIV instruction. */
2241 COSTS_N_INSNS (1), /* cost of FABS instruction. */
2242 COSTS_N_INSNS (1), /* cost of FCHS instruction. */
2243 COSTS_N_INSNS (20), /* cost of FSQRT instruction. */
2245 COSTS_N_INSNS (1), /* cost of cheap SSE instruction. */
2246 COSTS_N_INSNS (4), /* cost of ADDSS/SD SUBSS/SD insns. */
2247 COSTS_N_INSNS (4), /* cost of MULSS instruction. */
2248 COSTS_N_INSNS (4), /* cost of MULSD instruction. */
2249 COSTS_N_INSNS (4), /* cost of FMA SS instruction. */
2250 COSTS_N_INSNS (4), /* cost of FMA SD instruction. */
2251 COSTS_N_INSNS (11), /* cost of DIVSS instruction. */
2252 COSTS_N_INSNS (14), /* cost of DIVSD instruction. */
2253 COSTS_N_INSNS (12), /* cost of SQRTSS instruction. */
2254 COSTS_N_INSNS (18), /* cost of SQRTSD instruction. */
2255 1, 4, 2, 2, /* reassoc int, fp, vec_int, vec_fp. */
2258 COSTS_N_INSNS (3), /* cond_taken_branch_cost. */
2259 COSTS_N_INSNS (1), /* cond_not_taken_branch_cost. */
2260 "16:11:8", /* Loop alignment. */
2261 "16:11:8", /* Jump alignment. */
2262 "0:0:8", /* Label alignment. */
2263 "16", /* Func alignment. */
2264 4, /* Small unroll limit. */
2265 2, /* Small unroll factor. */
2268 /* icelake_cost should produce code tuned for Icelake family of CPUs.
2269 NB: rep_prefix_1_byte is used only for known size. */
2271 static stringop_algs icelake_memcpy
[2] = {
2273 {{256, rep_prefix_1_byte
, true},
2275 {-1, libcall
, false}}},
2277 {{256, rep_prefix_1_byte
, true},
2279 {-1, libcall
, false}}}};
2281 static stringop_algs icelake_memset
[2] = {
2283 {{256, rep_prefix_1_byte
, true},
2285 {-1, libcall
, false}}},
2287 {{256, rep_prefix_1_byte
, true},
2289 {-1, libcall
, false}}}};
2292 struct processor_costs icelake_cost
= {
2294 /* Start of register allocator costs. integer->integer move cost is 2. */
2295 6, /* cost for loading QImode using movzbl */
2296 {4, 4, 4}, /* cost of loading integer registers
2297 in QImode, HImode and SImode.
2298 Relative to reg-reg move (2). */
2299 {6, 6, 6}, /* cost of storing integer registers */
2300 2, /* cost of reg,reg fld/fst */
2301 {6, 6, 8}, /* cost of loading fp registers
2302 in SFmode, DFmode and XFmode */
2303 {6, 6, 10}, /* cost of storing fp registers
2304 in SFmode, DFmode and XFmode */
2305 2, /* cost of moving MMX register */
2306 {6, 6}, /* cost of loading MMX registers
2307 in SImode and DImode */
2308 {6, 6}, /* cost of storing MMX registers
2309 in SImode and DImode */
2310 2, 2, 4, /* cost of moving XMM,YMM,ZMM register */
2311 {6, 6, 6, 10, 20}, /* cost of loading SSE registers
2312 in 32,64,128,256 and 512-bit */
2313 {8, 8, 8, 12, 24}, /* cost of storing SSE registers
2314 in 32,64,128,256 and 512-bit */
2315 6, 6, /* SSE->integer and integer->SSE moves */
2316 6, 6, /* mask->integer and integer->mask moves */
2317 {8, 8, 8}, /* cost of loading mask register
2318 in QImode, HImode, SImode. */
2319 {6, 6, 6}, /* cost if storing mask register
2320 in QImode, HImode, SImode. */
2321 3, /* cost of moving mask register. */
2322 /* End of register allocator costs. */
2325 COSTS_N_INSNS (1), /* cost of an add instruction */
2326 COSTS_N_INSNS (1)+1, /* cost of a lea instruction */
2327 COSTS_N_INSNS (1), /* variable shift costs */
2328 COSTS_N_INSNS (1), /* constant shift costs */
2329 {COSTS_N_INSNS (3), /* cost of starting multiply for QI */
2330 COSTS_N_INSNS (3), /* HI */
2331 COSTS_N_INSNS (3), /* SI */
2332 COSTS_N_INSNS (3), /* DI */
2333 COSTS_N_INSNS (3)}, /* other */
2334 0, /* cost of multiply per each bit set */
2335 /* Expanding div/mod currently doesn't consider parallelism. So the cost
2336 model is not realistic. We compensate by increasing the latencies a bit. */
2337 {COSTS_N_INSNS (11), /* cost of a divide/mod for QI */
2338 COSTS_N_INSNS (11), /* HI */
2339 COSTS_N_INSNS (14), /* SI */
2340 COSTS_N_INSNS (76), /* DI */
2341 COSTS_N_INSNS (76)}, /* other */
2342 COSTS_N_INSNS (1), /* cost of movsx */
2343 COSTS_N_INSNS (0), /* cost of movzx */
2344 8, /* "large" insn */
2345 17, /* MOVE_RATIO */
2346 17, /* CLEAR_RATIO */
2347 {6, 6, 6}, /* cost of loading integer registers
2348 in QImode, HImode and SImode.
2349 Relative to reg-reg move (2). */
2350 {8, 8, 8}, /* cost of storing integer registers */
2351 {8, 8, 8, 8, 16}, /* cost of loading SSE register
2352 in 32bit, 64bit, 128bit, 256bit and 512bit */
2353 {8, 8, 8, 8, 16}, /* cost of storing SSE register
2354 in 32bit, 64bit, 128bit, 256bit and 512bit */
2355 {8, 8, 8, 8, 16}, /* cost of unaligned loads. */
2356 {8, 8, 8, 8, 16}, /* cost of unaligned stores. */
2357 2, 2, 4, /* cost of moving XMM,YMM,ZMM register */
2358 6, /* cost of moving SSE register to integer. */
2359 20, 8, /* Gather load static, per_elt. */
2360 22, 10, /* Gather store static, per_elt. */
2361 64, /* size of l1 cache. */
2362 512, /* size of l2 cache. */
2363 64, /* size of prefetch block */
2364 6, /* number of parallel prefetches */
2365 3, /* Branch cost */
2366 COSTS_N_INSNS (3), /* cost of FADD and FSUB insns. */
2367 COSTS_N_INSNS (4), /* cost of FMUL instruction. */
2368 COSTS_N_INSNS (20), /* cost of FDIV instruction. */
2369 COSTS_N_INSNS (1), /* cost of FABS instruction. */
2370 COSTS_N_INSNS (1), /* cost of FCHS instruction. */
2371 COSTS_N_INSNS (20), /* cost of FSQRT instruction. */
2373 COSTS_N_INSNS (1), /* cost of cheap SSE instruction. */
2374 COSTS_N_INSNS (4), /* cost of ADDSS/SD SUBSS/SD insns. */
2375 COSTS_N_INSNS (4), /* cost of MULSS instruction. */
2376 COSTS_N_INSNS (4), /* cost of MULSD instruction. */
2377 COSTS_N_INSNS (4), /* cost of FMA SS instruction. */
2378 COSTS_N_INSNS (4), /* cost of FMA SD instruction. */
2379 COSTS_N_INSNS (11), /* cost of DIVSS instruction. */
2380 COSTS_N_INSNS (14), /* cost of DIVSD instruction. */
2381 COSTS_N_INSNS (12), /* cost of SQRTSS instruction. */
2382 COSTS_N_INSNS (18), /* cost of SQRTSD instruction. */
2383 1, 4, 2, 2, /* reassoc int, fp, vec_int, vec_fp. */
2386 COSTS_N_INSNS (3), /* cond_taken_branch_cost. */
2387 COSTS_N_INSNS (1), /* cond_not_taken_branch_cost. */
2388 "16:11:8", /* Loop alignment. */
2389 "16:11:8", /* Jump alignment. */
2390 "0:0:8", /* Label alignment. */
2391 "16", /* Func alignment. */
2392 4, /* Small unroll limit. */
2393 2, /* Small unroll factor. */
2396 /* alderlake_cost should produce code tuned for alderlake family of CPUs. */
2397 static stringop_algs alderlake_memcpy
[2] = {
2399 {{256, rep_prefix_1_byte
, true},
2401 {-1, libcall
, false}}},
2403 {{256, rep_prefix_1_byte
, true},
2405 {-1, libcall
, false}}}};
2406 static stringop_algs alderlake_memset
[2] = {
2408 {{256, rep_prefix_1_byte
, true},
2410 {-1, libcall
, false}}},
2412 {{256, rep_prefix_1_byte
, true},
2414 {-1, libcall
, false}}}};
2416 struct processor_costs alderlake_cost
= {
2418 /* Start of register allocator costs. integer->integer move cost is 2. */
2419 6, /* cost for loading QImode using movzbl */
2420 {6, 6, 6}, /* cost of loading integer registers
2421 in QImode, HImode and SImode.
2422 Relative to reg-reg move (2). */
2423 {6, 6, 6}, /* cost of storing integer registers */
2424 4, /* cost of reg,reg fld/fst */
2425 {6, 6, 12}, /* cost of loading fp registers
2426 in SFmode, DFmode and XFmode */
2427 {6, 6, 12}, /* cost of storing fp registers
2428 in SFmode, DFmode and XFmode */
2429 2, /* cost of moving MMX register */
2430 {6, 6}, /* cost of loading MMX registers
2431 in SImode and DImode */
2432 {6, 6}, /* cost of storing MMX registers
2433 in SImode and DImode */
2434 2, 3, 4, /* cost of moving XMM,YMM,ZMM register */
2435 {6, 6, 6, 10, 15}, /* cost of loading SSE registers
2436 in 32,64,128,256 and 512-bit */
2437 {6, 6, 6, 10, 15}, /* cost of storing SSE registers
2438 in 32,64,128,256 and 512-bit */
2439 6, 6, /* SSE->integer and integer->SSE moves */
2440 6, 6, /* mask->integer and integer->mask moves */
2441 {6, 6, 6}, /* cost of loading mask register
2442 in QImode, HImode, SImode. */
2443 {6, 6, 6}, /* cost if storing mask register
2444 in QImode, HImode, SImode. */
2445 2, /* cost of moving mask register. */
2446 /* End of register allocator costs. */
2449 COSTS_N_INSNS (1), /* cost of an add instruction */
2450 COSTS_N_INSNS (1) + 1, /* cost of a lea instruction */
2451 COSTS_N_INSNS (1), /* variable shift costs */
2452 COSTS_N_INSNS (1), /* constant shift costs */
2453 {COSTS_N_INSNS (3), /* cost of starting multiply for QI */
2454 COSTS_N_INSNS (3), /* HI */
2455 COSTS_N_INSNS (3), /* SI */
2456 COSTS_N_INSNS (3), /* DI */
2457 COSTS_N_INSNS (4)}, /* other */
2458 0, /* cost of multiply per each bit set */
2459 {COSTS_N_INSNS (16), /* cost of a divide/mod for QI */
2460 COSTS_N_INSNS (22), /* HI */
2461 COSTS_N_INSNS (30), /* SI */
2462 COSTS_N_INSNS (74), /* DI */
2463 COSTS_N_INSNS (74)}, /* other */
2464 COSTS_N_INSNS (1), /* cost of movsx */
2465 COSTS_N_INSNS (1), /* cost of movzx */
2466 8, /* "large" insn */
2467 17, /* MOVE_RATIO */
2468 17, /* CLEAR_RATIO */
2469 {6, 6, 6}, /* cost of loading integer registers
2470 in QImode, HImode and SImode.
2471 Relative to reg-reg move (2). */
2472 {8, 8, 8}, /* cost of storing integer registers */
2473 {8, 8, 8, 10, 15}, /* cost of loading SSE register
2474 in 32bit, 64bit, 128bit, 256bit and 512bit */
2475 {8, 8, 8, 10, 15}, /* cost of storing SSE register
2476 in 32bit, 64bit, 128bit, 256bit and 512bit */
2477 {8, 8, 8, 10, 15}, /* cost of unaligned loads. */
2478 {8, 8, 8, 10, 15}, /* cost of unaligned storess. */
2479 2, 3, 4, /* cost of moving XMM,YMM,ZMM register */
2480 6, /* cost of moving SSE register to integer. */
2481 18, 6, /* Gather load static, per_elt. */
2482 18, 6, /* Gather store static, per_elt. */
2483 32, /* size of l1 cache. */
2484 512, /* size of l2 cache. */
2485 64, /* size of prefetch block */
2486 6, /* number of parallel prefetches */
2487 3, /* Branch cost */
2488 COSTS_N_INSNS (3), /* cost of FADD and FSUB insns. */
2489 COSTS_N_INSNS (5), /* cost of FMUL instruction. */
2490 COSTS_N_INSNS (17), /* cost of FDIV instruction. */
2491 COSTS_N_INSNS (1), /* cost of FABS instruction. */
2492 COSTS_N_INSNS (1), /* cost of FCHS instruction. */
2493 COSTS_N_INSNS (14), /* cost of FSQRT instruction. */
2495 COSTS_N_INSNS (1), /* cost of cheap SSE instruction. */
2496 COSTS_N_INSNS (3), /* cost of ADDSS/SD SUBSS/SD insns. */
2497 COSTS_N_INSNS (4), /* cost of MULSS instruction. */
2498 COSTS_N_INSNS (5), /* cost of MULSD instruction. */
2499 COSTS_N_INSNS (5), /* cost of FMA SS instruction. */
2500 COSTS_N_INSNS (5), /* cost of FMA SD instruction. */
2501 COSTS_N_INSNS (13), /* cost of DIVSS instruction. */
2502 COSTS_N_INSNS (17), /* cost of DIVSD instruction. */
2503 COSTS_N_INSNS (14), /* cost of SQRTSS instruction. */
2504 COSTS_N_INSNS (18), /* cost of SQRTSD instruction. */
2505 1, 4, 3, 3, /* reassoc int, fp, vec_int, vec_fp. */
2508 COSTS_N_INSNS (4), /* cond_taken_branch_cost. */
2509 COSTS_N_INSNS (2), /* cond_not_taken_branch_cost. */
2510 "16:11:8", /* Loop alignment. */
2511 "16:11:8", /* Jump alignment. */
2512 "0:0:8", /* Label alignment. */
2513 "16", /* Func alignment. */
2514 4, /* Small unroll limit. */
2515 2, /* Small unroll factor. */
2518 /* BTVER1 has optimized REP instruction for medium sized blocks, but for
2519 very small blocks it is better to use loop. For large blocks, libcall can
2520 do nontemporary accesses and beat inline considerably. */
2521 static stringop_algs btver1_memcpy
[2] = {
2522 {libcall
, {{6, loop
, false}, {14, unrolled_loop
, false},
2523 {-1, rep_prefix_4_byte
, false}}},
2524 {libcall
, {{16, loop
, false}, {8192, rep_prefix_8_byte
, false},
2525 {-1, libcall
, false}}}};
2526 static stringop_algs btver1_memset
[2] = {
2527 {libcall
, {{8, loop
, false}, {24, unrolled_loop
, false},
2528 {2048, rep_prefix_4_byte
, false}, {-1, libcall
, false}}},
2529 {libcall
, {{48, unrolled_loop
, false}, {8192, rep_prefix_8_byte
, false},
2530 {-1, libcall
, false}}}};
2531 const struct processor_costs btver1_cost
= {
2533 /* Start of register allocator costs. integer->integer move cost is 2. */
2534 8, /* cost for loading QImode using movzbl */
2535 {6, 8, 6}, /* cost of loading integer registers
2536 in QImode, HImode and SImode.
2537 Relative to reg-reg move (2). */
2538 {6, 8, 6}, /* cost of storing integer registers */
2539 4, /* cost of reg,reg fld/fst */
2540 {12, 12, 28}, /* cost of loading fp registers
2541 in SFmode, DFmode and XFmode */
2542 {12, 12, 38}, /* cost of storing fp registers
2543 in SFmode, DFmode and XFmode */
2544 4, /* cost of moving MMX register */
2545 {10, 10}, /* cost of loading MMX registers
2546 in SImode and DImode */
2547 {12, 12}, /* cost of storing MMX registers
2548 in SImode and DImode */
2549 2, 4, 8, /* cost of moving XMM,YMM,ZMM register */
2550 {10, 10, 12, 48, 96}, /* cost of loading SSE registers
2551 in 32,64,128,256 and 512-bit */
2552 {10, 10, 12, 48, 96}, /* cost of storing SSE registers
2553 in 32,64,128,256 and 512-bit */
2554 14, 14, /* SSE->integer and integer->SSE moves */
2555 14, 14, /* mask->integer and integer->mask moves */
2556 {6, 8, 6}, /* cost of loading mask register
2557 in QImode, HImode, SImode. */
2558 {6, 8, 6}, /* cost if storing mask register
2559 in QImode, HImode, SImode. */
2560 2, /* cost of moving mask register. */
2561 /* End of register allocator costs. */
2564 COSTS_N_INSNS (1), /* cost of an add instruction */
2565 COSTS_N_INSNS (2), /* cost of a lea instruction */
2566 COSTS_N_INSNS (1), /* variable shift costs */
2567 COSTS_N_INSNS (1), /* constant shift costs */
2568 {COSTS_N_INSNS (3), /* cost of starting multiply for QI */
2569 COSTS_N_INSNS (4), /* HI */
2570 COSTS_N_INSNS (3), /* SI */
2571 COSTS_N_INSNS (4), /* DI */
2572 COSTS_N_INSNS (5)}, /* other */
2573 0, /* cost of multiply per each bit set */
2574 {COSTS_N_INSNS (19), /* cost of a divide/mod for QI */
2575 COSTS_N_INSNS (35), /* HI */
2576 COSTS_N_INSNS (51), /* SI */
2577 COSTS_N_INSNS (83), /* DI */
2578 COSTS_N_INSNS (83)}, /* other */
2579 COSTS_N_INSNS (1), /* cost of movsx */
2580 COSTS_N_INSNS (1), /* cost of movzx */
2581 8, /* "large" insn */
2583 6, /* CLEAR_RATIO */
2584 {6, 8, 6}, /* cost of loading integer registers
2585 in QImode, HImode and SImode.
2586 Relative to reg-reg move (2). */
2587 {6, 8, 6}, /* cost of storing integer registers */
2588 {10, 10, 12, 48, 96}, /* cost of loading SSE register
2589 in 32bit, 64bit, 128bit, 256bit and 512bit */
2590 {10, 10, 12, 48, 96}, /* cost of storing SSE register
2591 in 32bit, 64bit, 128bit, 256bit and 512bit */
2592 {10, 10, 12, 48, 96}, /* cost of unaligned loads. */
2593 {10, 10, 12, 48, 96}, /* cost of unaligned stores. */
2594 2, 4, 8, /* cost of moving XMM,YMM,ZMM register */
2595 14, /* cost of moving SSE register to integer. */
2596 10, 10, /* Gather load static, per_elt. */
2597 10, 10, /* Gather store static, per_elt. */
2598 32, /* size of l1 cache. */
2599 512, /* size of l2 cache. */
2600 64, /* size of prefetch block */
2601 100, /* number of parallel prefetches */
2602 2, /* Branch cost */
2603 COSTS_N_INSNS (4), /* cost of FADD and FSUB insns. */
2604 COSTS_N_INSNS (4), /* cost of FMUL instruction. */
2605 COSTS_N_INSNS (19), /* cost of FDIV instruction. */
2606 COSTS_N_INSNS (2), /* cost of FABS instruction. */
2607 COSTS_N_INSNS (2), /* cost of FCHS instruction. */
2608 COSTS_N_INSNS (35), /* cost of FSQRT instruction. */
2610 COSTS_N_INSNS (1), /* cost of cheap SSE instruction. */
2611 COSTS_N_INSNS (3), /* cost of ADDSS/SD SUBSS/SD insns. */
2612 COSTS_N_INSNS (2), /* cost of MULSS instruction. */
2613 COSTS_N_INSNS (4), /* cost of MULSD instruction. */
2614 COSTS_N_INSNS (5), /* cost of FMA SS instruction. */
2615 COSTS_N_INSNS (5), /* cost of FMA SD instruction. */
2616 COSTS_N_INSNS (13), /* cost of DIVSS instruction. */
2617 COSTS_N_INSNS (17), /* cost of DIVSD instruction. */
2618 COSTS_N_INSNS (14), /* cost of SQRTSS instruction. */
2619 COSTS_N_INSNS (48), /* cost of SQRTSD instruction. */
2620 1, 1, 1, 1, /* reassoc int, fp, vec_int, vec_fp. */
2623 COSTS_N_INSNS (2), /* cond_taken_branch_cost. */
2624 COSTS_N_INSNS (1), /* cond_not_taken_branch_cost. */
2625 "16:11:8", /* Loop alignment. */
2626 "16:8:8", /* Jump alignment. */
2627 "0:0:8", /* Label alignment. */
2628 "11", /* Func alignment. */
2629 4, /* Small unroll limit. */
2630 2, /* Small unroll factor. */
2633 static stringop_algs btver2_memcpy
[2] = {
2634 {libcall
, {{6, loop
, false}, {14, unrolled_loop
, false},
2635 {-1, rep_prefix_4_byte
, false}}},
2636 {libcall
, {{16, loop
, false}, {8192, rep_prefix_8_byte
, false},
2637 {-1, libcall
, false}}}};
2638 static stringop_algs btver2_memset
[2] = {
2639 {libcall
, {{8, loop
, false}, {24, unrolled_loop
, false},
2640 {2048, rep_prefix_4_byte
, false}, {-1, libcall
, false}}},
2641 {libcall
, {{48, unrolled_loop
, false}, {8192, rep_prefix_8_byte
, false},
2642 {-1, libcall
, false}}}};
2643 const struct processor_costs btver2_cost
= {
2645 /* Start of register allocator costs. integer->integer move cost is 2. */
2646 8, /* cost for loading QImode using movzbl */
2647 {8, 8, 6}, /* cost of loading integer registers
2648 in QImode, HImode and SImode.
2649 Relative to reg-reg move (2). */
2650 {8, 8, 6}, /* cost of storing integer registers */
2651 4, /* cost of reg,reg fld/fst */
2652 {12, 12, 28}, /* cost of loading fp registers
2653 in SFmode, DFmode and XFmode */
2654 {12, 12, 38}, /* cost of storing fp registers
2655 in SFmode, DFmode and XFmode */
2656 4, /* cost of moving MMX register */
2657 {10, 10}, /* cost of loading MMX registers
2658 in SImode and DImode */
2659 {12, 12}, /* cost of storing MMX registers
2660 in SImode and DImode */
2661 2, 4, 8, /* cost of moving XMM,YMM,ZMM register */
2662 {10, 10, 12, 48, 96}, /* cost of loading SSE registers
2663 in 32,64,128,256 and 512-bit */
2664 {10, 10, 12, 48, 96}, /* cost of storing SSE registers
2665 in 32,64,128,256 and 512-bit */
2666 14, 14, /* SSE->integer and integer->SSE moves */
2667 14, 14, /* mask->integer and integer->mask moves */
2668 {8, 8, 6}, /* cost of loading mask register
2669 in QImode, HImode, SImode. */
2670 {8, 8, 6}, /* cost if storing mask register
2671 in QImode, HImode, SImode. */
2672 2, /* cost of moving mask register. */
2673 /* End of register allocator costs. */
2676 COSTS_N_INSNS (1), /* cost of an add instruction */
2677 COSTS_N_INSNS (2), /* cost of a lea instruction */
2678 COSTS_N_INSNS (1), /* variable shift costs */
2679 COSTS_N_INSNS (1), /* constant shift costs */
2680 {COSTS_N_INSNS (3), /* cost of starting multiply for QI */
2681 COSTS_N_INSNS (4), /* HI */
2682 COSTS_N_INSNS (3), /* SI */
2683 COSTS_N_INSNS (4), /* DI */
2684 COSTS_N_INSNS (5)}, /* other */
2685 0, /* cost of multiply per each bit set */
2686 {COSTS_N_INSNS (19), /* cost of a divide/mod for QI */
2687 COSTS_N_INSNS (35), /* HI */
2688 COSTS_N_INSNS (51), /* SI */
2689 COSTS_N_INSNS (83), /* DI */
2690 COSTS_N_INSNS (83)}, /* other */
2691 COSTS_N_INSNS (1), /* cost of movsx */
2692 COSTS_N_INSNS (1), /* cost of movzx */
2693 8, /* "large" insn */
2695 6, /* CLEAR_RATIO */
2696 {8, 8, 6}, /* cost of loading integer registers
2697 in QImode, HImode and SImode.
2698 Relative to reg-reg move (2). */
2699 {8, 8, 6}, /* cost of storing integer registers */
2700 {10, 10, 12, 48, 96}, /* cost of loading SSE register
2701 in 32bit, 64bit, 128bit, 256bit and 512bit */
2702 {10, 10, 12, 48, 96}, /* cost of storing SSE register
2703 in 32bit, 64bit, 128bit, 256bit and 512bit */
2704 {10, 10, 12, 48, 96}, /* cost of unaligned loads. */
2705 {10, 10, 12, 48, 96}, /* cost of unaligned stores. */
2706 2, 4, 8, /* cost of moving XMM,YMM,ZMM register */
2707 14, /* cost of moving SSE register to integer. */
2708 10, 10, /* Gather load static, per_elt. */
2709 10, 10, /* Gather store static, per_elt. */
2710 32, /* size of l1 cache. */
2711 2048, /* size of l2 cache. */
2712 64, /* size of prefetch block */
2713 100, /* number of parallel prefetches */
2714 2, /* Branch cost */
2715 COSTS_N_INSNS (4), /* cost of FADD and FSUB insns. */
2716 COSTS_N_INSNS (4), /* cost of FMUL instruction. */
2717 COSTS_N_INSNS (19), /* cost of FDIV instruction. */
2718 COSTS_N_INSNS (2), /* cost of FABS instruction. */
2719 COSTS_N_INSNS (2), /* cost of FCHS instruction. */
2720 COSTS_N_INSNS (35), /* cost of FSQRT instruction. */
2722 COSTS_N_INSNS (1), /* cost of cheap SSE instruction. */
2723 COSTS_N_INSNS (3), /* cost of ADDSS/SD SUBSS/SD insns. */
2724 COSTS_N_INSNS (2), /* cost of MULSS instruction. */
2725 COSTS_N_INSNS (4), /* cost of MULSD instruction. */
2726 COSTS_N_INSNS (5), /* cost of FMA SS instruction. */
2727 COSTS_N_INSNS (5), /* cost of FMA SD instruction. */
2728 COSTS_N_INSNS (13), /* cost of DIVSS instruction. */
2729 COSTS_N_INSNS (19), /* cost of DIVSD instruction. */
2730 COSTS_N_INSNS (16), /* cost of SQRTSS instruction. */
2731 COSTS_N_INSNS (21), /* cost of SQRTSD instruction. */
2732 1, 1, 1, 1, /* reassoc int, fp, vec_int, vec_fp. */
2735 COSTS_N_INSNS (2), /* cond_taken_branch_cost. */
2736 COSTS_N_INSNS (1), /* cond_not_taken_branch_cost. */
2737 "16:11:8", /* Loop alignment. */
2738 "16:8:8", /* Jump alignment. */
2739 "0:0:8", /* Label alignment. */
2740 "11", /* Func alignment. */
2741 4, /* Small unroll limit. */
2742 2, /* Small unroll factor. */
2745 static stringop_algs pentium4_memcpy
[2] = {
2746 {libcall
, {{12, loop_1_byte
, false}, {-1, rep_prefix_4_byte
, false}}},
2747 DUMMY_STRINGOP_ALGS
};
2748 static stringop_algs pentium4_memset
[2] = {
2749 {libcall
, {{6, loop_1_byte
, false}, {48, loop
, false},
2750 {20480, rep_prefix_4_byte
, false}, {-1, libcall
, false}}},
2751 DUMMY_STRINGOP_ALGS
};
2754 struct processor_costs pentium4_cost
= {
2756 /* Start of register allocator costs. integer->integer move cost is 2. */
2757 5, /* cost for loading QImode using movzbl */
2758 {4, 5, 4}, /* cost of loading integer registers
2759 in QImode, HImode and SImode.
2760 Relative to reg-reg move (2). */
2761 {2, 3, 2}, /* cost of storing integer registers */
2762 12, /* cost of reg,reg fld/fst */
2763 {14, 14, 14}, /* cost of loading fp registers
2764 in SFmode, DFmode and XFmode */
2765 {14, 14, 14}, /* cost of storing fp registers
2766 in SFmode, DFmode and XFmode */
2767 12, /* cost of moving MMX register */
2768 {16, 16}, /* cost of loading MMX registers
2769 in SImode and DImode */
2770 {16, 16}, /* cost of storing MMX registers
2771 in SImode and DImode */
2772 12, 24, 48, /* cost of moving XMM,YMM,ZMM register */
2773 {16, 16, 16, 32, 64}, /* cost of loading SSE registers
2774 in 32,64,128,256 and 512-bit */
2775 {16, 16, 16, 32, 64}, /* cost of storing SSE registers
2776 in 32,64,128,256 and 512-bit */
2777 20, 12, /* SSE->integer and integer->SSE moves */
2778 20, 12, /* mask->integer and integer->mask moves */
2779 {4, 5, 4}, /* cost of loading mask register
2780 in QImode, HImode, SImode. */
2781 {2, 3, 2}, /* cost if storing mask register
2782 in QImode, HImode, SImode. */
2783 2, /* cost of moving mask register. */
2784 /* End of register allocator costs. */
2787 COSTS_N_INSNS (1), /* cost of an add instruction */
2788 COSTS_N_INSNS (3), /* cost of a lea instruction */
2789 COSTS_N_INSNS (4), /* variable shift costs */
2790 COSTS_N_INSNS (4), /* constant shift costs */
2791 {COSTS_N_INSNS (15), /* cost of starting multiply for QI */
2792 COSTS_N_INSNS (15), /* HI */
2793 COSTS_N_INSNS (15), /* SI */
2794 COSTS_N_INSNS (15), /* DI */
2795 COSTS_N_INSNS (15)}, /* other */
2796 0, /* cost of multiply per each bit set */
2797 {COSTS_N_INSNS (56), /* cost of a divide/mod for QI */
2798 COSTS_N_INSNS (56), /* HI */
2799 COSTS_N_INSNS (56), /* SI */
2800 COSTS_N_INSNS (56), /* DI */
2801 COSTS_N_INSNS (56)}, /* other */
2802 COSTS_N_INSNS (1), /* cost of movsx */
2803 COSTS_N_INSNS (1), /* cost of movzx */
2804 16, /* "large" insn */
2806 6, /* CLEAR_RATIO */
2807 {4, 5, 4}, /* cost of loading integer registers
2808 in QImode, HImode and SImode.
2809 Relative to reg-reg move (2). */
2810 {2, 3, 2}, /* cost of storing integer registers */
2811 {16, 16, 16, 32, 64}, /* cost of loading SSE register
2812 in 32bit, 64bit, 128bit, 256bit and 512bit */
2813 {16, 16, 16, 32, 64}, /* cost of storing SSE register
2814 in 32bit, 64bit, 128bit, 256bit and 512bit */
2815 {32, 32, 32, 64, 128}, /* cost of unaligned loads. */
2816 {32, 32, 32, 64, 128}, /* cost of unaligned stores. */
2817 12, 24, 48, /* cost of moving XMM,YMM,ZMM register */
2818 20, /* cost of moving SSE register to integer. */
2819 16, 16, /* Gather load static, per_elt. */
2820 16, 16, /* Gather store static, per_elt. */
2821 8, /* size of l1 cache. */
2822 256, /* size of l2 cache. */
2823 64, /* size of prefetch block */
2824 6, /* number of parallel prefetches */
2825 2, /* Branch cost */
2826 COSTS_N_INSNS (5), /* cost of FADD and FSUB insns. */
2827 COSTS_N_INSNS (7), /* cost of FMUL instruction. */
2828 COSTS_N_INSNS (43), /* cost of FDIV instruction. */
2829 COSTS_N_INSNS (2), /* cost of FABS instruction. */
2830 COSTS_N_INSNS (2), /* cost of FCHS instruction. */
2831 COSTS_N_INSNS (43), /* cost of FSQRT instruction. */
2833 COSTS_N_INSNS (2), /* cost of cheap SSE instruction. */
2834 COSTS_N_INSNS (4), /* cost of ADDSS/SD SUBSS/SD insns. */
2835 COSTS_N_INSNS (6), /* cost of MULSS instruction. */
2836 COSTS_N_INSNS (6), /* cost of MULSD instruction. */
2837 COSTS_N_INSNS (6), /* cost of FMA SS instruction. */
2838 COSTS_N_INSNS (6), /* cost of FMA SD instruction. */
2839 COSTS_N_INSNS (23), /* cost of DIVSS instruction. */
2840 COSTS_N_INSNS (38), /* cost of DIVSD instruction. */
2841 COSTS_N_INSNS (23), /* cost of SQRTSS instruction. */
2842 COSTS_N_INSNS (38), /* cost of SQRTSD instruction. */
2843 1, 1, 1, 1, /* reassoc int, fp, vec_int, vec_fp. */
2846 COSTS_N_INSNS (3), /* cond_taken_branch_cost. */
2847 COSTS_N_INSNS (1), /* cond_not_taken_branch_cost. */
2848 NULL
, /* Loop alignment. */
2849 NULL
, /* Jump alignment. */
2850 NULL
, /* Label alignment. */
2851 NULL
, /* Func alignment. */
2852 4, /* Small unroll limit. */
2853 2, /* Small unroll factor. */
2856 static stringop_algs nocona_memcpy
[2] = {
2857 {libcall
, {{12, loop_1_byte
, false}, {-1, rep_prefix_4_byte
, false}}},
2858 {libcall
, {{32, loop
, false}, {20000, rep_prefix_8_byte
, false},
2859 {100000, unrolled_loop
, false}, {-1, libcall
, false}}}};
2861 static stringop_algs nocona_memset
[2] = {
2862 {libcall
, {{6, loop_1_byte
, false}, {48, loop
, false},
2863 {20480, rep_prefix_4_byte
, false}, {-1, libcall
, false}}},
2864 {libcall
, {{24, loop
, false}, {64, unrolled_loop
, false},
2865 {8192, rep_prefix_8_byte
, false}, {-1, libcall
, false}}}};
2868 struct processor_costs nocona_cost
= {
2870 /* Start of register allocator costs. integer->integer move cost is 2. */
2871 4, /* cost for loading QImode using movzbl */
2872 {4, 4, 4}, /* cost of loading integer registers
2873 in QImode, HImode and SImode.
2874 Relative to reg-reg move (2). */
2875 {4, 4, 4}, /* cost of storing integer registers */
2876 12, /* cost of reg,reg fld/fst */
2877 {14, 14, 14}, /* cost of loading fp registers
2878 in SFmode, DFmode and XFmode */
2879 {14, 14, 14}, /* cost of storing fp registers
2880 in SFmode, DFmode and XFmode */
2881 14, /* cost of moving MMX register */
2882 {12, 12}, /* cost of loading MMX registers
2883 in SImode and DImode */
2884 {12, 12}, /* cost of storing MMX registers
2885 in SImode and DImode */
2886 6, 12, 24, /* cost of moving XMM,YMM,ZMM register */
2887 {12, 12, 12, 24, 48}, /* cost of loading SSE registers
2888 in 32,64,128,256 and 512-bit */
2889 {12, 12, 12, 24, 48}, /* cost of storing SSE registers
2890 in 32,64,128,256 and 512-bit */
2891 20, 12, /* SSE->integer and integer->SSE moves */
2892 20, 12, /* mask->integer and integer->mask moves */
2893 {4, 4, 4}, /* cost of loading mask register
2894 in QImode, HImode, SImode. */
2895 {4, 4, 4}, /* cost if storing mask register
2896 in QImode, HImode, SImode. */
2897 2, /* cost of moving mask register. */
2898 /* End of register allocator costs. */
2901 COSTS_N_INSNS (1), /* cost of an add instruction */
2902 COSTS_N_INSNS (1), /* cost of a lea instruction */
2903 COSTS_N_INSNS (1), /* variable shift costs */
2904 COSTS_N_INSNS (1), /* constant shift costs */
2905 {COSTS_N_INSNS (10), /* cost of starting multiply for QI */
2906 COSTS_N_INSNS (10), /* HI */
2907 COSTS_N_INSNS (10), /* SI */
2908 COSTS_N_INSNS (10), /* DI */
2909 COSTS_N_INSNS (10)}, /* other */
2910 0, /* cost of multiply per each bit set */
2911 {COSTS_N_INSNS (66), /* cost of a divide/mod for QI */
2912 COSTS_N_INSNS (66), /* HI */
2913 COSTS_N_INSNS (66), /* SI */
2914 COSTS_N_INSNS (66), /* DI */
2915 COSTS_N_INSNS (66)}, /* other */
2916 COSTS_N_INSNS (1), /* cost of movsx */
2917 COSTS_N_INSNS (1), /* cost of movzx */
2918 16, /* "large" insn */
2919 17, /* MOVE_RATIO */
2920 6, /* CLEAR_RATIO */
2921 {4, 4, 4}, /* cost of loading integer registers
2922 in QImode, HImode and SImode.
2923 Relative to reg-reg move (2). */
2924 {4, 4, 4}, /* cost of storing integer registers */
2925 {12, 12, 12, 24, 48}, /* cost of loading SSE register
2926 in 32bit, 64bit, 128bit, 256bit and 512bit */
2927 {12, 12, 12, 24, 48}, /* cost of storing SSE register
2928 in 32bit, 64bit, 128bit, 256bit and 512bit */
2929 {24, 24, 24, 48, 96}, /* cost of unaligned loads. */
2930 {24, 24, 24, 48, 96}, /* cost of unaligned stores. */
2931 6, 12, 24, /* cost of moving XMM,YMM,ZMM register */
2932 20, /* cost of moving SSE register to integer. */
2933 12, 12, /* Gather load static, per_elt. */
2934 12, 12, /* Gather store static, per_elt. */
2935 8, /* size of l1 cache. */
2936 1024, /* size of l2 cache. */
2937 64, /* size of prefetch block */
2938 8, /* number of parallel prefetches */
2939 1, /* Branch cost */
2940 COSTS_N_INSNS (6), /* cost of FADD and FSUB insns. */
2941 COSTS_N_INSNS (8), /* cost of FMUL instruction. */
2942 COSTS_N_INSNS (40), /* cost of FDIV instruction. */
2943 COSTS_N_INSNS (3), /* cost of FABS instruction. */
2944 COSTS_N_INSNS (3), /* cost of FCHS instruction. */
2945 COSTS_N_INSNS (44), /* cost of FSQRT instruction. */
2947 COSTS_N_INSNS (2), /* cost of cheap SSE instruction. */
2948 COSTS_N_INSNS (5), /* cost of ADDSS/SD SUBSS/SD insns. */
2949 COSTS_N_INSNS (7), /* cost of MULSS instruction. */
2950 COSTS_N_INSNS (7), /* cost of MULSD instruction. */
2951 COSTS_N_INSNS (7), /* cost of FMA SS instruction. */
2952 COSTS_N_INSNS (7), /* cost of FMA SD instruction. */
2953 COSTS_N_INSNS (32), /* cost of DIVSS instruction. */
2954 COSTS_N_INSNS (40), /* cost of DIVSD instruction. */
2955 COSTS_N_INSNS (32), /* cost of SQRTSS instruction. */
2956 COSTS_N_INSNS (41), /* cost of SQRTSD instruction. */
2957 1, 1, 1, 1, /* reassoc int, fp, vec_int, vec_fp. */
2960 COSTS_N_INSNS (3), /* cond_taken_branch_cost. */
2961 COSTS_N_INSNS (1), /* cond_not_taken_branch_cost. */
2962 NULL
, /* Loop alignment. */
2963 NULL
, /* Jump alignment. */
2964 NULL
, /* Label alignment. */
2965 NULL
, /* Func alignment. */
2966 4, /* Small unroll limit. */
2967 2, /* Small unroll factor. */
2970 static stringop_algs atom_memcpy
[2] = {
2971 {libcall
, {{11, loop
, false}, {-1, rep_prefix_4_byte
, false}}},
2972 {libcall
, {{32, loop
, false}, {64, rep_prefix_4_byte
, false},
2973 {8192, rep_prefix_8_byte
, false}, {-1, libcall
, false}}}};
2974 static stringop_algs atom_memset
[2] = {
2975 {libcall
, {{8, loop
, false}, {15, unrolled_loop
, false},
2976 {2048, rep_prefix_4_byte
, false}, {-1, libcall
, false}}},
2977 {libcall
, {{24, loop
, false}, {32, unrolled_loop
, false},
2978 {8192, rep_prefix_8_byte
, false}, {-1, libcall
, false}}}};
2980 struct processor_costs atom_cost
= {
2982 /* Start of register allocator costs. integer->integer move cost is 2. */
2983 6, /* cost for loading QImode using movzbl */
2984 {6, 6, 6}, /* cost of loading integer registers
2985 in QImode, HImode and SImode.
2986 Relative to reg-reg move (2). */
2987 {6, 6, 6}, /* cost of storing integer registers */
2988 4, /* cost of reg,reg fld/fst */
2989 {6, 6, 18}, /* cost of loading fp registers
2990 in SFmode, DFmode and XFmode */
2991 {14, 14, 24}, /* cost of storing fp registers
2992 in SFmode, DFmode and XFmode */
2993 2, /* cost of moving MMX register */
2994 {8, 8}, /* cost of loading MMX registers
2995 in SImode and DImode */
2996 {10, 10}, /* cost of storing MMX registers
2997 in SImode and DImode */
2998 2, 4, 8, /* cost of moving XMM,YMM,ZMM register */
2999 {8, 8, 8, 16, 32}, /* cost of loading SSE registers
3000 in 32,64,128,256 and 512-bit */
3001 {8, 8, 8, 16, 32}, /* cost of storing SSE registers
3002 in 32,64,128,256 and 512-bit */
3003 8, 6, /* SSE->integer and integer->SSE moves */
3004 8, 6, /* mask->integer and integer->mask moves */
3005 {6, 6, 6}, /* cost of loading mask register
3006 in QImode, HImode, SImode. */
3007 {6, 6, 6}, /* cost if storing mask register
3008 in QImode, HImode, SImode. */
3009 2, /* cost of moving mask register. */
3010 /* End of register allocator costs. */
3013 COSTS_N_INSNS (1), /* cost of an add instruction */
3014 COSTS_N_INSNS (1) + 1, /* cost of a lea instruction */
3015 COSTS_N_INSNS (1), /* variable shift costs */
3016 COSTS_N_INSNS (1), /* constant shift costs */
3017 {COSTS_N_INSNS (3), /* cost of starting multiply for QI */
3018 COSTS_N_INSNS (4), /* HI */
3019 COSTS_N_INSNS (3), /* SI */
3020 COSTS_N_INSNS (4), /* DI */
3021 COSTS_N_INSNS (2)}, /* other */
3022 0, /* cost of multiply per each bit set */
3023 {COSTS_N_INSNS (18), /* cost of a divide/mod for QI */
3024 COSTS_N_INSNS (26), /* HI */
3025 COSTS_N_INSNS (42), /* SI */
3026 COSTS_N_INSNS (74), /* DI */
3027 COSTS_N_INSNS (74)}, /* other */
3028 COSTS_N_INSNS (1), /* cost of movsx */
3029 COSTS_N_INSNS (1), /* cost of movzx */
3030 8, /* "large" insn */
3031 17, /* MOVE_RATIO */
3032 6, /* CLEAR_RATIO */
3033 {6, 6, 6}, /* cost of loading integer registers
3034 in QImode, HImode and SImode.
3035 Relative to reg-reg move (2). */
3036 {6, 6, 6}, /* cost of storing integer registers */
3037 {8, 8, 8, 16, 32}, /* cost of loading SSE register
3038 in 32bit, 64bit, 128bit, 256bit and 512bit */
3039 {8, 8, 8, 16, 32}, /* cost of storing SSE register
3040 in 32bit, 64bit, 128bit, 256bit and 512bit */
3041 {16, 16, 16, 32, 64}, /* cost of unaligned loads. */
3042 {16, 16, 16, 32, 64}, /* cost of unaligned stores. */
3043 2, 4, 8, /* cost of moving XMM,YMM,ZMM register */
3044 8, /* cost of moving SSE register to integer. */
3045 8, 8, /* Gather load static, per_elt. */
3046 8, 8, /* Gather store static, per_elt. */
3047 32, /* size of l1 cache. */
3048 256, /* size of l2 cache. */
3049 64, /* size of prefetch block */
3050 6, /* number of parallel prefetches */
3051 3, /* Branch cost */
3052 COSTS_N_INSNS (8), /* cost of FADD and FSUB insns. */
3053 COSTS_N_INSNS (8), /* cost of FMUL instruction. */
3054 COSTS_N_INSNS (20), /* cost of FDIV instruction. */
3055 COSTS_N_INSNS (8), /* cost of FABS instruction. */
3056 COSTS_N_INSNS (8), /* cost of FCHS instruction. */
3057 COSTS_N_INSNS (40), /* cost of FSQRT instruction. */
3059 COSTS_N_INSNS (1), /* cost of cheap SSE instruction. */
3060 COSTS_N_INSNS (5), /* cost of ADDSS/SD SUBSS/SD insns. */
3061 COSTS_N_INSNS (4), /* cost of MULSS instruction. */
3062 COSTS_N_INSNS (5), /* cost of MULSD instruction. */
3063 COSTS_N_INSNS (6), /* cost of FMA SS instruction. */
3064 COSTS_N_INSNS (6), /* cost of FMA SD instruction. */
3065 COSTS_N_INSNS (31), /* cost of DIVSS instruction. */
3066 COSTS_N_INSNS (60), /* cost of DIVSD instruction. */
3067 COSTS_N_INSNS (31), /* cost of SQRTSS instruction. */
3068 COSTS_N_INSNS (63), /* cost of SQRTSD instruction. */
3069 2, 2, 2, 2, /* reassoc int, fp, vec_int, vec_fp. */
3072 COSTS_N_INSNS (3), /* cond_taken_branch_cost. */
3073 COSTS_N_INSNS (1), /* cond_not_taken_branch_cost. */
3074 "16", /* Loop alignment. */
3075 "16:8:8", /* Jump alignment. */
3076 "0:0:8", /* Label alignment. */
3077 "16", /* Func alignment. */
3078 4, /* Small unroll limit. */
3079 2, /* Small unroll factor. */
3082 static stringop_algs slm_memcpy
[2] = {
3083 {libcall
, {{11, loop
, false}, {-1, rep_prefix_4_byte
, false}}},
3084 {libcall
, {{32, loop
, false}, {64, rep_prefix_4_byte
, false},
3085 {8192, rep_prefix_8_byte
, false}, {-1, libcall
, false}}}};
3086 static stringop_algs slm_memset
[2] = {
3087 {libcall
, {{8, loop
, false}, {15, unrolled_loop
, false},
3088 {2048, rep_prefix_4_byte
, false}, {-1, libcall
, false}}},
3089 {libcall
, {{24, loop
, false}, {32, unrolled_loop
, false},
3090 {8192, rep_prefix_8_byte
, false}, {-1, libcall
, false}}}};
3092 struct processor_costs slm_cost
= {
3094 /* Start of register allocator costs. integer->integer move cost is 2. */
3095 8, /* cost for loading QImode using movzbl */
3096 {8, 8, 8}, /* cost of loading integer registers
3097 in QImode, HImode and SImode.
3098 Relative to reg-reg move (2). */
3099 {6, 6, 6}, /* cost of storing integer registers */
3100 2, /* cost of reg,reg fld/fst */
3101 {8, 8, 18}, /* cost of loading fp registers
3102 in SFmode, DFmode and XFmode */
3103 {6, 6, 18}, /* cost of storing fp registers
3104 in SFmode, DFmode and XFmode */
3105 2, /* cost of moving MMX register */
3106 {8, 8}, /* cost of loading MMX registers
3107 in SImode and DImode */
3108 {6, 6}, /* cost of storing MMX registers
3109 in SImode and DImode */
3110 2, 4, 8, /* cost of moving XMM,YMM,ZMM register */
3111 {8, 8, 8, 16, 32}, /* cost of loading SSE registers
3112 in 32,64,128,256 and 512-bit */
3113 {8, 8, 8, 16, 32}, /* cost of storing SSE registers
3114 in 32,64,128,256 and 512-bit */
3115 8, 6, /* SSE->integer and integer->SSE moves */
3116 8, 6, /* mask->integer and integer->mask moves */
3117 {8, 8, 8}, /* cost of loading mask register
3118 in QImode, HImode, SImode. */
3119 {6, 6, 6}, /* cost if storing mask register
3120 in QImode, HImode, SImode. */
3121 2, /* cost of moving mask register. */
3122 /* End of register allocator costs. */
3125 COSTS_N_INSNS (1), /* cost of an add instruction */
3126 COSTS_N_INSNS (1) + 1, /* cost of a lea instruction */
3127 COSTS_N_INSNS (1), /* variable shift costs */
3128 COSTS_N_INSNS (1), /* constant shift costs */
3129 {COSTS_N_INSNS (3), /* cost of starting multiply for QI */
3130 COSTS_N_INSNS (3), /* HI */
3131 COSTS_N_INSNS (3), /* SI */
3132 COSTS_N_INSNS (4), /* DI */
3133 COSTS_N_INSNS (2)}, /* other */
3134 0, /* cost of multiply per each bit set */
3135 {COSTS_N_INSNS (18), /* cost of a divide/mod for QI */
3136 COSTS_N_INSNS (26), /* HI */
3137 COSTS_N_INSNS (42), /* SI */
3138 COSTS_N_INSNS (74), /* DI */
3139 COSTS_N_INSNS (74)}, /* other */
3140 COSTS_N_INSNS (1), /* cost of movsx */
3141 COSTS_N_INSNS (1), /* cost of movzx */
3142 8, /* "large" insn */
3143 17, /* MOVE_RATIO */
3144 6, /* CLEAR_RATIO */
3145 {8, 8, 8}, /* cost of loading integer registers
3146 in QImode, HImode and SImode.
3147 Relative to reg-reg move (2). */
3148 {6, 6, 6}, /* cost of storing integer registers */
3149 {8, 8, 8, 16, 32}, /* cost of loading SSE register
3150 in 32bit, 64bit, 128bit, 256bit and 512bit */
3151 {8, 8, 8, 16, 32}, /* cost of storing SSE register
3152 in SImode, DImode and TImode. */
3153 {16, 16, 16, 32, 64}, /* cost of unaligned loads. */
3154 {16, 16, 16, 32, 64}, /* cost of unaligned stores. */
3155 2, 4, 8, /* cost of moving XMM,YMM,ZMM register */
3156 8, /* cost of moving SSE register to integer. */
3157 8, 8, /* Gather load static, per_elt. */
3158 8, 8, /* Gather store static, per_elt. */
3159 32, /* size of l1 cache. */
3160 256, /* size of l2 cache. */
3161 64, /* size of prefetch block */
3162 6, /* number of parallel prefetches */
3163 3, /* Branch cost */
3164 COSTS_N_INSNS (8), /* cost of FADD and FSUB insns. */
3165 COSTS_N_INSNS (8), /* cost of FMUL instruction. */
3166 COSTS_N_INSNS (20), /* cost of FDIV instruction. */
3167 COSTS_N_INSNS (8), /* cost of FABS instruction. */
3168 COSTS_N_INSNS (8), /* cost of FCHS instruction. */
3169 COSTS_N_INSNS (40), /* cost of FSQRT instruction. */
3171 COSTS_N_INSNS (1), /* cost of cheap SSE instruction. */
3172 COSTS_N_INSNS (3), /* cost of ADDSS/SD SUBSS/SD insns. */
3173 COSTS_N_INSNS (4), /* cost of MULSS instruction. */
3174 COSTS_N_INSNS (5), /* cost of MULSD instruction. */
3175 COSTS_N_INSNS (6), /* cost of FMA SS instruction. */
3176 COSTS_N_INSNS (6), /* cost of FMA SD instruction. */
3177 COSTS_N_INSNS (39), /* cost of DIVSS instruction. */
3178 COSTS_N_INSNS (69), /* cost of DIVSD instruction. */
3179 COSTS_N_INSNS (20), /* cost of SQRTSS instruction. */
3180 COSTS_N_INSNS (35), /* cost of SQRTSD instruction. */
3181 1, 2, 1, 1, /* reassoc int, fp, vec_int, vec_fp. */
3184 COSTS_N_INSNS (3), /* cond_taken_branch_cost. */
3185 COSTS_N_INSNS (1), /* cond_not_taken_branch_cost. */
3186 "16", /* Loop alignment. */
3187 "16:8:8", /* Jump alignment. */
3188 "0:0:8", /* Label alignment. */
3189 "16", /* Func alignment. */
3190 4, /* Small unroll limit. */
3191 2, /* Small unroll factor. */
3194 static stringop_algs tremont_memcpy
[2] = {
3196 {{256, rep_prefix_1_byte
, true},
3198 {-1, libcall
, false}}},
3200 {{256, rep_prefix_1_byte
, true},
3202 {-1, libcall
, false}}}};
3203 static stringop_algs tremont_memset
[2] = {
3205 {{256, rep_prefix_1_byte
, true},
3207 {-1, libcall
, false}}},
3209 {{256, rep_prefix_1_byte
, true},
3211 {-1, libcall
, false}}}};
3213 struct processor_costs tremont_cost
= {
3215 /* Start of register allocator costs. integer->integer move cost is 2. */
3216 6, /* cost for loading QImode using movzbl */
3217 {6, 6, 6}, /* cost of loading integer registers
3218 in QImode, HImode and SImode.
3219 Relative to reg-reg move (2). */
3220 {6, 6, 6}, /* cost of storing integer registers */
3221 4, /* cost of reg,reg fld/fst */
3222 {6, 6, 12}, /* cost of loading fp registers
3223 in SFmode, DFmode and XFmode */
3224 {6, 6, 12}, /* cost of storing fp registers
3225 in SFmode, DFmode and XFmode */
3226 2, /* cost of moving MMX register */
3227 {6, 6}, /* cost of loading MMX registers
3228 in SImode and DImode */
3229 {6, 6}, /* cost of storing MMX registers
3230 in SImode and DImode */
3231 2, 3, 4, /* cost of moving XMM,YMM,ZMM register */
3232 {6, 6, 6, 10, 15}, /* cost of loading SSE registers
3233 in 32,64,128,256 and 512-bit */
3234 {6, 6, 6, 10, 15}, /* cost of storing SSE registers
3235 in 32,64,128,256 and 512-bit */
3236 6, 6, /* SSE->integer and integer->SSE moves */
3237 6, 6, /* mask->integer and integer->mask moves */
3238 {6, 6, 6}, /* cost of loading mask register
3239 in QImode, HImode, SImode. */
3240 {6, 6, 6}, /* cost if storing mask register
3241 in QImode, HImode, SImode. */
3242 2, /* cost of moving mask register. */
3243 /* End of register allocator costs. */
3246 COSTS_N_INSNS (1), /* cost of an add instruction */
3247 /* Setting cost to 2 makes our current implementation of synth_mult result in
3248 use of unnecessary temporary registers causing regression on several
3249 SPECfp benchmarks. */
3250 COSTS_N_INSNS (1) + 1, /* cost of a lea instruction */
3251 COSTS_N_INSNS (1), /* variable shift costs */
3252 COSTS_N_INSNS (1), /* constant shift costs */
3253 {COSTS_N_INSNS (3), /* cost of starting multiply for QI */
3254 COSTS_N_INSNS (3), /* HI */
3255 COSTS_N_INSNS (3), /* SI */
3256 COSTS_N_INSNS (3), /* DI */
3257 COSTS_N_INSNS (4)}, /* other */
3258 0, /* cost of multiply per each bit set */
3259 {COSTS_N_INSNS (16), /* cost of a divide/mod for QI */
3260 COSTS_N_INSNS (22), /* HI */
3261 COSTS_N_INSNS (30), /* SI */
3262 COSTS_N_INSNS (74), /* DI */
3263 COSTS_N_INSNS (74)}, /* other */
3264 COSTS_N_INSNS (1), /* cost of movsx */
3265 COSTS_N_INSNS (1), /* cost of movzx */
3266 8, /* "large" insn */
3267 17, /* MOVE_RATIO */
3268 17, /* CLEAR_RATIO */
3269 {6, 6, 6}, /* cost of loading integer registers
3270 in QImode, HImode and SImode.
3271 Relative to reg-reg move (2). */
3272 {6, 6, 6}, /* cost of storing integer registers */
3273 {6, 6, 6, 10, 15}, /* cost of loading SSE register
3274 in 32bit, 64bit, 128bit, 256bit and 512bit */
3275 {6, 6, 6, 10, 15}, /* cost of storing SSE register
3276 in 32bit, 64bit, 128bit, 256bit and 512bit */
3277 {6, 6, 6, 10, 15}, /* cost of unaligned loads. */
3278 {6, 6, 6, 10, 15}, /* cost of unaligned storess. */
3279 2, 3, 4, /* cost of moving XMM,YMM,ZMM register */
3280 6, /* cost of moving SSE register to integer. */
3281 18, 6, /* Gather load static, per_elt. */
3282 18, 6, /* Gather store static, per_elt. */
3283 32, /* size of l1 cache. */
3284 512, /* size of l2 cache. */
3285 64, /* size of prefetch block */
3286 6, /* number of parallel prefetches */
3287 /* Benchmarks shows large regressions on K8 sixtrack benchmark when this
3288 value is increased to perhaps more appropriate value of 5. */
3289 3, /* Branch cost */
3290 COSTS_N_INSNS (3), /* cost of FADD and FSUB insns. */
3291 COSTS_N_INSNS (5), /* cost of FMUL instruction. */
3292 COSTS_N_INSNS (17), /* cost of FDIV instruction. */
3293 COSTS_N_INSNS (1), /* cost of FABS instruction. */
3294 COSTS_N_INSNS (1), /* cost of FCHS instruction. */
3295 COSTS_N_INSNS (14), /* cost of FSQRT instruction. */
3297 COSTS_N_INSNS (1), /* cost of cheap SSE instruction. */
3298 COSTS_N_INSNS (3), /* cost of ADDSS/SD SUBSS/SD insns. */
3299 COSTS_N_INSNS (4), /* cost of MULSS instruction. */
3300 COSTS_N_INSNS (5), /* cost of MULSD instruction. */
3301 COSTS_N_INSNS (5), /* cost of FMA SS instruction. */
3302 COSTS_N_INSNS (5), /* cost of FMA SD instruction. */
3303 COSTS_N_INSNS (13), /* cost of DIVSS instruction. */
3304 COSTS_N_INSNS (17), /* cost of DIVSD instruction. */
3305 COSTS_N_INSNS (14), /* cost of SQRTSS instruction. */
3306 COSTS_N_INSNS (18), /* cost of SQRTSD instruction. */
3307 1, 4, 3, 3, /* reassoc int, fp, vec_int, vec_fp. */
3310 COSTS_N_INSNS (4), /* cond_taken_branch_cost. */
3311 COSTS_N_INSNS (2), /* cond_not_taken_branch_cost. */
3312 "16:11:8", /* Loop alignment. */
3313 "16:11:8", /* Jump alignment. */
3314 "0:0:8", /* Label alignment. */
3315 "16", /* Func alignment. */
3316 4, /* Small unroll limit. */
3317 2, /* Small unroll factor. */
3320 static stringop_algs intel_memcpy
[2] = {
3321 {libcall
, {{11, loop
, false}, {-1, rep_prefix_4_byte
, false}}},
3322 {libcall
, {{32, loop
, false}, {64, rep_prefix_4_byte
, false},
3323 {8192, rep_prefix_8_byte
, false}, {-1, libcall
, false}}}};
3324 static stringop_algs intel_memset
[2] = {
3325 {libcall
, {{8, loop
, false}, {15, unrolled_loop
, false},
3326 {2048, rep_prefix_4_byte
, false}, {-1, libcall
, false}}},
3327 {libcall
, {{24, loop
, false}, {32, unrolled_loop
, false},
3328 {8192, rep_prefix_8_byte
, false}, {-1, libcall
, false}}}};
3330 struct processor_costs intel_cost
= {
3332 /* Start of register allocator costs. integer->integer move cost is 2. */
3333 6, /* cost for loading QImode using movzbl */
3334 {4, 4, 4}, /* cost of loading integer registers
3335 in QImode, HImode and SImode.
3336 Relative to reg-reg move (2). */
3337 {6, 6, 6}, /* cost of storing integer registers */
3338 2, /* cost of reg,reg fld/fst */
3339 {6, 6, 8}, /* cost of loading fp registers
3340 in SFmode, DFmode and XFmode */
3341 {6, 6, 10}, /* cost of storing fp registers
3342 in SFmode, DFmode and XFmode */
3343 2, /* cost of moving MMX register */
3344 {6, 6}, /* cost of loading MMX registers
3345 in SImode and DImode */
3346 {6, 6}, /* cost of storing MMX registers
3347 in SImode and DImode */
3348 2, 2, 2, /* cost of moving XMM,YMM,ZMM register */
3349 {6, 6, 6, 6, 6}, /* cost of loading SSE registers
3350 in 32,64,128,256 and 512-bit */
3351 {6, 6, 6, 6, 6}, /* cost of storing SSE registers
3352 in 32,64,128,256 and 512-bit */
3353 4, 4, /* SSE->integer and integer->SSE moves */
3354 4, 4, /* mask->integer and integer->mask moves */
3355 {4, 4, 4}, /* cost of loading mask register
3356 in QImode, HImode, SImode. */
3357 {6, 6, 6}, /* cost if storing mask register
3358 in QImode, HImode, SImode. */
3359 2, /* cost of moving mask register. */
3360 /* End of register allocator costs. */
3363 COSTS_N_INSNS (1), /* cost of an add instruction */
3364 COSTS_N_INSNS (1) + 1, /* cost of a lea instruction */
3365 COSTS_N_INSNS (1), /* variable shift costs */
3366 COSTS_N_INSNS (1), /* constant shift costs */
3367 {COSTS_N_INSNS (3), /* cost of starting multiply for QI */
3368 COSTS_N_INSNS (3), /* HI */
3369 COSTS_N_INSNS (3), /* SI */
3370 COSTS_N_INSNS (4), /* DI */
3371 COSTS_N_INSNS (2)}, /* other */
3372 0, /* cost of multiply per each bit set */
3373 {COSTS_N_INSNS (18), /* cost of a divide/mod for QI */
3374 COSTS_N_INSNS (26), /* HI */
3375 COSTS_N_INSNS (42), /* SI */
3376 COSTS_N_INSNS (74), /* DI */
3377 COSTS_N_INSNS (74)}, /* other */
3378 COSTS_N_INSNS (1), /* cost of movsx */
3379 COSTS_N_INSNS (1), /* cost of movzx */
3380 8, /* "large" insn */
3381 17, /* MOVE_RATIO */
3382 6, /* CLEAR_RATIO */
3383 {4, 4, 4}, /* cost of loading integer registers
3384 in QImode, HImode and SImode.
3385 Relative to reg-reg move (2). */
3386 {6, 6, 6}, /* cost of storing integer registers */
3387 {6, 6, 6, 6, 6}, /* cost of loading SSE register
3388 in 32bit, 64bit, 128bit, 256bit and 512bit */
3389 {6, 6, 6, 6, 6}, /* cost of storing SSE register
3390 in 32bit, 64bit, 128bit, 256bit and 512bit */
3391 {10, 10, 10, 10, 10}, /* cost of unaligned loads. */
3392 {10, 10, 10, 10, 10}, /* cost of unaligned loads. */
3393 2, 2, 2, /* cost of moving XMM,YMM,ZMM register */
3394 4, /* cost of moving SSE register to integer. */
3395 6, 6, /* Gather load static, per_elt. */
3396 6, 6, /* Gather store static, per_elt. */
3397 32, /* size of l1 cache. */
3398 256, /* size of l2 cache. */
3399 64, /* size of prefetch block */
3400 6, /* number of parallel prefetches */
3401 3, /* Branch cost */
3402 COSTS_N_INSNS (8), /* cost of FADD and FSUB insns. */
3403 COSTS_N_INSNS (8), /* cost of FMUL instruction. */
3404 COSTS_N_INSNS (20), /* cost of FDIV instruction. */
3405 COSTS_N_INSNS (8), /* cost of FABS instruction. */
3406 COSTS_N_INSNS (8), /* cost of FCHS instruction. */
3407 COSTS_N_INSNS (40), /* cost of FSQRT instruction. */
3409 COSTS_N_INSNS (1), /* cost of cheap SSE instruction. */
3410 COSTS_N_INSNS (8), /* cost of ADDSS/SD SUBSS/SD insns. */
3411 COSTS_N_INSNS (8), /* cost of MULSS instruction. */
3412 COSTS_N_INSNS (8), /* cost of MULSD instruction. */
3413 COSTS_N_INSNS (6), /* cost of FMA SS instruction. */
3414 COSTS_N_INSNS (6), /* cost of FMA SD instruction. */
3415 COSTS_N_INSNS (20), /* cost of DIVSS instruction. */
3416 COSTS_N_INSNS (20), /* cost of DIVSD instruction. */
3417 COSTS_N_INSNS (40), /* cost of SQRTSS instruction. */
3418 COSTS_N_INSNS (40), /* cost of SQRTSD instruction. */
3419 1, 4, 1, 1, /* reassoc int, fp, vec_int, vec_fp. */
3422 COSTS_N_INSNS (3), /* cond_taken_branch_cost. */
3423 COSTS_N_INSNS (1), /* cond_not_taken_branch_cost. */
3424 "16", /* Loop alignment. */
3425 "16:8:8", /* Jump alignment. */
3426 "0:0:8", /* Label alignment. */
3427 "16", /* Func alignment. */
3428 4, /* Small unroll limit. */
3429 2, /* Small unroll factor. */
3432 /* lujiazui_cost should produce code tuned for ZHAOXIN lujiazui CPU. */
3433 static stringop_algs lujiazui_memcpy
[2] = {
3434 {libcall
, {{32, loop
, false}, {8192, rep_prefix_4_byte
, false},
3435 {-1, libcall
, false}}},
3436 {libcall
, {{12, unrolled_loop
, true}, {32, loop
, false},
3437 {6144, rep_prefix_8_byte
, false},
3438 {-1, libcall
, false}}}};
3439 static stringop_algs lujiazui_memset
[2] = {
3440 {libcall
, {{32, loop
, false}, {8192, rep_prefix_4_byte
, false},
3441 {-1, libcall
, false}}},
3442 {libcall
, {{12, loop
, true}, {32, loop
, false},
3443 {640, rep_prefix_8_byte
, false},
3444 {-1, libcall
, false}}}};
3446 struct processor_costs lujiazui_cost
= {
3448 /* Start of register allocator costs. integer->integer move cost is 2. */
3449 6, /* cost for loading QImode using movzbl. */
3450 {6, 6, 6}, /* cost of loading integer registers
3451 in QImode, HImode and SImode.
3452 Relative to reg-reg move (2). */
3453 {6, 6, 6}, /* cost of storing integer registers. */
3454 2, /* cost of reg,reg fld/fst. */
3455 {6, 6, 8}, /* cost of loading fp registers
3456 in SFmode, DFmode and XFmode. */
3457 {6, 6, 8}, /* cost of storing fp registers
3458 in SFmode, DFmode and XFmode. */
3459 2, /* cost of moving MMX register. */
3460 {6, 6}, /* cost of loading MMX registers
3461 in SImode and DImode. */
3462 {6, 6}, /* cost of storing MMX registers
3463 in SImode and DImode. */
3464 2, 3, 4, /* cost of moving XMM,YMM,ZMM register. */
3465 {6, 6, 6, 10, 15}, /* cost of loading SSE registers
3466 in 32,64,128,256 and 512-bit. */
3467 {6, 6, 6, 10, 15}, /* cost of storing SSE registers
3468 in 32,64,128,256 and 512-bit. */
3469 6, 6, /* SSE->integer and integer->SSE moves. */
3470 6, 6, /* mask->integer and integer->mask moves. */
3471 {6, 6, 6}, /* cost of loading mask register
3472 in QImode, HImode, SImode. */
3473 {6, 6, 6}, /* cost if storing mask register
3474 in QImode, HImode, SImode. */
3475 2, /* cost of moving mask register. */
3476 /* End of register allocator costs. */
3479 COSTS_N_INSNS (1), /* cost of an add instruction. */
3480 COSTS_N_INSNS (1) + 1, /* cost of a lea instruction. */
3481 COSTS_N_INSNS (1), /* variable shift costs. */
3482 COSTS_N_INSNS (1), /* constant shift costs. */
3483 {COSTS_N_INSNS (2), /* cost of starting multiply for QI. */
3484 COSTS_N_INSNS (3), /* HI. */
3485 COSTS_N_INSNS (3), /* SI. */
3486 COSTS_N_INSNS (12), /* DI. */
3487 COSTS_N_INSNS (14)}, /* other. */
3488 0, /* cost of multiply per each bit set. */
3489 {COSTS_N_INSNS (22), /* cost of a divide/mod for QI. */
3490 COSTS_N_INSNS (24), /* HI. */
3491 COSTS_N_INSNS (24), /* SI. */
3492 COSTS_N_INSNS (150), /* DI. */
3493 COSTS_N_INSNS (152)}, /* other. */
3494 COSTS_N_INSNS (1), /* cost of movsx. */
3495 COSTS_N_INSNS (1), /* cost of movzx. */
3496 8, /* "large" insn. */
3497 17, /* MOVE_RATIO. */
3498 6, /* CLEAR_RATIO. */
3499 {6, 6, 6}, /* cost of loading integer registers
3500 in QImode, HImode and SImode.
3501 Relative to reg-reg move (2). */
3502 {6, 6, 6}, /* cost of storing integer registers. */
3503 {6, 6, 6, 10, 15}, /* cost of loading SSE register
3504 in 32bit, 64bit, 128bit, 256bit and 512bit. */
3505 {6, 6, 6, 10, 15}, /* cost of storing SSE register
3506 in 32bit, 64bit, 128bit, 256bit and 512bit. */
3507 {6, 6, 6, 10, 15}, /* cost of unaligned loads. */
3508 {6, 6, 6, 10, 15}, /* cost of unaligned storess. */
3509 2, 3, 4, /* cost of moving XMM,YMM,ZMM register. */
3510 6, /* cost of moving SSE register to integer. */
3511 18, 6, /* Gather load static, per_elt. */
3512 18, 6, /* Gather store static, per_elt. */
3513 32, /* size of l1 cache. */
3514 4096, /* size of l2 cache. */
3515 64, /* size of prefetch block. */
3516 /* Lujiazui processor never drop prefetches, like AMD processors. */
3517 100, /* number of parallel prefetches. */
3518 3, /* Branch cost. */
3519 COSTS_N_INSNS (3), /* cost of FADD and FSUB insns. */
3520 COSTS_N_INSNS (4), /* cost of FMUL instruction. */
3521 COSTS_N_INSNS (22), /* cost of FDIV instruction. */
3522 COSTS_N_INSNS (1), /* cost of FABS instruction. */
3523 COSTS_N_INSNS (1), /* cost of FCHS instruction. */
3524 COSTS_N_INSNS (44), /* cost of FSQRT instruction. */
3526 COSTS_N_INSNS (1), /* cost of cheap SSE instruction. */
3527 COSTS_N_INSNS (3), /* cost of ADDSS/SD SUBSS/SD insns. */
3528 COSTS_N_INSNS (3), /* cost of MULSS instruction. */
3529 COSTS_N_INSNS (4), /* cost of MULSD instruction. */
3530 COSTS_N_INSNS (6), /* cost of FMA SS instruction. */
3531 COSTS_N_INSNS (6), /* cost of FMA SD instruction. */
3532 COSTS_N_INSNS (13), /* cost of DIVSS instruction. */
3533 COSTS_N_INSNS (17), /* cost of DIVSD instruction. */
3534 COSTS_N_INSNS (32), /* cost of SQRTSS instruction. */
3535 COSTS_N_INSNS (60), /* cost of SQRTSD instruction. */
3536 1, 4, 3, 3, /* reassoc int, fp, vec_int, vec_fp. */
3539 COSTS_N_INSNS (4), /* cond_taken_branch_cost. */
3540 COSTS_N_INSNS (2), /* cond_not_taken_branch_cost. */
3541 "16:11:8", /* Loop alignment. */
3542 "16:11:8", /* Jump alignment. */
3543 "0:0:8", /* Label alignment. */
3544 "16", /* Func alignment. */
3545 4, /* Small unroll limit. */
3546 2, /* Small unroll factor. */
3549 /* yongfeng_cost should produce code tuned for ZHAOXIN yongfeng CPU. */
3550 static stringop_algs yongfeng_memcpy
[2] = {
3551 {libcall
, {{6, unrolled_loop
, true}, {256, unrolled_loop
, false},
3552 {-1, libcall
, false}}},
3553 {libcall
, {{8, loop
, false}, {512, unrolled_loop
, false},
3554 {-1, libcall
, false}}}};
3555 static stringop_algs yongfeng_memset
[2] = {
3556 {libcall
, {{6, loop_1_byte
, false}, {128, loop
, false},
3557 {-1, libcall
, false}}},
3558 {libcall
, {{2, rep_prefix_4_byte
, false}, {64, loop
, false},
3559 {1024, vector_loop
, false},
3560 {-1, libcall
, false}}}};
3562 struct processor_costs yongfeng_cost
= {
3564 /* Start of register allocator costs. integer->integer move cost is 2. */
3565 8, /* cost for loading QImode using movzbl. */
3566 {8, 8, 8}, /* cost of loading integer registers
3567 in QImode, HImode and SImode.
3568 Relative to reg-reg move (2). */
3569 {8, 8, 8}, /* cost of storing integer registers. */
3570 2, /* cost of reg,reg fld/fst. */
3571 {8, 8, 8}, /* cost of loading fp registers
3572 in SFmode, DFmode and XFmode. */
3573 {8, 8, 8}, /* cost of storing fp registers
3574 in SFmode, DFmode and XFmode. */
3575 2, /* cost of moving MMX register. */
3576 {8, 8}, /* cost of loading MMX registers
3577 in SImode and DImode. */
3578 {8, 8}, /* cost of storing MMX registers
3579 in SImode and DImode. */
3580 2, 3, 4, /* cost of moving XMM,YMM,ZMM register. */
3581 {8, 8, 8, 10, 15}, /* cost of loading SSE registers
3582 in 32,64,128,256 and 512-bit. */
3583 {8, 8, 8, 10, 15}, /* cost of storing SSE registers
3584 in 32,64,128,256 and 512-bit. */
3585 8, 8, /* SSE->integer and integer->SSE moves. */
3586 8, 8, /* mask->integer and integer->mask moves. */
3587 {8, 8, 8}, /* cost of loading mask register
3588 in QImode, HImode, SImode. */
3589 {8, 8, 8}, /* cost if storing mask register
3590 in QImode, HImode, SImode. */
3591 2, /* cost of moving mask register. */
3592 /* End of register allocator costs. */
3595 COSTS_N_INSNS (1), /* cost of an add instruction. */
3596 COSTS_N_INSNS (1), /* cost of a lea instruction. */
3597 COSTS_N_INSNS (1), /* variable shift costs. */
3598 COSTS_N_INSNS (1), /* constant shift costs. */
3599 {COSTS_N_INSNS (2), /* cost of starting multiply for QI. */
3600 COSTS_N_INSNS (3), /* HI. */
3601 COSTS_N_INSNS (2), /* SI. */
3602 COSTS_N_INSNS (2), /* DI. */
3603 COSTS_N_INSNS (3)}, /* other. */
3604 0, /* cost of multiply per each bit set. */
3605 {COSTS_N_INSNS (8), /* cost of a divide/mod for QI. */
3606 COSTS_N_INSNS (9), /* HI. */
3607 COSTS_N_INSNS (8), /* SI. */
3608 COSTS_N_INSNS (41), /* DI. */
3609 COSTS_N_INSNS (41)}, /* other. */
3610 COSTS_N_INSNS (1), /* cost of movsx. */
3611 COSTS_N_INSNS (1), /* cost of movzx. */
3612 8, /* "large" insn. */
3613 17, /* MOVE_RATIO. */
3614 6, /* CLEAR_RATIO. */
3615 {8, 8, 8}, /* cost of loading integer registers
3616 in QImode, HImode and SImode.
3617 Relative to reg-reg move (2). */
3618 {8, 8, 8}, /* cost of storing integer registers. */
3619 {8, 8, 8, 12, 15}, /* cost of loading SSE register
3620 in 32bit, 64bit, 128bit, 256bit and 512bit. */
3621 {8, 8, 8, 12, 15}, /* cost of storing SSE register
3622 in 32bit, 64bit, 128bit, 256bit and 512bit. */
3623 {8, 8, 8, 12, 15}, /* cost of unaligned loads. */
3624 {8, 8, 8, 12, 15}, /* cost of unaligned storess. */
3625 2, 3, 4, /* cost of moving XMM,YMM,ZMM register. */
3626 8, /* cost of moving SSE register to integer. */
3627 18, 6, /* Gather load static, per_elt. */
3628 18, 6, /* Gather store static, per_elt. */
3629 32, /* size of l1 cache. */
3630 256, /* size of l2 cache. */
3631 64, /* size of prefetch block. */
3632 12, /* number of parallel prefetches. */
3633 3, /* Branch cost. */
3634 COSTS_N_INSNS (3), /* cost of FADD and FSUB insns. */
3635 COSTS_N_INSNS (3), /* cost of FMUL instruction. */
3636 COSTS_N_INSNS (14), /* cost of FDIV instruction. */
3637 COSTS_N_INSNS (1), /* cost of FABS instruction. */
3638 COSTS_N_INSNS (1), /* cost of FCHS instruction. */
3639 COSTS_N_INSNS (40), /* cost of FSQRT instruction. */
3641 COSTS_N_INSNS (1), /* cost of cheap SSE instruction. */
3642 COSTS_N_INSNS (3), /* cost of ADDSS/SD SUBSS/SD insns. */
3643 COSTS_N_INSNS (3), /* cost of MULSS instruction. */
3644 COSTS_N_INSNS (3), /* cost of MULSD instruction. */
3645 COSTS_N_INSNS (5), /* cost of FMA SS instruction. */
3646 COSTS_N_INSNS (5), /* cost of FMA SD instruction. */
3647 COSTS_N_INSNS (10), /* cost of DIVSS instruction. */
3648 COSTS_N_INSNS (14), /* cost of DIVSD instruction. */
3649 COSTS_N_INSNS (20), /* cost of SQRTSS instruction. */
3650 COSTS_N_INSNS (35), /* cost of SQRTSD instruction. */
3651 4, 4, 4, 4, /* reassoc int, fp, vec_int, vec_fp. */
3654 COSTS_N_INSNS (3), /* cond_taken_branch_cost. */
3655 COSTS_N_INSNS (1), /* cond_not_taken_branch_cost. */
3656 "16:11:8", /* Loop alignment. */
3657 "16:11:8", /* Jump alignment. */
3658 "0:0:8", /* Label alignment. */
3659 "16", /* Func alignment. */
3660 4, /* Small unroll limit. */
3661 2, /* Small unroll factor. */
3664 /* shijidadao_cost should produce code tuned for ZHAOXIN shijidadao CPU. */
3665 static stringop_algs shijidadao_memcpy
[2] = {
3666 {libcall
, {{8, unrolled_loop
, true}, {256, unrolled_loop
, false},
3667 {-1, libcall
, false}}},
3668 {libcall
, {{10, loop
, true}, {256, unrolled_loop
, false},
3669 {-1, libcall
, false}}}};
3670 static stringop_algs shijidadao_memset
[2] = {
3671 {libcall
, {{4, loop
, true}, {128, unrolled_loop
, false},
3672 {-1, libcall
, false}}},
3673 {libcall
, {{1, rep_prefix_4_byte
, false}, {14, loop
, true},
3674 {1024, vector_loop
, false},
3675 {-1, libcall
, false}}}};
3677 struct processor_costs shijidadao_cost
= {
3679 /* Start of register allocator costs. integer->integer move cost is 2. */
3680 8, /* cost for loading QImode using movzbl. */
3681 {8, 8, 8}, /* cost of loading integer registers
3682 in QImode, HImode and SImode.
3683 Relative to reg-reg move (2). */
3684 {8, 8, 8}, /* cost of storing integer registers. */
3685 2, /* cost of reg,reg fld/fst. */
3686 {8, 8, 8}, /* cost of loading fp registers
3687 in SFmode, DFmode and XFmode. */
3688 {8, 8, 8}, /* cost of storing fp registers
3689 in SFmode, DFmode and XFmode. */
3690 2, /* cost of moving MMX register. */
3691 {8, 8}, /* cost of loading MMX registers
3692 in SImode and DImode. */
3693 {8, 8}, /* cost of storing MMX registers
3694 in SImode and DImode. */
3695 2, 3, 4, /* cost of moving XMM,YMM,ZMM register. */
3696 {8, 8, 8, 10, 15}, /* cost of loading SSE registers
3697 in 32,64,128,256 and 512-bit. */
3698 {8, 8, 8, 10, 15}, /* cost of storing SSE registers
3699 in 32,64,128,256 and 512-bit. */
3700 8, 8, /* SSE->integer and integer->SSE moves. */
3701 8, 8, /* mask->integer and integer->mask moves. */
3702 {8, 8, 8}, /* cost of loading mask register
3703 in QImode, HImode, SImode. */
3704 {8, 8, 8}, /* cost if storing mask register
3705 in QImode, HImode, SImode. */
3706 2, /* cost of moving mask register. */
3707 /* End of register allocator costs. */
3710 COSTS_N_INSNS (1), /* cost of an add instruction. */
3711 COSTS_N_INSNS (1), /* cost of a lea instruction. */
3712 COSTS_N_INSNS (1), /* variable shift costs. */
3713 COSTS_N_INSNS (1), /* constant shift costs. */
3714 {COSTS_N_INSNS (2), /* cost of starting multiply for QI. */
3715 COSTS_N_INSNS (3), /* HI. */
3716 COSTS_N_INSNS (2), /* SI. */
3717 COSTS_N_INSNS (2), /* DI. */
3718 COSTS_N_INSNS (3)}, /* other. */
3719 0, /* cost of multiply per each bit set. */
3720 {COSTS_N_INSNS (9), /* cost of a divide/mod for QI. */
3721 COSTS_N_INSNS (10), /* HI. */
3722 COSTS_N_INSNS (9), /* SI. */
3723 COSTS_N_INSNS (50), /* DI. */
3724 COSTS_N_INSNS (50)}, /* other. */
3725 COSTS_N_INSNS (1), /* cost of movsx. */
3726 COSTS_N_INSNS (1), /* cost of movzx. */
3727 8, /* "large" insn. */
3728 17, /* MOVE_RATIO. */
3729 6, /* CLEAR_RATIO. */
3730 {8, 8, 8}, /* cost of loading integer registers
3731 in QImode, HImode and SImode.
3732 Relative to reg-reg move (2). */
3733 {8, 8, 8}, /* cost of storing integer registers. */
3734 {8, 8, 8, 12, 15}, /* cost of loading SSE register
3735 in 32bit, 64bit, 128bit, 256bit and 512bit. */
3736 {8, 8, 8, 12, 15}, /* cost of storing SSE register
3737 in 32bit, 64bit, 128bit, 256bit and 512bit. */
3738 {8, 8, 8, 12, 15}, /* cost of unaligned loads. */
3739 {8, 8, 8, 12, 15}, /* cost of unaligned storess. */
3740 2, 3, 4, /* cost of moving XMM,YMM,ZMM register. */
3741 8, /* cost of moving SSE register to integer. */
3742 18, 6, /* Gather load static, per_elt. */
3743 18, 6, /* Gather store static, per_elt. */
3744 32, /* size of l1 cache. */
3745 256, /* size of l2 cache. */
3746 64, /* size of prefetch block. */
3747 12, /* number of parallel prefetches. */
3748 3, /* Branch cost. */
3749 COSTS_N_INSNS (3), /* cost of FADD and FSUB insns. */
3750 COSTS_N_INSNS (3), /* cost of FMUL instruction. */
3751 COSTS_N_INSNS (13), /* cost of FDIV instruction. */
3752 COSTS_N_INSNS (2), /* cost of FABS instruction. */
3753 COSTS_N_INSNS (2), /* cost of FCHS instruction. */
3754 COSTS_N_INSNS (44), /* cost of FSQRT instruction. */
3756 COSTS_N_INSNS (1), /* cost of cheap SSE instruction. */
3757 COSTS_N_INSNS (3), /* cost of ADDSS/SD SUBSS/SD insns. */
3758 COSTS_N_INSNS (3), /* cost of MULSS instruction. */
3759 COSTS_N_INSNS (3), /* cost of MULSD instruction. */
3760 COSTS_N_INSNS (5), /* cost of FMA SS instruction. */
3761 COSTS_N_INSNS (5), /* cost of FMA SD instruction. */
3762 COSTS_N_INSNS (11), /* cost of DIVSS instruction. */
3763 COSTS_N_INSNS (14), /* cost of DIVSD instruction. */
3764 COSTS_N_INSNS (11), /* cost of SQRTSS instruction. */
3765 COSTS_N_INSNS (18), /* cost of SQRTSD instruction. */
3766 4, 4, 4, 4, /* reassoc int, fp, vec_int, vec_fp. */
3769 COSTS_N_INSNS (3), /* cond_taken_branch_cost. */
3770 COSTS_N_INSNS (1), /* cond_not_taken_branch_cost. */
3771 "16:11:8", /* Loop alignment. */
3772 "16:11:8", /* Jump alignment. */
3773 "0:0:8", /* Label alignment. */
3774 "16", /* Func alignment. */
3775 4, /* Small unroll limit. */
3776 2, /* Small unroll factor. */
3781 /* Generic should produce code tuned for Core-i7 (and newer chips)
3782 and btver1 (and newer chips). */
3784 static stringop_algs generic_memcpy
[2] = {
3785 {libcall
, {{32, loop
, false}, {8192, rep_prefix_4_byte
, false},
3786 {-1, libcall
, false}}},
3787 {libcall
, {{32, loop
, false}, {8192, rep_prefix_8_byte
, false},
3788 {-1, libcall
, false}}}};
3789 static stringop_algs generic_memset
[2] = {
3790 {libcall
, {{32, loop
, false}, {8192, rep_prefix_4_byte
, false},
3791 {-1, libcall
, false}}},
3792 {libcall
, {{32, loop
, false}, {8192, rep_prefix_8_byte
, false},
3793 {-1, libcall
, false}}}};
3795 struct processor_costs generic_cost
= {
3797 /* Start of register allocator costs. integer->integer move cost is 2. */
3798 6, /* cost for loading QImode using movzbl */
3799 {6, 6, 6}, /* cost of loading integer registers
3800 in QImode, HImode and SImode.
3801 Relative to reg-reg move (2). */
3802 {6, 6, 6}, /* cost of storing integer registers */
3803 4, /* cost of reg,reg fld/fst */
3804 {6, 6, 12}, /* cost of loading fp registers
3805 in SFmode, DFmode and XFmode */
3806 {6, 6, 12}, /* cost of storing fp registers
3807 in SFmode, DFmode and XFmode */
3808 2, /* cost of moving MMX register */
3809 {6, 6}, /* cost of loading MMX registers
3810 in SImode and DImode */
3811 {6, 6}, /* cost of storing MMX registers
3812 in SImode and DImode */
3813 2, 3, 4, /* cost of moving XMM,YMM,ZMM register */
3814 {6, 6, 6, 10, 15}, /* cost of loading SSE registers
3815 in 32,64,128,256 and 512-bit */
3816 {6, 6, 6, 10, 15}, /* cost of storing SSE registers
3817 in 32,64,128,256 and 512-bit */
3818 6, 6, /* SSE->integer and integer->SSE moves */
3819 6, 6, /* mask->integer and integer->mask moves */
3820 {6, 6, 6}, /* cost of loading mask register
3821 in QImode, HImode, SImode. */
3822 {6, 6, 6}, /* cost if storing mask register
3823 in QImode, HImode, SImode. */
3824 2, /* cost of moving mask register. */
3825 /* End of register allocator costs. */
3828 COSTS_N_INSNS (1), /* cost of an add instruction */
3829 /* Setting cost to 2 makes our current implementation of synth_mult result in
3830 use of unnecessary temporary registers causing regression on several
3831 SPECfp benchmarks. */
3832 COSTS_N_INSNS (1) + 1, /* cost of a lea instruction */
3833 COSTS_N_INSNS (1), /* variable shift costs */
3834 COSTS_N_INSNS (1), /* constant shift costs */
3835 {COSTS_N_INSNS (3), /* cost of starting multiply for QI */
3836 COSTS_N_INSNS (3), /* HI */
3837 COSTS_N_INSNS (3), /* SI */
3838 COSTS_N_INSNS (3), /* DI */
3839 COSTS_N_INSNS (4)}, /* other */
3840 0, /* cost of multiply per each bit set */
3841 {COSTS_N_INSNS (16), /* cost of a divide/mod for QI */
3842 COSTS_N_INSNS (22), /* HI */
3843 COSTS_N_INSNS (30), /* SI */
3844 COSTS_N_INSNS (74), /* DI */
3845 COSTS_N_INSNS (74)}, /* other */
3846 COSTS_N_INSNS (1), /* cost of movsx */
3847 COSTS_N_INSNS (1), /* cost of movzx */
3848 8, /* "large" insn */
3849 17, /* MOVE_RATIO */
3850 6, /* CLEAR_RATIO */
3851 {6, 6, 6}, /* cost of loading integer registers
3852 in QImode, HImode and SImode.
3853 Relative to reg-reg move (2). */
3854 {6, 6, 6}, /* cost of storing integer registers */
3855 {6, 6, 6, 10, 15}, /* cost of loading SSE register
3856 in 32bit, 64bit, 128bit, 256bit and 512bit */
3857 {6, 6, 6, 10, 15}, /* cost of storing SSE register
3858 in 32bit, 64bit, 128bit, 256bit and 512bit */
3859 {6, 6, 6, 10, 15}, /* cost of unaligned loads. */
3860 {6, 6, 6, 10, 15}, /* cost of unaligned storess. */
3861 2, 3, 4, /* cost of moving XMM,YMM,ZMM register */
3862 6, /* cost of moving SSE register to integer. */
3863 18, 6, /* Gather load static, per_elt. */
3864 18, 6, /* Gather store static, per_elt. */
3865 32, /* size of l1 cache. */
3866 512, /* size of l2 cache. */
3867 64, /* size of prefetch block */
3868 6, /* number of parallel prefetches */
3869 /* Benchmarks shows large regressions on K8 sixtrack benchmark when this
3870 value is increased to perhaps more appropriate value of 5. */
3871 3, /* Branch cost */
3872 COSTS_N_INSNS (3), /* cost of FADD and FSUB insns. */
3873 COSTS_N_INSNS (5), /* cost of FMUL instruction. */
3874 COSTS_N_INSNS (17), /* cost of FDIV instruction. */
3875 COSTS_N_INSNS (1), /* cost of FABS instruction. */
3876 COSTS_N_INSNS (1), /* cost of FCHS instruction. */
3877 COSTS_N_INSNS (14), /* cost of FSQRT instruction. */
3879 COSTS_N_INSNS (1), /* cost of cheap SSE instruction. */
3880 COSTS_N_INSNS (3), /* cost of ADDSS/SD SUBSS/SD insns. */
3881 COSTS_N_INSNS (4), /* cost of MULSS instruction. */
3882 COSTS_N_INSNS (5), /* cost of MULSD instruction. */
3883 COSTS_N_INSNS (5), /* cost of FMA SS instruction. */
3884 COSTS_N_INSNS (5), /* cost of FMA SD instruction. */
3885 COSTS_N_INSNS (13), /* cost of DIVSS instruction. */
3886 COSTS_N_INSNS (17), /* cost of DIVSD instruction. */
3887 COSTS_N_INSNS (14), /* cost of SQRTSS instruction. */
3888 COSTS_N_INSNS (18), /* cost of SQRTSD instruction. */
3889 1, 4, 3, 3, /* reassoc int, fp, vec_int, vec_fp. */
3892 COSTS_N_INSNS (4), /* cond_taken_branch_cost. */
3893 COSTS_N_INSNS (2), /* cond_not_taken_branch_cost. */
3894 "16", /* Loop alignment. */
3895 "16:11:8", /* Jump alignment. */
3896 "0:0:8", /* Label alignment. */
3897 "16", /* Func alignment. */
3898 4, /* Small unroll limit. */
3899 2, /* Small unroll factor. */
3902 /* core_cost should produce code tuned for Core familly of CPUs. */
3903 static stringop_algs core_memcpy
[2] = {
3904 {libcall
, {{1024, rep_prefix_4_byte
, true}, {-1, libcall
, false}}},
3905 {libcall
, {{24, loop
, true}, {128, rep_prefix_8_byte
, true},
3906 {-1, libcall
, false}}}};
3907 static stringop_algs core_memset
[2] = {
3908 {libcall
, {{6, loop_1_byte
, true},
3910 {8192, rep_prefix_4_byte
, true},
3911 {-1, libcall
, false}}},
3912 {libcall
, {{24, loop
, true}, {512, rep_prefix_8_byte
, true},
3913 {-1, libcall
, false}}}};
3916 struct processor_costs core_cost
= {
3918 /* Start of register allocator costs. integer->integer move cost is 2. */
3919 6, /* cost for loading QImode using movzbl */
3920 {4, 4, 4}, /* cost of loading integer registers
3921 in QImode, HImode and SImode.
3922 Relative to reg-reg move (2). */
3923 {6, 6, 6}, /* cost of storing integer registers */
3924 2, /* cost of reg,reg fld/fst */
3925 {6, 6, 8}, /* cost of loading fp registers
3926 in SFmode, DFmode and XFmode */
3927 {6, 6, 10}, /* cost of storing fp registers
3928 in SFmode, DFmode and XFmode */
3929 2, /* cost of moving MMX register */
3930 {6, 6}, /* cost of loading MMX registers
3931 in SImode and DImode */
3932 {6, 6}, /* cost of storing MMX registers
3933 in SImode and DImode */
3934 2, 2, 4, /* cost of moving XMM,YMM,ZMM register */
3935 {6, 6, 6, 6, 12}, /* cost of loading SSE registers
3936 in 32,64,128,256 and 512-bit */
3937 {6, 6, 6, 6, 12}, /* cost of storing SSE registers
3938 in 32,64,128,256 and 512-bit */
3939 6, 6, /* SSE->integer and integer->SSE moves */
3940 6, 6, /* mask->integer and integer->mask moves */
3941 {4, 4, 4}, /* cost of loading mask register
3942 in QImode, HImode, SImode. */
3943 {6, 6, 6}, /* cost if storing mask register
3944 in QImode, HImode, SImode. */
3945 2, /* cost of moving mask register. */
3946 /* End of register allocator costs. */
3949 COSTS_N_INSNS (1), /* cost of an add instruction */
3950 /* On all chips taken into consideration lea is 2 cycles and more. With
3951 this cost however our current implementation of synth_mult results in
3952 use of unnecessary temporary registers causing regression on several
3953 SPECfp benchmarks. */
3954 COSTS_N_INSNS (1) + 1, /* cost of a lea instruction */
3955 COSTS_N_INSNS (1), /* variable shift costs */
3956 COSTS_N_INSNS (1), /* constant shift costs */
3957 {COSTS_N_INSNS (3), /* cost of starting multiply for QI */
3958 COSTS_N_INSNS (4), /* HI */
3959 COSTS_N_INSNS (3), /* SI */
3960 /* Here we tune for Sandybridge or newer. */
3961 COSTS_N_INSNS (3), /* DI */
3962 COSTS_N_INSNS (3)}, /* other */
3963 0, /* cost of multiply per each bit set */
3964 /* Expanding div/mod currently doesn't consider parallelism. So the cost
3965 model is not realistic. We compensate by increasing the latencies a bit. */
3966 {COSTS_N_INSNS (11), /* cost of a divide/mod for QI */
3967 COSTS_N_INSNS (11), /* HI */
3968 COSTS_N_INSNS (14), /* SI */
3969 COSTS_N_INSNS (81), /* DI */
3970 COSTS_N_INSNS (81)}, /* other */
3971 COSTS_N_INSNS (1), /* cost of movsx */
3972 COSTS_N_INSNS (1), /* cost of movzx */
3973 8, /* "large" insn */
3974 17, /* MOVE_RATIO */
3975 6, /* CLEAR_RATIO */
3976 {4, 4, 4}, /* cost of loading integer registers
3977 in QImode, HImode and SImode.
3978 Relative to reg-reg move (2). */
3979 {6, 6, 6}, /* cost of storing integer registers */
3980 {6, 6, 6, 6, 12}, /* cost of loading SSE register
3981 in 32bit, 64bit, 128bit, 256bit and 512bit */
3982 {6, 6, 6, 6, 12}, /* cost of storing SSE register
3983 in 32bit, 64bit, 128bit, 256bit and 512bit */
3984 {6, 6, 6, 6, 12}, /* cost of unaligned loads. */
3985 {6, 6, 6, 6, 12}, /* cost of unaligned stores. */
3986 2, 2, 4, /* cost of moving XMM,YMM,ZMM register */
3987 2, /* cost of moving SSE register to integer. */
3988 /* VGATHERDPD is 7 uops, rec throughput 5, while VGATHERDPD is 9 uops,
3990 So 5 uops statically and one uops per load. */
3991 10, 6, /* Gather load static, per_elt. */
3992 10, 6, /* Gather store static, per_elt. */
3993 64, /* size of l1 cache. */
3994 512, /* size of l2 cache. */
3995 64, /* size of prefetch block */
3996 6, /* number of parallel prefetches */
3997 /* FIXME perhaps more appropriate value is 5. */
3998 3, /* Branch cost */
3999 COSTS_N_INSNS (3), /* cost of FADD and FSUB insns. */
4000 COSTS_N_INSNS (5), /* cost of FMUL instruction. */
4002 COSTS_N_INSNS (24), /* cost of FDIV instruction. */
4003 COSTS_N_INSNS (1), /* cost of FABS instruction. */
4004 COSTS_N_INSNS (1), /* cost of FCHS instruction. */
4005 COSTS_N_INSNS (23), /* cost of FSQRT instruction. */
4007 COSTS_N_INSNS (1), /* cost of cheap SSE instruction. */
4008 COSTS_N_INSNS (3), /* cost of ADDSS/SD SUBSS/SD insns. */
4009 COSTS_N_INSNS (4), /* cost of MULSS instruction. */
4010 COSTS_N_INSNS (5), /* cost of MULSD instruction. */
4011 COSTS_N_INSNS (5), /* cost of FMA SS instruction. */
4012 COSTS_N_INSNS (5), /* cost of FMA SD instruction. */
4013 COSTS_N_INSNS (18), /* cost of DIVSS instruction. */
4014 COSTS_N_INSNS (32), /* cost of DIVSD instruction. */
4015 COSTS_N_INSNS (30), /* cost of SQRTSS instruction. */
4016 COSTS_N_INSNS (58), /* cost of SQRTSD instruction. */
4017 1, 4, 2, 2, /* reassoc int, fp, vec_int, vec_fp. */
4020 COSTS_N_INSNS (3), /* cond_taken_branch_cost. */
4021 COSTS_N_INSNS (1), /* cond_not_taken_branch_cost. */
4022 "16:11:8", /* Loop alignment. */
4023 "16:11:8", /* Jump alignment. */
4024 "0:0:8", /* Label alignment. */
4025 "16", /* Func alignment. */
4026 4, /* Small unroll limit. */
4027 2, /* Small unroll factor. */