arm: fix typo in dg-require-effective-target [PR118089]
[official-gcc.git] / gcc / config / i386 / x86-tune-costs.h
bloba4a128cd5dde671bd203e517f290366814ebf88b
1 /* Costs of operations of individual x86 CPUs.
2 Copyright (C) 1988-2025 Free Software Foundation, Inc.
4 This file is part of GCC.
6 GCC is free software; you can redistribute it and/or modify
7 it under the terms of the GNU General Public License as published by
8 the Free Software Foundation; either version 3, or (at your option)
9 any later version.
11 GCC is distributed in the hope that it will be useful,
12 but WITHOUT ANY WARRANTY; without even the implied warranty of
13 MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
14 GNU General Public License for more details.
16 Under Section 7 of GPL version 3, you are granted additional
17 permissions described in the GCC Runtime Library Exception, version
18 3.1, as published by the Free Software Foundation.
20 You should have received a copy of the GNU General Public License and
21 a copy of the GCC Runtime Library Exception along with this program;
22 see the files COPYING3 and COPYING.RUNTIME respectively. If not, see
23 <http://www.gnu.org/licenses/>. */
24 /* Processor costs (relative to an add) */
25 /* We assume COSTS_N_INSNS is defined as (N)*4 and an addition is 2 bytes. */
26 #define COSTS_N_BYTES(N) ((N) * 2)
28 #define DUMMY_STRINGOP_ALGS {libcall, {{-1, libcall, false}}}
30 static stringop_algs ix86_size_memcpy[2] = {
31 {rep_prefix_1_byte, {{-1, rep_prefix_1_byte, false}}},
32 {rep_prefix_1_byte, {{-1, rep_prefix_1_byte, false}}}};
33 static stringop_algs ix86_size_memset[2] = {
34 {rep_prefix_1_byte, {{-1, rep_prefix_1_byte, false}}},
35 {rep_prefix_1_byte, {{-1, rep_prefix_1_byte, false}}}};
37 const
38 struct processor_costs ix86_size_cost = {/* costs for tuning for size */
40 /* Start of register allocator costs. integer->integer move cost is 2. */
41 2, /* cost for loading QImode using movzbl */
42 {2, 2, 2}, /* cost of loading integer registers
43 in QImode, HImode and SImode.
44 Relative to reg-reg move (2). */
45 {2, 2, 2}, /* cost of storing integer registers */
46 2, /* cost of reg,reg fld/fst */
47 {2, 2, 2}, /* cost of loading fp registers
48 in SFmode, DFmode and XFmode */
49 {2, 2, 2}, /* cost of storing fp registers
50 in SFmode, DFmode and XFmode */
51 3, /* cost of moving MMX register */
52 {3, 3}, /* cost of loading MMX registers
53 in SImode and DImode */
54 {3, 3}, /* cost of storing MMX registers
55 in SImode and DImode */
56 3, 3, 3, /* cost of moving XMM,YMM,ZMM register */
57 {3, 3, 3, 3, 3}, /* cost of loading SSE registers
58 in 32,64,128,256 and 512-bit */
59 {3, 3, 3, 3, 3}, /* cost of storing SSE registers
60 in 32,64,128,256 and 512-bit */
61 3, 3, /* SSE->integer and integer->SSE moves */
62 3, 3, /* mask->integer and integer->mask moves */
63 {2, 2, 2}, /* cost of loading mask register
64 in QImode, HImode, SImode. */
65 {2, 2, 2}, /* cost if storing mask register
66 in QImode, HImode, SImode. */
67 2, /* cost of moving mask register. */
68 /* End of register allocator costs. */
71 COSTS_N_BYTES (2), /* cost of an add instruction */
72 COSTS_N_BYTES (3), /* cost of a lea instruction */
73 COSTS_N_BYTES (2), /* variable shift costs */
74 COSTS_N_BYTES (3), /* constant shift costs */
75 {COSTS_N_BYTES (3), /* cost of starting multiply for QI */
76 COSTS_N_BYTES (3), /* HI */
77 COSTS_N_BYTES (3), /* SI */
78 COSTS_N_BYTES (3), /* DI */
79 COSTS_N_BYTES (5)}, /* other */
80 0, /* cost of multiply per each bit set */
81 {COSTS_N_BYTES (3), /* cost of a divide/mod for QI */
82 COSTS_N_BYTES (3), /* HI */
83 COSTS_N_BYTES (3), /* SI */
84 COSTS_N_BYTES (3), /* DI */
85 COSTS_N_BYTES (5)}, /* other */
86 COSTS_N_BYTES (3), /* cost of movsx */
87 COSTS_N_BYTES (3), /* cost of movzx */
88 0, /* "large" insn */
89 2, /* MOVE_RATIO */
90 2, /* CLEAR_RATIO */
91 {2, 2, 2}, /* cost of loading integer registers
92 in QImode, HImode and SImode.
93 Relative to reg-reg move (2). */
94 {2, 2, 2}, /* cost of storing integer registers */
95 {3, 3, 3, 3, 3}, /* cost of loading SSE register
96 in 32bit, 64bit, 128bit, 256bit and 512bit */
97 {3, 3, 3, 3, 3}, /* cost of storing SSE register
98 in 32bit, 64bit, 128bit, 256bit and 512bit */
99 {3, 3, 3, 3, 3}, /* cost of unaligned SSE load
100 in 128bit, 256bit and 512bit */
101 {3, 3, 3, 3, 3}, /* cost of unaligned SSE store
102 in 128bit, 256bit and 512bit */
103 3, 3, 3, /* cost of moving XMM,YMM,ZMM register */
104 3, /* cost of moving SSE register to integer. */
105 5, 0, /* Gather load static, per_elt. */
106 5, 0, /* Gather store static, per_elt. */
107 0, /* size of l1 cache */
108 0, /* size of l2 cache */
109 0, /* size of prefetch block */
110 0, /* number of parallel prefetches */
111 2, /* Branch cost */
112 COSTS_N_BYTES (2), /* cost of FADD and FSUB insns. */
113 COSTS_N_BYTES (2), /* cost of FMUL instruction. */
114 COSTS_N_BYTES (2), /* cost of FDIV instruction. */
115 COSTS_N_BYTES (2), /* cost of FABS instruction. */
116 COSTS_N_BYTES (2), /* cost of FCHS instruction. */
117 COSTS_N_BYTES (2), /* cost of FSQRT instruction. */
119 COSTS_N_BYTES (2), /* cost of cheap SSE instruction. */
120 COSTS_N_BYTES (2), /* cost of ADDSS/SD SUBSS/SD insns. */
121 COSTS_N_BYTES (2), /* cost of MULSS instruction. */
122 COSTS_N_BYTES (2), /* cost of MULSD instruction. */
123 COSTS_N_BYTES (2), /* cost of FMA SS instruction. */
124 COSTS_N_BYTES (2), /* cost of FMA SD instruction. */
125 COSTS_N_BYTES (2), /* cost of DIVSS instruction. */
126 COSTS_N_BYTES (2), /* cost of DIVSD instruction. */
127 COSTS_N_BYTES (2), /* cost of SQRTSS instruction. */
128 COSTS_N_BYTES (2), /* cost of SQRTSD instruction. */
129 1, 1, 1, 1, /* reassoc int, fp, vec_int, vec_fp. */
130 ix86_size_memcpy,
131 ix86_size_memset,
132 COSTS_N_BYTES (1), /* cond_taken_branch_cost. */
133 COSTS_N_BYTES (1), /* cond_not_taken_branch_cost. */
134 NULL, /* Loop alignment. */
135 NULL, /* Jump alignment. */
136 NULL, /* Label alignment. */
137 NULL, /* Func alignment. */
138 4, /* Small unroll limit. */
139 2, /* Small unroll factor. */
140 COSTS_N_INSNS (2), /* Branch mispredict scale. */
143 /* Processor costs (relative to an add) */
144 static stringop_algs i386_memcpy[2] = {
145 {rep_prefix_1_byte, {{-1, rep_prefix_1_byte, false}}},
146 DUMMY_STRINGOP_ALGS};
147 static stringop_algs i386_memset[2] = {
148 {rep_prefix_1_byte, {{-1, rep_prefix_1_byte, false}}},
149 DUMMY_STRINGOP_ALGS};
151 static const
152 struct processor_costs i386_cost = { /* 386 specific costs */
154 /* Start of register allocator costs. integer->integer move cost is 2. */
155 4, /* cost for loading QImode using movzbl */
156 {2, 4, 2}, /* cost of loading integer registers
157 in QImode, HImode and SImode.
158 Relative to reg-reg move (2). */
159 {2, 4, 2}, /* cost of storing integer registers */
160 2, /* cost of reg,reg fld/fst */
161 {8, 8, 8}, /* cost of loading fp registers
162 in SFmode, DFmode and XFmode */
163 {8, 8, 8}, /* cost of storing fp registers
164 in SFmode, DFmode and XFmode */
165 2, /* cost of moving MMX register */
166 {4, 8}, /* cost of loading MMX registers
167 in SImode and DImode */
168 {4, 8}, /* cost of storing MMX registers
169 in SImode and DImode */
170 2, 4, 8, /* cost of moving XMM,YMM,ZMM register */
171 {4, 8, 16, 32, 64}, /* cost of loading SSE registers
172 in 32,64,128,256 and 512-bit */
173 {4, 8, 16, 32, 64}, /* cost of storing SSE registers
174 in 32,64,128,256 and 512-bit */
175 3, 3, /* SSE->integer and integer->SSE moves */
176 3, 3, /* mask->integer and integer->mask moves */
177 {2, 4, 2}, /* cost of loading mask register
178 in QImode, HImode, SImode. */
179 {2, 4, 2}, /* cost if storing mask register
180 in QImode, HImode, SImode. */
181 2, /* cost of moving mask register. */
182 /* End of register allocator costs. */
185 COSTS_N_INSNS (1), /* cost of an add instruction */
186 COSTS_N_INSNS (1), /* cost of a lea instruction */
187 COSTS_N_INSNS (3), /* variable shift costs */
188 COSTS_N_INSNS (2), /* constant shift costs */
189 {COSTS_N_INSNS (6), /* cost of starting multiply for QI */
190 COSTS_N_INSNS (6), /* HI */
191 COSTS_N_INSNS (6), /* SI */
192 COSTS_N_INSNS (6), /* DI */
193 COSTS_N_INSNS (6)}, /* other */
194 COSTS_N_INSNS (1), /* cost of multiply per each bit set */
195 {COSTS_N_INSNS (23), /* cost of a divide/mod for QI */
196 COSTS_N_INSNS (23), /* HI */
197 COSTS_N_INSNS (23), /* SI */
198 COSTS_N_INSNS (23), /* DI */
199 COSTS_N_INSNS (23)}, /* other */
200 COSTS_N_INSNS (3), /* cost of movsx */
201 COSTS_N_INSNS (2), /* cost of movzx */
202 15, /* "large" insn */
203 3, /* MOVE_RATIO */
204 3, /* CLEAR_RATIO */
205 {2, 4, 2}, /* cost of loading integer registers
206 in QImode, HImode and SImode.
207 Relative to reg-reg move (2). */
208 {2, 4, 2}, /* cost of storing integer registers */
209 {4, 8, 16, 32, 64}, /* cost of loading SSE register
210 in 32bit, 64bit, 128bit, 256bit and 512bit */
211 {4, 8, 16, 32, 64}, /* cost of storing SSE register
212 in 32bit, 64bit, 128bit, 256bit and 512bit */
213 {4, 8, 16, 32, 64}, /* cost of unaligned loads. */
214 {4, 8, 16, 32, 64}, /* cost of unaligned stores. */
215 2, 4, 8, /* cost of moving XMM,YMM,ZMM register */
216 3, /* cost of moving SSE register to integer. */
217 4, 4, /* Gather load static, per_elt. */
218 4, 4, /* Gather store static, per_elt. */
219 0, /* size of l1 cache */
220 0, /* size of l2 cache */
221 0, /* size of prefetch block */
222 0, /* number of parallel prefetches */
223 1, /* Branch cost */
224 COSTS_N_INSNS (23), /* cost of FADD and FSUB insns. */
225 COSTS_N_INSNS (27), /* cost of FMUL instruction. */
226 COSTS_N_INSNS (88), /* cost of FDIV instruction. */
227 COSTS_N_INSNS (22), /* cost of FABS instruction. */
228 COSTS_N_INSNS (24), /* cost of FCHS instruction. */
229 COSTS_N_INSNS (122), /* cost of FSQRT instruction. */
231 COSTS_N_INSNS (1), /* cost of cheap SSE instruction. */
232 COSTS_N_INSNS (23), /* cost of ADDSS/SD SUBSS/SD insns. */
233 COSTS_N_INSNS (27), /* cost of MULSS instruction. */
234 COSTS_N_INSNS (27), /* cost of MULSD instruction. */
235 COSTS_N_INSNS (27), /* cost of FMA SS instruction. */
236 COSTS_N_INSNS (27), /* cost of FMA SD instruction. */
237 COSTS_N_INSNS (88), /* cost of DIVSS instruction. */
238 COSTS_N_INSNS (88), /* cost of DIVSD instruction. */
239 COSTS_N_INSNS (122), /* cost of SQRTSS instruction. */
240 COSTS_N_INSNS (122), /* cost of SQRTSD instruction. */
241 1, 1, 1, 1, /* reassoc int, fp, vec_int, vec_fp. */
242 i386_memcpy,
243 i386_memset,
244 COSTS_N_INSNS (3), /* cond_taken_branch_cost. */
245 COSTS_N_INSNS (1), /* cond_not_taken_branch_cost. */
246 "4", /* Loop alignment. */
247 "4", /* Jump alignment. */
248 NULL, /* Label alignment. */
249 "4", /* Func alignment. */
250 4, /* Small unroll limit. */
251 2, /* Small unroll factor. */
252 COSTS_N_INSNS (2), /* Branch mispredict scale. */
255 static stringop_algs i486_memcpy[2] = {
256 {rep_prefix_4_byte, {{-1, rep_prefix_4_byte, false}}},
257 DUMMY_STRINGOP_ALGS};
258 static stringop_algs i486_memset[2] = {
259 {rep_prefix_4_byte, {{-1, rep_prefix_4_byte, false}}},
260 DUMMY_STRINGOP_ALGS};
262 static const
263 struct processor_costs i486_cost = { /* 486 specific costs */
265 /* Start of register allocator costs. integer->integer move cost is 2. */
266 4, /* cost for loading QImode using movzbl */
267 {2, 4, 2}, /* cost of loading integer registers
268 in QImode, HImode and SImode.
269 Relative to reg-reg move (2). */
270 {2, 4, 2}, /* cost of storing integer registers */
271 2, /* cost of reg,reg fld/fst */
272 {8, 8, 8}, /* cost of loading fp registers
273 in SFmode, DFmode and XFmode */
274 {8, 8, 8}, /* cost of storing fp registers
275 in SFmode, DFmode and XFmode */
276 2, /* cost of moving MMX register */
277 {4, 8}, /* cost of loading MMX registers
278 in SImode and DImode */
279 {4, 8}, /* cost of storing MMX registers
280 in SImode and DImode */
281 2, 4, 8, /* cost of moving XMM,YMM,ZMM register */
282 {4, 8, 16, 32, 64}, /* cost of loading SSE registers
283 in 32,64,128,256 and 512-bit */
284 {4, 8, 16, 32, 64}, /* cost of storing SSE registers
285 in 32,64,128,256 and 512-bit */
286 3, 3, /* SSE->integer and integer->SSE moves */
287 3, 3, /* mask->integer and integer->mask moves */
288 {2, 4, 2}, /* cost of loading mask register
289 in QImode, HImode, SImode. */
290 {2, 4, 2}, /* cost if storing mask register
291 in QImode, HImode, SImode. */
292 2, /* cost of moving mask register. */
293 /* End of register allocator costs. */
296 COSTS_N_INSNS (1), /* cost of an add instruction */
297 COSTS_N_INSNS (1), /* cost of a lea instruction */
298 COSTS_N_INSNS (3), /* variable shift costs */
299 COSTS_N_INSNS (2), /* constant shift costs */
300 {COSTS_N_INSNS (12), /* cost of starting multiply for QI */
301 COSTS_N_INSNS (12), /* HI */
302 COSTS_N_INSNS (12), /* SI */
303 COSTS_N_INSNS (12), /* DI */
304 COSTS_N_INSNS (12)}, /* other */
305 1, /* cost of multiply per each bit set */
306 {COSTS_N_INSNS (40), /* cost of a divide/mod for QI */
307 COSTS_N_INSNS (40), /* HI */
308 COSTS_N_INSNS (40), /* SI */
309 COSTS_N_INSNS (40), /* DI */
310 COSTS_N_INSNS (40)}, /* other */
311 COSTS_N_INSNS (3), /* cost of movsx */
312 COSTS_N_INSNS (2), /* cost of movzx */
313 15, /* "large" insn */
314 3, /* MOVE_RATIO */
315 3, /* CLEAR_RATIO */
316 {2, 4, 2}, /* cost of loading integer registers
317 in QImode, HImode and SImode.
318 Relative to reg-reg move (2). */
319 {2, 4, 2}, /* cost of storing integer registers */
320 {4, 8, 16, 32, 64}, /* cost of loading SSE register
321 in 32bit, 64bit, 128bit, 256bit and 512bit */
322 {4, 8, 16, 32, 64}, /* cost of storing SSE register
323 in 32bit, 64bit, 128bit, 256bit and 512bit */
324 {4, 8, 16, 32, 64}, /* cost of unaligned loads. */
325 {4, 8, 16, 32, 64}, /* cost of unaligned stores. */
326 2, 4, 8, /* cost of moving XMM,YMM,ZMM register */
327 3, /* cost of moving SSE register to integer. */
328 4, 4, /* Gather load static, per_elt. */
329 4, 4, /* Gather store static, per_elt. */
330 4, /* size of l1 cache. 486 has 8kB cache
331 shared for code and data, so 4kB is
332 not really precise. */
333 4, /* size of l2 cache */
334 0, /* size of prefetch block */
335 0, /* number of parallel prefetches */
336 1, /* Branch cost */
337 COSTS_N_INSNS (8), /* cost of FADD and FSUB insns. */
338 COSTS_N_INSNS (16), /* cost of FMUL instruction. */
339 COSTS_N_INSNS (73), /* cost of FDIV instruction. */
340 COSTS_N_INSNS (3), /* cost of FABS instruction. */
341 COSTS_N_INSNS (3), /* cost of FCHS instruction. */
342 COSTS_N_INSNS (83), /* cost of FSQRT instruction. */
344 COSTS_N_INSNS (1), /* cost of cheap SSE instruction. */
345 COSTS_N_INSNS (8), /* cost of ADDSS/SD SUBSS/SD insns. */
346 COSTS_N_INSNS (16), /* cost of MULSS instruction. */
347 COSTS_N_INSNS (16), /* cost of MULSD instruction. */
348 COSTS_N_INSNS (16), /* cost of FMA SS instruction. */
349 COSTS_N_INSNS (16), /* cost of FMA SD instruction. */
350 COSTS_N_INSNS (73), /* cost of DIVSS instruction. */
351 COSTS_N_INSNS (74), /* cost of DIVSD instruction. */
352 COSTS_N_INSNS (83), /* cost of SQRTSS instruction. */
353 COSTS_N_INSNS (83), /* cost of SQRTSD instruction. */
354 1, 1, 1, 1, /* reassoc int, fp, vec_int, vec_fp. */
355 i486_memcpy,
356 i486_memset,
357 COSTS_N_INSNS (3), /* cond_taken_branch_cost. */
358 COSTS_N_INSNS (1), /* cond_not_taken_branch_cost. */
359 "16", /* Loop alignment. */
360 "16", /* Jump alignment. */
361 "0:0:8", /* Label alignment. */
362 "16", /* Func alignment. */
363 4, /* Small unroll limit. */
364 2, /* Small unroll factor. */
365 COSTS_N_INSNS (2), /* Branch mispredict scale. */
368 static stringop_algs pentium_memcpy[2] = {
369 {libcall, {{256, rep_prefix_4_byte, false}, {-1, libcall, false}}},
370 DUMMY_STRINGOP_ALGS};
371 static stringop_algs pentium_memset[2] = {
372 {libcall, {{-1, rep_prefix_4_byte, false}}},
373 DUMMY_STRINGOP_ALGS};
375 static const
376 struct processor_costs pentium_cost = {
378 /* Start of register allocator costs. integer->integer move cost is 2. */
379 6, /* cost for loading QImode using movzbl */
380 {2, 4, 2}, /* cost of loading integer registers
381 in QImode, HImode and SImode.
382 Relative to reg-reg move (2). */
383 {2, 4, 2}, /* cost of storing integer registers */
384 2, /* cost of reg,reg fld/fst */
385 {2, 2, 6}, /* cost of loading fp registers
386 in SFmode, DFmode and XFmode */
387 {4, 4, 6}, /* cost of storing fp registers
388 in SFmode, DFmode and XFmode */
389 8, /* cost of moving MMX register */
390 {8, 8}, /* cost of loading MMX registers
391 in SImode and DImode */
392 {8, 8}, /* cost of storing MMX registers
393 in SImode and DImode */
394 2, 4, 8, /* cost of moving XMM,YMM,ZMM register */
395 {4, 8, 16, 32, 64}, /* cost of loading SSE registers
396 in 32,64,128,256 and 512-bit */
397 {4, 8, 16, 32, 64}, /* cost of storing SSE registers
398 in 32,64,128,256 and 512-bit */
399 3, 3, /* SSE->integer and integer->SSE moves */
400 3, 3, /* mask->integer and integer->mask moves */
401 {2, 4, 2}, /* cost of loading mask register
402 in QImode, HImode, SImode. */
403 {2, 4, 2}, /* cost if storing mask register
404 in QImode, HImode, SImode. */
405 2, /* cost of moving mask register. */
406 /* End of register allocator costs. */
409 COSTS_N_INSNS (1), /* cost of an add instruction */
410 COSTS_N_INSNS (1), /* cost of a lea instruction */
411 COSTS_N_INSNS (4), /* variable shift costs */
412 COSTS_N_INSNS (1), /* constant shift costs */
413 {COSTS_N_INSNS (11), /* cost of starting multiply for QI */
414 COSTS_N_INSNS (11), /* HI */
415 COSTS_N_INSNS (11), /* SI */
416 COSTS_N_INSNS (11), /* DI */
417 COSTS_N_INSNS (11)}, /* other */
418 0, /* cost of multiply per each bit set */
419 {COSTS_N_INSNS (25), /* cost of a divide/mod for QI */
420 COSTS_N_INSNS (25), /* HI */
421 COSTS_N_INSNS (25), /* SI */
422 COSTS_N_INSNS (25), /* DI */
423 COSTS_N_INSNS (25)}, /* other */
424 COSTS_N_INSNS (3), /* cost of movsx */
425 COSTS_N_INSNS (2), /* cost of movzx */
426 8, /* "large" insn */
427 6, /* MOVE_RATIO */
428 6, /* CLEAR_RATIO */
429 {2, 4, 2}, /* cost of loading integer registers
430 in QImode, HImode and SImode.
431 Relative to reg-reg move (2). */
432 {2, 4, 2}, /* cost of storing integer registers */
433 {4, 8, 16, 32, 64}, /* cost of loading SSE register
434 in 32bit, 64bit, 128bit, 256bit and 512bit */
435 {4, 8, 16, 32, 64}, /* cost of storing SSE register
436 in 32bit, 64bit, 128bit, 256bit and 512bit */
437 {4, 8, 16, 32, 64}, /* cost of unaligned loads. */
438 {4, 8, 16, 32, 64}, /* cost of unaligned stores. */
439 2, 4, 8, /* cost of moving XMM,YMM,ZMM register */
440 3, /* cost of moving SSE register to integer. */
441 4, 4, /* Gather load static, per_elt. */
442 4, 4, /* Gather store static, per_elt. */
443 8, /* size of l1 cache. */
444 8, /* size of l2 cache */
445 0, /* size of prefetch block */
446 0, /* number of parallel prefetches */
447 2, /* Branch cost */
448 COSTS_N_INSNS (3), /* cost of FADD and FSUB insns. */
449 COSTS_N_INSNS (3), /* cost of FMUL instruction. */
450 COSTS_N_INSNS (39), /* cost of FDIV instruction. */
451 COSTS_N_INSNS (1), /* cost of FABS instruction. */
452 COSTS_N_INSNS (1), /* cost of FCHS instruction. */
453 COSTS_N_INSNS (70), /* cost of FSQRT instruction. */
455 COSTS_N_INSNS (1), /* cost of cheap SSE instruction. */
456 COSTS_N_INSNS (3), /* cost of ADDSS/SD SUBSS/SD insns. */
457 COSTS_N_INSNS (3), /* cost of MULSS instruction. */
458 COSTS_N_INSNS (3), /* cost of MULSD instruction. */
459 COSTS_N_INSNS (6), /* cost of FMA SS instruction. */
460 COSTS_N_INSNS (6), /* cost of FMA SD instruction. */
461 COSTS_N_INSNS (39), /* cost of DIVSS instruction. */
462 COSTS_N_INSNS (39), /* cost of DIVSD instruction. */
463 COSTS_N_INSNS (70), /* cost of SQRTSS instruction. */
464 COSTS_N_INSNS (70), /* cost of SQRTSD instruction. */
465 1, 1, 1, 1, /* reassoc int, fp, vec_int, vec_fp. */
466 pentium_memcpy,
467 pentium_memset,
468 COSTS_N_INSNS (3), /* cond_taken_branch_cost. */
469 COSTS_N_INSNS (1), /* cond_not_taken_branch_cost. */
470 "16:8:8", /* Loop alignment. */
471 "16:8:8", /* Jump alignment. */
472 "0:0:8", /* Label alignment. */
473 "16", /* Func alignment. */
474 4, /* Small unroll limit. */
475 2, /* Small unroll factor. */
476 COSTS_N_INSNS (2), /* Branch mispredict scale. */
479 static const
480 struct processor_costs lakemont_cost = {
482 /* Start of register allocator costs. integer->integer move cost is 2. */
483 6, /* cost for loading QImode using movzbl */
484 {2, 4, 2}, /* cost of loading integer registers
485 in QImode, HImode and SImode.
486 Relative to reg-reg move (2). */
487 {2, 4, 2}, /* cost of storing integer registers */
488 2, /* cost of reg,reg fld/fst */
489 {2, 2, 6}, /* cost of loading fp registers
490 in SFmode, DFmode and XFmode */
491 {4, 4, 6}, /* cost of storing fp registers
492 in SFmode, DFmode and XFmode */
493 8, /* cost of moving MMX register */
494 {8, 8}, /* cost of loading MMX registers
495 in SImode and DImode */
496 {8, 8}, /* cost of storing MMX registers
497 in SImode and DImode */
498 2, 4, 8, /* cost of moving XMM,YMM,ZMM register */
499 {4, 8, 16, 32, 64}, /* cost of loading SSE registers
500 in 32,64,128,256 and 512-bit */
501 {4, 8, 16, 32, 64}, /* cost of storing SSE registers
502 in 32,64,128,256 and 512-bit */
503 3, 3, /* SSE->integer and integer->SSE moves */
504 3, 3, /* mask->integer and integer->mask moves */
505 {2, 4, 2}, /* cost of loading mask register
506 in QImode, HImode, SImode. */
507 {2, 4, 2}, /* cost if storing mask register
508 in QImode, HImode, SImode. */
509 2, /* cost of moving mask register. */
510 /* End of register allocator costs. */
513 COSTS_N_INSNS (1), /* cost of an add instruction */
514 COSTS_N_INSNS (1) + 1, /* cost of a lea instruction */
515 COSTS_N_INSNS (1), /* variable shift costs */
516 COSTS_N_INSNS (1), /* constant shift costs */
517 {COSTS_N_INSNS (11), /* cost of starting multiply for QI */
518 COSTS_N_INSNS (11), /* HI */
519 COSTS_N_INSNS (11), /* SI */
520 COSTS_N_INSNS (11), /* DI */
521 COSTS_N_INSNS (11)}, /* other */
522 0, /* cost of multiply per each bit set */
523 {COSTS_N_INSNS (25), /* cost of a divide/mod for QI */
524 COSTS_N_INSNS (25), /* HI */
525 COSTS_N_INSNS (25), /* SI */
526 COSTS_N_INSNS (25), /* DI */
527 COSTS_N_INSNS (25)}, /* other */
528 COSTS_N_INSNS (3), /* cost of movsx */
529 COSTS_N_INSNS (2), /* cost of movzx */
530 8, /* "large" insn */
531 17, /* MOVE_RATIO */
532 6, /* CLEAR_RATIO */
533 {2, 4, 2}, /* cost of loading integer registers
534 in QImode, HImode and SImode.
535 Relative to reg-reg move (2). */
536 {2, 4, 2}, /* cost of storing integer registers */
537 {4, 8, 16, 32, 64}, /* cost of loading SSE register
538 in 32bit, 64bit, 128bit, 256bit and 512bit */
539 {4, 8, 16, 32, 64}, /* cost of storing SSE register
540 in 32bit, 64bit, 128bit, 256bit and 512bit */
541 {4, 8, 16, 32, 64}, /* cost of unaligned loads. */
542 {4, 8, 16, 32, 64}, /* cost of unaligned stores. */
543 2, 4, 8, /* cost of moving XMM,YMM,ZMM register */
544 3, /* cost of moving SSE register to integer. */
545 4, 4, /* Gather load static, per_elt. */
546 4, 4, /* Gather store static, per_elt. */
547 8, /* size of l1 cache. */
548 8, /* size of l2 cache */
549 0, /* size of prefetch block */
550 0, /* number of parallel prefetches */
551 2, /* Branch cost */
552 COSTS_N_INSNS (3), /* cost of FADD and FSUB insns. */
553 COSTS_N_INSNS (3), /* cost of FMUL instruction. */
554 COSTS_N_INSNS (39), /* cost of FDIV instruction. */
555 COSTS_N_INSNS (1), /* cost of FABS instruction. */
556 COSTS_N_INSNS (1), /* cost of FCHS instruction. */
557 COSTS_N_INSNS (70), /* cost of FSQRT instruction. */
559 COSTS_N_INSNS (1), /* cost of cheap SSE instruction. */
560 COSTS_N_INSNS (5), /* cost of ADDSS/SD SUBSS/SD insns. */
561 COSTS_N_INSNS (5), /* cost of MULSS instruction. */
562 COSTS_N_INSNS (5), /* cost of MULSD instruction. */
563 COSTS_N_INSNS (10), /* cost of FMA SS instruction. */
564 COSTS_N_INSNS (10), /* cost of FMA SD instruction. */
565 COSTS_N_INSNS (31), /* cost of DIVSS instruction. */
566 COSTS_N_INSNS (60), /* cost of DIVSD instruction. */
567 COSTS_N_INSNS (31), /* cost of SQRTSS instruction. */
568 COSTS_N_INSNS (63), /* cost of SQRTSD instruction. */
569 1, 1, 1, 1, /* reassoc int, fp, vec_int, vec_fp. */
570 pentium_memcpy,
571 pentium_memset,
572 COSTS_N_INSNS (3), /* cond_taken_branch_cost. */
573 COSTS_N_INSNS (1), /* cond_not_taken_branch_cost. */
574 "16:8:8", /* Loop alignment. */
575 "16:8:8", /* Jump alignment. */
576 "0:0:8", /* Label alignment. */
577 "16", /* Func alignment. */
578 4, /* Small unroll limit. */
579 2, /* Small unroll factor. */
580 COSTS_N_INSNS (2), /* Branch mispredict scale. */
583 /* PentiumPro has optimized rep instructions for blocks aligned by 8 bytes
584 (we ensure the alignment). For small blocks inline loop is still a
585 noticeable win, for bigger blocks either rep movsl or rep movsb is
586 way to go. Rep movsb has apparently more expensive startup time in CPU,
587 but after 4K the difference is down in the noise. */
588 static stringop_algs pentiumpro_memcpy[2] = {
589 {rep_prefix_4_byte, {{128, loop, false}, {1024, unrolled_loop, false},
590 {8192, rep_prefix_4_byte, false},
591 {-1, rep_prefix_1_byte, false}}},
592 DUMMY_STRINGOP_ALGS};
593 static stringop_algs pentiumpro_memset[2] = {
594 {rep_prefix_4_byte, {{1024, unrolled_loop, false},
595 {8192, rep_prefix_4_byte, false},
596 {-1, libcall, false}}},
597 DUMMY_STRINGOP_ALGS};
598 static const
599 struct processor_costs pentiumpro_cost = {
601 /* Start of register allocator costs. integer->integer move cost is 2. */
602 2, /* cost for loading QImode using movzbl */
603 {4, 4, 4}, /* cost of loading integer registers
604 in QImode, HImode and SImode.
605 Relative to reg-reg move (2). */
606 {2, 2, 2}, /* cost of storing integer registers */
607 2, /* cost of reg,reg fld/fst */
608 {2, 2, 6}, /* cost of loading fp registers
609 in SFmode, DFmode and XFmode */
610 {4, 4, 6}, /* cost of storing fp registers
611 in SFmode, DFmode and XFmode */
612 2, /* cost of moving MMX register */
613 {2, 2}, /* cost of loading MMX registers
614 in SImode and DImode */
615 {2, 2}, /* cost of storing MMX registers
616 in SImode and DImode */
617 2, 4, 8, /* cost of moving XMM,YMM,ZMM register */
618 {4, 8, 16, 32, 64}, /* cost of loading SSE registers
619 in 32,64,128,256 and 512-bit */
620 {4, 8, 16, 32, 64}, /* cost of storing SSE registers
621 in 32,64,128,256 and 512-bit */
622 3, 3, /* SSE->integer and integer->SSE moves */
623 3, 3, /* mask->integer and integer->mask moves */
624 {4, 4, 4}, /* cost of loading mask register
625 in QImode, HImode, SImode. */
626 {2, 2, 2}, /* cost if storing mask register
627 in QImode, HImode, SImode. */
628 2, /* cost of moving mask register. */
629 /* End of register allocator costs. */
632 COSTS_N_INSNS (1), /* cost of an add instruction */
633 COSTS_N_INSNS (1), /* cost of a lea instruction */
634 COSTS_N_INSNS (1), /* variable shift costs */
635 COSTS_N_INSNS (1), /* constant shift costs */
636 {COSTS_N_INSNS (4), /* cost of starting multiply for QI */
637 COSTS_N_INSNS (4), /* HI */
638 COSTS_N_INSNS (4), /* SI */
639 COSTS_N_INSNS (4), /* DI */
640 COSTS_N_INSNS (4)}, /* other */
641 0, /* cost of multiply per each bit set */
642 {COSTS_N_INSNS (17), /* cost of a divide/mod for QI */
643 COSTS_N_INSNS (17), /* HI */
644 COSTS_N_INSNS (17), /* SI */
645 COSTS_N_INSNS (17), /* DI */
646 COSTS_N_INSNS (17)}, /* other */
647 COSTS_N_INSNS (1), /* cost of movsx */
648 COSTS_N_INSNS (1), /* cost of movzx */
649 8, /* "large" insn */
650 6, /* MOVE_RATIO */
651 6, /* CLEAR_RATIO */
652 {4, 4, 4}, /* cost of loading integer registers
653 in QImode, HImode and SImode.
654 Relative to reg-reg move (2). */
655 {2, 2, 2}, /* cost of storing integer registers */
656 {4, 8, 16, 32, 64}, /* cost of loading SSE register
657 in 32bit, 64bit, 128bit, 256bit and 512bit */
658 {4, 8, 16, 32, 64}, /* cost of storing SSE register
659 in 32bit, 64bit, 128bit, 256bit and 512bit */
660 {4, 8, 16, 32, 64}, /* cost of unaligned loads. */
661 {4, 8, 16, 32, 64}, /* cost of unaligned stores. */
662 2, 4, 8, /* cost of moving XMM,YMM,ZMM register */
663 3, /* cost of moving SSE register to integer. */
664 4, 4, /* Gather load static, per_elt. */
665 4, 4, /* Gather store static, per_elt. */
666 8, /* size of l1 cache. */
667 256, /* size of l2 cache */
668 32, /* size of prefetch block */
669 6, /* number of parallel prefetches */
670 2, /* Branch cost */
671 COSTS_N_INSNS (3), /* cost of FADD and FSUB insns. */
672 COSTS_N_INSNS (5), /* cost of FMUL instruction. */
673 COSTS_N_INSNS (56), /* cost of FDIV instruction. */
674 COSTS_N_INSNS (2), /* cost of FABS instruction. */
675 COSTS_N_INSNS (2), /* cost of FCHS instruction. */
676 COSTS_N_INSNS (56), /* cost of FSQRT instruction. */
678 COSTS_N_INSNS (1), /* cost of cheap SSE instruction. */
679 COSTS_N_INSNS (3), /* cost of ADDSS/SD SUBSS/SD insns. */
680 COSTS_N_INSNS (4), /* cost of MULSS instruction. */
681 COSTS_N_INSNS (4), /* cost of MULSD instruction. */
682 COSTS_N_INSNS (7), /* cost of FMA SS instruction. */
683 COSTS_N_INSNS (7), /* cost of FMA SD instruction. */
684 COSTS_N_INSNS (18), /* cost of DIVSS instruction. */
685 COSTS_N_INSNS (18), /* cost of DIVSD instruction. */
686 COSTS_N_INSNS (31), /* cost of SQRTSS instruction. */
687 COSTS_N_INSNS (31), /* cost of SQRTSD instruction. */
688 1, 1, 1, 1, /* reassoc int, fp, vec_int, vec_fp. */
689 pentiumpro_memcpy,
690 pentiumpro_memset,
691 COSTS_N_INSNS (3), /* cond_taken_branch_cost. */
692 COSTS_N_INSNS (1), /* cond_not_taken_branch_cost. */
693 "16", /* Loop alignment. */
694 "16:11:8", /* Jump alignment. */
695 "0:0:8", /* Label alignment. */
696 "16", /* Func alignment. */
697 4, /* Small unroll limit. */
698 2, /* Small unroll factor. */
699 COSTS_N_INSNS (2), /* Branch mispredict scale. */
702 static stringop_algs geode_memcpy[2] = {
703 {libcall, {{256, rep_prefix_4_byte, false}, {-1, libcall, false}}},
704 DUMMY_STRINGOP_ALGS};
705 static stringop_algs geode_memset[2] = {
706 {libcall, {{256, rep_prefix_4_byte, false}, {-1, libcall, false}}},
707 DUMMY_STRINGOP_ALGS};
708 static const
709 struct processor_costs geode_cost = {
711 /* Start of register allocator costs. integer->integer move cost is 2. */
712 2, /* cost for loading QImode using movzbl */
713 {2, 2, 2}, /* cost of loading integer registers
714 in QImode, HImode and SImode.
715 Relative to reg-reg move (2). */
716 {2, 2, 2}, /* cost of storing integer registers */
717 2, /* cost of reg,reg fld/fst */
718 {2, 2, 2}, /* cost of loading fp registers
719 in SFmode, DFmode and XFmode */
720 {4, 6, 6}, /* cost of storing fp registers
721 in SFmode, DFmode and XFmode */
722 2, /* cost of moving MMX register */
723 {2, 2}, /* cost of loading MMX registers
724 in SImode and DImode */
725 {2, 2}, /* cost of storing MMX registers
726 in SImode and DImode */
727 2, 4, 8, /* cost of moving XMM,YMM,ZMM register */
728 {2, 2, 8, 16, 32}, /* cost of loading SSE registers
729 in 32,64,128,256 and 512-bit */
730 {2, 2, 8, 16, 32}, /* cost of storing SSE registers
731 in 32,64,128,256 and 512-bit */
732 6, 6, /* SSE->integer and integer->SSE moves */
733 6, 6, /* mask->integer and integer->mask moves */
734 {2, 2, 2}, /* cost of loading mask register
735 in QImode, HImode, SImode. */
736 {2, 2, 2}, /* cost if storing mask register
737 in QImode, HImode, SImode. */
738 2, /* cost of moving mask register. */
739 /* End of register allocator costs. */
742 COSTS_N_INSNS (1), /* cost of an add instruction */
743 COSTS_N_INSNS (1), /* cost of a lea instruction */
744 COSTS_N_INSNS (2), /* variable shift costs */
745 COSTS_N_INSNS (1), /* constant shift costs */
746 {COSTS_N_INSNS (3), /* cost of starting multiply for QI */
747 COSTS_N_INSNS (4), /* HI */
748 COSTS_N_INSNS (7), /* SI */
749 COSTS_N_INSNS (7), /* DI */
750 COSTS_N_INSNS (7)}, /* other */
751 0, /* cost of multiply per each bit set */
752 {COSTS_N_INSNS (15), /* cost of a divide/mod for QI */
753 COSTS_N_INSNS (23), /* HI */
754 COSTS_N_INSNS (39), /* SI */
755 COSTS_N_INSNS (39), /* DI */
756 COSTS_N_INSNS (39)}, /* other */
757 COSTS_N_INSNS (1), /* cost of movsx */
758 COSTS_N_INSNS (1), /* cost of movzx */
759 8, /* "large" insn */
760 4, /* MOVE_RATIO */
761 4, /* CLEAR_RATIO */
762 {2, 2, 2}, /* cost of loading integer registers
763 in QImode, HImode and SImode.
764 Relative to reg-reg move (2). */
765 {2, 2, 2}, /* cost of storing integer registers */
766 {2, 2, 8, 16, 32}, /* cost of loading SSE register
767 in 32bit, 64bit, 128bit, 256bit and 512bit */
768 {2, 2, 8, 16, 32}, /* cost of storing SSE register
769 in 32bit, 64bit, 128bit, 256bit and 512bit */
770 {2, 2, 8, 16, 32}, /* cost of unaligned loads. */
771 {2, 2, 8, 16, 32}, /* cost of unaligned stores. */
772 2, 4, 8, /* cost of moving XMM,YMM,ZMM register */
773 6, /* cost of moving SSE register to integer. */
774 2, 2, /* Gather load static, per_elt. */
775 2, 2, /* Gather store static, per_elt. */
776 64, /* size of l1 cache. */
777 128, /* size of l2 cache. */
778 32, /* size of prefetch block */
779 1, /* number of parallel prefetches */
780 1, /* Branch cost */
781 COSTS_N_INSNS (6), /* cost of FADD and FSUB insns. */
782 COSTS_N_INSNS (11), /* cost of FMUL instruction. */
783 COSTS_N_INSNS (47), /* cost of FDIV instruction. */
784 COSTS_N_INSNS (1), /* cost of FABS instruction. */
785 COSTS_N_INSNS (1), /* cost of FCHS instruction. */
786 COSTS_N_INSNS (54), /* cost of FSQRT instruction. */
788 COSTS_N_INSNS (1), /* cost of cheap SSE instruction. */
789 COSTS_N_INSNS (6), /* cost of ADDSS/SD SUBSS/SD insns. */
790 COSTS_N_INSNS (11), /* cost of MULSS instruction. */
791 COSTS_N_INSNS (11), /* cost of MULSD instruction. */
792 COSTS_N_INSNS (17), /* cost of FMA SS instruction. */
793 COSTS_N_INSNS (17), /* cost of FMA SD instruction. */
794 COSTS_N_INSNS (47), /* cost of DIVSS instruction. */
795 COSTS_N_INSNS (47), /* cost of DIVSD instruction. */
796 COSTS_N_INSNS (54), /* cost of SQRTSS instruction. */
797 COSTS_N_INSNS (54), /* cost of SQRTSD instruction. */
798 1, 1, 1, 1, /* reassoc int, fp, vec_int, vec_fp. */
799 geode_memcpy,
800 geode_memset,
801 COSTS_N_INSNS (3), /* cond_taken_branch_cost. */
802 COSTS_N_INSNS (1), /* cond_not_taken_branch_cost. */
803 NULL, /* Loop alignment. */
804 NULL, /* Jump alignment. */
805 NULL, /* Label alignment. */
806 NULL, /* Func alignment. */
807 4, /* Small unroll limit. */
808 2, /* Small unroll factor. */
809 COSTS_N_INSNS (2), /* Branch mispredict scale. */
812 static stringop_algs k6_memcpy[2] = {
813 {libcall, {{256, rep_prefix_4_byte, false}, {-1, libcall, false}}},
814 DUMMY_STRINGOP_ALGS};
815 static stringop_algs k6_memset[2] = {
816 {libcall, {{256, rep_prefix_4_byte, false}, {-1, libcall, false}}},
817 DUMMY_STRINGOP_ALGS};
818 static const
819 struct processor_costs k6_cost = {
821 /* Start of register allocator costs. integer->integer move cost is 2. */
822 3, /* cost for loading QImode using movzbl */
823 {4, 5, 4}, /* cost of loading integer registers
824 in QImode, HImode and SImode.
825 Relative to reg-reg move (2). */
826 {2, 3, 2}, /* cost of storing integer registers */
827 4, /* cost of reg,reg fld/fst */
828 {6, 6, 6}, /* cost of loading fp registers
829 in SFmode, DFmode and XFmode */
830 {4, 4, 4}, /* cost of storing fp registers
831 in SFmode, DFmode and XFmode */
832 2, /* cost of moving MMX register */
833 {2, 2}, /* cost of loading MMX registers
834 in SImode and DImode */
835 {2, 2}, /* cost of storing MMX registers
836 in SImode and DImode */
837 2, 4, 8, /* cost of moving XMM,YMM,ZMM register */
838 {2, 2, 8, 16, 32}, /* cost of loading SSE registers
839 in 32,64,128,256 and 512-bit */
840 {2, 2, 8, 16, 32}, /* cost of storing SSE registers
841 in 32,64,128,256 and 512-bit */
842 6, 6, /* SSE->integer and integer->SSE moves */
843 6, 6, /* mask->integer and integer->mask moves */
844 {4, 5, 4}, /* cost of loading mask register
845 in QImode, HImode, SImode. */
846 {2, 3, 2}, /* cost if storing mask register
847 in QImode, HImode, SImode. */
848 2, /* cost of moving mask register. */
849 /* End of register allocator costs. */
852 COSTS_N_INSNS (1), /* cost of an add instruction */
853 COSTS_N_INSNS (2), /* cost of a lea instruction */
854 COSTS_N_INSNS (1), /* variable shift costs */
855 COSTS_N_INSNS (1), /* constant shift costs */
856 {COSTS_N_INSNS (3), /* cost of starting multiply for QI */
857 COSTS_N_INSNS (3), /* HI */
858 COSTS_N_INSNS (3), /* SI */
859 COSTS_N_INSNS (3), /* DI */
860 COSTS_N_INSNS (3)}, /* other */
861 0, /* cost of multiply per each bit set */
862 {COSTS_N_INSNS (18), /* cost of a divide/mod for QI */
863 COSTS_N_INSNS (18), /* HI */
864 COSTS_N_INSNS (18), /* SI */
865 COSTS_N_INSNS (18), /* DI */
866 COSTS_N_INSNS (18)}, /* other */
867 COSTS_N_INSNS (2), /* cost of movsx */
868 COSTS_N_INSNS (2), /* cost of movzx */
869 8, /* "large" insn */
870 4, /* MOVE_RATIO */
871 4, /* CLEAR_RATIO */
872 {4, 5, 4}, /* cost of loading integer registers
873 in QImode, HImode and SImode.
874 Relative to reg-reg move (2). */
875 {2, 3, 2}, /* cost of storing integer registers */
876 {2, 2, 8, 16, 32}, /* cost of loading SSE register
877 in 32bit, 64bit, 128bit, 256bit and 512bit */
878 {2, 2, 8, 16, 32}, /* cost of storing SSE register
879 in 32bit, 64bit, 128bit, 256bit and 512bit */
880 {2, 2, 8, 16, 32}, /* cost of unaligned loads. */
881 {2, 2, 8, 16, 32}, /* cost of unaligned stores. */
882 2, 4, 8, /* cost of moving XMM,YMM,ZMM register */
883 6, /* cost of moving SSE register to integer. */
884 2, 2, /* Gather load static, per_elt. */
885 2, 2, /* Gather store static, per_elt. */
886 32, /* size of l1 cache. */
887 32, /* size of l2 cache. Some models
888 have integrated l2 cache, but
889 optimizing for k6 is not important
890 enough to worry about that. */
891 32, /* size of prefetch block */
892 1, /* number of parallel prefetches */
893 1, /* Branch cost */
894 COSTS_N_INSNS (2), /* cost of FADD and FSUB insns. */
895 COSTS_N_INSNS (2), /* cost of FMUL instruction. */
896 COSTS_N_INSNS (56), /* cost of FDIV instruction. */
897 COSTS_N_INSNS (2), /* cost of FABS instruction. */
898 COSTS_N_INSNS (2), /* cost of FCHS instruction. */
899 COSTS_N_INSNS (56), /* cost of FSQRT instruction. */
901 COSTS_N_INSNS (1), /* cost of cheap SSE instruction. */
902 COSTS_N_INSNS (2), /* cost of ADDSS/SD SUBSS/SD insns. */
903 COSTS_N_INSNS (2), /* cost of MULSS instruction. */
904 COSTS_N_INSNS (2), /* cost of MULSD instruction. */
905 COSTS_N_INSNS (4), /* cost of FMA SS instruction. */
906 COSTS_N_INSNS (4), /* cost of FMA SD instruction. */
907 COSTS_N_INSNS (56), /* cost of DIVSS instruction. */
908 COSTS_N_INSNS (56), /* cost of DIVSD instruction. */
909 COSTS_N_INSNS (56), /* cost of SQRTSS instruction. */
910 COSTS_N_INSNS (56), /* cost of SQRTSD instruction. */
911 1, 1, 1, 1, /* reassoc int, fp, vec_int, vec_fp. */
912 k6_memcpy,
913 k6_memset,
914 COSTS_N_INSNS (3), /* cond_taken_branch_cost. */
915 COSTS_N_INSNS (1), /* cond_not_taken_branch_cost. */
916 "32:8:8", /* Loop alignment. */
917 "32:8:8", /* Jump alignment. */
918 "0:0:8", /* Label alignment. */
919 "32", /* Func alignment. */
920 4, /* Small unroll limit. */
921 2, /* Small unroll factor. */
922 COSTS_N_INSNS (2), /* Branch mispredict scale. */
925 /* For some reason, Athlon deals better with REP prefix (relative to loops)
926 compared to K8. Alignment becomes important after 8 bytes for memcpy and
927 128 bytes for memset. */
928 static stringop_algs athlon_memcpy[2] = {
929 {libcall, {{2048, rep_prefix_4_byte, false}, {-1, libcall, false}}},
930 DUMMY_STRINGOP_ALGS};
931 static stringop_algs athlon_memset[2] = {
932 {libcall, {{2048, rep_prefix_4_byte, false}, {-1, libcall, false}}},
933 DUMMY_STRINGOP_ALGS};
934 static const
935 struct processor_costs athlon_cost = {
937 /* Start of register allocator costs. integer->integer move cost is 2. */
938 4, /* cost for loading QImode using movzbl */
939 {3, 4, 3}, /* cost of loading integer registers
940 in QImode, HImode and SImode.
941 Relative to reg-reg move (2). */
942 {3, 4, 3}, /* cost of storing integer registers */
943 4, /* cost of reg,reg fld/fst */
944 {4, 4, 12}, /* cost of loading fp registers
945 in SFmode, DFmode and XFmode */
946 {6, 6, 8}, /* cost of storing fp registers
947 in SFmode, DFmode and XFmode */
948 2, /* cost of moving MMX register */
949 {4, 4}, /* cost of loading MMX registers
950 in SImode and DImode */
951 {4, 4}, /* cost of storing MMX registers
952 in SImode and DImode */
953 2, 4, 8, /* cost of moving XMM,YMM,ZMM register */
954 {4, 4, 12, 12, 24}, /* cost of loading SSE registers
955 in 32,64,128,256 and 512-bit */
956 {4, 4, 10, 10, 20}, /* cost of storing SSE registers
957 in 32,64,128,256 and 512-bit */
958 5, 5, /* SSE->integer and integer->SSE moves */
959 5, 5, /* mask->integer and integer->mask moves */
960 {3, 4, 3}, /* cost of loading mask register
961 in QImode, HImode, SImode. */
962 {3, 4, 3}, /* cost if storing mask register
963 in QImode, HImode, SImode. */
964 2, /* cost of moving mask register. */
965 /* End of register allocator costs. */
968 COSTS_N_INSNS (1), /* cost of an add instruction */
969 COSTS_N_INSNS (2), /* cost of a lea instruction */
970 COSTS_N_INSNS (1), /* variable shift costs */
971 COSTS_N_INSNS (1), /* constant shift costs */
972 {COSTS_N_INSNS (5), /* cost of starting multiply for QI */
973 COSTS_N_INSNS (5), /* HI */
974 COSTS_N_INSNS (5), /* SI */
975 COSTS_N_INSNS (5), /* DI */
976 COSTS_N_INSNS (5)}, /* other */
977 0, /* cost of multiply per each bit set */
978 {COSTS_N_INSNS (18), /* cost of a divide/mod for QI */
979 COSTS_N_INSNS (26), /* HI */
980 COSTS_N_INSNS (42), /* SI */
981 COSTS_N_INSNS (74), /* DI */
982 COSTS_N_INSNS (74)}, /* other */
983 COSTS_N_INSNS (1), /* cost of movsx */
984 COSTS_N_INSNS (1), /* cost of movzx */
985 8, /* "large" insn */
986 9, /* MOVE_RATIO */
987 6, /* CLEAR_RATIO */
988 {3, 4, 3}, /* cost of loading integer registers
989 in QImode, HImode and SImode.
990 Relative to reg-reg move (2). */
991 {3, 4, 3}, /* cost of storing integer registers */
992 {4, 4, 12, 12, 24}, /* cost of loading SSE register
993 in 32bit, 64bit, 128bit, 256bit and 512bit */
994 {4, 4, 10, 10, 20}, /* cost of storing SSE register
995 in 32bit, 64bit, 128bit, 256bit and 512bit */
996 {4, 4, 12, 12, 24}, /* cost of unaligned loads. */
997 {4, 4, 10, 10, 20}, /* cost of unaligned stores. */
998 2, 4, 8, /* cost of moving XMM,YMM,ZMM register */
999 5, /* cost of moving SSE register to integer. */
1000 4, 4, /* Gather load static, per_elt. */
1001 4, 4, /* Gather store static, per_elt. */
1002 64, /* size of l1 cache. */
1003 256, /* size of l2 cache. */
1004 64, /* size of prefetch block */
1005 6, /* number of parallel prefetches */
1006 5, /* Branch cost */
1007 COSTS_N_INSNS (4), /* cost of FADD and FSUB insns. */
1008 COSTS_N_INSNS (4), /* cost of FMUL instruction. */
1009 COSTS_N_INSNS (24), /* cost of FDIV instruction. */
1010 COSTS_N_INSNS (2), /* cost of FABS instruction. */
1011 COSTS_N_INSNS (2), /* cost of FCHS instruction. */
1012 COSTS_N_INSNS (35), /* cost of FSQRT instruction. */
1014 COSTS_N_INSNS (2), /* cost of cheap SSE instruction. */
1015 COSTS_N_INSNS (4), /* cost of ADDSS/SD SUBSS/SD insns. */
1016 COSTS_N_INSNS (4), /* cost of MULSS instruction. */
1017 COSTS_N_INSNS (4), /* cost of MULSD instruction. */
1018 COSTS_N_INSNS (8), /* cost of FMA SS instruction. */
1019 COSTS_N_INSNS (8), /* cost of FMA SD instruction. */
1020 /* 11-16 */
1021 COSTS_N_INSNS (16), /* cost of DIVSS instruction. */
1022 COSTS_N_INSNS (24), /* cost of DIVSD instruction. */
1023 COSTS_N_INSNS (19), /* cost of SQRTSS instruction. */
1024 COSTS_N_INSNS (19), /* cost of SQRTSD instruction. */
1025 1, 1, 1, 1, /* reassoc int, fp, vec_int, vec_fp. */
1026 athlon_memcpy,
1027 athlon_memset,
1028 COSTS_N_INSNS (3), /* cond_taken_branch_cost. */
1029 COSTS_N_INSNS (1), /* cond_not_taken_branch_cost. */
1030 "16:8:8", /* Loop alignment. */
1031 "16:8:8", /* Jump alignment. */
1032 "0:0:8", /* Label alignment. */
1033 "16", /* Func alignment. */
1034 4, /* Small unroll limit. */
1035 2, /* Small unroll factor. */
1036 COSTS_N_INSNS (2), /* Branch mispredict scale. */
1039 /* K8 has optimized REP instruction for medium sized blocks, but for very
1040 small blocks it is better to use loop. For large blocks, libcall can
1041 do nontemporary accesses and beat inline considerably. */
1042 static stringop_algs k8_memcpy[2] = {
1043 {libcall, {{6, loop, false}, {14, unrolled_loop, false},
1044 {-1, rep_prefix_4_byte, false}}},
1045 {libcall, {{16, loop, false}, {8192, rep_prefix_8_byte, false},
1046 {-1, libcall, false}}}};
1047 static stringop_algs k8_memset[2] = {
1048 {libcall, {{8, loop, false}, {24, unrolled_loop, false},
1049 {2048, rep_prefix_4_byte, false}, {-1, libcall, false}}},
1050 {libcall, {{48, unrolled_loop, false},
1051 {8192, rep_prefix_8_byte, false}, {-1, libcall, false}}}};
1052 static const
1053 struct processor_costs k8_cost = {
1055 /* Start of register allocator costs. integer->integer move cost is 2. */
1056 4, /* cost for loading QImode using movzbl */
1057 {3, 4, 3}, /* cost of loading integer registers
1058 in QImode, HImode and SImode.
1059 Relative to reg-reg move (2). */
1060 {3, 4, 3}, /* cost of storing integer registers */
1061 4, /* cost of reg,reg fld/fst */
1062 {4, 4, 12}, /* cost of loading fp registers
1063 in SFmode, DFmode and XFmode */
1064 {6, 6, 8}, /* cost of storing fp registers
1065 in SFmode, DFmode and XFmode */
1066 2, /* cost of moving MMX register */
1067 {3, 3}, /* cost of loading MMX registers
1068 in SImode and DImode */
1069 {4, 4}, /* cost of storing MMX registers
1070 in SImode and DImode */
1071 2, 4, 8, /* cost of moving XMM,YMM,ZMM register */
1072 {4, 3, 12, 12, 24}, /* cost of loading SSE registers
1073 in 32,64,128,256 and 512-bit */
1074 {4, 4, 10, 10, 20}, /* cost of storing SSE registers
1075 in 32,64,128,256 and 512-bit */
1076 5, 5, /* SSE->integer and integer->SSE moves */
1077 5, 5, /* mask->integer and integer->mask moves */
1078 {3, 4, 3}, /* cost of loading mask register
1079 in QImode, HImode, SImode. */
1080 {3, 4, 3}, /* cost if storing mask register
1081 in QImode, HImode, SImode. */
1082 2, /* cost of moving mask register. */
1083 /* End of register allocator costs. */
1086 COSTS_N_INSNS (1), /* cost of an add instruction */
1087 COSTS_N_INSNS (2), /* cost of a lea instruction */
1088 COSTS_N_INSNS (1), /* variable shift costs */
1089 COSTS_N_INSNS (1), /* constant shift costs */
1090 {COSTS_N_INSNS (3), /* cost of starting multiply for QI */
1091 COSTS_N_INSNS (4), /* HI */
1092 COSTS_N_INSNS (3), /* SI */
1093 COSTS_N_INSNS (4), /* DI */
1094 COSTS_N_INSNS (5)}, /* other */
1095 0, /* cost of multiply per each bit set */
1096 {COSTS_N_INSNS (18), /* cost of a divide/mod for QI */
1097 COSTS_N_INSNS (26), /* HI */
1098 COSTS_N_INSNS (42), /* SI */
1099 COSTS_N_INSNS (74), /* DI */
1100 COSTS_N_INSNS (74)}, /* other */
1101 COSTS_N_INSNS (1), /* cost of movsx */
1102 COSTS_N_INSNS (1), /* cost of movzx */
1103 8, /* "large" insn */
1104 9, /* MOVE_RATIO */
1105 6, /* CLEAR_RATIO */
1106 {3, 4, 3}, /* cost of loading integer registers
1107 in QImode, HImode and SImode.
1108 Relative to reg-reg move (2). */
1109 {3, 4, 3}, /* cost of storing integer registers */
1110 {4, 3, 12, 12, 24}, /* cost of loading SSE register
1111 in 32bit, 64bit, 128bit, 256bit and 512bit */
1112 {4, 4, 10, 10, 20}, /* cost of storing SSE register
1113 in 32bit, 64bit, 128bit, 256bit and 512bit */
1114 {4, 3, 12, 12, 24}, /* cost of unaligned loads. */
1115 {4, 4, 10, 10, 20}, /* cost of unaligned stores. */
1116 2, 4, 8, /* cost of moving XMM,YMM,ZMM register */
1117 5, /* cost of moving SSE register to integer. */
1118 4, 4, /* Gather load static, per_elt. */
1119 4, 4, /* Gather store static, per_elt. */
1120 64, /* size of l1 cache. */
1121 512, /* size of l2 cache. */
1122 64, /* size of prefetch block */
1123 /* New AMD processors never drop prefetches; if they cannot be performed
1124 immediately, they are queued. We set number of simultaneous prefetches
1125 to a large constant to reflect this (it probably is not a good idea not
1126 to limit number of prefetches at all, as their execution also takes some
1127 time). */
1128 100, /* number of parallel prefetches */
1129 3, /* Branch cost */
1130 COSTS_N_INSNS (4), /* cost of FADD and FSUB insns. */
1131 COSTS_N_INSNS (4), /* cost of FMUL instruction. */
1132 COSTS_N_INSNS (19), /* cost of FDIV instruction. */
1133 COSTS_N_INSNS (2), /* cost of FABS instruction. */
1134 COSTS_N_INSNS (2), /* cost of FCHS instruction. */
1135 COSTS_N_INSNS (35), /* cost of FSQRT instruction. */
1137 COSTS_N_INSNS (2), /* cost of cheap SSE instruction. */
1138 COSTS_N_INSNS (4), /* cost of ADDSS/SD SUBSS/SD insns. */
1139 COSTS_N_INSNS (4), /* cost of MULSS instruction. */
1140 COSTS_N_INSNS (4), /* cost of MULSD instruction. */
1141 COSTS_N_INSNS (8), /* cost of FMA SS instruction. */
1142 COSTS_N_INSNS (8), /* cost of FMA SD instruction. */
1143 /* 11-16 */
1144 COSTS_N_INSNS (16), /* cost of DIVSS instruction. */
1145 COSTS_N_INSNS (20), /* cost of DIVSD instruction. */
1146 COSTS_N_INSNS (19), /* cost of SQRTSS instruction. */
1147 COSTS_N_INSNS (27), /* cost of SQRTSD instruction. */
1148 1, 1, 1, 1, /* reassoc int, fp, vec_int, vec_fp. */
1149 k8_memcpy,
1150 k8_memset,
1151 COSTS_N_INSNS (3), /* cond_taken_branch_cost. */
1152 COSTS_N_INSNS (2), /* cond_not_taken_branch_cost. */
1153 "16:8:8", /* Loop alignment. */
1154 "16:8:8", /* Jump alignment. */
1155 "0:0:8", /* Label alignment. */
1156 "16", /* Func alignment. */
1157 4, /* Small unroll limit. */
1158 2, /* Small unroll factor. */
1159 COSTS_N_INSNS (2), /* Branch mispredict scale. */
1162 /* AMDFAM10 has optimized REP instruction for medium sized blocks, but for
1163 very small blocks it is better to use loop. For large blocks, libcall can
1164 do nontemporary accesses and beat inline considerably. */
1165 static stringop_algs amdfam10_memcpy[2] = {
1166 {libcall, {{6, loop, false}, {14, unrolled_loop, false},
1167 {-1, rep_prefix_4_byte, false}}},
1168 {libcall, {{16, loop, false}, {8192, rep_prefix_8_byte, false},
1169 {-1, libcall, false}}}};
1170 static stringop_algs amdfam10_memset[2] = {
1171 {libcall, {{8, loop, false}, {24, unrolled_loop, false},
1172 {2048, rep_prefix_4_byte, false}, {-1, libcall, false}}},
1173 {libcall, {{48, unrolled_loop, false}, {8192, rep_prefix_8_byte, false},
1174 {-1, libcall, false}}}};
1175 struct processor_costs amdfam10_cost = {
1177 /* Start of register allocator costs. integer->integer move cost is 2. */
1178 4, /* cost for loading QImode using movzbl */
1179 {3, 4, 3}, /* cost of loading integer registers
1180 in QImode, HImode and SImode.
1181 Relative to reg-reg move (2). */
1182 {3, 4, 3}, /* cost of storing integer registers */
1183 4, /* cost of reg,reg fld/fst */
1184 {4, 4, 12}, /* cost of loading fp registers
1185 in SFmode, DFmode and XFmode */
1186 {6, 6, 8}, /* cost of storing fp registers
1187 in SFmode, DFmode and XFmode */
1188 2, /* cost of moving MMX register */
1189 {3, 3}, /* cost of loading MMX registers
1190 in SImode and DImode */
1191 {4, 4}, /* cost of storing MMX registers
1192 in SImode and DImode */
1193 2, 4, 8, /* cost of moving XMM,YMM,ZMM register */
1194 {4, 4, 3, 6, 12}, /* cost of loading SSE registers
1195 in 32,64,128,256 and 512-bit */
1196 {4, 4, 5, 10, 20}, /* cost of storing SSE registers
1197 in 32,64,128,256 and 512-bit */
1198 3, 3, /* SSE->integer and integer->SSE moves */
1199 3, 3, /* mask->integer and integer->mask moves */
1200 {3, 4, 3}, /* cost of loading mask register
1201 in QImode, HImode, SImode. */
1202 {3, 4, 3}, /* cost if storing mask register
1203 in QImode, HImode, SImode. */
1204 2, /* cost of moving mask register. */
1206 /* On K8:
1207 MOVD reg64, xmmreg Double FSTORE 4
1208 MOVD reg32, xmmreg Double FSTORE 4
1209 On AMDFAM10:
1210 MOVD reg64, xmmreg Double FADD 3
1211 1/1 1/1
1212 MOVD reg32, xmmreg Double FADD 3
1213 1/1 1/1 */
1214 /* End of register allocator costs. */
1217 COSTS_N_INSNS (1), /* cost of an add instruction */
1218 COSTS_N_INSNS (2), /* cost of a lea instruction */
1219 COSTS_N_INSNS (1), /* variable shift costs */
1220 COSTS_N_INSNS (1), /* constant shift costs */
1221 {COSTS_N_INSNS (3), /* cost of starting multiply for QI */
1222 COSTS_N_INSNS (4), /* HI */
1223 COSTS_N_INSNS (3), /* SI */
1224 COSTS_N_INSNS (4), /* DI */
1225 COSTS_N_INSNS (5)}, /* other */
1226 0, /* cost of multiply per each bit set */
1227 {COSTS_N_INSNS (19), /* cost of a divide/mod for QI */
1228 COSTS_N_INSNS (35), /* HI */
1229 COSTS_N_INSNS (51), /* SI */
1230 COSTS_N_INSNS (83), /* DI */
1231 COSTS_N_INSNS (83)}, /* other */
1232 COSTS_N_INSNS (1), /* cost of movsx */
1233 COSTS_N_INSNS (1), /* cost of movzx */
1234 8, /* "large" insn */
1235 9, /* MOVE_RATIO */
1236 6, /* CLEAR_RATIO */
1237 {3, 4, 3}, /* cost of loading integer registers
1238 in QImode, HImode and SImode.
1239 Relative to reg-reg move (2). */
1240 {3, 4, 3}, /* cost of storing integer registers */
1241 {4, 4, 3, 6, 12}, /* cost of loading SSE register
1242 in 32bit, 64bit, 128bit, 256bit and 512bit */
1243 {4, 4, 5, 10, 20}, /* cost of storing SSE register
1244 in 32bit, 64bit, 128bit, 256bit and 512bit */
1245 {4, 4, 3, 7, 12}, /* cost of unaligned loads. */
1246 {4, 4, 5, 10, 20}, /* cost of unaligned stores. */
1247 2, 4, 8, /* cost of moving XMM,YMM,ZMM register */
1248 3, /* cost of moving SSE register to integer. */
1249 4, 4, /* Gather load static, per_elt. */
1250 4, 4, /* Gather store static, per_elt. */
1251 64, /* size of l1 cache. */
1252 512, /* size of l2 cache. */
1253 64, /* size of prefetch block */
1254 /* New AMD processors never drop prefetches; if they cannot be performed
1255 immediately, they are queued. We set number of simultaneous prefetches
1256 to a large constant to reflect this (it probably is not a good idea not
1257 to limit number of prefetches at all, as their execution also takes some
1258 time). */
1259 100, /* number of parallel prefetches */
1260 2, /* Branch cost */
1261 COSTS_N_INSNS (4), /* cost of FADD and FSUB insns. */
1262 COSTS_N_INSNS (4), /* cost of FMUL instruction. */
1263 COSTS_N_INSNS (19), /* cost of FDIV instruction. */
1264 COSTS_N_INSNS (2), /* cost of FABS instruction. */
1265 COSTS_N_INSNS (2), /* cost of FCHS instruction. */
1266 COSTS_N_INSNS (35), /* cost of FSQRT instruction. */
1268 COSTS_N_INSNS (2), /* cost of cheap SSE instruction. */
1269 COSTS_N_INSNS (4), /* cost of ADDSS/SD SUBSS/SD insns. */
1270 COSTS_N_INSNS (4), /* cost of MULSS instruction. */
1271 COSTS_N_INSNS (4), /* cost of MULSD instruction. */
1272 COSTS_N_INSNS (8), /* cost of FMA SS instruction. */
1273 COSTS_N_INSNS (8), /* cost of FMA SD instruction. */
1274 /* 11-16 */
1275 COSTS_N_INSNS (16), /* cost of DIVSS instruction. */
1276 COSTS_N_INSNS (20), /* cost of DIVSD instruction. */
1277 COSTS_N_INSNS (19), /* cost of SQRTSS instruction. */
1278 COSTS_N_INSNS (27), /* cost of SQRTSD instruction. */
1279 1, 1, 1, 1, /* reassoc int, fp, vec_int, vec_fp. */
1280 amdfam10_memcpy,
1281 amdfam10_memset,
1282 COSTS_N_INSNS (2), /* cond_taken_branch_cost. */
1283 COSTS_N_INSNS (1), /* cond_not_taken_branch_cost. */
1284 "32:25:8", /* Loop alignment. */
1285 "32:8:8", /* Jump alignment. */
1286 "0:0:8", /* Label alignment. */
1287 "32", /* Func alignment. */
1288 4, /* Small unroll limit. */
1289 2, /* Small unroll factor. */
1290 COSTS_N_INSNS (2), /* Branch mispredict scale. */
1293 /* BDVER has optimized REP instruction for medium sized blocks, but for
1294 very small blocks it is better to use loop. For large blocks, libcall
1295 can do nontemporary accesses and beat inline considerably. */
1296 static stringop_algs bdver_memcpy[2] = {
1297 {libcall, {{6, loop, false}, {14, unrolled_loop, false},
1298 {-1, rep_prefix_4_byte, false}}},
1299 {libcall, {{16, loop, false}, {8192, rep_prefix_8_byte, false},
1300 {-1, libcall, false}}}};
1301 static stringop_algs bdver_memset[2] = {
1302 {libcall, {{8, loop, false}, {24, unrolled_loop, false},
1303 {2048, rep_prefix_4_byte, false}, {-1, libcall, false}}},
1304 {libcall, {{48, unrolled_loop, false}, {8192, rep_prefix_8_byte, false},
1305 {-1, libcall, false}}}};
1307 const struct processor_costs bdver_cost = {
1309 /* Start of register allocator costs. integer->integer move cost is 2. */
1310 8, /* cost for loading QImode using movzbl */
1311 {8, 8, 8}, /* cost of loading integer registers
1312 in QImode, HImode and SImode.
1313 Relative to reg-reg move (2). */
1314 {8, 8, 8}, /* cost of storing integer registers */
1315 4, /* cost of reg,reg fld/fst */
1316 {12, 12, 28}, /* cost of loading fp registers
1317 in SFmode, DFmode and XFmode */
1318 {10, 10, 18}, /* cost of storing fp registers
1319 in SFmode, DFmode and XFmode */
1320 4, /* cost of moving MMX register */
1321 {12, 12}, /* cost of loading MMX registers
1322 in SImode and DImode */
1323 {10, 10}, /* cost of storing MMX registers
1324 in SImode and DImode */
1325 2, 4, 8, /* cost of moving XMM,YMM,ZMM register */
1326 {12, 12, 10, 40, 60}, /* cost of loading SSE registers
1327 in 32,64,128,256 and 512-bit */
1328 {10, 10, 10, 40, 60}, /* cost of storing SSE registers
1329 in 32,64,128,256 and 512-bit */
1330 16, 20, /* SSE->integer and integer->SSE moves */
1331 16, 20, /* mask->integer and integer->mask moves */
1332 {8, 8, 8}, /* cost of loading mask register
1333 in QImode, HImode, SImode. */
1334 {8, 8, 8}, /* cost if storing mask register
1335 in QImode, HImode, SImode. */
1336 2, /* cost of moving mask register. */
1337 /* End of register allocator costs. */
1340 COSTS_N_INSNS (1), /* cost of an add instruction */
1341 COSTS_N_INSNS (1), /* cost of a lea instruction */
1342 COSTS_N_INSNS (1), /* variable shift costs */
1343 COSTS_N_INSNS (1), /* constant shift costs */
1344 {COSTS_N_INSNS (4), /* cost of starting multiply for QI */
1345 COSTS_N_INSNS (4), /* HI */
1346 COSTS_N_INSNS (4), /* SI */
1347 COSTS_N_INSNS (6), /* DI */
1348 COSTS_N_INSNS (6)}, /* other */
1349 0, /* cost of multiply per each bit set */
1350 {COSTS_N_INSNS (19), /* cost of a divide/mod for QI */
1351 COSTS_N_INSNS (35), /* HI */
1352 COSTS_N_INSNS (51), /* SI */
1353 COSTS_N_INSNS (83), /* DI */
1354 COSTS_N_INSNS (83)}, /* other */
1355 COSTS_N_INSNS (1), /* cost of movsx */
1356 COSTS_N_INSNS (1), /* cost of movzx */
1357 8, /* "large" insn */
1358 9, /* MOVE_RATIO */
1359 6, /* CLEAR_RATIO */
1360 {8, 8, 8}, /* cost of loading integer registers
1361 in QImode, HImode and SImode.
1362 Relative to reg-reg move (2). */
1363 {8, 8, 8}, /* cost of storing integer registers */
1364 {12, 12, 10, 40, 60}, /* cost of loading SSE register
1365 in 32bit, 64bit, 128bit, 256bit and 512bit */
1366 {10, 10, 10, 40, 60}, /* cost of storing SSE register
1367 in 32bit, 64bit, 128bit, 256bit and 512bit */
1368 {12, 12, 10, 40, 60}, /* cost of unaligned loads. */
1369 {10, 10, 10, 40, 60}, /* cost of unaligned stores. */
1370 2, 4, 8, /* cost of moving XMM,YMM,ZMM register */
1371 16, /* cost of moving SSE register to integer. */
1372 12, 12, /* Gather load static, per_elt. */
1373 10, 10, /* Gather store static, per_elt. */
1374 16, /* size of l1 cache. */
1375 2048, /* size of l2 cache. */
1376 64, /* size of prefetch block */
1377 /* New AMD processors never drop prefetches; if they cannot be performed
1378 immediately, they are queued. We set number of simultaneous prefetches
1379 to a large constant to reflect this (it probably is not a good idea not
1380 to limit number of prefetches at all, as their execution also takes some
1381 time). */
1382 100, /* number of parallel prefetches */
1383 2, /* Branch cost */
1384 COSTS_N_INSNS (6), /* cost of FADD and FSUB insns. */
1385 COSTS_N_INSNS (6), /* cost of FMUL instruction. */
1386 COSTS_N_INSNS (42), /* cost of FDIV instruction. */
1387 COSTS_N_INSNS (2), /* cost of FABS instruction. */
1388 COSTS_N_INSNS (2), /* cost of FCHS instruction. */
1389 COSTS_N_INSNS (52), /* cost of FSQRT instruction. */
1391 COSTS_N_INSNS (2), /* cost of cheap SSE instruction. */
1392 COSTS_N_INSNS (6), /* cost of ADDSS/SD SUBSS/SD insns. */
1393 COSTS_N_INSNS (6), /* cost of MULSS instruction. */
1394 COSTS_N_INSNS (6), /* cost of MULSD instruction. */
1395 COSTS_N_INSNS (6), /* cost of FMA SS instruction. */
1396 COSTS_N_INSNS (6), /* cost of FMA SD instruction. */
1397 /* 9-24 */
1398 COSTS_N_INSNS (24), /* cost of DIVSS instruction. */
1399 /* 9-27 */
1400 COSTS_N_INSNS (27), /* cost of DIVSD instruction. */
1401 COSTS_N_INSNS (15), /* cost of SQRTSS instruction. */
1402 COSTS_N_INSNS (26), /* cost of SQRTSD instruction. */
1403 1, 2, 1, 1, /* reassoc int, fp, vec_int, vec_fp. */
1404 bdver_memcpy,
1405 bdver_memset,
1406 COSTS_N_INSNS (4), /* cond_taken_branch_cost. */
1407 COSTS_N_INSNS (2), /* cond_not_taken_branch_cost. */
1408 "16:11:8", /* Loop alignment. */
1409 "16:8:8", /* Jump alignment. */
1410 "0:0:8", /* Label alignment. */
1411 "11", /* Func alignment. */
1412 4, /* Small unroll limit. */
1413 2, /* Small unroll factor. */
1414 COSTS_N_INSNS (2), /* Branch mispredict scale. */
1418 /* ZNVER1 has optimized REP instruction for medium sized blocks, but for
1419 very small blocks it is better to use loop. For large blocks, libcall
1420 can do nontemporary accesses and beat inline considerably. */
1421 static stringop_algs znver1_memcpy[2] = {
1422 /* 32-bit tuning. */
1423 {libcall, {{6, loop, false},
1424 {14, unrolled_loop, false},
1425 {-1, libcall, false}}},
1426 /* 64-bit tuning. */
1427 {libcall, {{16, loop, false},
1428 {128, rep_prefix_8_byte, false},
1429 {-1, libcall, false}}}};
1430 static stringop_algs znver1_memset[2] = {
1431 /* 32-bit tuning. */
1432 {libcall, {{8, loop, false},
1433 {24, unrolled_loop, false},
1434 {128, rep_prefix_4_byte, false},
1435 {-1, libcall, false}}},
1436 /* 64-bit tuning. */
1437 {libcall, {{48, unrolled_loop, false},
1438 {128, rep_prefix_8_byte, false},
1439 {-1, libcall, false}}}};
1440 struct processor_costs znver1_cost = {
1442 /* Start of register allocator costs. integer->integer move cost is 2. */
1444 /* reg-reg moves are done by renaming and thus they are even cheaper than
1445 1 cycle. Becuase reg-reg move cost is 2 and the following tables correspond
1446 to doubles of latencies, we do not model this correctly. It does not
1447 seem to make practical difference to bump prices up even more. */
1448 6, /* cost for loading QImode using
1449 movzbl. */
1450 {6, 6, 6}, /* cost of loading integer registers
1451 in QImode, HImode and SImode.
1452 Relative to reg-reg move (2). */
1453 {8, 8, 8}, /* cost of storing integer
1454 registers. */
1455 2, /* cost of reg,reg fld/fst. */
1456 {6, 6, 16}, /* cost of loading fp registers
1457 in SFmode, DFmode and XFmode. */
1458 {8, 8, 16}, /* cost of storing fp registers
1459 in SFmode, DFmode and XFmode. */
1460 2, /* cost of moving MMX register. */
1461 {6, 6}, /* cost of loading MMX registers
1462 in SImode and DImode. */
1463 {8, 8}, /* cost of storing MMX registers
1464 in SImode and DImode. */
1465 2, 3, 6, /* cost of moving XMM,YMM,ZMM register. */
1466 {6, 6, 6, 12, 24}, /* cost of loading SSE registers
1467 in 32,64,128,256 and 512-bit. */
1468 {8, 8, 8, 16, 32}, /* cost of storing SSE registers
1469 in 32,64,128,256 and 512-bit. */
1470 6, 6, /* SSE->integer and integer->SSE moves. */
1471 8, 8, /* mask->integer and integer->mask moves */
1472 {6, 6, 6}, /* cost of loading mask register
1473 in QImode, HImode, SImode. */
1474 {8, 8, 8}, /* cost if storing mask register
1475 in QImode, HImode, SImode. */
1476 2, /* cost of moving mask register. */
1477 /* End of register allocator costs. */
1480 COSTS_N_INSNS (1), /* cost of an add instruction. */
1481 COSTS_N_INSNS (1), /* cost of a lea instruction. */
1482 COSTS_N_INSNS (1), /* variable shift costs. */
1483 COSTS_N_INSNS (1), /* constant shift costs. */
1484 {COSTS_N_INSNS (3), /* cost of starting multiply for QI. */
1485 COSTS_N_INSNS (3), /* HI. */
1486 COSTS_N_INSNS (3), /* SI. */
1487 COSTS_N_INSNS (3), /* DI. */
1488 COSTS_N_INSNS (3)}, /* other. */
1489 0, /* cost of multiply per each bit
1490 set. */
1491 /* Depending on parameters, idiv can get faster on ryzen. This is upper
1492 bound. */
1493 {COSTS_N_INSNS (16), /* cost of a divide/mod for QI. */
1494 COSTS_N_INSNS (22), /* HI. */
1495 COSTS_N_INSNS (30), /* SI. */
1496 COSTS_N_INSNS (45), /* DI. */
1497 COSTS_N_INSNS (45)}, /* other. */
1498 COSTS_N_INSNS (1), /* cost of movsx. */
1499 COSTS_N_INSNS (1), /* cost of movzx. */
1500 8, /* "large" insn. */
1501 9, /* MOVE_RATIO. */
1502 6, /* CLEAR_RATIO */
1503 {6, 6, 6}, /* cost of loading integer registers
1504 in QImode, HImode and SImode.
1505 Relative to reg-reg move (2). */
1506 {8, 8, 8}, /* cost of storing integer
1507 registers. */
1508 {6, 6, 6, 12, 24}, /* cost of loading SSE register
1509 in 32bit, 64bit, 128bit, 256bit and 512bit */
1510 {8, 8, 8, 16, 32}, /* cost of storing SSE register
1511 in 32bit, 64bit, 128bit, 256bit and 512bit */
1512 {6, 6, 6, 12, 24}, /* cost of unaligned loads. */
1513 {8, 8, 8, 16, 32}, /* cost of unaligned stores. */
1514 2, 3, 6, /* cost of moving XMM,YMM,ZMM register. */
1515 6, /* cost of moving SSE register to integer. */
1516 /* VGATHERDPD is 23 uops and throughput is 9, VGATHERDPD is 35 uops,
1517 throughput 12. Approx 9 uops do not depend on vector size and every load
1518 is 7 uops. */
1519 18, 8, /* Gather load static, per_elt. */
1520 18, 10, /* Gather store static, per_elt. */
1521 32, /* size of l1 cache. */
1522 512, /* size of l2 cache. */
1523 64, /* size of prefetch block. */
1524 /* New AMD processors never drop prefetches; if they cannot be performed
1525 immediately, they are queued. We set number of simultaneous prefetches
1526 to a large constant to reflect this (it probably is not a good idea not
1527 to limit number of prefetches at all, as their execution also takes some
1528 time). */
1529 100, /* number of parallel prefetches. */
1530 3, /* Branch cost. */
1531 COSTS_N_INSNS (5), /* cost of FADD and FSUB insns. */
1532 COSTS_N_INSNS (5), /* cost of FMUL instruction. */
1533 /* Latency of fdiv is 8-15. */
1534 COSTS_N_INSNS (15), /* cost of FDIV instruction. */
1535 COSTS_N_INSNS (1), /* cost of FABS instruction. */
1536 COSTS_N_INSNS (1), /* cost of FCHS instruction. */
1537 /* Latency of fsqrt is 4-10. */
1538 COSTS_N_INSNS (10), /* cost of FSQRT instruction. */
1540 COSTS_N_INSNS (1), /* cost of cheap SSE instruction. */
1541 COSTS_N_INSNS (3), /* cost of ADDSS/SD SUBSS/SD insns. */
1542 COSTS_N_INSNS (3), /* cost of MULSS instruction. */
1543 COSTS_N_INSNS (4), /* cost of MULSD instruction. */
1544 COSTS_N_INSNS (5), /* cost of FMA SS instruction. */
1545 COSTS_N_INSNS (5), /* cost of FMA SD instruction. */
1546 COSTS_N_INSNS (10), /* cost of DIVSS instruction. */
1547 /* 9-13 */
1548 COSTS_N_INSNS (13), /* cost of DIVSD instruction. */
1549 COSTS_N_INSNS (10), /* cost of SQRTSS instruction. */
1550 COSTS_N_INSNS (15), /* cost of SQRTSD instruction. */
1551 /* Zen can execute 4 integer operations per cycle. FP operations take 3 cycles
1552 and it can execute 2 integer additions and 2 multiplications thus
1553 reassociation may make sense up to with of 6. SPEC2k6 bencharks suggests
1554 that 4 works better than 6 probably due to register pressure.
1556 Integer vector operations are taken by FP unit and execute 3 vector
1557 plus/minus operations per cycle but only one multiply. This is adjusted
1558 in ix86_reassociation_width. */
1559 4, 4, 3, 6, /* reassoc int, fp, vec_int, vec_fp. */
1560 znver1_memcpy,
1561 znver1_memset,
1562 COSTS_N_INSNS (4), /* cond_taken_branch_cost. */
1563 COSTS_N_INSNS (2), /* cond_not_taken_branch_cost. */
1564 "16", /* Loop alignment. */
1565 "16", /* Jump alignment. */
1566 "0:0:8", /* Label alignment. */
1567 "16", /* Func alignment. */
1568 4, /* Small unroll limit. */
1569 2, /* Small unroll factor. */
1570 COSTS_N_INSNS (2), /* Branch mispredict scale. */
1573 /* ZNVER2 has optimized REP instruction for medium sized blocks, but for
1574 very small blocks it is better to use loop. For large blocks, libcall
1575 can do nontemporary accesses and beat inline considerably. */
1576 static stringop_algs znver2_memcpy[2] = {
1577 /* 32-bit tuning. */
1578 {libcall, {{6, loop, false},
1579 {14, unrolled_loop, false},
1580 {-1, libcall, false}}},
1581 /* 64-bit tuning. */
1582 {libcall, {{16, loop, false},
1583 {64, rep_prefix_4_byte, false},
1584 {-1, libcall, false}}}};
1585 static stringop_algs znver2_memset[2] = {
1586 /* 32-bit tuning. */
1587 {libcall, {{8, loop, false},
1588 {24, unrolled_loop, false},
1589 {128, rep_prefix_4_byte, false},
1590 {-1, libcall, false}}},
1591 /* 64-bit tuning. */
1592 {libcall, {{24, rep_prefix_4_byte, false},
1593 {128, rep_prefix_8_byte, false},
1594 {-1, libcall, false}}}};
1596 struct processor_costs znver2_cost = {
1598 /* Start of register allocator costs. integer->integer move cost is 2. */
1600 /* reg-reg moves are done by renaming and thus they are even cheaper than
1601 1 cycle. Because reg-reg move cost is 2 and following tables correspond
1602 to doubles of latencies, we do not model this correctly. It does not
1603 seem to make practical difference to bump prices up even more. */
1604 6, /* cost for loading QImode using
1605 movzbl. */
1606 {6, 6, 6}, /* cost of loading integer registers
1607 in QImode, HImode and SImode.
1608 Relative to reg-reg move (2). */
1609 {8, 8, 8}, /* cost of storing integer
1610 registers. */
1611 2, /* cost of reg,reg fld/fst. */
1612 {6, 6, 16}, /* cost of loading fp registers
1613 in SFmode, DFmode and XFmode. */
1614 {8, 8, 16}, /* cost of storing fp registers
1615 in SFmode, DFmode and XFmode. */
1616 2, /* cost of moving MMX register. */
1617 {6, 6}, /* cost of loading MMX registers
1618 in SImode and DImode. */
1619 {8, 8}, /* cost of storing MMX registers
1620 in SImode and DImode. */
1621 2, 2, 3, /* cost of moving XMM,YMM,ZMM
1622 register. */
1623 {6, 6, 6, 6, 12}, /* cost of loading SSE registers
1624 in 32,64,128,256 and 512-bit. */
1625 {8, 8, 8, 8, 16}, /* cost of storing SSE registers
1626 in 32,64,128,256 and 512-bit. */
1627 6, 6, /* SSE->integer and integer->SSE
1628 moves. */
1629 8, 8, /* mask->integer and integer->mask moves */
1630 {6, 6, 6}, /* cost of loading mask register
1631 in QImode, HImode, SImode. */
1632 {8, 8, 8}, /* cost if storing mask register
1633 in QImode, HImode, SImode. */
1634 2, /* cost of moving mask register. */
1635 /* End of register allocator costs. */
1638 COSTS_N_INSNS (1), /* cost of an add instruction. */
1639 COSTS_N_INSNS (1), /* cost of a lea instruction. */
1640 COSTS_N_INSNS (1), /* variable shift costs. */
1641 COSTS_N_INSNS (1), /* constant shift costs. */
1642 {COSTS_N_INSNS (3), /* cost of starting multiply for QI. */
1643 COSTS_N_INSNS (3), /* HI. */
1644 COSTS_N_INSNS (3), /* SI. */
1645 COSTS_N_INSNS (3), /* DI. */
1646 COSTS_N_INSNS (3)}, /* other. */
1647 0, /* cost of multiply per each bit
1648 set. */
1649 /* Depending on parameters, idiv can get faster on ryzen. This is upper
1650 bound. */
1651 {COSTS_N_INSNS (16), /* cost of a divide/mod for QI. */
1652 COSTS_N_INSNS (22), /* HI. */
1653 COSTS_N_INSNS (30), /* SI. */
1654 COSTS_N_INSNS (45), /* DI. */
1655 COSTS_N_INSNS (45)}, /* other. */
1656 COSTS_N_INSNS (1), /* cost of movsx. */
1657 COSTS_N_INSNS (1), /* cost of movzx. */
1658 8, /* "large" insn. */
1659 9, /* MOVE_RATIO. */
1660 6, /* CLEAR_RATIO */
1661 {6, 6, 6}, /* cost of loading integer registers
1662 in QImode, HImode and SImode.
1663 Relative to reg-reg move (2). */
1664 {8, 8, 8}, /* cost of storing integer
1665 registers. */
1666 {6, 6, 6, 6, 12}, /* cost of loading SSE registers
1667 in 32bit, 64bit, 128bit, 256bit and 512bit */
1668 {8, 8, 8, 8, 16}, /* cost of storing SSE register
1669 in 32bit, 64bit, 128bit, 256bit and 512bit */
1670 {6, 6, 6, 6, 12}, /* cost of unaligned loads. */
1671 {8, 8, 8, 8, 16}, /* cost of unaligned stores. */
1672 2, 2, 3, /* cost of moving XMM,YMM,ZMM
1673 register. */
1674 6, /* cost of moving SSE register to integer. */
1675 /* VGATHERDPD is 23 uops and throughput is 9, VGATHERDPD is 35 uops,
1676 throughput 12. Approx 9 uops do not depend on vector size and every load
1677 is 7 uops. */
1678 18, 8, /* Gather load static, per_elt. */
1679 18, 10, /* Gather store static, per_elt. */
1680 32, /* size of l1 cache. */
1681 512, /* size of l2 cache. */
1682 64, /* size of prefetch block. */
1683 /* New AMD processors never drop prefetches; if they cannot be performed
1684 immediately, they are queued. We set number of simultaneous prefetches
1685 to a large constant to reflect this (it probably is not a good idea not
1686 to limit number of prefetches at all, as their execution also takes some
1687 time). */
1688 100, /* number of parallel prefetches. */
1689 3, /* Branch cost. */
1690 COSTS_N_INSNS (5), /* cost of FADD and FSUB insns. */
1691 COSTS_N_INSNS (5), /* cost of FMUL instruction. */
1692 /* Latency of fdiv is 8-15. */
1693 COSTS_N_INSNS (15), /* cost of FDIV instruction. */
1694 COSTS_N_INSNS (1), /* cost of FABS instruction. */
1695 COSTS_N_INSNS (1), /* cost of FCHS instruction. */
1696 /* Latency of fsqrt is 4-10. */
1697 COSTS_N_INSNS (10), /* cost of FSQRT instruction. */
1699 COSTS_N_INSNS (1), /* cost of cheap SSE instruction. */
1700 COSTS_N_INSNS (3), /* cost of ADDSS/SD SUBSS/SD insns. */
1701 COSTS_N_INSNS (3), /* cost of MULSS instruction. */
1702 COSTS_N_INSNS (3), /* cost of MULSD instruction. */
1703 COSTS_N_INSNS (5), /* cost of FMA SS instruction. */
1704 COSTS_N_INSNS (5), /* cost of FMA SD instruction. */
1705 COSTS_N_INSNS (10), /* cost of DIVSS instruction. */
1706 /* 9-13. */
1707 COSTS_N_INSNS (13), /* cost of DIVSD instruction. */
1708 COSTS_N_INSNS (10), /* cost of SQRTSS instruction. */
1709 COSTS_N_INSNS (15), /* cost of SQRTSD instruction. */
1710 /* Zen can execute 4 integer operations per cycle. FP operations
1711 take 3 cycles and it can execute 2 integer additions and 2
1712 multiplications thus reassociation may make sense up to with of 6.
1713 SPEC2k6 bencharks suggests
1714 that 4 works better than 6 probably due to register pressure.
1716 Integer vector operations are taken by FP unit and execute 3 vector
1717 plus/minus operations per cycle but only one multiply. This is adjusted
1718 in ix86_reassociation_width. */
1719 4, 4, 3, 6, /* reassoc int, fp, vec_int, vec_fp. */
1720 znver2_memcpy,
1721 znver2_memset,
1722 COSTS_N_INSNS (4), /* cond_taken_branch_cost. */
1723 COSTS_N_INSNS (2), /* cond_not_taken_branch_cost. */
1724 "16", /* Loop alignment. */
1725 "16", /* Jump alignment. */
1726 "0:0:8", /* Label alignment. */
1727 "16", /* Func alignment. */
1728 4, /* Small unroll limit. */
1729 2, /* Small unroll factor. */
1730 COSTS_N_INSNS (2), /* Branch mispredict scale. */
1733 struct processor_costs znver3_cost = {
1735 /* Start of register allocator costs. integer->integer move cost is 2. */
1737 /* reg-reg moves are done by renaming and thus they are even cheaper than
1738 1 cycle. Because reg-reg move cost is 2 and following tables correspond
1739 to doubles of latencies, we do not model this correctly. It does not
1740 seem to make practical difference to bump prices up even more. */
1741 6, /* cost for loading QImode using
1742 movzbl. */
1743 {6, 6, 6}, /* cost of loading integer registers
1744 in QImode, HImode and SImode.
1745 Relative to reg-reg move (2). */
1746 {8, 8, 8}, /* cost of storing integer
1747 registers. */
1748 2, /* cost of reg,reg fld/fst. */
1749 {6, 6, 16}, /* cost of loading fp registers
1750 in SFmode, DFmode and XFmode. */
1751 {8, 8, 16}, /* cost of storing fp registers
1752 in SFmode, DFmode and XFmode. */
1753 2, /* cost of moving MMX register. */
1754 {6, 6}, /* cost of loading MMX registers
1755 in SImode and DImode. */
1756 {8, 8}, /* cost of storing MMX registers
1757 in SImode and DImode. */
1758 2, 2, 3, /* cost of moving XMM,YMM,ZMM
1759 register. */
1760 {6, 6, 6, 6, 12}, /* cost of loading SSE registers
1761 in 32,64,128,256 and 512-bit. */
1762 {8, 8, 8, 8, 16}, /* cost of storing SSE registers
1763 in 32,64,128,256 and 512-bit. */
1764 6, 6, /* SSE->integer and integer->SSE
1765 moves. */
1766 8, 8, /* mask->integer and integer->mask moves */
1767 {6, 6, 6}, /* cost of loading mask register
1768 in QImode, HImode, SImode. */
1769 {8, 8, 8}, /* cost if storing mask register
1770 in QImode, HImode, SImode. */
1771 2, /* cost of moving mask register. */
1772 /* End of register allocator costs. */
1775 COSTS_N_INSNS (1), /* cost of an add instruction. */
1776 COSTS_N_INSNS (1), /* cost of a lea instruction. */
1777 COSTS_N_INSNS (1), /* variable shift costs. */
1778 COSTS_N_INSNS (1), /* constant shift costs. */
1779 {COSTS_N_INSNS (3), /* cost of starting multiply for QI. */
1780 COSTS_N_INSNS (3), /* HI. */
1781 COSTS_N_INSNS (3), /* SI. */
1782 COSTS_N_INSNS (3), /* DI. */
1783 COSTS_N_INSNS (3)}, /* other. */
1784 0, /* cost of multiply per each bit
1785 set. */
1786 {COSTS_N_INSNS (9), /* cost of a divide/mod for QI. */
1787 COSTS_N_INSNS (10), /* HI. */
1788 COSTS_N_INSNS (12), /* SI. */
1789 COSTS_N_INSNS (17), /* DI. */
1790 COSTS_N_INSNS (17)}, /* other. */
1791 COSTS_N_INSNS (1), /* cost of movsx. */
1792 COSTS_N_INSNS (1), /* cost of movzx. */
1793 8, /* "large" insn. */
1794 9, /* MOVE_RATIO. */
1795 6, /* CLEAR_RATIO */
1796 {6, 6, 6}, /* cost of loading integer registers
1797 in QImode, HImode and SImode.
1798 Relative to reg-reg move (2). */
1799 {8, 8, 8}, /* cost of storing integer
1800 registers. */
1801 {6, 6, 6, 6, 12}, /* cost of loading SSE registers
1802 in 32bit, 64bit, 128bit, 256bit and 512bit */
1803 {8, 8, 8, 8, 16}, /* cost of storing SSE register
1804 in 32bit, 64bit, 128bit, 256bit and 512bit */
1805 {6, 6, 6, 6, 12}, /* cost of unaligned loads. */
1806 {8, 8, 8, 8, 16}, /* cost of unaligned stores. */
1807 2, 2, 3, /* cost of moving XMM,YMM,ZMM
1808 register. */
1809 6, /* cost of moving SSE register to integer. */
1810 /* VGATHERDPD is 15 uops and throughput is 4, VGATHERDPS is 23 uops,
1811 throughput 9. Approx 7 uops do not depend on vector size and every load
1812 is 4 uops. */
1813 14, 8, /* Gather load static, per_elt. */
1814 14, 10, /* Gather store static, per_elt. */
1815 32, /* size of l1 cache. */
1816 512, /* size of l2 cache. */
1817 64, /* size of prefetch block. */
1818 /* New AMD processors never drop prefetches; if they cannot be performed
1819 immediately, they are queued. We set number of simultaneous prefetches
1820 to a large constant to reflect this (it probably is not a good idea not
1821 to limit number of prefetches at all, as their execution also takes some
1822 time). */
1823 100, /* number of parallel prefetches. */
1824 3, /* Branch cost. */
1825 COSTS_N_INSNS (5), /* cost of FADD and FSUB insns. */
1826 COSTS_N_INSNS (5), /* cost of FMUL instruction. */
1827 /* Latency of fdiv is 8-15. */
1828 COSTS_N_INSNS (15), /* cost of FDIV instruction. */
1829 COSTS_N_INSNS (1), /* cost of FABS instruction. */
1830 COSTS_N_INSNS (1), /* cost of FCHS instruction. */
1831 /* Latency of fsqrt is 4-10. */
1832 COSTS_N_INSNS (10), /* cost of FSQRT instruction. */
1834 COSTS_N_INSNS (1), /* cost of cheap SSE instruction. */
1835 COSTS_N_INSNS (3), /* cost of ADDSS/SD SUBSS/SD insns. */
1836 COSTS_N_INSNS (3), /* cost of MULSS instruction. */
1837 COSTS_N_INSNS (3), /* cost of MULSD instruction. */
1838 COSTS_N_INSNS (5), /* cost of FMA SS instruction. */
1839 COSTS_N_INSNS (5), /* cost of FMA SD instruction. */
1840 COSTS_N_INSNS (10), /* cost of DIVSS instruction. */
1841 /* 9-13. */
1842 COSTS_N_INSNS (13), /* cost of DIVSD instruction. */
1843 COSTS_N_INSNS (10), /* cost of SQRTSS instruction. */
1844 COSTS_N_INSNS (15), /* cost of SQRTSD instruction. */
1845 /* Zen can execute 4 integer operations per cycle. FP operations
1846 take 3 cycles and it can execute 2 integer additions and 2
1847 multiplications thus reassociation may make sense up to with of 6.
1848 SPEC2k6 bencharks suggests
1849 that 4 works better than 6 probably due to register pressure.
1851 Integer vector operations are taken by FP unit and execute 3 vector
1852 plus/minus operations per cycle but only one multiply. This is adjusted
1853 in ix86_reassociation_width. */
1854 4, 4, 3, 6, /* reassoc int, fp, vec_int, vec_fp. */
1855 znver2_memcpy,
1856 znver2_memset,
1857 COSTS_N_INSNS (4), /* cond_taken_branch_cost. */
1858 COSTS_N_INSNS (2), /* cond_not_taken_branch_cost. */
1859 "16", /* Loop alignment. */
1860 "16", /* Jump alignment. */
1861 "0:0:8", /* Label alignment. */
1862 "16", /* Func alignment. */
1863 4, /* Small unroll limit. */
1864 2, /* Small unroll factor. */
1865 COSTS_N_INSNS (2), /* Branch mispredict scale. */
1868 /* This table currently replicates znver3_cost table. */
1869 struct processor_costs znver4_cost = {
1871 /* Start of register allocator costs. integer->integer move cost is 2. */
1873 /* reg-reg moves are done by renaming and thus they are even cheaper than
1874 1 cycle. Because reg-reg move cost is 2 and following tables correspond
1875 to doubles of latencies, we do not model this correctly. It does not
1876 seem to make practical difference to bump prices up even more. */
1877 6, /* cost for loading QImode using
1878 movzbl. */
1879 {6, 6, 6}, /* cost of loading integer registers
1880 in QImode, HImode and SImode.
1881 Relative to reg-reg move (2). */
1882 {8, 8, 8}, /* cost of storing integer
1883 registers. */
1884 2, /* cost of reg,reg fld/fst. */
1885 {14, 14, 17}, /* cost of loading fp registers
1886 in SFmode, DFmode and XFmode. */
1887 {12, 12, 16}, /* cost of storing fp registers
1888 in SFmode, DFmode and XFmode. */
1889 2, /* cost of moving MMX register. */
1890 {6, 6}, /* cost of loading MMX registers
1891 in SImode and DImode. */
1892 {8, 8}, /* cost of storing MMX registers
1893 in SImode and DImode. */
1894 2, 2, 3, /* cost of moving XMM,YMM,ZMM
1895 register. */
1896 {6, 6, 10, 10, 12}, /* cost of loading SSE registers
1897 in 32,64,128,256 and 512-bit. */
1898 {8, 8, 8, 12, 12}, /* cost of storing SSE registers
1899 in 32,64,128,256 and 512-bit. */
1900 6, 8, /* SSE->integer and integer->SSE
1901 moves. */
1902 8, 8, /* mask->integer and integer->mask moves */
1903 {6, 6, 6}, /* cost of loading mask register
1904 in QImode, HImode, SImode. */
1905 {8, 8, 8}, /* cost if storing mask register
1906 in QImode, HImode, SImode. */
1907 2, /* cost of moving mask register. */
1908 /* End of register allocator costs. */
1911 COSTS_N_INSNS (1), /* cost of an add instruction. */
1912 /* TODO: Lea with 3 components has cost 2. */
1913 COSTS_N_INSNS (1), /* cost of a lea instruction. */
1914 COSTS_N_INSNS (1), /* variable shift costs. */
1915 COSTS_N_INSNS (1), /* constant shift costs. */
1916 {COSTS_N_INSNS (3), /* cost of starting multiply for QI. */
1917 COSTS_N_INSNS (3), /* HI. */
1918 COSTS_N_INSNS (3), /* SI. */
1919 COSTS_N_INSNS (3), /* DI. */
1920 COSTS_N_INSNS (3)}, /* other. */
1921 0, /* cost of multiply per each bit
1922 set. */
1923 {COSTS_N_INSNS (12), /* cost of a divide/mod for QI. */
1924 COSTS_N_INSNS (13), /* HI. */
1925 COSTS_N_INSNS (13), /* SI. */
1926 COSTS_N_INSNS (18), /* DI. */
1927 COSTS_N_INSNS (18)}, /* other. */
1928 COSTS_N_INSNS (1), /* cost of movsx. */
1929 COSTS_N_INSNS (1), /* cost of movzx. */
1930 8, /* "large" insn. */
1931 9, /* MOVE_RATIO. */
1932 6, /* CLEAR_RATIO */
1933 {6, 6, 6}, /* cost of loading integer registers
1934 in QImode, HImode and SImode.
1935 Relative to reg-reg move (2). */
1936 {8, 8, 8}, /* cost of storing integer
1937 registers. */
1938 {6, 6, 10, 10, 12}, /* cost of loading SSE registers
1939 in 32bit, 64bit, 128bit, 256bit and 512bit */
1940 {8, 8, 8, 12, 12}, /* cost of storing SSE register
1941 in 32bit, 64bit, 128bit, 256bit and 512bit */
1942 {6, 6, 10, 10, 12}, /* cost of unaligned loads. */
1943 {8, 8, 8, 12, 12}, /* cost of unaligned stores. */
1944 2, 2, 2, /* cost of moving XMM,YMM,ZMM
1945 register. */
1946 6, /* cost of moving SSE register to integer. */
1947 /* VGATHERDPD is 17 uops and throughput is 4, VGATHERDPS is 24 uops,
1948 throughput 5. Approx 7 uops do not depend on vector size and every load
1949 is 5 uops. */
1950 14, 10, /* Gather load static, per_elt. */
1951 14, 20, /* Gather store static, per_elt. */
1952 32, /* size of l1 cache. */
1953 1024, /* size of l2 cache. */
1954 64, /* size of prefetch block. */
1955 /* New AMD processors never drop prefetches; if they cannot be performed
1956 immediately, they are queued. We set number of simultaneous prefetches
1957 to a large constant to reflect this (it probably is not a good idea not
1958 to limit number of prefetches at all, as their execution also takes some
1959 time). */
1960 100, /* number of parallel prefetches. */
1961 3, /* Branch cost. */
1962 COSTS_N_INSNS (7), /* cost of FADD and FSUB insns. */
1963 COSTS_N_INSNS (7), /* cost of FMUL instruction. */
1964 /* Latency of fdiv is 8-15. */
1965 COSTS_N_INSNS (15), /* cost of FDIV instruction. */
1966 COSTS_N_INSNS (1), /* cost of FABS instruction. */
1967 COSTS_N_INSNS (1), /* cost of FCHS instruction. */
1968 /* Latency of fsqrt is 4-10. */
1969 COSTS_N_INSNS (25), /* cost of FSQRT instruction. */
1971 COSTS_N_INSNS (1), /* cost of cheap SSE instruction. */
1972 COSTS_N_INSNS (3), /* cost of ADDSS/SD SUBSS/SD insns. */
1973 COSTS_N_INSNS (3), /* cost of MULSS instruction. */
1974 COSTS_N_INSNS (3), /* cost of MULSD instruction. */
1975 COSTS_N_INSNS (4), /* cost of FMA SS instruction. */
1976 COSTS_N_INSNS (4), /* cost of FMA SD instruction. */
1977 COSTS_N_INSNS (13), /* cost of DIVSS instruction. */
1978 /* 9-13. */
1979 COSTS_N_INSNS (13), /* cost of DIVSD instruction. */
1980 COSTS_N_INSNS (15), /* cost of SQRTSS instruction. */
1981 COSTS_N_INSNS (21), /* cost of SQRTSD instruction. */
1982 /* Zen can execute 4 integer operations per cycle. FP operations
1983 take 3 cycles and it can execute 2 integer additions and 2
1984 multiplications thus reassociation may make sense up to with of 6.
1985 SPEC2k6 bencharks suggests
1986 that 4 works better than 6 probably due to register pressure.
1988 Integer vector operations are taken by FP unit and execute 3 vector
1989 plus/minus operations per cycle but only one multiply. This is adjusted
1990 in ix86_reassociation_width. */
1991 4, 4, 3, 6, /* reassoc int, fp, vec_int, vec_fp. */
1992 znver2_memcpy,
1993 znver2_memset,
1994 COSTS_N_INSNS (4), /* cond_taken_branch_cost. */
1995 COSTS_N_INSNS (2), /* cond_not_taken_branch_cost. */
1996 "16", /* Loop alignment. */
1997 "16", /* Jump alignment. */
1998 "0:0:8", /* Label alignment. */
1999 "16", /* Func alignment. */
2000 4, /* Small unroll limit. */
2001 2, /* Small unroll factor. */
2002 COSTS_N_INSNS (2), /* Branch mispredict scale. */
2005 /* This table currently replicates znver4_cost table. */
2006 struct processor_costs znver5_cost = {
2008 /* Start of register allocator costs. integer->integer move cost is 2. */
2010 /* reg-reg moves are done by renaming and thus they are even cheaper than
2011 1 cycle. Because reg-reg move cost is 2 and following tables correspond
2012 to doubles of latencies, we do not model this correctly. It does not
2013 seem to make practical difference to bump prices up even more. */
2014 6, /* cost for loading QImode using
2015 movzbl. */
2016 {6, 6, 6}, /* cost of loading integer registers
2017 in QImode, HImode and SImode.
2018 Relative to reg-reg move (2). */
2019 {8, 8, 8}, /* cost of storing integer
2020 registers. */
2021 2, /* cost of reg,reg fld/fst. */
2022 {14, 14, 17}, /* cost of loading fp registers
2023 in SFmode, DFmode and XFmode. */
2024 {12, 12, 16}, /* cost of storing fp registers
2025 in SFmode, DFmode and XFmode. */
2026 2, /* cost of moving MMX register. */
2027 {6, 6}, /* cost of loading MMX registers
2028 in SImode and DImode. */
2029 {8, 8}, /* cost of storing MMX registers
2030 in SImode and DImode. */
2031 2, 2, 3, /* cost of moving XMM,YMM,ZMM
2032 register. */
2033 {6, 6, 10, 10, 12}, /* cost of loading SSE registers
2034 in 32,64,128,256 and 512-bit. */
2035 {8, 8, 8, 12, 12}, /* cost of storing SSE registers
2036 in 32,64,128,256 and 512-bit. */
2037 6, 8, /* SSE->integer and integer->SSE
2038 moves. */
2039 8, 8, /* mask->integer and integer->mask moves */
2040 {6, 6, 6}, /* cost of loading mask register
2041 in QImode, HImode, SImode. */
2042 {8, 8, 8}, /* cost if storing mask register
2043 in QImode, HImode, SImode. */
2044 2, /* cost of moving mask register. */
2045 /* End of register allocator costs. */
2048 COSTS_N_INSNS (1), /* cost of an add instruction. */
2049 /* TODO: Lea with 3 components has cost 2. */
2050 COSTS_N_INSNS (1), /* cost of a lea instruction. */
2051 COSTS_N_INSNS (1), /* variable shift costs. */
2052 COSTS_N_INSNS (1), /* constant shift costs. */
2053 /* mul has latency 3, executes in 3 integer units. */
2054 {COSTS_N_INSNS (3), /* cost of starting multiply for QI. */
2055 COSTS_N_INSNS (3), /* HI. */
2056 COSTS_N_INSNS (3), /* SI. */
2057 COSTS_N_INSNS (3), /* DI. */
2058 COSTS_N_INSNS (3)}, /* other. */
2059 0, /* cost of multiply per each bit
2060 set. */
2061 /* integer divide has latency of 8 cycles
2062 plus 1 for every 9 bits of quotient. */
2063 {COSTS_N_INSNS (10), /* cost of a divide/mod for QI. */
2064 COSTS_N_INSNS (11), /* HI. */
2065 COSTS_N_INSNS (13), /* SI. */
2066 COSTS_N_INSNS (16), /* DI. */
2067 COSTS_N_INSNS (16)}, /* other. */
2068 COSTS_N_INSNS (1), /* cost of movsx. */
2069 COSTS_N_INSNS (1), /* cost of movzx. */
2070 15, /* "large" insn. */
2071 9, /* MOVE_RATIO. */
2072 6, /* CLEAR_RATIO */
2073 {6, 6, 6}, /* cost of loading integer registers
2074 in QImode, HImode and SImode.
2075 Relative to reg-reg move (2). */
2076 {8, 8, 8}, /* cost of storing integer
2077 registers. */
2078 {6, 6, 10, 10, 12}, /* cost of loading SSE registers
2079 in 32bit, 64bit, 128bit, 256bit and 512bit */
2080 {8, 8, 8, 12, 12}, /* cost of storing SSE register
2081 in 32bit, 64bit, 128bit, 256bit and 512bit */
2082 {6, 6, 10, 10, 12}, /* cost of unaligned loads. */
2083 {8, 8, 8, 12, 12}, /* cost of unaligned stores. */
2084 2, 2, 2, /* cost of moving XMM,YMM,ZMM
2085 register. */
2086 6, /* cost of moving SSE register to integer. */
2088 /* TODO: gather and scatter instructions are currently disabled in
2089 x86-tune.def. In some cases they are however a win, see PR116582
2090 We however need good cost model for them. */
2091 14, 10, /* Gather load static, per_elt. */
2092 14, 20, /* Gather store static, per_elt. */
2093 48, /* size of l1 cache. */
2094 1024, /* size of l2 cache. */
2095 64, /* size of prefetch block. */
2096 /* New AMD processors never drop prefetches; if they cannot be performed
2097 immediately, they are queued. We set number of simultaneous prefetches
2098 to a large constant to reflect this (it probably is not a good idea not
2099 to limit number of prefetches at all, as their execution also takes some
2100 time). */
2101 100, /* number of parallel prefetches. */
2102 3, /* Branch cost. */
2103 /* TODO x87 latencies are still based on znver4.
2104 Probably not very important these days. */
2105 COSTS_N_INSNS (7), /* cost of FADD and FSUB insns. */
2106 COSTS_N_INSNS (7), /* cost of FMUL instruction. */
2107 /* Latency of fdiv is 8-15. */
2108 COSTS_N_INSNS (15), /* cost of FDIV instruction. */
2109 COSTS_N_INSNS (1), /* cost of FABS instruction. */
2110 COSTS_N_INSNS (1), /* cost of FCHS instruction. */
2111 /* Latency of fsqrt is 4-10. */
2112 COSTS_N_INSNS (25), /* cost of FSQRT instruction. */
2114 /* SSE instructions have typical throughput 4 and latency 1. */
2115 COSTS_N_INSNS (1), /* cost of cheap SSE instruction. */
2116 /* ADDSS has throughput 2 and latency 2
2117 (in some cases when source is another addition). */
2118 COSTS_N_INSNS (2), /* cost of ADDSS/SD SUBSS/SD insns. */
2119 /* MULSS has throughput 2 and latency 3. */
2120 COSTS_N_INSNS (3), /* cost of MULSS instruction. */
2121 COSTS_N_INSNS (3), /* cost of MULSD instruction. */
2122 /* FMA had throughput 2 and latency 4. */
2123 COSTS_N_INSNS (4), /* cost of FMA SS instruction. */
2124 COSTS_N_INSNS (4), /* cost of FMA SD instruction. */
2125 /* DIVSS has throughtput 0.4 and latency 10. */
2126 COSTS_N_INSNS (10), /* cost of DIVSS instruction. */
2127 /* DIVSD has throughtput 0.25 and latency 13. */
2128 COSTS_N_INSNS (13), /* cost of DIVSD instruction. */
2129 /* DIVSD has throughtput 0.22 and latency 14. */
2130 COSTS_N_INSNS (14), /* cost of SQRTSS instruction. */
2131 /* DIVSD has throughtput 0.13 and latency 20. */
2132 COSTS_N_INSNS (20), /* cost of SQRTSD instruction. */
2133 /* Zen5 can execute:
2134 - integer ops: 6 per cycle, at most 3 multiplications.
2135 latency 1 for additions, 3 for multiplications (pipelined)
2137 Setting width of 9 for multiplication is probably excessive
2138 for register pressure.
2139 - fp ops: 2 additions per cycle, latency 2-3
2140 2 multiplicaitons per cycle, latency 3
2141 - vector intger ops: 4 additions, latency 1
2142 2 multiplications, latency 4
2143 We increase width to 6 for multiplications
2144 in ix86_reassociation_width. */
2145 6, 6, 4, 6, /* reassoc int, fp, vec_int, vec_fp. */
2146 znver2_memcpy,
2147 znver2_memset,
2148 COSTS_N_INSNS (4), /* cond_taken_branch_cost. */
2149 COSTS_N_INSNS (2), /* cond_not_taken_branch_cost. */
2150 "16", /* Loop alignment. */
2151 "16", /* Jump alignment. */
2152 "0:0:8", /* Label alignment. */
2153 "16", /* Func alignment. */
2154 4, /* Small unroll limit. */
2155 2, /* Small unroll factor. */
2156 COSTS_N_INSNS (2), /* Branch mispredict scale. */
2159 /* skylake_cost should produce code tuned for Skylake familly of CPUs. */
2160 static stringop_algs skylake_memcpy[2] = {
2161 {libcall,
2162 {{256, rep_prefix_1_byte, true},
2163 {256, loop, false},
2164 {-1, libcall, false}}},
2165 {libcall,
2166 {{256, rep_prefix_1_byte, true},
2167 {256, loop, false},
2168 {-1, libcall, false}}}};
2170 static stringop_algs skylake_memset[2] = {
2171 {libcall,
2172 {{256, rep_prefix_1_byte, true},
2173 {256, loop, false},
2174 {-1, libcall, false}}},
2175 {libcall,
2176 {{256, rep_prefix_1_byte, true},
2177 {256, loop, false},
2178 {-1, libcall, false}}}};
2180 static const
2181 struct processor_costs skylake_cost = {
2183 /* Start of register allocator costs. integer->integer move cost is 2. */
2184 6, /* cost for loading QImode using movzbl */
2185 {4, 4, 4}, /* cost of loading integer registers
2186 in QImode, HImode and SImode.
2187 Relative to reg-reg move (2). */
2188 {6, 6, 6}, /* cost of storing integer registers */
2189 2, /* cost of reg,reg fld/fst */
2190 {6, 6, 8}, /* cost of loading fp registers
2191 in SFmode, DFmode and XFmode */
2192 {6, 6, 10}, /* cost of storing fp registers
2193 in SFmode, DFmode and XFmode */
2194 2, /* cost of moving MMX register */
2195 {6, 6}, /* cost of loading MMX registers
2196 in SImode and DImode */
2197 {6, 6}, /* cost of storing MMX registers
2198 in SImode and DImode */
2199 2, 2, 4, /* cost of moving XMM,YMM,ZMM register */
2200 {6, 6, 6, 10, 20}, /* cost of loading SSE registers
2201 in 32,64,128,256 and 512-bit */
2202 {8, 8, 8, 12, 24}, /* cost of storing SSE registers
2203 in 32,64,128,256 and 512-bit */
2204 6, 6, /* SSE->integer and integer->SSE moves */
2205 6, 6, /* mask->integer and integer->mask moves */
2206 {8, 8, 8}, /* cost of loading mask register
2207 in QImode, HImode, SImode. */
2208 {6, 6, 6}, /* cost if storing mask register
2209 in QImode, HImode, SImode. */
2210 3, /* cost of moving mask register. */
2211 /* End of register allocator costs. */
2214 COSTS_N_INSNS (1), /* cost of an add instruction */
2215 COSTS_N_INSNS (1)+1, /* cost of a lea instruction */
2216 COSTS_N_INSNS (1), /* variable shift costs */
2217 COSTS_N_INSNS (1), /* constant shift costs */
2218 {COSTS_N_INSNS (3), /* cost of starting multiply for QI */
2219 COSTS_N_INSNS (3), /* HI */
2220 COSTS_N_INSNS (3), /* SI */
2221 COSTS_N_INSNS (3), /* DI */
2222 COSTS_N_INSNS (3)}, /* other */
2223 0, /* cost of multiply per each bit set */
2224 /* Expanding div/mod currently doesn't consider parallelism. So the cost
2225 model is not realistic. We compensate by increasing the latencies a bit. */
2226 {COSTS_N_INSNS (11), /* cost of a divide/mod for QI */
2227 COSTS_N_INSNS (11), /* HI */
2228 COSTS_N_INSNS (14), /* SI */
2229 COSTS_N_INSNS (76), /* DI */
2230 COSTS_N_INSNS (76)}, /* other */
2231 COSTS_N_INSNS (1), /* cost of movsx */
2232 COSTS_N_INSNS (0), /* cost of movzx */
2233 8, /* "large" insn */
2234 17, /* MOVE_RATIO */
2235 17, /* CLEAR_RATIO */
2236 {6, 6, 6}, /* cost of loading integer registers
2237 in QImode, HImode and SImode.
2238 Relative to reg-reg move (2). */
2239 {8, 8, 8}, /* cost of storing integer registers */
2240 {8, 8, 8, 8, 16}, /* cost of loading SSE register
2241 in 32bit, 64bit, 128bit, 256bit and 512bit */
2242 {8, 8, 8, 8, 16}, /* cost of storing SSE register
2243 in 32bit, 64bit, 128bit, 256bit and 512bit */
2244 {8, 8, 8, 8, 16}, /* cost of unaligned loads. */
2245 {8, 8, 8, 8, 16}, /* cost of unaligned stores. */
2246 2, 2, 4, /* cost of moving XMM,YMM,ZMM register */
2247 6, /* cost of moving SSE register to integer. */
2248 20, 8, /* Gather load static, per_elt. */
2249 22, 10, /* Gather store static, per_elt. */
2250 64, /* size of l1 cache. */
2251 512, /* size of l2 cache. */
2252 64, /* size of prefetch block */
2253 6, /* number of parallel prefetches */
2254 3, /* Branch cost */
2255 COSTS_N_INSNS (3), /* cost of FADD and FSUB insns. */
2256 COSTS_N_INSNS (4), /* cost of FMUL instruction. */
2257 COSTS_N_INSNS (20), /* cost of FDIV instruction. */
2258 COSTS_N_INSNS (1), /* cost of FABS instruction. */
2259 COSTS_N_INSNS (1), /* cost of FCHS instruction. */
2260 COSTS_N_INSNS (20), /* cost of FSQRT instruction. */
2262 COSTS_N_INSNS (1), /* cost of cheap SSE instruction. */
2263 COSTS_N_INSNS (4), /* cost of ADDSS/SD SUBSS/SD insns. */
2264 COSTS_N_INSNS (4), /* cost of MULSS instruction. */
2265 COSTS_N_INSNS (4), /* cost of MULSD instruction. */
2266 COSTS_N_INSNS (4), /* cost of FMA SS instruction. */
2267 COSTS_N_INSNS (4), /* cost of FMA SD instruction. */
2268 COSTS_N_INSNS (11), /* cost of DIVSS instruction. */
2269 COSTS_N_INSNS (14), /* cost of DIVSD instruction. */
2270 COSTS_N_INSNS (12), /* cost of SQRTSS instruction. */
2271 COSTS_N_INSNS (18), /* cost of SQRTSD instruction. */
2272 1, 4, 2, 2, /* reassoc int, fp, vec_int, vec_fp. */
2273 skylake_memcpy,
2274 skylake_memset,
2275 COSTS_N_INSNS (3), /* cond_taken_branch_cost. */
2276 COSTS_N_INSNS (1), /* cond_not_taken_branch_cost. */
2277 "16:11:8", /* Loop alignment. */
2278 "16:11:8", /* Jump alignment. */
2279 "0:0:8", /* Label alignment. */
2280 "16", /* Func alignment. */
2281 4, /* Small unroll limit. */
2282 2, /* Small unroll factor. */
2283 COSTS_N_INSNS (2), /* Branch mispredict scale. */
2286 /* icelake_cost should produce code tuned for Icelake family of CPUs.
2287 NB: rep_prefix_1_byte is used only for known size. */
2289 static stringop_algs icelake_memcpy[2] = {
2290 {libcall,
2291 {{256, rep_prefix_1_byte, true},
2292 {256, loop, false},
2293 {-1, libcall, false}}},
2294 {libcall,
2295 {{256, rep_prefix_1_byte, true},
2296 {256, loop, false},
2297 {-1, libcall, false}}}};
2299 static stringop_algs icelake_memset[2] = {
2300 {libcall,
2301 {{256, rep_prefix_1_byte, true},
2302 {256, loop, false},
2303 {-1, libcall, false}}},
2304 {libcall,
2305 {{256, rep_prefix_1_byte, true},
2306 {256, loop, false},
2307 {-1, libcall, false}}}};
2309 static const
2310 struct processor_costs icelake_cost = {
2312 /* Start of register allocator costs. integer->integer move cost is 2. */
2313 6, /* cost for loading QImode using movzbl */
2314 {4, 4, 4}, /* cost of loading integer registers
2315 in QImode, HImode and SImode.
2316 Relative to reg-reg move (2). */
2317 {6, 6, 6}, /* cost of storing integer registers */
2318 2, /* cost of reg,reg fld/fst */
2319 {6, 6, 8}, /* cost of loading fp registers
2320 in SFmode, DFmode and XFmode */
2321 {6, 6, 10}, /* cost of storing fp registers
2322 in SFmode, DFmode and XFmode */
2323 2, /* cost of moving MMX register */
2324 {6, 6}, /* cost of loading MMX registers
2325 in SImode and DImode */
2326 {6, 6}, /* cost of storing MMX registers
2327 in SImode and DImode */
2328 2, 2, 4, /* cost of moving XMM,YMM,ZMM register */
2329 {6, 6, 6, 10, 20}, /* cost of loading SSE registers
2330 in 32,64,128,256 and 512-bit */
2331 {8, 8, 8, 12, 24}, /* cost of storing SSE registers
2332 in 32,64,128,256 and 512-bit */
2333 6, 6, /* SSE->integer and integer->SSE moves */
2334 6, 6, /* mask->integer and integer->mask moves */
2335 {8, 8, 8}, /* cost of loading mask register
2336 in QImode, HImode, SImode. */
2337 {6, 6, 6}, /* cost if storing mask register
2338 in QImode, HImode, SImode. */
2339 3, /* cost of moving mask register. */
2340 /* End of register allocator costs. */
2343 COSTS_N_INSNS (1), /* cost of an add instruction */
2344 COSTS_N_INSNS (1)+1, /* cost of a lea instruction */
2345 COSTS_N_INSNS (1), /* variable shift costs */
2346 COSTS_N_INSNS (1), /* constant shift costs */
2347 {COSTS_N_INSNS (3), /* cost of starting multiply for QI */
2348 COSTS_N_INSNS (3), /* HI */
2349 COSTS_N_INSNS (3), /* SI */
2350 COSTS_N_INSNS (3), /* DI */
2351 COSTS_N_INSNS (3)}, /* other */
2352 0, /* cost of multiply per each bit set */
2353 /* Expanding div/mod currently doesn't consider parallelism. So the cost
2354 model is not realistic. We compensate by increasing the latencies a bit. */
2355 {COSTS_N_INSNS (11), /* cost of a divide/mod for QI */
2356 COSTS_N_INSNS (11), /* HI */
2357 COSTS_N_INSNS (14), /* SI */
2358 COSTS_N_INSNS (76), /* DI */
2359 COSTS_N_INSNS (76)}, /* other */
2360 COSTS_N_INSNS (1), /* cost of movsx */
2361 COSTS_N_INSNS (0), /* cost of movzx */
2362 8, /* "large" insn */
2363 17, /* MOVE_RATIO */
2364 17, /* CLEAR_RATIO */
2365 {6, 6, 6}, /* cost of loading integer registers
2366 in QImode, HImode and SImode.
2367 Relative to reg-reg move (2). */
2368 {8, 8, 8}, /* cost of storing integer registers */
2369 {8, 8, 8, 8, 16}, /* cost of loading SSE register
2370 in 32bit, 64bit, 128bit, 256bit and 512bit */
2371 {8, 8, 8, 8, 16}, /* cost of storing SSE register
2372 in 32bit, 64bit, 128bit, 256bit and 512bit */
2373 {8, 8, 8, 8, 16}, /* cost of unaligned loads. */
2374 {8, 8, 8, 8, 16}, /* cost of unaligned stores. */
2375 2, 2, 4, /* cost of moving XMM,YMM,ZMM register */
2376 6, /* cost of moving SSE register to integer. */
2377 20, 8, /* Gather load static, per_elt. */
2378 22, 10, /* Gather store static, per_elt. */
2379 64, /* size of l1 cache. */
2380 512, /* size of l2 cache. */
2381 64, /* size of prefetch block */
2382 6, /* number of parallel prefetches */
2383 3, /* Branch cost */
2384 COSTS_N_INSNS (3), /* cost of FADD and FSUB insns. */
2385 COSTS_N_INSNS (4), /* cost of FMUL instruction. */
2386 COSTS_N_INSNS (20), /* cost of FDIV instruction. */
2387 COSTS_N_INSNS (1), /* cost of FABS instruction. */
2388 COSTS_N_INSNS (1), /* cost of FCHS instruction. */
2389 COSTS_N_INSNS (20), /* cost of FSQRT instruction. */
2391 COSTS_N_INSNS (1), /* cost of cheap SSE instruction. */
2392 COSTS_N_INSNS (4), /* cost of ADDSS/SD SUBSS/SD insns. */
2393 COSTS_N_INSNS (4), /* cost of MULSS instruction. */
2394 COSTS_N_INSNS (4), /* cost of MULSD instruction. */
2395 COSTS_N_INSNS (4), /* cost of FMA SS instruction. */
2396 COSTS_N_INSNS (4), /* cost of FMA SD instruction. */
2397 COSTS_N_INSNS (11), /* cost of DIVSS instruction. */
2398 COSTS_N_INSNS (14), /* cost of DIVSD instruction. */
2399 COSTS_N_INSNS (12), /* cost of SQRTSS instruction. */
2400 COSTS_N_INSNS (18), /* cost of SQRTSD instruction. */
2401 1, 4, 2, 2, /* reassoc int, fp, vec_int, vec_fp. */
2402 icelake_memcpy,
2403 icelake_memset,
2404 COSTS_N_INSNS (3), /* cond_taken_branch_cost. */
2405 COSTS_N_INSNS (1), /* cond_not_taken_branch_cost. */
2406 "16:11:8", /* Loop alignment. */
2407 "16:11:8", /* Jump alignment. */
2408 "0:0:8", /* Label alignment. */
2409 "16", /* Func alignment. */
2410 4, /* Small unroll limit. */
2411 2, /* Small unroll factor. */
2412 COSTS_N_INSNS (2) + 3, /* Branch mispredict scale. */
2415 /* alderlake_cost should produce code tuned for alderlake family of CPUs. */
2416 static stringop_algs alderlake_memcpy[2] = {
2417 {libcall,
2418 {{256, rep_prefix_1_byte, true},
2419 {256, loop, false},
2420 {-1, libcall, false}}},
2421 {libcall,
2422 {{256, rep_prefix_1_byte, true},
2423 {256, loop, false},
2424 {-1, libcall, false}}}};
2425 static stringop_algs alderlake_memset[2] = {
2426 {libcall,
2427 {{256, rep_prefix_1_byte, true},
2428 {256, loop, false},
2429 {-1, libcall, false}}},
2430 {libcall,
2431 {{256, rep_prefix_1_byte, true},
2432 {256, loop, false},
2433 {-1, libcall, false}}}};
2434 static const
2435 struct processor_costs alderlake_cost = {
2437 /* Start of register allocator costs. integer->integer move cost is 2. */
2438 6, /* cost for loading QImode using movzbl */
2439 {6, 6, 6}, /* cost of loading integer registers
2440 in QImode, HImode and SImode.
2441 Relative to reg-reg move (2). */
2442 {6, 6, 6}, /* cost of storing integer registers */
2443 4, /* cost of reg,reg fld/fst */
2444 {6, 6, 12}, /* cost of loading fp registers
2445 in SFmode, DFmode and XFmode */
2446 {6, 6, 12}, /* cost of storing fp registers
2447 in SFmode, DFmode and XFmode */
2448 2, /* cost of moving MMX register */
2449 {6, 6}, /* cost of loading MMX registers
2450 in SImode and DImode */
2451 {6, 6}, /* cost of storing MMX registers
2452 in SImode and DImode */
2453 2, 3, 4, /* cost of moving XMM,YMM,ZMM register */
2454 {6, 6, 6, 10, 15}, /* cost of loading SSE registers
2455 in 32,64,128,256 and 512-bit */
2456 {6, 6, 6, 10, 15}, /* cost of storing SSE registers
2457 in 32,64,128,256 and 512-bit */
2458 6, 6, /* SSE->integer and integer->SSE moves */
2459 6, 6, /* mask->integer and integer->mask moves */
2460 {6, 6, 6}, /* cost of loading mask register
2461 in QImode, HImode, SImode. */
2462 {6, 6, 6}, /* cost if storing mask register
2463 in QImode, HImode, SImode. */
2464 2, /* cost of moving mask register. */
2465 /* End of register allocator costs. */
2468 COSTS_N_INSNS (1), /* cost of an add instruction */
2469 COSTS_N_INSNS (1) + 1, /* cost of a lea instruction */
2470 COSTS_N_INSNS (1), /* variable shift costs */
2471 COSTS_N_INSNS (1), /* constant shift costs */
2472 {COSTS_N_INSNS (3), /* cost of starting multiply for QI */
2473 COSTS_N_INSNS (3), /* HI */
2474 COSTS_N_INSNS (3), /* SI */
2475 COSTS_N_INSNS (3), /* DI */
2476 COSTS_N_INSNS (4)}, /* other */
2477 0, /* cost of multiply per each bit set */
2478 {COSTS_N_INSNS (16), /* cost of a divide/mod for QI */
2479 COSTS_N_INSNS (22), /* HI */
2480 COSTS_N_INSNS (30), /* SI */
2481 COSTS_N_INSNS (74), /* DI */
2482 COSTS_N_INSNS (74)}, /* other */
2483 COSTS_N_INSNS (1), /* cost of movsx */
2484 COSTS_N_INSNS (1), /* cost of movzx */
2485 8, /* "large" insn */
2486 17, /* MOVE_RATIO */
2487 17, /* CLEAR_RATIO */
2488 {6, 6, 6}, /* cost of loading integer registers
2489 in QImode, HImode and SImode.
2490 Relative to reg-reg move (2). */
2491 {8, 8, 8}, /* cost of storing integer registers */
2492 {8, 8, 8, 10, 15}, /* cost of loading SSE register
2493 in 32bit, 64bit, 128bit, 256bit and 512bit */
2494 {8, 8, 8, 10, 15}, /* cost of storing SSE register
2495 in 32bit, 64bit, 128bit, 256bit and 512bit */
2496 {8, 8, 8, 10, 15}, /* cost of unaligned loads. */
2497 {8, 8, 8, 10, 15}, /* cost of unaligned storess. */
2498 2, 3, 4, /* cost of moving XMM,YMM,ZMM register */
2499 6, /* cost of moving SSE register to integer. */
2500 18, 6, /* Gather load static, per_elt. */
2501 18, 6, /* Gather store static, per_elt. */
2502 32, /* size of l1 cache. */
2503 512, /* size of l2 cache. */
2504 64, /* size of prefetch block */
2505 6, /* number of parallel prefetches */
2506 3, /* Branch cost */
2507 COSTS_N_INSNS (3), /* cost of FADD and FSUB insns. */
2508 COSTS_N_INSNS (5), /* cost of FMUL instruction. */
2509 COSTS_N_INSNS (17), /* cost of FDIV instruction. */
2510 COSTS_N_INSNS (1), /* cost of FABS instruction. */
2511 COSTS_N_INSNS (1), /* cost of FCHS instruction. */
2512 COSTS_N_INSNS (14), /* cost of FSQRT instruction. */
2514 COSTS_N_INSNS (1), /* cost of cheap SSE instruction. */
2515 COSTS_N_INSNS (3), /* cost of ADDSS/SD SUBSS/SD insns. */
2516 COSTS_N_INSNS (4), /* cost of MULSS instruction. */
2517 COSTS_N_INSNS (5), /* cost of MULSD instruction. */
2518 COSTS_N_INSNS (5), /* cost of FMA SS instruction. */
2519 COSTS_N_INSNS (5), /* cost of FMA SD instruction. */
2520 COSTS_N_INSNS (13), /* cost of DIVSS instruction. */
2521 COSTS_N_INSNS (17), /* cost of DIVSD instruction. */
2522 COSTS_N_INSNS (14), /* cost of SQRTSS instruction. */
2523 COSTS_N_INSNS (18), /* cost of SQRTSD instruction. */
2524 1, 4, 3, 3, /* reassoc int, fp, vec_int, vec_fp. */
2525 alderlake_memcpy,
2526 alderlake_memset,
2527 COSTS_N_INSNS (4), /* cond_taken_branch_cost. */
2528 COSTS_N_INSNS (2), /* cond_not_taken_branch_cost. */
2529 "16:11:8", /* Loop alignment. */
2530 "16:11:8", /* Jump alignment. */
2531 "0:0:8", /* Label alignment. */
2532 "16", /* Func alignment. */
2533 4, /* Small unroll limit. */
2534 2, /* Small unroll factor. */
2535 COSTS_N_INSNS (2) + 3, /* Branch mispredict scale. */
2538 /* BTVER1 has optimized REP instruction for medium sized blocks, but for
2539 very small blocks it is better to use loop. For large blocks, libcall can
2540 do nontemporary accesses and beat inline considerably. */
2541 static stringop_algs btver1_memcpy[2] = {
2542 {libcall, {{6, loop, false}, {14, unrolled_loop, false},
2543 {-1, rep_prefix_4_byte, false}}},
2544 {libcall, {{16, loop, false}, {8192, rep_prefix_8_byte, false},
2545 {-1, libcall, false}}}};
2546 static stringop_algs btver1_memset[2] = {
2547 {libcall, {{8, loop, false}, {24, unrolled_loop, false},
2548 {2048, rep_prefix_4_byte, false}, {-1, libcall, false}}},
2549 {libcall, {{48, unrolled_loop, false}, {8192, rep_prefix_8_byte, false},
2550 {-1, libcall, false}}}};
2551 const struct processor_costs btver1_cost = {
2553 /* Start of register allocator costs. integer->integer move cost is 2. */
2554 8, /* cost for loading QImode using movzbl */
2555 {6, 8, 6}, /* cost of loading integer registers
2556 in QImode, HImode and SImode.
2557 Relative to reg-reg move (2). */
2558 {6, 8, 6}, /* cost of storing integer registers */
2559 4, /* cost of reg,reg fld/fst */
2560 {12, 12, 28}, /* cost of loading fp registers
2561 in SFmode, DFmode and XFmode */
2562 {12, 12, 38}, /* cost of storing fp registers
2563 in SFmode, DFmode and XFmode */
2564 4, /* cost of moving MMX register */
2565 {10, 10}, /* cost of loading MMX registers
2566 in SImode and DImode */
2567 {12, 12}, /* cost of storing MMX registers
2568 in SImode and DImode */
2569 2, 4, 8, /* cost of moving XMM,YMM,ZMM register */
2570 {10, 10, 12, 48, 96}, /* cost of loading SSE registers
2571 in 32,64,128,256 and 512-bit */
2572 {10, 10, 12, 48, 96}, /* cost of storing SSE registers
2573 in 32,64,128,256 and 512-bit */
2574 14, 14, /* SSE->integer and integer->SSE moves */
2575 14, 14, /* mask->integer and integer->mask moves */
2576 {6, 8, 6}, /* cost of loading mask register
2577 in QImode, HImode, SImode. */
2578 {6, 8, 6}, /* cost if storing mask register
2579 in QImode, HImode, SImode. */
2580 2, /* cost of moving mask register. */
2581 /* End of register allocator costs. */
2584 COSTS_N_INSNS (1), /* cost of an add instruction */
2585 COSTS_N_INSNS (2), /* cost of a lea instruction */
2586 COSTS_N_INSNS (1), /* variable shift costs */
2587 COSTS_N_INSNS (1), /* constant shift costs */
2588 {COSTS_N_INSNS (3), /* cost of starting multiply for QI */
2589 COSTS_N_INSNS (4), /* HI */
2590 COSTS_N_INSNS (3), /* SI */
2591 COSTS_N_INSNS (4), /* DI */
2592 COSTS_N_INSNS (5)}, /* other */
2593 0, /* cost of multiply per each bit set */
2594 {COSTS_N_INSNS (19), /* cost of a divide/mod for QI */
2595 COSTS_N_INSNS (35), /* HI */
2596 COSTS_N_INSNS (51), /* SI */
2597 COSTS_N_INSNS (83), /* DI */
2598 COSTS_N_INSNS (83)}, /* other */
2599 COSTS_N_INSNS (1), /* cost of movsx */
2600 COSTS_N_INSNS (1), /* cost of movzx */
2601 8, /* "large" insn */
2602 9, /* MOVE_RATIO */
2603 6, /* CLEAR_RATIO */
2604 {6, 8, 6}, /* cost of loading integer registers
2605 in QImode, HImode and SImode.
2606 Relative to reg-reg move (2). */
2607 {6, 8, 6}, /* cost of storing integer registers */
2608 {10, 10, 12, 48, 96}, /* cost of loading SSE register
2609 in 32bit, 64bit, 128bit, 256bit and 512bit */
2610 {10, 10, 12, 48, 96}, /* cost of storing SSE register
2611 in 32bit, 64bit, 128bit, 256bit and 512bit */
2612 {10, 10, 12, 48, 96}, /* cost of unaligned loads. */
2613 {10, 10, 12, 48, 96}, /* cost of unaligned stores. */
2614 2, 4, 8, /* cost of moving XMM,YMM,ZMM register */
2615 14, /* cost of moving SSE register to integer. */
2616 10, 10, /* Gather load static, per_elt. */
2617 10, 10, /* Gather store static, per_elt. */
2618 32, /* size of l1 cache. */
2619 512, /* size of l2 cache. */
2620 64, /* size of prefetch block */
2621 100, /* number of parallel prefetches */
2622 2, /* Branch cost */
2623 COSTS_N_INSNS (4), /* cost of FADD and FSUB insns. */
2624 COSTS_N_INSNS (4), /* cost of FMUL instruction. */
2625 COSTS_N_INSNS (19), /* cost of FDIV instruction. */
2626 COSTS_N_INSNS (2), /* cost of FABS instruction. */
2627 COSTS_N_INSNS (2), /* cost of FCHS instruction. */
2628 COSTS_N_INSNS (35), /* cost of FSQRT instruction. */
2630 COSTS_N_INSNS (1), /* cost of cheap SSE instruction. */
2631 COSTS_N_INSNS (3), /* cost of ADDSS/SD SUBSS/SD insns. */
2632 COSTS_N_INSNS (2), /* cost of MULSS instruction. */
2633 COSTS_N_INSNS (4), /* cost of MULSD instruction. */
2634 COSTS_N_INSNS (5), /* cost of FMA SS instruction. */
2635 COSTS_N_INSNS (5), /* cost of FMA SD instruction. */
2636 COSTS_N_INSNS (13), /* cost of DIVSS instruction. */
2637 COSTS_N_INSNS (17), /* cost of DIVSD instruction. */
2638 COSTS_N_INSNS (14), /* cost of SQRTSS instruction. */
2639 COSTS_N_INSNS (48), /* cost of SQRTSD instruction. */
2640 1, 1, 1, 1, /* reassoc int, fp, vec_int, vec_fp. */
2641 btver1_memcpy,
2642 btver1_memset,
2643 COSTS_N_INSNS (2), /* cond_taken_branch_cost. */
2644 COSTS_N_INSNS (1), /* cond_not_taken_branch_cost. */
2645 "16:11:8", /* Loop alignment. */
2646 "16:8:8", /* Jump alignment. */
2647 "0:0:8", /* Label alignment. */
2648 "11", /* Func alignment. */
2649 4, /* Small unroll limit. */
2650 2, /* Small unroll factor. */
2651 COSTS_N_INSNS (2), /* Branch mispredict scale. */
2654 static stringop_algs btver2_memcpy[2] = {
2655 {libcall, {{6, loop, false}, {14, unrolled_loop, false},
2656 {-1, rep_prefix_4_byte, false}}},
2657 {libcall, {{16, loop, false}, {8192, rep_prefix_8_byte, false},
2658 {-1, libcall, false}}}};
2659 static stringop_algs btver2_memset[2] = {
2660 {libcall, {{8, loop, false}, {24, unrolled_loop, false},
2661 {2048, rep_prefix_4_byte, false}, {-1, libcall, false}}},
2662 {libcall, {{48, unrolled_loop, false}, {8192, rep_prefix_8_byte, false},
2663 {-1, libcall, false}}}};
2664 const struct processor_costs btver2_cost = {
2666 /* Start of register allocator costs. integer->integer move cost is 2. */
2667 8, /* cost for loading QImode using movzbl */
2668 {8, 8, 6}, /* cost of loading integer registers
2669 in QImode, HImode and SImode.
2670 Relative to reg-reg move (2). */
2671 {8, 8, 6}, /* cost of storing integer registers */
2672 4, /* cost of reg,reg fld/fst */
2673 {12, 12, 28}, /* cost of loading fp registers
2674 in SFmode, DFmode and XFmode */
2675 {12, 12, 38}, /* cost of storing fp registers
2676 in SFmode, DFmode and XFmode */
2677 4, /* cost of moving MMX register */
2678 {10, 10}, /* cost of loading MMX registers
2679 in SImode and DImode */
2680 {12, 12}, /* cost of storing MMX registers
2681 in SImode and DImode */
2682 2, 4, 8, /* cost of moving XMM,YMM,ZMM register */
2683 {10, 10, 12, 48, 96}, /* cost of loading SSE registers
2684 in 32,64,128,256 and 512-bit */
2685 {10, 10, 12, 48, 96}, /* cost of storing SSE registers
2686 in 32,64,128,256 and 512-bit */
2687 14, 14, /* SSE->integer and integer->SSE moves */
2688 14, 14, /* mask->integer and integer->mask moves */
2689 {8, 8, 6}, /* cost of loading mask register
2690 in QImode, HImode, SImode. */
2691 {8, 8, 6}, /* cost if storing mask register
2692 in QImode, HImode, SImode. */
2693 2, /* cost of moving mask register. */
2694 /* End of register allocator costs. */
2697 COSTS_N_INSNS (1), /* cost of an add instruction */
2698 COSTS_N_INSNS (2), /* cost of a lea instruction */
2699 COSTS_N_INSNS (1), /* variable shift costs */
2700 COSTS_N_INSNS (1), /* constant shift costs */
2701 {COSTS_N_INSNS (3), /* cost of starting multiply for QI */
2702 COSTS_N_INSNS (4), /* HI */
2703 COSTS_N_INSNS (3), /* SI */
2704 COSTS_N_INSNS (4), /* DI */
2705 COSTS_N_INSNS (5)}, /* other */
2706 0, /* cost of multiply per each bit set */
2707 {COSTS_N_INSNS (19), /* cost of a divide/mod for QI */
2708 COSTS_N_INSNS (35), /* HI */
2709 COSTS_N_INSNS (51), /* SI */
2710 COSTS_N_INSNS (83), /* DI */
2711 COSTS_N_INSNS (83)}, /* other */
2712 COSTS_N_INSNS (1), /* cost of movsx */
2713 COSTS_N_INSNS (1), /* cost of movzx */
2714 8, /* "large" insn */
2715 9, /* MOVE_RATIO */
2716 6, /* CLEAR_RATIO */
2717 {8, 8, 6}, /* cost of loading integer registers
2718 in QImode, HImode and SImode.
2719 Relative to reg-reg move (2). */
2720 {8, 8, 6}, /* cost of storing integer registers */
2721 {10, 10, 12, 48, 96}, /* cost of loading SSE register
2722 in 32bit, 64bit, 128bit, 256bit and 512bit */
2723 {10, 10, 12, 48, 96}, /* cost of storing SSE register
2724 in 32bit, 64bit, 128bit, 256bit and 512bit */
2725 {10, 10, 12, 48, 96}, /* cost of unaligned loads. */
2726 {10, 10, 12, 48, 96}, /* cost of unaligned stores. */
2727 2, 4, 8, /* cost of moving XMM,YMM,ZMM register */
2728 14, /* cost of moving SSE register to integer. */
2729 10, 10, /* Gather load static, per_elt. */
2730 10, 10, /* Gather store static, per_elt. */
2731 32, /* size of l1 cache. */
2732 2048, /* size of l2 cache. */
2733 64, /* size of prefetch block */
2734 100, /* number of parallel prefetches */
2735 2, /* Branch cost */
2736 COSTS_N_INSNS (4), /* cost of FADD and FSUB insns. */
2737 COSTS_N_INSNS (4), /* cost of FMUL instruction. */
2738 COSTS_N_INSNS (19), /* cost of FDIV instruction. */
2739 COSTS_N_INSNS (2), /* cost of FABS instruction. */
2740 COSTS_N_INSNS (2), /* cost of FCHS instruction. */
2741 COSTS_N_INSNS (35), /* cost of FSQRT instruction. */
2743 COSTS_N_INSNS (1), /* cost of cheap SSE instruction. */
2744 COSTS_N_INSNS (3), /* cost of ADDSS/SD SUBSS/SD insns. */
2745 COSTS_N_INSNS (2), /* cost of MULSS instruction. */
2746 COSTS_N_INSNS (4), /* cost of MULSD instruction. */
2747 COSTS_N_INSNS (5), /* cost of FMA SS instruction. */
2748 COSTS_N_INSNS (5), /* cost of FMA SD instruction. */
2749 COSTS_N_INSNS (13), /* cost of DIVSS instruction. */
2750 COSTS_N_INSNS (19), /* cost of DIVSD instruction. */
2751 COSTS_N_INSNS (16), /* cost of SQRTSS instruction. */
2752 COSTS_N_INSNS (21), /* cost of SQRTSD instruction. */
2753 1, 1, 1, 1, /* reassoc int, fp, vec_int, vec_fp. */
2754 btver2_memcpy,
2755 btver2_memset,
2756 COSTS_N_INSNS (2), /* cond_taken_branch_cost. */
2757 COSTS_N_INSNS (1), /* cond_not_taken_branch_cost. */
2758 "16:11:8", /* Loop alignment. */
2759 "16:8:8", /* Jump alignment. */
2760 "0:0:8", /* Label alignment. */
2761 "11", /* Func alignment. */
2762 4, /* Small unroll limit. */
2763 2, /* Small unroll factor. */
2764 COSTS_N_INSNS (2), /* Branch mispredict scale. */
2767 static stringop_algs pentium4_memcpy[2] = {
2768 {libcall, {{12, loop_1_byte, false}, {-1, rep_prefix_4_byte, false}}},
2769 DUMMY_STRINGOP_ALGS};
2770 static stringop_algs pentium4_memset[2] = {
2771 {libcall, {{6, loop_1_byte, false}, {48, loop, false},
2772 {20480, rep_prefix_4_byte, false}, {-1, libcall, false}}},
2773 DUMMY_STRINGOP_ALGS};
2775 static const
2776 struct processor_costs pentium4_cost = {
2778 /* Start of register allocator costs. integer->integer move cost is 2. */
2779 5, /* cost for loading QImode using movzbl */
2780 {4, 5, 4}, /* cost of loading integer registers
2781 in QImode, HImode and SImode.
2782 Relative to reg-reg move (2). */
2783 {2, 3, 2}, /* cost of storing integer registers */
2784 12, /* cost of reg,reg fld/fst */
2785 {14, 14, 14}, /* cost of loading fp registers
2786 in SFmode, DFmode and XFmode */
2787 {14, 14, 14}, /* cost of storing fp registers
2788 in SFmode, DFmode and XFmode */
2789 12, /* cost of moving MMX register */
2790 {16, 16}, /* cost of loading MMX registers
2791 in SImode and DImode */
2792 {16, 16}, /* cost of storing MMX registers
2793 in SImode and DImode */
2794 12, 24, 48, /* cost of moving XMM,YMM,ZMM register */
2795 {16, 16, 16, 32, 64}, /* cost of loading SSE registers
2796 in 32,64,128,256 and 512-bit */
2797 {16, 16, 16, 32, 64}, /* cost of storing SSE registers
2798 in 32,64,128,256 and 512-bit */
2799 20, 12, /* SSE->integer and integer->SSE moves */
2800 20, 12, /* mask->integer and integer->mask moves */
2801 {4, 5, 4}, /* cost of loading mask register
2802 in QImode, HImode, SImode. */
2803 {2, 3, 2}, /* cost if storing mask register
2804 in QImode, HImode, SImode. */
2805 2, /* cost of moving mask register. */
2806 /* End of register allocator costs. */
2809 COSTS_N_INSNS (1), /* cost of an add instruction */
2810 COSTS_N_INSNS (3), /* cost of a lea instruction */
2811 COSTS_N_INSNS (4), /* variable shift costs */
2812 COSTS_N_INSNS (4), /* constant shift costs */
2813 {COSTS_N_INSNS (15), /* cost of starting multiply for QI */
2814 COSTS_N_INSNS (15), /* HI */
2815 COSTS_N_INSNS (15), /* SI */
2816 COSTS_N_INSNS (15), /* DI */
2817 COSTS_N_INSNS (15)}, /* other */
2818 0, /* cost of multiply per each bit set */
2819 {COSTS_N_INSNS (56), /* cost of a divide/mod for QI */
2820 COSTS_N_INSNS (56), /* HI */
2821 COSTS_N_INSNS (56), /* SI */
2822 COSTS_N_INSNS (56), /* DI */
2823 COSTS_N_INSNS (56)}, /* other */
2824 COSTS_N_INSNS (1), /* cost of movsx */
2825 COSTS_N_INSNS (1), /* cost of movzx */
2826 16, /* "large" insn */
2827 6, /* MOVE_RATIO */
2828 6, /* CLEAR_RATIO */
2829 {4, 5, 4}, /* cost of loading integer registers
2830 in QImode, HImode and SImode.
2831 Relative to reg-reg move (2). */
2832 {2, 3, 2}, /* cost of storing integer registers */
2833 {16, 16, 16, 32, 64}, /* cost of loading SSE register
2834 in 32bit, 64bit, 128bit, 256bit and 512bit */
2835 {16, 16, 16, 32, 64}, /* cost of storing SSE register
2836 in 32bit, 64bit, 128bit, 256bit and 512bit */
2837 {32, 32, 32, 64, 128}, /* cost of unaligned loads. */
2838 {32, 32, 32, 64, 128}, /* cost of unaligned stores. */
2839 12, 24, 48, /* cost of moving XMM,YMM,ZMM register */
2840 20, /* cost of moving SSE register to integer. */
2841 16, 16, /* Gather load static, per_elt. */
2842 16, 16, /* Gather store static, per_elt. */
2843 8, /* size of l1 cache. */
2844 256, /* size of l2 cache. */
2845 64, /* size of prefetch block */
2846 6, /* number of parallel prefetches */
2847 2, /* Branch cost */
2848 COSTS_N_INSNS (5), /* cost of FADD and FSUB insns. */
2849 COSTS_N_INSNS (7), /* cost of FMUL instruction. */
2850 COSTS_N_INSNS (43), /* cost of FDIV instruction. */
2851 COSTS_N_INSNS (2), /* cost of FABS instruction. */
2852 COSTS_N_INSNS (2), /* cost of FCHS instruction. */
2853 COSTS_N_INSNS (43), /* cost of FSQRT instruction. */
2855 COSTS_N_INSNS (2), /* cost of cheap SSE instruction. */
2856 COSTS_N_INSNS (4), /* cost of ADDSS/SD SUBSS/SD insns. */
2857 COSTS_N_INSNS (6), /* cost of MULSS instruction. */
2858 COSTS_N_INSNS (6), /* cost of MULSD instruction. */
2859 COSTS_N_INSNS (6), /* cost of FMA SS instruction. */
2860 COSTS_N_INSNS (6), /* cost of FMA SD instruction. */
2861 COSTS_N_INSNS (23), /* cost of DIVSS instruction. */
2862 COSTS_N_INSNS (38), /* cost of DIVSD instruction. */
2863 COSTS_N_INSNS (23), /* cost of SQRTSS instruction. */
2864 COSTS_N_INSNS (38), /* cost of SQRTSD instruction. */
2865 1, 1, 1, 1, /* reassoc int, fp, vec_int, vec_fp. */
2866 pentium4_memcpy,
2867 pentium4_memset,
2868 COSTS_N_INSNS (3), /* cond_taken_branch_cost. */
2869 COSTS_N_INSNS (1), /* cond_not_taken_branch_cost. */
2870 NULL, /* Loop alignment. */
2871 NULL, /* Jump alignment. */
2872 NULL, /* Label alignment. */
2873 NULL, /* Func alignment. */
2874 4, /* Small unroll limit. */
2875 2, /* Small unroll factor. */
2876 COSTS_N_INSNS (2), /* Branch mispredict scale. */
2879 static stringop_algs nocona_memcpy[2] = {
2880 {libcall, {{12, loop_1_byte, false}, {-1, rep_prefix_4_byte, false}}},
2881 {libcall, {{32, loop, false}, {20000, rep_prefix_8_byte, false},
2882 {100000, unrolled_loop, false}, {-1, libcall, false}}}};
2884 static stringop_algs nocona_memset[2] = {
2885 {libcall, {{6, loop_1_byte, false}, {48, loop, false},
2886 {20480, rep_prefix_4_byte, false}, {-1, libcall, false}}},
2887 {libcall, {{24, loop, false}, {64, unrolled_loop, false},
2888 {8192, rep_prefix_8_byte, false}, {-1, libcall, false}}}};
2890 static const
2891 struct processor_costs nocona_cost = {
2893 /* Start of register allocator costs. integer->integer move cost is 2. */
2894 4, /* cost for loading QImode using movzbl */
2895 {4, 4, 4}, /* cost of loading integer registers
2896 in QImode, HImode and SImode.
2897 Relative to reg-reg move (2). */
2898 {4, 4, 4}, /* cost of storing integer registers */
2899 12, /* cost of reg,reg fld/fst */
2900 {14, 14, 14}, /* cost of loading fp registers
2901 in SFmode, DFmode and XFmode */
2902 {14, 14, 14}, /* cost of storing fp registers
2903 in SFmode, DFmode and XFmode */
2904 14, /* cost of moving MMX register */
2905 {12, 12}, /* cost of loading MMX registers
2906 in SImode and DImode */
2907 {12, 12}, /* cost of storing MMX registers
2908 in SImode and DImode */
2909 6, 12, 24, /* cost of moving XMM,YMM,ZMM register */
2910 {12, 12, 12, 24, 48}, /* cost of loading SSE registers
2911 in 32,64,128,256 and 512-bit */
2912 {12, 12, 12, 24, 48}, /* cost of storing SSE registers
2913 in 32,64,128,256 and 512-bit */
2914 20, 12, /* SSE->integer and integer->SSE moves */
2915 20, 12, /* mask->integer and integer->mask moves */
2916 {4, 4, 4}, /* cost of loading mask register
2917 in QImode, HImode, SImode. */
2918 {4, 4, 4}, /* cost if storing mask register
2919 in QImode, HImode, SImode. */
2920 2, /* cost of moving mask register. */
2921 /* End of register allocator costs. */
2924 COSTS_N_INSNS (1), /* cost of an add instruction */
2925 COSTS_N_INSNS (1), /* cost of a lea instruction */
2926 COSTS_N_INSNS (1), /* variable shift costs */
2927 COSTS_N_INSNS (1), /* constant shift costs */
2928 {COSTS_N_INSNS (10), /* cost of starting multiply for QI */
2929 COSTS_N_INSNS (10), /* HI */
2930 COSTS_N_INSNS (10), /* SI */
2931 COSTS_N_INSNS (10), /* DI */
2932 COSTS_N_INSNS (10)}, /* other */
2933 0, /* cost of multiply per each bit set */
2934 {COSTS_N_INSNS (66), /* cost of a divide/mod for QI */
2935 COSTS_N_INSNS (66), /* HI */
2936 COSTS_N_INSNS (66), /* SI */
2937 COSTS_N_INSNS (66), /* DI */
2938 COSTS_N_INSNS (66)}, /* other */
2939 COSTS_N_INSNS (1), /* cost of movsx */
2940 COSTS_N_INSNS (1), /* cost of movzx */
2941 16, /* "large" insn */
2942 17, /* MOVE_RATIO */
2943 6, /* CLEAR_RATIO */
2944 {4, 4, 4}, /* cost of loading integer registers
2945 in QImode, HImode and SImode.
2946 Relative to reg-reg move (2). */
2947 {4, 4, 4}, /* cost of storing integer registers */
2948 {12, 12, 12, 24, 48}, /* cost of loading SSE register
2949 in 32bit, 64bit, 128bit, 256bit and 512bit */
2950 {12, 12, 12, 24, 48}, /* cost of storing SSE register
2951 in 32bit, 64bit, 128bit, 256bit and 512bit */
2952 {24, 24, 24, 48, 96}, /* cost of unaligned loads. */
2953 {24, 24, 24, 48, 96}, /* cost of unaligned stores. */
2954 6, 12, 24, /* cost of moving XMM,YMM,ZMM register */
2955 20, /* cost of moving SSE register to integer. */
2956 12, 12, /* Gather load static, per_elt. */
2957 12, 12, /* Gather store static, per_elt. */
2958 8, /* size of l1 cache. */
2959 1024, /* size of l2 cache. */
2960 64, /* size of prefetch block */
2961 8, /* number of parallel prefetches */
2962 1, /* Branch cost */
2963 COSTS_N_INSNS (6), /* cost of FADD and FSUB insns. */
2964 COSTS_N_INSNS (8), /* cost of FMUL instruction. */
2965 COSTS_N_INSNS (40), /* cost of FDIV instruction. */
2966 COSTS_N_INSNS (3), /* cost of FABS instruction. */
2967 COSTS_N_INSNS (3), /* cost of FCHS instruction. */
2968 COSTS_N_INSNS (44), /* cost of FSQRT instruction. */
2970 COSTS_N_INSNS (2), /* cost of cheap SSE instruction. */
2971 COSTS_N_INSNS (5), /* cost of ADDSS/SD SUBSS/SD insns. */
2972 COSTS_N_INSNS (7), /* cost of MULSS instruction. */
2973 COSTS_N_INSNS (7), /* cost of MULSD instruction. */
2974 COSTS_N_INSNS (7), /* cost of FMA SS instruction. */
2975 COSTS_N_INSNS (7), /* cost of FMA SD instruction. */
2976 COSTS_N_INSNS (32), /* cost of DIVSS instruction. */
2977 COSTS_N_INSNS (40), /* cost of DIVSD instruction. */
2978 COSTS_N_INSNS (32), /* cost of SQRTSS instruction. */
2979 COSTS_N_INSNS (41), /* cost of SQRTSD instruction. */
2980 1, 1, 1, 1, /* reassoc int, fp, vec_int, vec_fp. */
2981 nocona_memcpy,
2982 nocona_memset,
2983 COSTS_N_INSNS (3), /* cond_taken_branch_cost. */
2984 COSTS_N_INSNS (1), /* cond_not_taken_branch_cost. */
2985 NULL, /* Loop alignment. */
2986 NULL, /* Jump alignment. */
2987 NULL, /* Label alignment. */
2988 NULL, /* Func alignment. */
2989 4, /* Small unroll limit. */
2990 2, /* Small unroll factor. */
2991 COSTS_N_INSNS (2), /* Branch mispredict scale. */
2994 static stringop_algs atom_memcpy[2] = {
2995 {libcall, {{11, loop, false}, {-1, rep_prefix_4_byte, false}}},
2996 {libcall, {{32, loop, false}, {64, rep_prefix_4_byte, false},
2997 {8192, rep_prefix_8_byte, false}, {-1, libcall, false}}}};
2998 static stringop_algs atom_memset[2] = {
2999 {libcall, {{8, loop, false}, {15, unrolled_loop, false},
3000 {2048, rep_prefix_4_byte, false}, {-1, libcall, false}}},
3001 {libcall, {{24, loop, false}, {32, unrolled_loop, false},
3002 {8192, rep_prefix_8_byte, false}, {-1, libcall, false}}}};
3003 static const
3004 struct processor_costs atom_cost = {
3006 /* Start of register allocator costs. integer->integer move cost is 2. */
3007 6, /* cost for loading QImode using movzbl */
3008 {6, 6, 6}, /* cost of loading integer registers
3009 in QImode, HImode and SImode.
3010 Relative to reg-reg move (2). */
3011 {6, 6, 6}, /* cost of storing integer registers */
3012 4, /* cost of reg,reg fld/fst */
3013 {6, 6, 18}, /* cost of loading fp registers
3014 in SFmode, DFmode and XFmode */
3015 {14, 14, 24}, /* cost of storing fp registers
3016 in SFmode, DFmode and XFmode */
3017 2, /* cost of moving MMX register */
3018 {8, 8}, /* cost of loading MMX registers
3019 in SImode and DImode */
3020 {10, 10}, /* cost of storing MMX registers
3021 in SImode and DImode */
3022 2, 4, 8, /* cost of moving XMM,YMM,ZMM register */
3023 {8, 8, 8, 16, 32}, /* cost of loading SSE registers
3024 in 32,64,128,256 and 512-bit */
3025 {8, 8, 8, 16, 32}, /* cost of storing SSE registers
3026 in 32,64,128,256 and 512-bit */
3027 8, 6, /* SSE->integer and integer->SSE moves */
3028 8, 6, /* mask->integer and integer->mask moves */
3029 {6, 6, 6}, /* cost of loading mask register
3030 in QImode, HImode, SImode. */
3031 {6, 6, 6}, /* cost if storing mask register
3032 in QImode, HImode, SImode. */
3033 2, /* cost of moving mask register. */
3034 /* End of register allocator costs. */
3037 COSTS_N_INSNS (1), /* cost of an add instruction */
3038 COSTS_N_INSNS (1) + 1, /* cost of a lea instruction */
3039 COSTS_N_INSNS (1), /* variable shift costs */
3040 COSTS_N_INSNS (1), /* constant shift costs */
3041 {COSTS_N_INSNS (3), /* cost of starting multiply for QI */
3042 COSTS_N_INSNS (4), /* HI */
3043 COSTS_N_INSNS (3), /* SI */
3044 COSTS_N_INSNS (4), /* DI */
3045 COSTS_N_INSNS (2)}, /* other */
3046 0, /* cost of multiply per each bit set */
3047 {COSTS_N_INSNS (18), /* cost of a divide/mod for QI */
3048 COSTS_N_INSNS (26), /* HI */
3049 COSTS_N_INSNS (42), /* SI */
3050 COSTS_N_INSNS (74), /* DI */
3051 COSTS_N_INSNS (74)}, /* other */
3052 COSTS_N_INSNS (1), /* cost of movsx */
3053 COSTS_N_INSNS (1), /* cost of movzx */
3054 8, /* "large" insn */
3055 17, /* MOVE_RATIO */
3056 6, /* CLEAR_RATIO */
3057 {6, 6, 6}, /* cost of loading integer registers
3058 in QImode, HImode and SImode.
3059 Relative to reg-reg move (2). */
3060 {6, 6, 6}, /* cost of storing integer registers */
3061 {8, 8, 8, 16, 32}, /* cost of loading SSE register
3062 in 32bit, 64bit, 128bit, 256bit and 512bit */
3063 {8, 8, 8, 16, 32}, /* cost of storing SSE register
3064 in 32bit, 64bit, 128bit, 256bit and 512bit */
3065 {16, 16, 16, 32, 64}, /* cost of unaligned loads. */
3066 {16, 16, 16, 32, 64}, /* cost of unaligned stores. */
3067 2, 4, 8, /* cost of moving XMM,YMM,ZMM register */
3068 8, /* cost of moving SSE register to integer. */
3069 8, 8, /* Gather load static, per_elt. */
3070 8, 8, /* Gather store static, per_elt. */
3071 32, /* size of l1 cache. */
3072 256, /* size of l2 cache. */
3073 64, /* size of prefetch block */
3074 6, /* number of parallel prefetches */
3075 3, /* Branch cost */
3076 COSTS_N_INSNS (8), /* cost of FADD and FSUB insns. */
3077 COSTS_N_INSNS (8), /* cost of FMUL instruction. */
3078 COSTS_N_INSNS (20), /* cost of FDIV instruction. */
3079 COSTS_N_INSNS (8), /* cost of FABS instruction. */
3080 COSTS_N_INSNS (8), /* cost of FCHS instruction. */
3081 COSTS_N_INSNS (40), /* cost of FSQRT instruction. */
3083 COSTS_N_INSNS (1), /* cost of cheap SSE instruction. */
3084 COSTS_N_INSNS (5), /* cost of ADDSS/SD SUBSS/SD insns. */
3085 COSTS_N_INSNS (4), /* cost of MULSS instruction. */
3086 COSTS_N_INSNS (5), /* cost of MULSD instruction. */
3087 COSTS_N_INSNS (6), /* cost of FMA SS instruction. */
3088 COSTS_N_INSNS (6), /* cost of FMA SD instruction. */
3089 COSTS_N_INSNS (31), /* cost of DIVSS instruction. */
3090 COSTS_N_INSNS (60), /* cost of DIVSD instruction. */
3091 COSTS_N_INSNS (31), /* cost of SQRTSS instruction. */
3092 COSTS_N_INSNS (63), /* cost of SQRTSD instruction. */
3093 2, 2, 2, 2, /* reassoc int, fp, vec_int, vec_fp. */
3094 atom_memcpy,
3095 atom_memset,
3096 COSTS_N_INSNS (3), /* cond_taken_branch_cost. */
3097 COSTS_N_INSNS (1), /* cond_not_taken_branch_cost. */
3098 "16", /* Loop alignment. */
3099 "16:8:8", /* Jump alignment. */
3100 "0:0:8", /* Label alignment. */
3101 "16", /* Func alignment. */
3102 4, /* Small unroll limit. */
3103 2, /* Small unroll factor. */
3104 COSTS_N_INSNS (2), /* Branch mispredict scale. */
3107 static stringop_algs slm_memcpy[2] = {
3108 {libcall, {{11, loop, false}, {-1, rep_prefix_4_byte, false}}},
3109 {libcall, {{32, loop, false}, {64, rep_prefix_4_byte, false},
3110 {8192, rep_prefix_8_byte, false}, {-1, libcall, false}}}};
3111 static stringop_algs slm_memset[2] = {
3112 {libcall, {{8, loop, false}, {15, unrolled_loop, false},
3113 {2048, rep_prefix_4_byte, false}, {-1, libcall, false}}},
3114 {libcall, {{24, loop, false}, {32, unrolled_loop, false},
3115 {8192, rep_prefix_8_byte, false}, {-1, libcall, false}}}};
3116 static const
3117 struct processor_costs slm_cost = {
3119 /* Start of register allocator costs. integer->integer move cost is 2. */
3120 8, /* cost for loading QImode using movzbl */
3121 {8, 8, 8}, /* cost of loading integer registers
3122 in QImode, HImode and SImode.
3123 Relative to reg-reg move (2). */
3124 {6, 6, 6}, /* cost of storing integer registers */
3125 2, /* cost of reg,reg fld/fst */
3126 {8, 8, 18}, /* cost of loading fp registers
3127 in SFmode, DFmode and XFmode */
3128 {6, 6, 18}, /* cost of storing fp registers
3129 in SFmode, DFmode and XFmode */
3130 2, /* cost of moving MMX register */
3131 {8, 8}, /* cost of loading MMX registers
3132 in SImode and DImode */
3133 {6, 6}, /* cost of storing MMX registers
3134 in SImode and DImode */
3135 2, 4, 8, /* cost of moving XMM,YMM,ZMM register */
3136 {8, 8, 8, 16, 32}, /* cost of loading SSE registers
3137 in 32,64,128,256 and 512-bit */
3138 {8, 8, 8, 16, 32}, /* cost of storing SSE registers
3139 in 32,64,128,256 and 512-bit */
3140 8, 6, /* SSE->integer and integer->SSE moves */
3141 8, 6, /* mask->integer and integer->mask moves */
3142 {8, 8, 8}, /* cost of loading mask register
3143 in QImode, HImode, SImode. */
3144 {6, 6, 6}, /* cost if storing mask register
3145 in QImode, HImode, SImode. */
3146 2, /* cost of moving mask register. */
3147 /* End of register allocator costs. */
3150 COSTS_N_INSNS (1), /* cost of an add instruction */
3151 COSTS_N_INSNS (1) + 1, /* cost of a lea instruction */
3152 COSTS_N_INSNS (1), /* variable shift costs */
3153 COSTS_N_INSNS (1), /* constant shift costs */
3154 {COSTS_N_INSNS (3), /* cost of starting multiply for QI */
3155 COSTS_N_INSNS (3), /* HI */
3156 COSTS_N_INSNS (3), /* SI */
3157 COSTS_N_INSNS (4), /* DI */
3158 COSTS_N_INSNS (2)}, /* other */
3159 0, /* cost of multiply per each bit set */
3160 {COSTS_N_INSNS (18), /* cost of a divide/mod for QI */
3161 COSTS_N_INSNS (26), /* HI */
3162 COSTS_N_INSNS (42), /* SI */
3163 COSTS_N_INSNS (74), /* DI */
3164 COSTS_N_INSNS (74)}, /* other */
3165 COSTS_N_INSNS (1), /* cost of movsx */
3166 COSTS_N_INSNS (1), /* cost of movzx */
3167 8, /* "large" insn */
3168 17, /* MOVE_RATIO */
3169 6, /* CLEAR_RATIO */
3170 {8, 8, 8}, /* cost of loading integer registers
3171 in QImode, HImode and SImode.
3172 Relative to reg-reg move (2). */
3173 {6, 6, 6}, /* cost of storing integer registers */
3174 {8, 8, 8, 16, 32}, /* cost of loading SSE register
3175 in 32bit, 64bit, 128bit, 256bit and 512bit */
3176 {8, 8, 8, 16, 32}, /* cost of storing SSE register
3177 in SImode, DImode and TImode. */
3178 {16, 16, 16, 32, 64}, /* cost of unaligned loads. */
3179 {16, 16, 16, 32, 64}, /* cost of unaligned stores. */
3180 2, 4, 8, /* cost of moving XMM,YMM,ZMM register */
3181 8, /* cost of moving SSE register to integer. */
3182 8, 8, /* Gather load static, per_elt. */
3183 8, 8, /* Gather store static, per_elt. */
3184 32, /* size of l1 cache. */
3185 256, /* size of l2 cache. */
3186 64, /* size of prefetch block */
3187 6, /* number of parallel prefetches */
3188 3, /* Branch cost */
3189 COSTS_N_INSNS (8), /* cost of FADD and FSUB insns. */
3190 COSTS_N_INSNS (8), /* cost of FMUL instruction. */
3191 COSTS_N_INSNS (20), /* cost of FDIV instruction. */
3192 COSTS_N_INSNS (8), /* cost of FABS instruction. */
3193 COSTS_N_INSNS (8), /* cost of FCHS instruction. */
3194 COSTS_N_INSNS (40), /* cost of FSQRT instruction. */
3196 COSTS_N_INSNS (1), /* cost of cheap SSE instruction. */
3197 COSTS_N_INSNS (3), /* cost of ADDSS/SD SUBSS/SD insns. */
3198 COSTS_N_INSNS (4), /* cost of MULSS instruction. */
3199 COSTS_N_INSNS (5), /* cost of MULSD instruction. */
3200 COSTS_N_INSNS (6), /* cost of FMA SS instruction. */
3201 COSTS_N_INSNS (6), /* cost of FMA SD instruction. */
3202 COSTS_N_INSNS (39), /* cost of DIVSS instruction. */
3203 COSTS_N_INSNS (69), /* cost of DIVSD instruction. */
3204 COSTS_N_INSNS (20), /* cost of SQRTSS instruction. */
3205 COSTS_N_INSNS (35), /* cost of SQRTSD instruction. */
3206 1, 2, 1, 1, /* reassoc int, fp, vec_int, vec_fp. */
3207 slm_memcpy,
3208 slm_memset,
3209 COSTS_N_INSNS (3), /* cond_taken_branch_cost. */
3210 COSTS_N_INSNS (1), /* cond_not_taken_branch_cost. */
3211 "16", /* Loop alignment. */
3212 "16:8:8", /* Jump alignment. */
3213 "0:0:8", /* Label alignment. */
3214 "16", /* Func alignment. */
3215 4, /* Small unroll limit. */
3216 2, /* Small unroll factor. */
3217 COSTS_N_INSNS (2), /* Branch mispredict scale. */
3220 static stringop_algs tremont_memcpy[2] = {
3221 {libcall,
3222 {{256, rep_prefix_1_byte, true},
3223 {256, loop, false},
3224 {-1, libcall, false}}},
3225 {libcall,
3226 {{256, rep_prefix_1_byte, true},
3227 {256, loop, false},
3228 {-1, libcall, false}}}};
3229 static stringop_algs tremont_memset[2] = {
3230 {libcall,
3231 {{256, rep_prefix_1_byte, true},
3232 {256, loop, false},
3233 {-1, libcall, false}}},
3234 {libcall,
3235 {{256, rep_prefix_1_byte, true},
3236 {256, loop, false},
3237 {-1, libcall, false}}}};
3238 static const
3239 struct processor_costs tremont_cost = {
3241 /* Start of register allocator costs. integer->integer move cost is 2. */
3242 6, /* cost for loading QImode using movzbl */
3243 {6, 6, 6}, /* cost of loading integer registers
3244 in QImode, HImode and SImode.
3245 Relative to reg-reg move (2). */
3246 {6, 6, 6}, /* cost of storing integer registers */
3247 4, /* cost of reg,reg fld/fst */
3248 {6, 6, 12}, /* cost of loading fp registers
3249 in SFmode, DFmode and XFmode */
3250 {6, 6, 12}, /* cost of storing fp registers
3251 in SFmode, DFmode and XFmode */
3252 2, /* cost of moving MMX register */
3253 {6, 6}, /* cost of loading MMX registers
3254 in SImode and DImode */
3255 {6, 6}, /* cost of storing MMX registers
3256 in SImode and DImode */
3257 2, 3, 4, /* cost of moving XMM,YMM,ZMM register */
3258 {6, 6, 6, 10, 15}, /* cost of loading SSE registers
3259 in 32,64,128,256 and 512-bit */
3260 {6, 6, 6, 10, 15}, /* cost of storing SSE registers
3261 in 32,64,128,256 and 512-bit */
3262 6, 6, /* SSE->integer and integer->SSE moves */
3263 6, 6, /* mask->integer and integer->mask moves */
3264 {6, 6, 6}, /* cost of loading mask register
3265 in QImode, HImode, SImode. */
3266 {6, 6, 6}, /* cost if storing mask register
3267 in QImode, HImode, SImode. */
3268 2, /* cost of moving mask register. */
3269 /* End of register allocator costs. */
3272 COSTS_N_INSNS (1), /* cost of an add instruction */
3273 /* Setting cost to 2 makes our current implementation of synth_mult result in
3274 use of unnecessary temporary registers causing regression on several
3275 SPECfp benchmarks. */
3276 COSTS_N_INSNS (1) + 1, /* cost of a lea instruction */
3277 COSTS_N_INSNS (1), /* variable shift costs */
3278 COSTS_N_INSNS (1), /* constant shift costs */
3279 {COSTS_N_INSNS (3), /* cost of starting multiply for QI */
3280 COSTS_N_INSNS (3), /* HI */
3281 COSTS_N_INSNS (3), /* SI */
3282 COSTS_N_INSNS (3), /* DI */
3283 COSTS_N_INSNS (4)}, /* other */
3284 0, /* cost of multiply per each bit set */
3285 {COSTS_N_INSNS (16), /* cost of a divide/mod for QI */
3286 COSTS_N_INSNS (22), /* HI */
3287 COSTS_N_INSNS (30), /* SI */
3288 COSTS_N_INSNS (74), /* DI */
3289 COSTS_N_INSNS (74)}, /* other */
3290 COSTS_N_INSNS (1), /* cost of movsx */
3291 COSTS_N_INSNS (1), /* cost of movzx */
3292 8, /* "large" insn */
3293 17, /* MOVE_RATIO */
3294 17, /* CLEAR_RATIO */
3295 {6, 6, 6}, /* cost of loading integer registers
3296 in QImode, HImode and SImode.
3297 Relative to reg-reg move (2). */
3298 {6, 6, 6}, /* cost of storing integer registers */
3299 {6, 6, 6, 10, 15}, /* cost of loading SSE register
3300 in 32bit, 64bit, 128bit, 256bit and 512bit */
3301 {6, 6, 6, 10, 15}, /* cost of storing SSE register
3302 in 32bit, 64bit, 128bit, 256bit and 512bit */
3303 {6, 6, 6, 10, 15}, /* cost of unaligned loads. */
3304 {6, 6, 6, 10, 15}, /* cost of unaligned storess. */
3305 2, 3, 4, /* cost of moving XMM,YMM,ZMM register */
3306 6, /* cost of moving SSE register to integer. */
3307 18, 6, /* Gather load static, per_elt. */
3308 18, 6, /* Gather store static, per_elt. */
3309 32, /* size of l1 cache. */
3310 512, /* size of l2 cache. */
3311 64, /* size of prefetch block */
3312 6, /* number of parallel prefetches */
3313 /* Benchmarks shows large regressions on K8 sixtrack benchmark when this
3314 value is increased to perhaps more appropriate value of 5. */
3315 3, /* Branch cost */
3316 COSTS_N_INSNS (3), /* cost of FADD and FSUB insns. */
3317 COSTS_N_INSNS (5), /* cost of FMUL instruction. */
3318 COSTS_N_INSNS (17), /* cost of FDIV instruction. */
3319 COSTS_N_INSNS (1), /* cost of FABS instruction. */
3320 COSTS_N_INSNS (1), /* cost of FCHS instruction. */
3321 COSTS_N_INSNS (14), /* cost of FSQRT instruction. */
3323 COSTS_N_INSNS (1), /* cost of cheap SSE instruction. */
3324 COSTS_N_INSNS (3), /* cost of ADDSS/SD SUBSS/SD insns. */
3325 COSTS_N_INSNS (4), /* cost of MULSS instruction. */
3326 COSTS_N_INSNS (5), /* cost of MULSD instruction. */
3327 COSTS_N_INSNS (5), /* cost of FMA SS instruction. */
3328 COSTS_N_INSNS (5), /* cost of FMA SD instruction. */
3329 COSTS_N_INSNS (13), /* cost of DIVSS instruction. */
3330 COSTS_N_INSNS (17), /* cost of DIVSD instruction. */
3331 COSTS_N_INSNS (14), /* cost of SQRTSS instruction. */
3332 COSTS_N_INSNS (18), /* cost of SQRTSD instruction. */
3333 1, 4, 3, 3, /* reassoc int, fp, vec_int, vec_fp. */
3334 tremont_memcpy,
3335 tremont_memset,
3336 COSTS_N_INSNS (4), /* cond_taken_branch_cost. */
3337 COSTS_N_INSNS (2), /* cond_not_taken_branch_cost. */
3338 "16:11:8", /* Loop alignment. */
3339 "16:11:8", /* Jump alignment. */
3340 "0:0:8", /* Label alignment. */
3341 "16", /* Func alignment. */
3342 4, /* Small unroll limit. */
3343 2, /* Small unroll factor. */
3344 COSTS_N_INSNS (2), /* Branch mispredict scale. */
3347 static stringop_algs intel_memcpy[2] = {
3348 {libcall, {{11, loop, false}, {-1, rep_prefix_4_byte, false}}},
3349 {libcall, {{32, loop, false}, {64, rep_prefix_4_byte, false},
3350 {8192, rep_prefix_8_byte, false}, {-1, libcall, false}}}};
3351 static stringop_algs intel_memset[2] = {
3352 {libcall, {{8, loop, false}, {15, unrolled_loop, false},
3353 {2048, rep_prefix_4_byte, false}, {-1, libcall, false}}},
3354 {libcall, {{24, loop, false}, {32, unrolled_loop, false},
3355 {8192, rep_prefix_8_byte, false}, {-1, libcall, false}}}};
3356 static const
3357 struct processor_costs intel_cost = {
3359 /* Start of register allocator costs. integer->integer move cost is 2. */
3360 6, /* cost for loading QImode using movzbl */
3361 {4, 4, 4}, /* cost of loading integer registers
3362 in QImode, HImode and SImode.
3363 Relative to reg-reg move (2). */
3364 {6, 6, 6}, /* cost of storing integer registers */
3365 2, /* cost of reg,reg fld/fst */
3366 {6, 6, 8}, /* cost of loading fp registers
3367 in SFmode, DFmode and XFmode */
3368 {6, 6, 10}, /* cost of storing fp registers
3369 in SFmode, DFmode and XFmode */
3370 2, /* cost of moving MMX register */
3371 {6, 6}, /* cost of loading MMX registers
3372 in SImode and DImode */
3373 {6, 6}, /* cost of storing MMX registers
3374 in SImode and DImode */
3375 2, 2, 2, /* cost of moving XMM,YMM,ZMM register */
3376 {6, 6, 6, 6, 6}, /* cost of loading SSE registers
3377 in 32,64,128,256 and 512-bit */
3378 {6, 6, 6, 6, 6}, /* cost of storing SSE registers
3379 in 32,64,128,256 and 512-bit */
3380 4, 4, /* SSE->integer and integer->SSE moves */
3381 4, 4, /* mask->integer and integer->mask moves */
3382 {4, 4, 4}, /* cost of loading mask register
3383 in QImode, HImode, SImode. */
3384 {6, 6, 6}, /* cost if storing mask register
3385 in QImode, HImode, SImode. */
3386 2, /* cost of moving mask register. */
3387 /* End of register allocator costs. */
3390 COSTS_N_INSNS (1), /* cost of an add instruction */
3391 COSTS_N_INSNS (1) + 1, /* cost of a lea instruction */
3392 COSTS_N_INSNS (1), /* variable shift costs */
3393 COSTS_N_INSNS (1), /* constant shift costs */
3394 {COSTS_N_INSNS (3), /* cost of starting multiply for QI */
3395 COSTS_N_INSNS (3), /* HI */
3396 COSTS_N_INSNS (3), /* SI */
3397 COSTS_N_INSNS (4), /* DI */
3398 COSTS_N_INSNS (2)}, /* other */
3399 0, /* cost of multiply per each bit set */
3400 {COSTS_N_INSNS (18), /* cost of a divide/mod for QI */
3401 COSTS_N_INSNS (26), /* HI */
3402 COSTS_N_INSNS (42), /* SI */
3403 COSTS_N_INSNS (74), /* DI */
3404 COSTS_N_INSNS (74)}, /* other */
3405 COSTS_N_INSNS (1), /* cost of movsx */
3406 COSTS_N_INSNS (1), /* cost of movzx */
3407 8, /* "large" insn */
3408 17, /* MOVE_RATIO */
3409 6, /* CLEAR_RATIO */
3410 {4, 4, 4}, /* cost of loading integer registers
3411 in QImode, HImode and SImode.
3412 Relative to reg-reg move (2). */
3413 {6, 6, 6}, /* cost of storing integer registers */
3414 {6, 6, 6, 6, 6}, /* cost of loading SSE register
3415 in 32bit, 64bit, 128bit, 256bit and 512bit */
3416 {6, 6, 6, 6, 6}, /* cost of storing SSE register
3417 in 32bit, 64bit, 128bit, 256bit and 512bit */
3418 {10, 10, 10, 10, 10}, /* cost of unaligned loads. */
3419 {10, 10, 10, 10, 10}, /* cost of unaligned loads. */
3420 2, 2, 2, /* cost of moving XMM,YMM,ZMM register */
3421 4, /* cost of moving SSE register to integer. */
3422 6, 6, /* Gather load static, per_elt. */
3423 6, 6, /* Gather store static, per_elt. */
3424 32, /* size of l1 cache. */
3425 256, /* size of l2 cache. */
3426 64, /* size of prefetch block */
3427 6, /* number of parallel prefetches */
3428 3, /* Branch cost */
3429 COSTS_N_INSNS (8), /* cost of FADD and FSUB insns. */
3430 COSTS_N_INSNS (8), /* cost of FMUL instruction. */
3431 COSTS_N_INSNS (20), /* cost of FDIV instruction. */
3432 COSTS_N_INSNS (8), /* cost of FABS instruction. */
3433 COSTS_N_INSNS (8), /* cost of FCHS instruction. */
3434 COSTS_N_INSNS (40), /* cost of FSQRT instruction. */
3436 COSTS_N_INSNS (1), /* cost of cheap SSE instruction. */
3437 COSTS_N_INSNS (8), /* cost of ADDSS/SD SUBSS/SD insns. */
3438 COSTS_N_INSNS (8), /* cost of MULSS instruction. */
3439 COSTS_N_INSNS (8), /* cost of MULSD instruction. */
3440 COSTS_N_INSNS (6), /* cost of FMA SS instruction. */
3441 COSTS_N_INSNS (6), /* cost of FMA SD instruction. */
3442 COSTS_N_INSNS (20), /* cost of DIVSS instruction. */
3443 COSTS_N_INSNS (20), /* cost of DIVSD instruction. */
3444 COSTS_N_INSNS (40), /* cost of SQRTSS instruction. */
3445 COSTS_N_INSNS (40), /* cost of SQRTSD instruction. */
3446 1, 4, 1, 1, /* reassoc int, fp, vec_int, vec_fp. */
3447 intel_memcpy,
3448 intel_memset,
3449 COSTS_N_INSNS (3), /* cond_taken_branch_cost. */
3450 COSTS_N_INSNS (1), /* cond_not_taken_branch_cost. */
3451 "16", /* Loop alignment. */
3452 "16:8:8", /* Jump alignment. */
3453 "0:0:8", /* Label alignment. */
3454 "16", /* Func alignment. */
3455 4, /* Small unroll limit. */
3456 2, /* Small unroll factor. */
3457 COSTS_N_INSNS (2), /* Branch mispredict scale. */
3460 /* lujiazui_cost should produce code tuned for ZHAOXIN lujiazui CPU. */
3461 static stringop_algs lujiazui_memcpy[2] = {
3462 {libcall, {{32, loop, false}, {8192, rep_prefix_4_byte, false},
3463 {-1, libcall, false}}},
3464 {libcall, {{12, unrolled_loop, true}, {32, loop, false},
3465 {6144, rep_prefix_8_byte, false},
3466 {-1, libcall, false}}}};
3467 static stringop_algs lujiazui_memset[2] = {
3468 {libcall, {{32, loop, false}, {8192, rep_prefix_4_byte, false},
3469 {-1, libcall, false}}},
3470 {libcall, {{12, loop, true}, {32, loop, false},
3471 {640, rep_prefix_8_byte, false},
3472 {-1, libcall, false}}}};
3473 static const
3474 struct processor_costs lujiazui_cost = {
3476 /* Start of register allocator costs. integer->integer move cost is 2. */
3477 6, /* cost for loading QImode using movzbl. */
3478 {6, 6, 6}, /* cost of loading integer registers
3479 in QImode, HImode and SImode.
3480 Relative to reg-reg move (2). */
3481 {6, 6, 6}, /* cost of storing integer registers. */
3482 2, /* cost of reg,reg fld/fst. */
3483 {6, 6, 8}, /* cost of loading fp registers
3484 in SFmode, DFmode and XFmode. */
3485 {6, 6, 8}, /* cost of storing fp registers
3486 in SFmode, DFmode and XFmode. */
3487 2, /* cost of moving MMX register. */
3488 {6, 6}, /* cost of loading MMX registers
3489 in SImode and DImode. */
3490 {6, 6}, /* cost of storing MMX registers
3491 in SImode and DImode. */
3492 2, 3, 4, /* cost of moving XMM,YMM,ZMM register. */
3493 {6, 6, 6, 10, 15}, /* cost of loading SSE registers
3494 in 32,64,128,256 and 512-bit. */
3495 {6, 6, 6, 10, 15}, /* cost of storing SSE registers
3496 in 32,64,128,256 and 512-bit. */
3497 6, 6, /* SSE->integer and integer->SSE moves. */
3498 6, 6, /* mask->integer and integer->mask moves. */
3499 {6, 6, 6}, /* cost of loading mask register
3500 in QImode, HImode, SImode. */
3501 {6, 6, 6}, /* cost if storing mask register
3502 in QImode, HImode, SImode. */
3503 2, /* cost of moving mask register. */
3504 /* End of register allocator costs. */
3507 COSTS_N_INSNS (1), /* cost of an add instruction. */
3508 COSTS_N_INSNS (1) + 1, /* cost of a lea instruction. */
3509 COSTS_N_INSNS (1), /* variable shift costs. */
3510 COSTS_N_INSNS (1), /* constant shift costs. */
3511 {COSTS_N_INSNS (2), /* cost of starting multiply for QI. */
3512 COSTS_N_INSNS (3), /* HI. */
3513 COSTS_N_INSNS (3), /* SI. */
3514 COSTS_N_INSNS (12), /* DI. */
3515 COSTS_N_INSNS (14)}, /* other. */
3516 0, /* cost of multiply per each bit set. */
3517 {COSTS_N_INSNS (22), /* cost of a divide/mod for QI. */
3518 COSTS_N_INSNS (24), /* HI. */
3519 COSTS_N_INSNS (24), /* SI. */
3520 COSTS_N_INSNS (150), /* DI. */
3521 COSTS_N_INSNS (152)}, /* other. */
3522 COSTS_N_INSNS (1), /* cost of movsx. */
3523 COSTS_N_INSNS (1), /* cost of movzx. */
3524 8, /* "large" insn. */
3525 17, /* MOVE_RATIO. */
3526 6, /* CLEAR_RATIO. */
3527 {6, 6, 6}, /* cost of loading integer registers
3528 in QImode, HImode and SImode.
3529 Relative to reg-reg move (2). */
3530 {6, 6, 6}, /* cost of storing integer registers. */
3531 {6, 6, 6, 10, 15}, /* cost of loading SSE register
3532 in 32bit, 64bit, 128bit, 256bit and 512bit. */
3533 {6, 6, 6, 10, 15}, /* cost of storing SSE register
3534 in 32bit, 64bit, 128bit, 256bit and 512bit. */
3535 {6, 6, 6, 10, 15}, /* cost of unaligned loads. */
3536 {6, 6, 6, 10, 15}, /* cost of unaligned storess. */
3537 2, 3, 4, /* cost of moving XMM,YMM,ZMM register. */
3538 6, /* cost of moving SSE register to integer. */
3539 18, 6, /* Gather load static, per_elt. */
3540 18, 6, /* Gather store static, per_elt. */
3541 32, /* size of l1 cache. */
3542 4096, /* size of l2 cache. */
3543 64, /* size of prefetch block. */
3544 /* Lujiazui processor never drop prefetches, like AMD processors. */
3545 100, /* number of parallel prefetches. */
3546 3, /* Branch cost. */
3547 COSTS_N_INSNS (3), /* cost of FADD and FSUB insns. */
3548 COSTS_N_INSNS (4), /* cost of FMUL instruction. */
3549 COSTS_N_INSNS (22), /* cost of FDIV instruction. */
3550 COSTS_N_INSNS (1), /* cost of FABS instruction. */
3551 COSTS_N_INSNS (1), /* cost of FCHS instruction. */
3552 COSTS_N_INSNS (44), /* cost of FSQRT instruction. */
3554 COSTS_N_INSNS (1), /* cost of cheap SSE instruction. */
3555 COSTS_N_INSNS (3), /* cost of ADDSS/SD SUBSS/SD insns. */
3556 COSTS_N_INSNS (3), /* cost of MULSS instruction. */
3557 COSTS_N_INSNS (4), /* cost of MULSD instruction. */
3558 COSTS_N_INSNS (6), /* cost of FMA SS instruction. */
3559 COSTS_N_INSNS (6), /* cost of FMA SD instruction. */
3560 COSTS_N_INSNS (13), /* cost of DIVSS instruction. */
3561 COSTS_N_INSNS (17), /* cost of DIVSD instruction. */
3562 COSTS_N_INSNS (32), /* cost of SQRTSS instruction. */
3563 COSTS_N_INSNS (60), /* cost of SQRTSD instruction. */
3564 1, 4, 3, 3, /* reassoc int, fp, vec_int, vec_fp. */
3565 lujiazui_memcpy,
3566 lujiazui_memset,
3567 COSTS_N_INSNS (4), /* cond_taken_branch_cost. */
3568 COSTS_N_INSNS (2), /* cond_not_taken_branch_cost. */
3569 "16:11:8", /* Loop alignment. */
3570 "16:11:8", /* Jump alignment. */
3571 "0:0:8", /* Label alignment. */
3572 "16", /* Func alignment. */
3573 4, /* Small unroll limit. */
3574 2, /* Small unroll factor. */
3575 COSTS_N_INSNS (2), /* Branch mispredict scale. */
3578 /* yongfeng_cost should produce code tuned for ZHAOXIN yongfeng CPU. */
3579 static stringop_algs yongfeng_memcpy[2] = {
3580 {libcall, {{6, unrolled_loop, true}, {256, unrolled_loop, false},
3581 {-1, libcall, false}}},
3582 {libcall, {{8, loop, false}, {512, unrolled_loop, false},
3583 {-1, libcall, false}}}};
3584 static stringop_algs yongfeng_memset[2] = {
3585 {libcall, {{6, loop_1_byte, false}, {128, loop, false},
3586 {-1, libcall, false}}},
3587 {libcall, {{2, rep_prefix_4_byte, false}, {64, loop, false},
3588 {1024, vector_loop, false},
3589 {-1, libcall, false}}}};
3590 static const
3591 struct processor_costs yongfeng_cost = {
3593 /* Start of register allocator costs. integer->integer move cost is 2. */
3594 8, /* cost for loading QImode using movzbl. */
3595 {8, 8, 8}, /* cost of loading integer registers
3596 in QImode, HImode and SImode.
3597 Relative to reg-reg move (2). */
3598 {8, 8, 8}, /* cost of storing integer registers. */
3599 2, /* cost of reg,reg fld/fst. */
3600 {8, 8, 8}, /* cost of loading fp registers
3601 in SFmode, DFmode and XFmode. */
3602 {8, 8, 8}, /* cost of storing fp registers
3603 in SFmode, DFmode and XFmode. */
3604 2, /* cost of moving MMX register. */
3605 {8, 8}, /* cost of loading MMX registers
3606 in SImode and DImode. */
3607 {8, 8}, /* cost of storing MMX registers
3608 in SImode and DImode. */
3609 2, 3, 4, /* cost of moving XMM,YMM,ZMM register. */
3610 {8, 8, 8, 10, 15}, /* cost of loading SSE registers
3611 in 32,64,128,256 and 512-bit. */
3612 {8, 8, 8, 10, 15}, /* cost of storing SSE registers
3613 in 32,64,128,256 and 512-bit. */
3614 8, 8, /* SSE->integer and integer->SSE moves. */
3615 8, 8, /* mask->integer and integer->mask moves. */
3616 {8, 8, 8}, /* cost of loading mask register
3617 in QImode, HImode, SImode. */
3618 {8, 8, 8}, /* cost if storing mask register
3619 in QImode, HImode, SImode. */
3620 2, /* cost of moving mask register. */
3621 /* End of register allocator costs. */
3624 COSTS_N_INSNS (1), /* cost of an add instruction. */
3625 COSTS_N_INSNS (1), /* cost of a lea instruction. */
3626 COSTS_N_INSNS (1), /* variable shift costs. */
3627 COSTS_N_INSNS (1), /* constant shift costs. */
3628 {COSTS_N_INSNS (2), /* cost of starting multiply for QI. */
3629 COSTS_N_INSNS (3), /* HI. */
3630 COSTS_N_INSNS (2), /* SI. */
3631 COSTS_N_INSNS (2), /* DI. */
3632 COSTS_N_INSNS (3)}, /* other. */
3633 0, /* cost of multiply per each bit set. */
3634 {COSTS_N_INSNS (8), /* cost of a divide/mod for QI. */
3635 COSTS_N_INSNS (9), /* HI. */
3636 COSTS_N_INSNS (8), /* SI. */
3637 COSTS_N_INSNS (41), /* DI. */
3638 COSTS_N_INSNS (41)}, /* other. */
3639 COSTS_N_INSNS (1), /* cost of movsx. */
3640 COSTS_N_INSNS (1), /* cost of movzx. */
3641 8, /* "large" insn. */
3642 17, /* MOVE_RATIO. */
3643 6, /* CLEAR_RATIO. */
3644 {8, 8, 8}, /* cost of loading integer registers
3645 in QImode, HImode and SImode.
3646 Relative to reg-reg move (2). */
3647 {8, 8, 8}, /* cost of storing integer registers. */
3648 {8, 8, 8, 12, 15}, /* cost of loading SSE register
3649 in 32bit, 64bit, 128bit, 256bit and 512bit. */
3650 {8, 8, 8, 12, 15}, /* cost of storing SSE register
3651 in 32bit, 64bit, 128bit, 256bit and 512bit. */
3652 {8, 8, 8, 12, 15}, /* cost of unaligned loads. */
3653 {8, 8, 8, 12, 15}, /* cost of unaligned storess. */
3654 2, 3, 4, /* cost of moving XMM,YMM,ZMM register. */
3655 8, /* cost of moving SSE register to integer. */
3656 18, 6, /* Gather load static, per_elt. */
3657 18, 6, /* Gather store static, per_elt. */
3658 32, /* size of l1 cache. */
3659 256, /* size of l2 cache. */
3660 64, /* size of prefetch block. */
3661 12, /* number of parallel prefetches. */
3662 3, /* Branch cost. */
3663 COSTS_N_INSNS (3), /* cost of FADD and FSUB insns. */
3664 COSTS_N_INSNS (3), /* cost of FMUL instruction. */
3665 COSTS_N_INSNS (14), /* cost of FDIV instruction. */
3666 COSTS_N_INSNS (1), /* cost of FABS instruction. */
3667 COSTS_N_INSNS (1), /* cost of FCHS instruction. */
3668 COSTS_N_INSNS (40), /* cost of FSQRT instruction. */
3670 COSTS_N_INSNS (1), /* cost of cheap SSE instruction. */
3671 COSTS_N_INSNS (3), /* cost of ADDSS/SD SUBSS/SD insns. */
3672 COSTS_N_INSNS (3), /* cost of MULSS instruction. */
3673 COSTS_N_INSNS (3), /* cost of MULSD instruction. */
3674 COSTS_N_INSNS (5), /* cost of FMA SS instruction. */
3675 COSTS_N_INSNS (5), /* cost of FMA SD instruction. */
3676 COSTS_N_INSNS (10), /* cost of DIVSS instruction. */
3677 COSTS_N_INSNS (14), /* cost of DIVSD instruction. */
3678 COSTS_N_INSNS (20), /* cost of SQRTSS instruction. */
3679 COSTS_N_INSNS (35), /* cost of SQRTSD instruction. */
3680 4, 4, 4, 4, /* reassoc int, fp, vec_int, vec_fp. */
3681 yongfeng_memcpy,
3682 yongfeng_memset,
3683 COSTS_N_INSNS (3), /* cond_taken_branch_cost. */
3684 COSTS_N_INSNS (1), /* cond_not_taken_branch_cost. */
3685 "16:11:8", /* Loop alignment. */
3686 "16:11:8", /* Jump alignment. */
3687 "0:0:8", /* Label alignment. */
3688 "16", /* Func alignment. */
3689 4, /* Small unroll limit. */
3690 2, /* Small unroll factor. */
3691 COSTS_N_INSNS (2), /* Branch mispredict scale. */
3694 /* shijidadao_cost should produce code tuned for ZHAOXIN shijidadao CPU. */
3695 static stringop_algs shijidadao_memcpy[2] = {
3696 {libcall, {{8, unrolled_loop, true}, {256, unrolled_loop, false},
3697 {-1, libcall, false}}},
3698 {libcall, {{10, loop, true}, {256, unrolled_loop, false},
3699 {-1, libcall, false}}}};
3700 static stringop_algs shijidadao_memset[2] = {
3701 {libcall, {{4, loop, true}, {128, unrolled_loop, false},
3702 {-1, libcall, false}}},
3703 {libcall, {{1, rep_prefix_4_byte, false}, {14, loop, true},
3704 {1024, vector_loop, false},
3705 {-1, libcall, false}}}};
3706 static const
3707 struct processor_costs shijidadao_cost = {
3709 /* Start of register allocator costs. integer->integer move cost is 2. */
3710 8, /* cost for loading QImode using movzbl. */
3711 {8, 8, 8}, /* cost of loading integer registers
3712 in QImode, HImode and SImode.
3713 Relative to reg-reg move (2). */
3714 {8, 8, 8}, /* cost of storing integer registers. */
3715 2, /* cost of reg,reg fld/fst. */
3716 {8, 8, 8}, /* cost of loading fp registers
3717 in SFmode, DFmode and XFmode. */
3718 {8, 8, 8}, /* cost of storing fp registers
3719 in SFmode, DFmode and XFmode. */
3720 2, /* cost of moving MMX register. */
3721 {8, 8}, /* cost of loading MMX registers
3722 in SImode and DImode. */
3723 {8, 8}, /* cost of storing MMX registers
3724 in SImode and DImode. */
3725 2, 3, 4, /* cost of moving XMM,YMM,ZMM register. */
3726 {8, 8, 8, 10, 15}, /* cost of loading SSE registers
3727 in 32,64,128,256 and 512-bit. */
3728 {8, 8, 8, 10, 15}, /* cost of storing SSE registers
3729 in 32,64,128,256 and 512-bit. */
3730 8, 8, /* SSE->integer and integer->SSE moves. */
3731 8, 8, /* mask->integer and integer->mask moves. */
3732 {8, 8, 8}, /* cost of loading mask register
3733 in QImode, HImode, SImode. */
3734 {8, 8, 8}, /* cost if storing mask register
3735 in QImode, HImode, SImode. */
3736 2, /* cost of moving mask register. */
3737 /* End of register allocator costs. */
3740 COSTS_N_INSNS (1), /* cost of an add instruction. */
3741 COSTS_N_INSNS (1), /* cost of a lea instruction. */
3742 COSTS_N_INSNS (1), /* variable shift costs. */
3743 COSTS_N_INSNS (1), /* constant shift costs. */
3744 {COSTS_N_INSNS (2), /* cost of starting multiply for QI. */
3745 COSTS_N_INSNS (3), /* HI. */
3746 COSTS_N_INSNS (2), /* SI. */
3747 COSTS_N_INSNS (2), /* DI. */
3748 COSTS_N_INSNS (3)}, /* other. */
3749 0, /* cost of multiply per each bit set. */
3750 {COSTS_N_INSNS (9), /* cost of a divide/mod for QI. */
3751 COSTS_N_INSNS (10), /* HI. */
3752 COSTS_N_INSNS (9), /* SI. */
3753 COSTS_N_INSNS (50), /* DI. */
3754 COSTS_N_INSNS (50)}, /* other. */
3755 COSTS_N_INSNS (1), /* cost of movsx. */
3756 COSTS_N_INSNS (1), /* cost of movzx. */
3757 8, /* "large" insn. */
3758 17, /* MOVE_RATIO. */
3759 6, /* CLEAR_RATIO. */
3760 {8, 8, 8}, /* cost of loading integer registers
3761 in QImode, HImode and SImode.
3762 Relative to reg-reg move (2). */
3763 {8, 8, 8}, /* cost of storing integer registers. */
3764 {8, 8, 8, 12, 15}, /* cost of loading SSE register
3765 in 32bit, 64bit, 128bit, 256bit and 512bit. */
3766 {8, 8, 8, 12, 15}, /* cost of storing SSE register
3767 in 32bit, 64bit, 128bit, 256bit and 512bit. */
3768 {8, 8, 8, 12, 15}, /* cost of unaligned loads. */
3769 {8, 8, 8, 12, 15}, /* cost of unaligned storess. */
3770 2, 3, 4, /* cost of moving XMM,YMM,ZMM register. */
3771 8, /* cost of moving SSE register to integer. */
3772 18, 6, /* Gather load static, per_elt. */
3773 18, 6, /* Gather store static, per_elt. */
3774 32, /* size of l1 cache. */
3775 256, /* size of l2 cache. */
3776 64, /* size of prefetch block. */
3777 12, /* number of parallel prefetches. */
3778 3, /* Branch cost. */
3779 COSTS_N_INSNS (3), /* cost of FADD and FSUB insns. */
3780 COSTS_N_INSNS (3), /* cost of FMUL instruction. */
3781 COSTS_N_INSNS (13), /* cost of FDIV instruction. */
3782 COSTS_N_INSNS (2), /* cost of FABS instruction. */
3783 COSTS_N_INSNS (2), /* cost of FCHS instruction. */
3784 COSTS_N_INSNS (44), /* cost of FSQRT instruction. */
3786 COSTS_N_INSNS (1), /* cost of cheap SSE instruction. */
3787 COSTS_N_INSNS (3), /* cost of ADDSS/SD SUBSS/SD insns. */
3788 COSTS_N_INSNS (3), /* cost of MULSS instruction. */
3789 COSTS_N_INSNS (3), /* cost of MULSD instruction. */
3790 COSTS_N_INSNS (5), /* cost of FMA SS instruction. */
3791 COSTS_N_INSNS (5), /* cost of FMA SD instruction. */
3792 COSTS_N_INSNS (11), /* cost of DIVSS instruction. */
3793 COSTS_N_INSNS (14), /* cost of DIVSD instruction. */
3794 COSTS_N_INSNS (11), /* cost of SQRTSS instruction. */
3795 COSTS_N_INSNS (18), /* cost of SQRTSD instruction. */
3796 4, 4, 4, 4, /* reassoc int, fp, vec_int, vec_fp. */
3797 shijidadao_memcpy,
3798 shijidadao_memset,
3799 COSTS_N_INSNS (3), /* cond_taken_branch_cost. */
3800 COSTS_N_INSNS (1), /* cond_not_taken_branch_cost. */
3801 "16:11:8", /* Loop alignment. */
3802 "16:11:8", /* Jump alignment. */
3803 "0:0:8", /* Label alignment. */
3804 "16", /* Func alignment. */
3805 4, /* Small unroll limit. */
3806 2, /* Small unroll factor. */
3807 COSTS_N_INSNS (2), /* Branch mispredict scale. */
3812 /* Generic should produce code tuned for Core-i7 (and newer chips)
3813 and btver1 (and newer chips). */
3815 static stringop_algs generic_memcpy[2] = {
3816 {libcall, {{32, loop, false}, {8192, rep_prefix_4_byte, false},
3817 {-1, libcall, false}}},
3818 {libcall, {{32, loop, false}, {8192, rep_prefix_8_byte, false},
3819 {-1, libcall, false}}}};
3820 static stringop_algs generic_memset[2] = {
3821 {libcall, {{32, loop, false}, {8192, rep_prefix_4_byte, false},
3822 {-1, libcall, false}}},
3823 {libcall, {{32, loop, false}, {8192, rep_prefix_8_byte, false},
3824 {-1, libcall, false}}}};
3825 static const
3826 struct processor_costs generic_cost = {
3828 /* Start of register allocator costs. integer->integer move cost is 2. */
3829 6, /* cost for loading QImode using movzbl */
3830 {6, 6, 6}, /* cost of loading integer registers
3831 in QImode, HImode and SImode.
3832 Relative to reg-reg move (2). */
3833 {6, 6, 6}, /* cost of storing integer registers */
3834 4, /* cost of reg,reg fld/fst */
3835 {6, 6, 12}, /* cost of loading fp registers
3836 in SFmode, DFmode and XFmode */
3837 {6, 6, 12}, /* cost of storing fp registers
3838 in SFmode, DFmode and XFmode */
3839 2, /* cost of moving MMX register */
3840 {6, 6}, /* cost of loading MMX registers
3841 in SImode and DImode */
3842 {6, 6}, /* cost of storing MMX registers
3843 in SImode and DImode */
3844 2, 3, 4, /* cost of moving XMM,YMM,ZMM register */
3845 {6, 6, 6, 10, 15}, /* cost of loading SSE registers
3846 in 32,64,128,256 and 512-bit */
3847 {6, 6, 6, 10, 15}, /* cost of storing SSE registers
3848 in 32,64,128,256 and 512-bit */
3849 6, 6, /* SSE->integer and integer->SSE moves */
3850 6, 6, /* mask->integer and integer->mask moves */
3851 {6, 6, 6}, /* cost of loading mask register
3852 in QImode, HImode, SImode. */
3853 {6, 6, 6}, /* cost if storing mask register
3854 in QImode, HImode, SImode. */
3855 2, /* cost of moving mask register. */
3856 /* End of register allocator costs. */
3859 COSTS_N_INSNS (1), /* cost of an add instruction */
3860 /* Setting cost to 2 makes our current implementation of synth_mult result in
3861 use of unnecessary temporary registers causing regression on several
3862 SPECfp benchmarks. */
3863 COSTS_N_INSNS (1) + 1, /* cost of a lea instruction */
3864 COSTS_N_INSNS (1), /* variable shift costs */
3865 COSTS_N_INSNS (1), /* constant shift costs */
3866 {COSTS_N_INSNS (3), /* cost of starting multiply for QI */
3867 COSTS_N_INSNS (3), /* HI */
3868 COSTS_N_INSNS (3), /* SI */
3869 COSTS_N_INSNS (3), /* DI */
3870 COSTS_N_INSNS (4)}, /* other */
3871 0, /* cost of multiply per each bit set */
3872 {COSTS_N_INSNS (16), /* cost of a divide/mod for QI */
3873 COSTS_N_INSNS (22), /* HI */
3874 COSTS_N_INSNS (30), /* SI */
3875 COSTS_N_INSNS (74), /* DI */
3876 COSTS_N_INSNS (74)}, /* other */
3877 COSTS_N_INSNS (1), /* cost of movsx */
3878 COSTS_N_INSNS (1), /* cost of movzx */
3879 8, /* "large" insn */
3880 17, /* MOVE_RATIO */
3881 6, /* CLEAR_RATIO */
3882 {6, 6, 6}, /* cost of loading integer registers
3883 in QImode, HImode and SImode.
3884 Relative to reg-reg move (2). */
3885 {6, 6, 6}, /* cost of storing integer registers */
3886 {6, 6, 6, 10, 15}, /* cost of loading SSE register
3887 in 32bit, 64bit, 128bit, 256bit and 512bit */
3888 {6, 6, 6, 10, 15}, /* cost of storing SSE register
3889 in 32bit, 64bit, 128bit, 256bit and 512bit */
3890 {6, 6, 6, 10, 15}, /* cost of unaligned loads. */
3891 {6, 6, 6, 10, 15}, /* cost of unaligned storess. */
3892 2, 3, 4, /* cost of moving XMM,YMM,ZMM register */
3893 6, /* cost of moving SSE register to integer. */
3894 18, 6, /* Gather load static, per_elt. */
3895 18, 6, /* Gather store static, per_elt. */
3896 32, /* size of l1 cache. */
3897 512, /* size of l2 cache. */
3898 64, /* size of prefetch block */
3899 6, /* number of parallel prefetches */
3900 /* Benchmarks shows large regressions on K8 sixtrack benchmark when this
3901 value is increased to perhaps more appropriate value of 5. */
3902 3, /* Branch cost */
3903 COSTS_N_INSNS (3), /* cost of FADD and FSUB insns. */
3904 COSTS_N_INSNS (5), /* cost of FMUL instruction. */
3905 COSTS_N_INSNS (17), /* cost of FDIV instruction. */
3906 COSTS_N_INSNS (1), /* cost of FABS instruction. */
3907 COSTS_N_INSNS (1), /* cost of FCHS instruction. */
3908 COSTS_N_INSNS (14), /* cost of FSQRT instruction. */
3910 COSTS_N_INSNS (1), /* cost of cheap SSE instruction. */
3911 COSTS_N_INSNS (3), /* cost of ADDSS/SD SUBSS/SD insns. */
3912 COSTS_N_INSNS (4), /* cost of MULSS instruction. */
3913 COSTS_N_INSNS (5), /* cost of MULSD instruction. */
3914 COSTS_N_INSNS (5), /* cost of FMA SS instruction. */
3915 COSTS_N_INSNS (5), /* cost of FMA SD instruction. */
3916 COSTS_N_INSNS (13), /* cost of DIVSS instruction. */
3917 COSTS_N_INSNS (17), /* cost of DIVSD instruction. */
3918 COSTS_N_INSNS (14), /* cost of SQRTSS instruction. */
3919 COSTS_N_INSNS (18), /* cost of SQRTSD instruction. */
3920 1, 4, 3, 3, /* reassoc int, fp, vec_int, vec_fp. */
3921 generic_memcpy,
3922 generic_memset,
3923 COSTS_N_INSNS (4), /* cond_taken_branch_cost. */
3924 COSTS_N_INSNS (2), /* cond_not_taken_branch_cost. */
3925 "16", /* Loop alignment. */
3926 "16:11:8", /* Jump alignment. */
3927 "0:0:8", /* Label alignment. */
3928 "16", /* Func alignment. */
3929 4, /* Small unroll limit. */
3930 2, /* Small unroll factor. */
3931 COSTS_N_INSNS (2), /* Branch mispredict scale. */
3934 /* core_cost should produce code tuned for Core familly of CPUs. */
3935 static stringop_algs core_memcpy[2] = {
3936 {libcall, {{1024, rep_prefix_4_byte, true}, {-1, libcall, false}}},
3937 {libcall, {{24, loop, true}, {128, rep_prefix_8_byte, true},
3938 {-1, libcall, false}}}};
3939 static stringop_algs core_memset[2] = {
3940 {libcall, {{6, loop_1_byte, true},
3941 {24, loop, true},
3942 {8192, rep_prefix_4_byte, true},
3943 {-1, libcall, false}}},
3944 {libcall, {{24, loop, true}, {512, rep_prefix_8_byte, true},
3945 {-1, libcall, false}}}};
3947 static const
3948 struct processor_costs core_cost = {
3950 /* Start of register allocator costs. integer->integer move cost is 2. */
3951 6, /* cost for loading QImode using movzbl */
3952 {4, 4, 4}, /* cost of loading integer registers
3953 in QImode, HImode and SImode.
3954 Relative to reg-reg move (2). */
3955 {6, 6, 6}, /* cost of storing integer registers */
3956 2, /* cost of reg,reg fld/fst */
3957 {6, 6, 8}, /* cost of loading fp registers
3958 in SFmode, DFmode and XFmode */
3959 {6, 6, 10}, /* cost of storing fp registers
3960 in SFmode, DFmode and XFmode */
3961 2, /* cost of moving MMX register */
3962 {6, 6}, /* cost of loading MMX registers
3963 in SImode and DImode */
3964 {6, 6}, /* cost of storing MMX registers
3965 in SImode and DImode */
3966 2, 2, 4, /* cost of moving XMM,YMM,ZMM register */
3967 {6, 6, 6, 6, 12}, /* cost of loading SSE registers
3968 in 32,64,128,256 and 512-bit */
3969 {6, 6, 6, 6, 12}, /* cost of storing SSE registers
3970 in 32,64,128,256 and 512-bit */
3971 6, 6, /* SSE->integer and integer->SSE moves */
3972 6, 6, /* mask->integer and integer->mask moves */
3973 {4, 4, 4}, /* cost of loading mask register
3974 in QImode, HImode, SImode. */
3975 {6, 6, 6}, /* cost if storing mask register
3976 in QImode, HImode, SImode. */
3977 2, /* cost of moving mask register. */
3978 /* End of register allocator costs. */
3981 COSTS_N_INSNS (1), /* cost of an add instruction */
3982 /* On all chips taken into consideration lea is 2 cycles and more. With
3983 this cost however our current implementation of synth_mult results in
3984 use of unnecessary temporary registers causing regression on several
3985 SPECfp benchmarks. */
3986 COSTS_N_INSNS (1) + 1, /* cost of a lea instruction */
3987 COSTS_N_INSNS (1), /* variable shift costs */
3988 COSTS_N_INSNS (1), /* constant shift costs */
3989 {COSTS_N_INSNS (3), /* cost of starting multiply for QI */
3990 COSTS_N_INSNS (4), /* HI */
3991 COSTS_N_INSNS (3), /* SI */
3992 /* Here we tune for Sandybridge or newer. */
3993 COSTS_N_INSNS (3), /* DI */
3994 COSTS_N_INSNS (3)}, /* other */
3995 0, /* cost of multiply per each bit set */
3996 /* Expanding div/mod currently doesn't consider parallelism. So the cost
3997 model is not realistic. We compensate by increasing the latencies a bit. */
3998 {COSTS_N_INSNS (11), /* cost of a divide/mod for QI */
3999 COSTS_N_INSNS (11), /* HI */
4000 COSTS_N_INSNS (14), /* SI */
4001 COSTS_N_INSNS (81), /* DI */
4002 COSTS_N_INSNS (81)}, /* other */
4003 COSTS_N_INSNS (1), /* cost of movsx */
4004 COSTS_N_INSNS (1), /* cost of movzx */
4005 8, /* "large" insn */
4006 17, /* MOVE_RATIO */
4007 6, /* CLEAR_RATIO */
4008 {4, 4, 4}, /* cost of loading integer registers
4009 in QImode, HImode and SImode.
4010 Relative to reg-reg move (2). */
4011 {6, 6, 6}, /* cost of storing integer registers */
4012 {6, 6, 6, 6, 12}, /* cost of loading SSE register
4013 in 32bit, 64bit, 128bit, 256bit and 512bit */
4014 {6, 6, 6, 6, 12}, /* cost of storing SSE register
4015 in 32bit, 64bit, 128bit, 256bit and 512bit */
4016 {6, 6, 6, 6, 12}, /* cost of unaligned loads. */
4017 {6, 6, 6, 6, 12}, /* cost of unaligned stores. */
4018 2, 2, 4, /* cost of moving XMM,YMM,ZMM register */
4019 2, /* cost of moving SSE register to integer. */
4020 /* VGATHERDPD is 7 uops, rec throughput 5, while VGATHERDPD is 9 uops,
4021 rec. throughput 6.
4022 So 5 uops statically and one uops per load. */
4023 10, 6, /* Gather load static, per_elt. */
4024 10, 6, /* Gather store static, per_elt. */
4025 64, /* size of l1 cache. */
4026 512, /* size of l2 cache. */
4027 64, /* size of prefetch block */
4028 6, /* number of parallel prefetches */
4029 /* FIXME perhaps more appropriate value is 5. */
4030 3, /* Branch cost */
4031 COSTS_N_INSNS (3), /* cost of FADD and FSUB insns. */
4032 COSTS_N_INSNS (5), /* cost of FMUL instruction. */
4033 /* 10-24 */
4034 COSTS_N_INSNS (24), /* cost of FDIV instruction. */
4035 COSTS_N_INSNS (1), /* cost of FABS instruction. */
4036 COSTS_N_INSNS (1), /* cost of FCHS instruction. */
4037 COSTS_N_INSNS (23), /* cost of FSQRT instruction. */
4039 COSTS_N_INSNS (1), /* cost of cheap SSE instruction. */
4040 COSTS_N_INSNS (3), /* cost of ADDSS/SD SUBSS/SD insns. */
4041 COSTS_N_INSNS (4), /* cost of MULSS instruction. */
4042 COSTS_N_INSNS (5), /* cost of MULSD instruction. */
4043 COSTS_N_INSNS (5), /* cost of FMA SS instruction. */
4044 COSTS_N_INSNS (5), /* cost of FMA SD instruction. */
4045 COSTS_N_INSNS (18), /* cost of DIVSS instruction. */
4046 COSTS_N_INSNS (32), /* cost of DIVSD instruction. */
4047 COSTS_N_INSNS (30), /* cost of SQRTSS instruction. */
4048 COSTS_N_INSNS (58), /* cost of SQRTSD instruction. */
4049 1, 4, 2, 2, /* reassoc int, fp, vec_int, vec_fp. */
4050 core_memcpy,
4051 core_memset,
4052 COSTS_N_INSNS (3), /* cond_taken_branch_cost. */
4053 COSTS_N_INSNS (1), /* cond_not_taken_branch_cost. */
4054 "16:11:8", /* Loop alignment. */
4055 "16:11:8", /* Jump alignment. */
4056 "0:0:8", /* Label alignment. */
4057 "16", /* Func alignment. */
4058 4, /* Small unroll limit. */
4059 2, /* Small unroll factor. */
4060 COSTS_N_INSNS (2), /* Branch mispredict scale. */