gcc/config/i386/x86-tune-costs.h

   1 /* Costs of operations of individual x86 CPUs.
   2    Copyright (C) 1988-2025 Free Software Foundation, Inc.
   3
   4 This file is part of GCC.
   5
   6 GCC is free software; you can redistribute it and/or modify
   7 it under the terms of the GNU General Public License as published by
   8 the Free Software Foundation; either version 3, or (at your option)
   9 any later version.
  10
  11 GCC is distributed in the hope that it will be useful,
  12 but WITHOUT ANY WARRANTY; without even the implied warranty of
  13 MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
  14 GNU General Public License for more details.
  15
  16 Under Section 7 of GPL version 3, you are granted additional
  17 permissions described in the GCC Runtime Library Exception, version
  18 3.1, as published by the Free Software Foundation.
  19
  20 You should have received a copy of the GNU General Public License and
  21 a copy of the GCC Runtime Library Exception along with this program;
  22 see the files COPYING3 and COPYING.RUNTIME respectively.  If not, see
  23 <http://www.gnu.org/licenses/>.  */
  24 /* Processor costs (relative to an add) */
  25 /* We assume COSTS_N_INSNS is defined as (N)*4 and an addition is 2 bytes.  */
  26 #define COSTS_N_BYTES(N) ((N) * 2)
  27
  28 #define DUMMY_STRINGOP_ALGS {libcall, {{-1, libcall, false}}}
  29
  30 static stringop_algs ix86_size_memcpy[2] = {
  31   {rep_prefix_1_byte, {{-1, rep_prefix_1_byte, false}}},
  32   {rep_prefix_1_byte, {{-1, rep_prefix_1_byte, false}}}};
  33 static stringop_algs ix86_size_memset[2] = {
  34   {rep_prefix_1_byte, {{-1, rep_prefix_1_byte, false}}},
  35   {rep_prefix_1_byte, {{-1, rep_prefix_1_byte, false}}}};
  36
  37 const
  38 struct processor_costs ix86_size_cost = {/* costs for tuning for size */
  39   {
  40   /* Start of register allocator costs.  integer->integer move cost is 2. */
  41   2,                                 /* cost for loading QImode using movzbl */
  42   {2, 2, 2},                            /* cost of loading integer registers
  43                                            in QImode, HImode and SImode.
  44                                            Relative to reg-reg move (2).  */
  45   {2, 2, 2},                            /* cost of storing integer registers */
  46   2,                                    /* cost of reg,reg fld/fst */
  47   {2, 2, 2},                            /* cost of loading fp registers
  48                                            in SFmode, DFmode and XFmode */
  49   {2, 2, 2},                            /* cost of storing fp registers
  50                                            in SFmode, DFmode and XFmode */
  51   3,                                    /* cost of moving MMX register */
  52   {3, 3},                               /* cost of loading MMX registers
  53                                            in SImode and DImode */
  54   {3, 3},                               /* cost of storing MMX registers
  55                                            in SImode and DImode */
  56   3, 3, 3,                              /* cost of moving XMM,YMM,ZMM register */
  57   {3, 3, 3, 3, 3},                      /* cost of loading SSE registers
  58                                            in 32,64,128,256 and 512-bit */
  59   {3, 3, 3, 3, 3},                      /* cost of storing SSE registers
  60                                            in 32,64,128,256 and 512-bit */
  61   3, 3,                         /* SSE->integer and integer->SSE moves */
  62   3, 3,                         /* mask->integer and integer->mask moves */
  63   {2, 2, 2},                            /* cost of loading mask register
  64                                            in QImode, HImode, SImode.  */
  65   {2, 2, 2},                            /* cost if storing mask register
  66                                            in QImode, HImode, SImode.  */
  67   2,                                    /* cost of moving mask register.  */
  68   /* End of register allocator costs.  */
  69   },
  70
  71   COSTS_N_BYTES (2),                    /* cost of an add instruction */
  72   COSTS_N_BYTES (3),                    /* cost of a lea instruction */
  73   COSTS_N_BYTES (2),                    /* variable shift costs */
  74   COSTS_N_BYTES (3),                    /* constant shift costs */
  75   {COSTS_N_BYTES (3),                   /* cost of starting multiply for QI */
  76    COSTS_N_BYTES (3),                   /*                               HI */
  77    COSTS_N_BYTES (3),                   /*                               SI */
  78    COSTS_N_BYTES (3),                   /*                               DI */
  79    COSTS_N_BYTES (5)},                  /*                            other */
  80   0,                                    /* cost of multiply per each bit set */
  81   {COSTS_N_BYTES (3),                   /* cost of a divide/mod for QI */
  82    COSTS_N_BYTES (3),                   /*                          HI */
  83    COSTS_N_BYTES (3),                   /*                          SI */
  84    COSTS_N_BYTES (3),                   /*                          DI */
  85    COSTS_N_BYTES (5)},                  /*                          other */
  86   COSTS_N_BYTES (3),                    /* cost of movsx */
  87   COSTS_N_BYTES (3),                    /* cost of movzx */
  88   0,                                    /* "large" insn */
  89   2,                                    /* MOVE_RATIO */
  90   2,                                    /* CLEAR_RATIO */
  91   {2, 2, 2},                            /* cost of loading integer registers
  92                                            in QImode, HImode and SImode.
  93                                            Relative to reg-reg move (2).  */
  94   {2, 2, 2},                            /* cost of storing integer registers */
  95   {3, 3, 3, 3, 3},                      /* cost of loading SSE register
  96                                            in 32bit, 64bit, 128bit, 256bit and 512bit */
  97   {3, 3, 3, 3, 3},                      /* cost of storing SSE register
  98                                            in 32bit, 64bit, 128bit, 256bit and 512bit */
  99   {3, 3, 3, 3, 3},                      /* cost of unaligned SSE load
 100                                            in 128bit, 256bit and 512bit */
 101   {3, 3, 3, 3, 3},                      /* cost of unaligned SSE store
 102                                            in 128bit, 256bit and 512bit */
 103   3, 3, 3,                              /* cost of moving XMM,YMM,ZMM register */
 104   3,                                    /* cost of moving SSE register to integer.  */
 105   5, 0,                                 /* Gather load static, per_elt.  */
 106   5, 0,                                 /* Gather store static, per_elt.  */
 107   0,                                    /* size of l1 cache  */
 108   0,                                    /* size of l2 cache  */
 109   0,                                    /* size of prefetch block */
 110   0,                                    /* number of parallel prefetches */
 111   2,                                    /* Branch cost */
 112   COSTS_N_BYTES (2),                    /* cost of FADD and FSUB insns.  */
 113   COSTS_N_BYTES (2),                    /* cost of FMUL instruction.  */
 114   COSTS_N_BYTES (2),                    /* cost of FDIV instruction.  */
 115   COSTS_N_BYTES (2),                    /* cost of FABS instruction.  */
 116   COSTS_N_BYTES (2),                    /* cost of FCHS instruction.  */
 117   COSTS_N_BYTES (2),                    /* cost of FSQRT instruction.  */
 118
 119   COSTS_N_BYTES (2),                    /* cost of cheap SSE instruction.  */
 120   COSTS_N_BYTES (2),                    /* cost of ADDSS/SD SUBSS/SD insns.  */
 121   COSTS_N_BYTES (2),                    /* cost of MULSS instruction.  */
 122   COSTS_N_BYTES (2),                    /* cost of MULSD instruction.  */
 123   COSTS_N_BYTES (2),                    /* cost of FMA SS instruction.  */
 124   COSTS_N_BYTES (2),                    /* cost of FMA SD instruction.  */
 125   COSTS_N_BYTES (2),                    /* cost of DIVSS instruction.  */
 126   COSTS_N_BYTES (2),                    /* cost of DIVSD instruction.  */
 127   COSTS_N_BYTES (2),                    /* cost of SQRTSS instruction.  */
 128   COSTS_N_BYTES (2),                    /* cost of SQRTSD instruction.  */
 129   1, 1, 1, 1,                           /* reassoc int, fp, vec_int, vec_fp.  */
 130   ix86_size_memcpy,
 131   ix86_size_memset,
 132   COSTS_N_BYTES (1),                    /* cond_taken_branch_cost.  */
 133   COSTS_N_BYTES (1),                    /* cond_not_taken_branch_cost.  */
 134   NULL,                                 /* Loop alignment.  */
 135   NULL,                                 /* Jump alignment.  */
 136   NULL,                                 /* Label alignment.  */
 137   NULL,                                 /* Func alignment.  */
 138   4,                                    /* Small unroll limit.  */
 139   2,                                    /* Small unroll factor.  */
 140   COSTS_N_INSNS (2),                    /* Branch mispredict scale.  */
 141 };
 142
 143 /* Processor costs (relative to an add) */
 144 static stringop_algs i386_memcpy[2] = {
 145   {rep_prefix_1_byte, {{-1, rep_prefix_1_byte, false}}},
 146   DUMMY_STRINGOP_ALGS};
 147 static stringop_algs i386_memset[2] = {
 148   {rep_prefix_1_byte, {{-1, rep_prefix_1_byte, false}}},
 149   DUMMY_STRINGOP_ALGS};
 150
 151 static const
 152 struct processor_costs i386_cost = {    /* 386 specific costs */
 153   {
 154   /* Start of register allocator costs.  integer->integer move cost is 2. */
 155   4,                                 /* cost for loading QImode using movzbl */
 156   {2, 4, 2},                            /* cost of loading integer registers
 157                                            in QImode, HImode and SImode.
 158                                            Relative to reg-reg move (2).  */
 159   {2, 4, 2},                            /* cost of storing integer registers */
 160   2,                                    /* cost of reg,reg fld/fst */
 161   {8, 8, 8},                            /* cost of loading fp registers
 162                                            in SFmode, DFmode and XFmode */
 163   {8, 8, 8},                            /* cost of storing fp registers
 164                                            in SFmode, DFmode and XFmode */
 165   2,                                    /* cost of moving MMX register */
 166   {4, 8},                               /* cost of loading MMX registers
 167                                            in SImode and DImode */
 168   {4, 8},                               /* cost of storing MMX registers
 169                                            in SImode and DImode */
 170   2, 4, 8,                              /* cost of moving XMM,YMM,ZMM register */
 171   {4, 8, 16, 32, 64},                   /* cost of loading SSE registers
 172                                            in 32,64,128,256 and 512-bit */
 173   {4, 8, 16, 32, 64},                   /* cost of storing SSE registers
 174                                            in 32,64,128,256 and 512-bit */
 175   3, 3,                         /* SSE->integer and integer->SSE moves */
 176   3, 3,                         /* mask->integer and integer->mask moves */
 177   {2, 4, 2},                            /* cost of loading mask register
 178                                            in QImode, HImode, SImode.  */
 179   {2, 4, 2},                            /* cost if storing mask register
 180                                            in QImode, HImode, SImode.  */
 181   2,                                    /* cost of moving mask register.  */
 182   /* End of register allocator costs.  */
 183   },
 184
 185   COSTS_N_INSNS (1),                    /* cost of an add instruction */
 186   COSTS_N_INSNS (1),                    /* cost of a lea instruction */
 187   COSTS_N_INSNS (3),                    /* variable shift costs */
 188   COSTS_N_INSNS (2),                    /* constant shift costs */
 189   {COSTS_N_INSNS (6),                   /* cost of starting multiply for QI */
 190    COSTS_N_INSNS (6),                   /*                               HI */
 191    COSTS_N_INSNS (6),                   /*                               SI */
 192    COSTS_N_INSNS (6),                   /*                               DI */
 193    COSTS_N_INSNS (6)},                  /*                            other */
 194   COSTS_N_INSNS (1),                    /* cost of multiply per each bit set */
 195   {COSTS_N_INSNS (23),                  /* cost of a divide/mod for QI */
 196    COSTS_N_INSNS (23),                  /*                          HI */
 197    COSTS_N_INSNS (23),                  /*                          SI */
 198    COSTS_N_INSNS (23),                  /*                          DI */
 199    COSTS_N_INSNS (23)},                 /*                          other */
 200   COSTS_N_INSNS (3),                    /* cost of movsx */
 201   COSTS_N_INSNS (2),                    /* cost of movzx */
 202   15,                                   /* "large" insn */
 203   3,                                    /* MOVE_RATIO */
 204   3,                                    /* CLEAR_RATIO */
 205   {2, 4, 2},                            /* cost of loading integer registers
 206                                            in QImode, HImode and SImode.
 207                                            Relative to reg-reg move (2).  */
 208   {2, 4, 2},                            /* cost of storing integer registers */
 209   {4, 8, 16, 32, 64},                   /* cost of loading SSE register
 210                                            in 32bit, 64bit, 128bit, 256bit and 512bit */
 211   {4, 8, 16, 32, 64},                   /* cost of storing SSE register
 212                                            in 32bit, 64bit, 128bit, 256bit and 512bit */
 213   {4, 8, 16, 32, 64},                   /* cost of unaligned loads.  */
 214   {4, 8, 16, 32, 64},                   /* cost of unaligned stores.  */
 215   2, 4, 8,                              /* cost of moving XMM,YMM,ZMM register */
 216   3,                                    /* cost of moving SSE register to integer.  */
 217   4, 4,                                 /* Gather load static, per_elt.  */
 218   4, 4,                                 /* Gather store static, per_elt.  */
 219   0,                                    /* size of l1 cache  */
 220   0,                                    /* size of l2 cache  */
 221   0,                                    /* size of prefetch block */
 222   0,                                    /* number of parallel prefetches */
 223   1,                                    /* Branch cost */
 224   COSTS_N_INSNS (23),                   /* cost of FADD and FSUB insns.  */
 225   COSTS_N_INSNS (27),                   /* cost of FMUL instruction.  */
 226   COSTS_N_INSNS (88),                   /* cost of FDIV instruction.  */
 227   COSTS_N_INSNS (22),                   /* cost of FABS instruction.  */
 228   COSTS_N_INSNS (24),                   /* cost of FCHS instruction.  */
 229   COSTS_N_INSNS (122),                  /* cost of FSQRT instruction.  */
 230
 231   COSTS_N_INSNS (1),                    /* cost of cheap SSE instruction.  */
 232   COSTS_N_INSNS (23),                   /* cost of ADDSS/SD SUBSS/SD insns.  */
 233   COSTS_N_INSNS (27),                   /* cost of MULSS instruction.  */
 234   COSTS_N_INSNS (27),                   /* cost of MULSD instruction.  */
 235   COSTS_N_INSNS (27),                   /* cost of FMA SS instruction.  */
 236   COSTS_N_INSNS (27),                   /* cost of FMA SD instruction.  */
 237   COSTS_N_INSNS (88),                   /* cost of DIVSS instruction.  */
 238   COSTS_N_INSNS (88),                   /* cost of DIVSD instruction.  */
 239   COSTS_N_INSNS (122),                  /* cost of SQRTSS instruction.  */
 240   COSTS_N_INSNS (122),                  /* cost of SQRTSD instruction.  */
 241   1, 1, 1, 1,                           /* reassoc int, fp, vec_int, vec_fp.  */
 242   i386_memcpy,
 243   i386_memset,
 244   COSTS_N_INSNS (3),                    /* cond_taken_branch_cost.  */
 245   COSTS_N_INSNS (1),                    /* cond_not_taken_branch_cost.  */
 246   "4",                                  /* Loop alignment.  */
 247   "4",                                  /* Jump alignment.  */
 248   NULL,                                 /* Label alignment.  */
 249   "4",                                  /* Func alignment.  */
 250   4,                                    /* Small unroll limit.  */
 251   2,                                    /* Small unroll factor.  */
 252   COSTS_N_INSNS (2),                    /* Branch mispredict scale.  */
 253 };
 254
 255 static stringop_algs i486_memcpy[2] = {
 256   {rep_prefix_4_byte, {{-1, rep_prefix_4_byte, false}}},
 257   DUMMY_STRINGOP_ALGS};
 258 static stringop_algs i486_memset[2] = {
 259   {rep_prefix_4_byte, {{-1, rep_prefix_4_byte, false}}},
 260   DUMMY_STRINGOP_ALGS};
 261
 262 static const
 263 struct processor_costs i486_cost = {    /* 486 specific costs */
 264   {
 265   /* Start of register allocator costs.  integer->integer move cost is 2. */
 266   4,                                 /* cost for loading QImode using movzbl */
 267   {2, 4, 2},                            /* cost of loading integer registers
 268                                            in QImode, HImode and SImode.
 269                                            Relative to reg-reg move (2).  */
 270   {2, 4, 2},                            /* cost of storing integer registers */
 271   2,                                    /* cost of reg,reg fld/fst */
 272   {8, 8, 8},                            /* cost of loading fp registers
 273                                            in SFmode, DFmode and XFmode */
 274   {8, 8, 8},                            /* cost of storing fp registers
 275                                            in SFmode, DFmode and XFmode */
 276   2,                                    /* cost of moving MMX register */
 277   {4, 8},                               /* cost of loading MMX registers
 278                                            in SImode and DImode */
 279   {4, 8},                               /* cost of storing MMX registers
 280                                            in SImode and DImode */
 281   2, 4, 8,                              /* cost of moving XMM,YMM,ZMM register */
 282   {4, 8, 16, 32, 64},                   /* cost of loading SSE registers
 283                                            in 32,64,128,256 and 512-bit */
 284   {4, 8, 16, 32, 64},                   /* cost of storing SSE registers
 285                                            in 32,64,128,256 and 512-bit */
 286   3, 3,                         /* SSE->integer and integer->SSE moves */
 287   3, 3,                         /* mask->integer and integer->mask moves */
 288   {2, 4, 2},                            /* cost of loading mask register
 289                                            in QImode, HImode, SImode.  */
 290   {2, 4, 2},                            /* cost if storing mask register
 291                                            in QImode, HImode, SImode.  */
 292   2,                                    /* cost of moving mask register.  */
 293   /* End of register allocator costs.  */
 294   },
 295
 296   COSTS_N_INSNS (1),                    /* cost of an add instruction */
 297   COSTS_N_INSNS (1),                    /* cost of a lea instruction */
 298   COSTS_N_INSNS (3),                    /* variable shift costs */
 299   COSTS_N_INSNS (2),                    /* constant shift costs */
 300   {COSTS_N_INSNS (12),                  /* cost of starting multiply for QI */
 301    COSTS_N_INSNS (12),                  /*                               HI */
 302    COSTS_N_INSNS (12),                  /*                               SI */
 303    COSTS_N_INSNS (12),                  /*                               DI */
 304    COSTS_N_INSNS (12)},                 /*                            other */
 305   1,                                    /* cost of multiply per each bit set */
 306   {COSTS_N_INSNS (40),                  /* cost of a divide/mod for QI */
 307    COSTS_N_INSNS (40),                  /*                          HI */
 308    COSTS_N_INSNS (40),                  /*                          SI */
 309    COSTS_N_INSNS (40),                  /*                          DI */
 310    COSTS_N_INSNS (40)},                 /*                          other */
 311   COSTS_N_INSNS (3),                    /* cost of movsx */
 312   COSTS_N_INSNS (2),                    /* cost of movzx */
 313   15,                                   /* "large" insn */
 314   3,                                    /* MOVE_RATIO */
 315   3,                                    /* CLEAR_RATIO */
 316   {2, 4, 2},                            /* cost of loading integer registers
 317                                            in QImode, HImode and SImode.
 318                                            Relative to reg-reg move (2).  */
 319   {2, 4, 2},                            /* cost of storing integer registers */
 320   {4, 8, 16, 32, 64},                   /* cost of loading SSE register
 321                                            in 32bit, 64bit, 128bit, 256bit and 512bit */
 322   {4, 8, 16, 32, 64},                   /* cost of storing SSE register
 323                                            in 32bit, 64bit, 128bit, 256bit and 512bit */
 324   {4, 8, 16, 32, 64},                   /* cost of unaligned loads.  */
 325   {4, 8, 16, 32, 64},                   /* cost of unaligned stores.  */
 326   2, 4, 8,                              /* cost of moving XMM,YMM,ZMM register */
 327   3,                                    /* cost of moving SSE register to integer.  */
 328   4, 4,                                 /* Gather load static, per_elt.  */
 329   4, 4,                                 /* Gather store static, per_elt.  */
 330   4,                                    /* size of l1 cache.  486 has 8kB cache
 331                                            shared for code and data, so 4kB is
 332                                            not really precise.  */
 333   4,                                    /* size of l2 cache  */
 334   0,                                    /* size of prefetch block */
 335   0,                                    /* number of parallel prefetches */
 336   1,                                    /* Branch cost */
 337   COSTS_N_INSNS (8),                    /* cost of FADD and FSUB insns.  */
 338   COSTS_N_INSNS (16),                   /* cost of FMUL instruction.  */
 339   COSTS_N_INSNS (73),                   /* cost of FDIV instruction.  */
 340   COSTS_N_INSNS (3),                    /* cost of FABS instruction.  */
 341   COSTS_N_INSNS (3),                    /* cost of FCHS instruction.  */
 342   COSTS_N_INSNS (83),                   /* cost of FSQRT instruction.  */
 343
 344   COSTS_N_INSNS (1),                    /* cost of cheap SSE instruction.  */
 345   COSTS_N_INSNS (8),                    /* cost of ADDSS/SD SUBSS/SD insns.  */
 346   COSTS_N_INSNS (16),                   /* cost of MULSS instruction.  */
 347   COSTS_N_INSNS (16),                   /* cost of MULSD instruction.  */
 348   COSTS_N_INSNS (16),                   /* cost of FMA SS instruction.  */
 349   COSTS_N_INSNS (16),                   /* cost of FMA SD instruction.  */
 350   COSTS_N_INSNS (73),                   /* cost of DIVSS instruction.  */
 351   COSTS_N_INSNS (74),                   /* cost of DIVSD instruction.  */
 352   COSTS_N_INSNS (83),                   /* cost of SQRTSS instruction.  */
 353   COSTS_N_INSNS (83),                   /* cost of SQRTSD instruction.  */
 354   1, 1, 1, 1,                           /* reassoc int, fp, vec_int, vec_fp.  */
 355   i486_memcpy,
 356   i486_memset,
 357   COSTS_N_INSNS (3),                    /* cond_taken_branch_cost.  */
 358   COSTS_N_INSNS (1),                    /* cond_not_taken_branch_cost.  */
 359   "16",                                 /* Loop alignment.  */
 360   "16",                                 /* Jump alignment.  */
 361   "0:0:8",                              /* Label alignment.  */
 362   "16",                                 /* Func alignment.  */
 363   4,                                    /* Small unroll limit.  */
 364   2,                                    /* Small unroll factor.  */
 365   COSTS_N_INSNS (2),                    /* Branch mispredict scale.  */
 366 };
 367
 368 static stringop_algs pentium_memcpy[2] = {
 369   {libcall, {{256, rep_prefix_4_byte, false}, {-1, libcall, false}}},
 370   DUMMY_STRINGOP_ALGS};
 371 static stringop_algs pentium_memset[2] = {
 372   {libcall, {{-1, rep_prefix_4_byte, false}}},
 373   DUMMY_STRINGOP_ALGS};
 374
 375 static const
 376 struct processor_costs pentium_cost = {
 377   {
 378   /* Start of register allocator costs.  integer->integer move cost is 2. */
 379   6,                                 /* cost for loading QImode using movzbl */
 380   {2, 4, 2},                            /* cost of loading integer registers
 381                                            in QImode, HImode and SImode.
 382                                            Relative to reg-reg move (2).  */
 383   {2, 4, 2},                            /* cost of storing integer registers */
 384   2,                                    /* cost of reg,reg fld/fst */
 385   {2, 2, 6},                            /* cost of loading fp registers
 386                                            in SFmode, DFmode and XFmode */
 387   {4, 4, 6},                            /* cost of storing fp registers
 388                                            in SFmode, DFmode and XFmode */
 389   8,                                    /* cost of moving MMX register */
 390   {8, 8},                               /* cost of loading MMX registers
 391                                            in SImode and DImode */
 392   {8, 8},                               /* cost of storing MMX registers
 393                                            in SImode and DImode */
 394   2, 4, 8,                              /* cost of moving XMM,YMM,ZMM register */
 395   {4, 8, 16, 32, 64},                   /* cost of loading SSE registers
 396                                            in 32,64,128,256 and 512-bit */
 397   {4, 8, 16, 32, 64},                   /* cost of storing SSE registers
 398                                            in 32,64,128,256 and 512-bit */
 399   3, 3,                         /* SSE->integer and integer->SSE moves */
 400   3, 3,                         /* mask->integer and integer->mask moves */
 401   {2, 4, 2},                            /* cost of loading mask register
 402                                            in QImode, HImode, SImode.  */
 403   {2, 4, 2},                            /* cost if storing mask register
 404                                            in QImode, HImode, SImode.  */
 405   2,                                    /* cost of moving mask register.  */
 406   /* End of register allocator costs.  */
 407   },
 408
 409   COSTS_N_INSNS (1),                    /* cost of an add instruction */
 410   COSTS_N_INSNS (1),                    /* cost of a lea instruction */
 411   COSTS_N_INSNS (4),                    /* variable shift costs */
 412   COSTS_N_INSNS (1),                    /* constant shift costs */
 413   {COSTS_N_INSNS (11),                  /* cost of starting multiply for QI */
 414    COSTS_N_INSNS (11),                  /*                               HI */
 415    COSTS_N_INSNS (11),                  /*                               SI */
 416    COSTS_N_INSNS (11),                  /*                               DI */
 417    COSTS_N_INSNS (11)},                 /*                            other */
 418   0,                                    /* cost of multiply per each bit set */
 419   {COSTS_N_INSNS (25),                  /* cost of a divide/mod for QI */
 420    COSTS_N_INSNS (25),                  /*                          HI */
 421    COSTS_N_INSNS (25),                  /*                          SI */
 422    COSTS_N_INSNS (25),                  /*                          DI */
 423    COSTS_N_INSNS (25)},                 /*                          other */
 424   COSTS_N_INSNS (3),                    /* cost of movsx */
 425   COSTS_N_INSNS (2),                    /* cost of movzx */
 426   8,                                    /* "large" insn */
 427   6,                                    /* MOVE_RATIO */
 428   6,                                    /* CLEAR_RATIO */
 429   {2, 4, 2},                            /* cost of loading integer registers
 430                                            in QImode, HImode and SImode.
 431                                            Relative to reg-reg move (2).  */
 432   {2, 4, 2},                            /* cost of storing integer registers */
 433   {4, 8, 16, 32, 64},                   /* cost of loading SSE register
 434                                            in 32bit, 64bit, 128bit, 256bit and 512bit */
 435   {4, 8, 16, 32, 64},                   /* cost of storing SSE register
 436                                            in 32bit, 64bit, 128bit, 256bit and 512bit */
 437   {4, 8, 16, 32, 64},                   /* cost of unaligned loads.  */
 438   {4, 8, 16, 32, 64},                   /* cost of unaligned stores.  */
 439   2, 4, 8,                              /* cost of moving XMM,YMM,ZMM register */
 440   3,                                    /* cost of moving SSE register to integer.  */
 441   4, 4,                                 /* Gather load static, per_elt.  */
 442   4, 4,                                 /* Gather store static, per_elt.  */
 443   8,                                    /* size of l1 cache.  */
 444   8,                                    /* size of l2 cache  */
 445   0,                                    /* size of prefetch block */
 446   0,                                    /* number of parallel prefetches */
 447   2,                                    /* Branch cost */
 448   COSTS_N_INSNS (3),                    /* cost of FADD and FSUB insns.  */
 449   COSTS_N_INSNS (3),                    /* cost of FMUL instruction.  */
 450   COSTS_N_INSNS (39),                   /* cost of FDIV instruction.  */
 451   COSTS_N_INSNS (1),                    /* cost of FABS instruction.  */
 452   COSTS_N_INSNS (1),                    /* cost of FCHS instruction.  */
 453   COSTS_N_INSNS (70),                   /* cost of FSQRT instruction.  */
 454
 455   COSTS_N_INSNS (1),                    /* cost of cheap SSE instruction.  */
 456   COSTS_N_INSNS (3),                    /* cost of ADDSS/SD SUBSS/SD insns.  */
 457   COSTS_N_INSNS (3),                    /* cost of MULSS instruction.  */
 458   COSTS_N_INSNS (3),                    /* cost of MULSD instruction.  */
 459   COSTS_N_INSNS (6),                    /* cost of FMA SS instruction.  */
 460   COSTS_N_INSNS (6),                    /* cost of FMA SD instruction.  */
 461   COSTS_N_INSNS (39),                   /* cost of DIVSS instruction.  */
 462   COSTS_N_INSNS (39),                   /* cost of DIVSD instruction.  */
 463   COSTS_N_INSNS (70),                   /* cost of SQRTSS instruction.  */
 464   COSTS_N_INSNS (70),                   /* cost of SQRTSD instruction.  */
 465   1, 1, 1, 1,                           /* reassoc int, fp, vec_int, vec_fp.  */
 466   pentium_memcpy,
 467   pentium_memset,
 468   COSTS_N_INSNS (3),                    /* cond_taken_branch_cost.  */
 469   COSTS_N_INSNS (1),                    /* cond_not_taken_branch_cost.  */
 470   "16:8:8",                             /* Loop alignment.  */
 471   "16:8:8",                             /* Jump alignment.  */
 472   "0:0:8",                              /* Label alignment.  */
 473   "16",                                 /* Func alignment.  */
 474   4,                                    /* Small unroll limit.  */
 475   2,                                    /* Small unroll factor.  */
 476   COSTS_N_INSNS (2),                    /* Branch mispredict scale.  */
 477 };
 478
 479 static const
 480 struct processor_costs lakemont_cost = {
 481   {
 482   /* Start of register allocator costs.  integer->integer move cost is 2. */
 483   6,                                 /* cost for loading QImode using movzbl */
 484   {2, 4, 2},                            /* cost of loading integer registers
 485                                            in QImode, HImode and SImode.
 486                                            Relative to reg-reg move (2).  */
 487   {2, 4, 2},                            /* cost of storing integer registers */
 488   2,                                    /* cost of reg,reg fld/fst */
 489   {2, 2, 6},                            /* cost of loading fp registers
 490                                            in SFmode, DFmode and XFmode */
 491   {4, 4, 6},                            /* cost of storing fp registers
 492                                            in SFmode, DFmode and XFmode */
 493   8,                                    /* cost of moving MMX register */
 494   {8, 8},                               /* cost of loading MMX registers
 495                                            in SImode and DImode */
 496   {8, 8},                               /* cost of storing MMX registers
 497                                            in SImode and DImode */
 498   2, 4, 8,                              /* cost of moving XMM,YMM,ZMM register */
 499   {4, 8, 16, 32, 64},                   /* cost of loading SSE registers
 500                                            in 32,64,128,256 and 512-bit */
 501   {4, 8, 16, 32, 64},                   /* cost of storing SSE registers
 502                                            in 32,64,128,256 and 512-bit */
 503   3, 3,                         /* SSE->integer and integer->SSE moves */
 504   3, 3,                         /* mask->integer and integer->mask moves */
 505   {2, 4, 2},                            /* cost of loading mask register
 506                                            in QImode, HImode, SImode.  */
 507   {2, 4, 2},                            /* cost if storing mask register
 508                                            in QImode, HImode, SImode.  */
 509   2,                                    /* cost of moving mask register.  */
 510   /* End of register allocator costs.  */
 511   },
 512
 513   COSTS_N_INSNS (1),                    /* cost of an add instruction */
 514   COSTS_N_INSNS (1) + 1,                /* cost of a lea instruction */
 515   COSTS_N_INSNS (1),                    /* variable shift costs */
 516   COSTS_N_INSNS (1),                    /* constant shift costs */
 517   {COSTS_N_INSNS (11),                  /* cost of starting multiply for QI */
 518    COSTS_N_INSNS (11),                  /*                               HI */
 519    COSTS_N_INSNS (11),                  /*                               SI */
 520    COSTS_N_INSNS (11),                  /*                               DI */
 521    COSTS_N_INSNS (11)},                 /*                            other */
 522   0,                                    /* cost of multiply per each bit set */
 523   {COSTS_N_INSNS (25),                  /* cost of a divide/mod for QI */
 524    COSTS_N_INSNS (25),                  /*                          HI */
 525    COSTS_N_INSNS (25),                  /*                          SI */
 526    COSTS_N_INSNS (25),                  /*                          DI */
 527    COSTS_N_INSNS (25)},                 /*                          other */
 528   COSTS_N_INSNS (3),                    /* cost of movsx */
 529   COSTS_N_INSNS (2),                    /* cost of movzx */
 530   8,                                    /* "large" insn */
 531   17,                                   /* MOVE_RATIO */
 532   6,                                    /* CLEAR_RATIO */
 533   {2, 4, 2},                            /* cost of loading integer registers
 534                                            in QImode, HImode and SImode.
 535                                            Relative to reg-reg move (2).  */
 536   {2, 4, 2},                            /* cost of storing integer registers */
 537   {4, 8, 16, 32, 64},                   /* cost of loading SSE register
 538                                            in 32bit, 64bit, 128bit, 256bit and 512bit */
 539   {4, 8, 16, 32, 64},                   /* cost of storing SSE register
 540                                            in 32bit, 64bit, 128bit, 256bit and 512bit */
 541   {4, 8, 16, 32, 64},                   /* cost of unaligned loads.  */
 542   {4, 8, 16, 32, 64},                   /* cost of unaligned stores.  */
 543   2, 4, 8,                              /* cost of moving XMM,YMM,ZMM register */
 544   3,                                    /* cost of moving SSE register to integer.  */
 545   4, 4,                                 /* Gather load static, per_elt.  */
 546   4, 4,                                 /* Gather store static, per_elt.  */
 547   8,                                    /* size of l1 cache.  */
 548   8,                                    /* size of l2 cache  */
 549   0,                                    /* size of prefetch block */
 550   0,                                    /* number of parallel prefetches */
 551   2,                                    /* Branch cost */
 552   COSTS_N_INSNS (3),                    /* cost of FADD and FSUB insns.  */
 553   COSTS_N_INSNS (3),                    /* cost of FMUL instruction.  */
 554   COSTS_N_INSNS (39),                   /* cost of FDIV instruction.  */
 555   COSTS_N_INSNS (1),                    /* cost of FABS instruction.  */
 556   COSTS_N_INSNS (1),                    /* cost of FCHS instruction.  */
 557   COSTS_N_INSNS (70),                   /* cost of FSQRT instruction.  */
 558
 559   COSTS_N_INSNS (1),                    /* cost of cheap SSE instruction.  */
 560   COSTS_N_INSNS (5),                    /* cost of ADDSS/SD SUBSS/SD insns.  */
 561   COSTS_N_INSNS (5),                    /* cost of MULSS instruction.  */
 562   COSTS_N_INSNS (5),                    /* cost of MULSD instruction.  */
 563   COSTS_N_INSNS (10),                   /* cost of FMA SS instruction.  */
 564   COSTS_N_INSNS (10),                   /* cost of FMA SD instruction.  */
 565   COSTS_N_INSNS (31),                   /* cost of DIVSS instruction.  */
 566   COSTS_N_INSNS (60),                   /* cost of DIVSD instruction.  */
 567   COSTS_N_INSNS (31),                   /* cost of SQRTSS instruction.  */
 568   COSTS_N_INSNS (63),                   /* cost of SQRTSD instruction.  */
 569   1, 1, 1, 1,                           /* reassoc int, fp, vec_int, vec_fp.  */
 570   pentium_memcpy,
 571   pentium_memset,
 572   COSTS_N_INSNS (3),                    /* cond_taken_branch_cost.  */
 573   COSTS_N_INSNS (1),                    /* cond_not_taken_branch_cost.  */
 574   "16:8:8",                             /* Loop alignment.  */
 575   "16:8:8",                             /* Jump alignment.  */
 576   "0:0:8",                              /* Label alignment.  */
 577   "16",                                 /* Func alignment.  */
 578   4,                                    /* Small unroll limit.  */
 579   2,                                    /* Small unroll factor.  */
 580   COSTS_N_INSNS (2),                    /* Branch mispredict scale.  */
 581 };
 582
 583 /* PentiumPro has optimized rep instructions for blocks aligned by 8 bytes
 584    (we ensure the alignment).  For small blocks inline loop is still a
 585    noticeable win, for bigger blocks either rep movsl or rep movsb is
 586    way to go.  Rep movsb has apparently more expensive startup time in CPU,
 587    but after 4K the difference is down in the noise.  */
 588 static stringop_algs pentiumpro_memcpy[2] = {
 589   {rep_prefix_4_byte, {{128, loop, false}, {1024, unrolled_loop, false},
 590                        {8192, rep_prefix_4_byte, false},
 591                        {-1, rep_prefix_1_byte, false}}},
 592   DUMMY_STRINGOP_ALGS};
 593 static stringop_algs pentiumpro_memset[2] = {
 594   {rep_prefix_4_byte, {{1024, unrolled_loop, false},
 595                        {8192, rep_prefix_4_byte, false},
 596                        {-1, libcall, false}}},
 597   DUMMY_STRINGOP_ALGS};
 598 static const
 599 struct processor_costs pentiumpro_cost = {
 600   {
 601   /* Start of register allocator costs.  integer->integer move cost is 2. */
 602   2,                                 /* cost for loading QImode using movzbl */
 603   {4, 4, 4},                            /* cost of loading integer registers
 604                                            in QImode, HImode and SImode.
 605                                            Relative to reg-reg move (2).  */
 606   {2, 2, 2},                            /* cost of storing integer registers */
 607   2,                                    /* cost of reg,reg fld/fst */
 608   {2, 2, 6},                            /* cost of loading fp registers
 609                                            in SFmode, DFmode and XFmode */
 610   {4, 4, 6},                            /* cost of storing fp registers
 611                                            in SFmode, DFmode and XFmode */
 612   2,                                    /* cost of moving MMX register */
 613   {2, 2},                               /* cost of loading MMX registers
 614                                            in SImode and DImode */
 615   {2, 2},                               /* cost of storing MMX registers
 616                                            in SImode and DImode */
 617   2, 4, 8,                              /* cost of moving XMM,YMM,ZMM register */
 618   {4, 8, 16, 32, 64},                   /* cost of loading SSE registers
 619                                            in 32,64,128,256 and 512-bit */
 620   {4, 8, 16, 32, 64},                   /* cost of storing SSE registers
 621                                            in 32,64,128,256 and 512-bit */
 622   3, 3,                         /* SSE->integer and integer->SSE moves */
 623   3, 3,                         /* mask->integer and integer->mask moves */
 624   {4, 4, 4},                            /* cost of loading mask register
 625                                            in QImode, HImode, SImode.  */
 626   {2, 2, 2},                            /* cost if storing mask register
 627                                            in QImode, HImode, SImode.  */
 628   2,                                    /* cost of moving mask register.  */
 629   /* End of register allocator costs.  */
 630   },
 631
 632   COSTS_N_INSNS (1),                    /* cost of an add instruction */
 633   COSTS_N_INSNS (1),                    /* cost of a lea instruction */
 634   COSTS_N_INSNS (1),                    /* variable shift costs */
 635   COSTS_N_INSNS (1),                    /* constant shift costs */
 636   {COSTS_N_INSNS (4),                   /* cost of starting multiply for QI */
 637    COSTS_N_INSNS (4),                   /*                               HI */
 638    COSTS_N_INSNS (4),                   /*                               SI */
 639    COSTS_N_INSNS (4),                   /*                               DI */
 640    COSTS_N_INSNS (4)},                  /*                            other */
 641   0,                                    /* cost of multiply per each bit set */
 642   {COSTS_N_INSNS (17),                  /* cost of a divide/mod for QI */
 643    COSTS_N_INSNS (17),                  /*                          HI */
 644    COSTS_N_INSNS (17),                  /*                          SI */
 645    COSTS_N_INSNS (17),                  /*                          DI */
 646    COSTS_N_INSNS (17)},                 /*                          other */
 647   COSTS_N_INSNS (1),                    /* cost of movsx */
 648   COSTS_N_INSNS (1),                    /* cost of movzx */
 649   8,                                    /* "large" insn */
 650   6,                                    /* MOVE_RATIO */
 651   6,                                    /* CLEAR_RATIO */
 652   {4, 4, 4},                            /* cost of loading integer registers
 653                                            in QImode, HImode and SImode.
 654                                            Relative to reg-reg move (2).  */
 655   {2, 2, 2},                            /* cost of storing integer registers */
 656   {4, 8, 16, 32, 64},                   /* cost of loading SSE register
 657                                            in 32bit, 64bit, 128bit, 256bit and 512bit */
 658   {4, 8, 16, 32, 64},                   /* cost of storing SSE register
 659                                            in 32bit, 64bit, 128bit, 256bit and 512bit */
 660   {4, 8, 16, 32, 64},                   /* cost of unaligned loads.  */
 661   {4, 8, 16, 32, 64},                   /* cost of unaligned stores.  */
 662   2, 4, 8,                              /* cost of moving XMM,YMM,ZMM register */
 663   3,                                    /* cost of moving SSE register to integer.  */
 664   4, 4,                                 /* Gather load static, per_elt.  */
 665   4, 4,                                 /* Gather store static, per_elt.  */
 666   8,                                    /* size of l1 cache.  */
 667   256,                                  /* size of l2 cache  */
 668   32,                                   /* size of prefetch block */
 669   6,                                    /* number of parallel prefetches */
 670   2,                                    /* Branch cost */
 671   COSTS_N_INSNS (3),                    /* cost of FADD and FSUB insns.  */
 672   COSTS_N_INSNS (5),                    /* cost of FMUL instruction.  */
 673   COSTS_N_INSNS (56),                   /* cost of FDIV instruction.  */
 674   COSTS_N_INSNS (2),                    /* cost of FABS instruction.  */
 675   COSTS_N_INSNS (2),                    /* cost of FCHS instruction.  */
 676   COSTS_N_INSNS (56),                   /* cost of FSQRT instruction.  */
 677
 678   COSTS_N_INSNS (1),                    /* cost of cheap SSE instruction.  */
 679   COSTS_N_INSNS (3),                    /* cost of ADDSS/SD SUBSS/SD insns.  */
 680   COSTS_N_INSNS (4),                    /* cost of MULSS instruction.  */
 681   COSTS_N_INSNS (4),                    /* cost of MULSD instruction.  */
 682   COSTS_N_INSNS (7),                    /* cost of FMA SS instruction.  */
 683   COSTS_N_INSNS (7),                    /* cost of FMA SD instruction.  */
 684   COSTS_N_INSNS (18),                   /* cost of DIVSS instruction.  */
 685   COSTS_N_INSNS (18),                   /* cost of DIVSD instruction.  */
 686   COSTS_N_INSNS (31),                   /* cost of SQRTSS instruction.  */
 687   COSTS_N_INSNS (31),                   /* cost of SQRTSD instruction.  */
 688   1, 1, 1, 1,                           /* reassoc int, fp, vec_int, vec_fp.  */
 689   pentiumpro_memcpy,
 690   pentiumpro_memset,
 691   COSTS_N_INSNS (3),                    /* cond_taken_branch_cost.  */
 692   COSTS_N_INSNS (1),                    /* cond_not_taken_branch_cost.  */
 693   "16",                                 /* Loop alignment.  */
 694   "16:11:8",                            /* Jump alignment.  */
 695   "0:0:8",                              /* Label alignment.  */
 696   "16",                                 /* Func alignment.  */
 697   4,                                    /* Small unroll limit.  */
 698   2,                                    /* Small unroll factor.  */
 699   COSTS_N_INSNS (2),                    /* Branch mispredict scale.  */
 700 };
 701
 702 static stringop_algs geode_memcpy[2] = {
 703   {libcall, {{256, rep_prefix_4_byte, false}, {-1, libcall, false}}},
 704   DUMMY_STRINGOP_ALGS};
 705 static stringop_algs geode_memset[2] = {
 706   {libcall, {{256, rep_prefix_4_byte, false}, {-1, libcall, false}}},
 707   DUMMY_STRINGOP_ALGS};
 708 static const
 709 struct processor_costs geode_cost = {
 710   {
 711   /* Start of register allocator costs.  integer->integer move cost is 2. */
 712   2,                                 /* cost for loading QImode using movzbl */
 713   {2, 2, 2},                            /* cost of loading integer registers
 714                                            in QImode, HImode and SImode.
 715                                            Relative to reg-reg move (2).  */
 716   {2, 2, 2},                            /* cost of storing integer registers */
 717   2,                                    /* cost of reg,reg fld/fst */
 718   {2, 2, 2},                            /* cost of loading fp registers
 719                                            in SFmode, DFmode and XFmode */
 720   {4, 6, 6},                            /* cost of storing fp registers
 721                                            in SFmode, DFmode and XFmode */
 722   2,                                    /* cost of moving MMX register */
 723   {2, 2},                               /* cost of loading MMX registers
 724                                            in SImode and DImode */
 725   {2, 2},                               /* cost of storing MMX registers
 726                                            in SImode and DImode */
 727   2, 4, 8,                              /* cost of moving XMM,YMM,ZMM register */
 728   {2, 2, 8, 16, 32},                    /* cost of loading SSE registers
 729                                            in 32,64,128,256 and 512-bit */
 730   {2, 2, 8, 16, 32},                    /* cost of storing SSE registers
 731                                            in 32,64,128,256 and 512-bit */
 732   6, 6,                         /* SSE->integer and integer->SSE moves */
 733   6, 6,                         /* mask->integer and integer->mask moves */
 734   {2, 2, 2},                            /* cost of loading mask register
 735                                            in QImode, HImode, SImode.  */
 736   {2, 2, 2},                            /* cost if storing mask register
 737                                            in QImode, HImode, SImode.  */
 738   2,                                    /* cost of moving mask register.  */
 739   /* End of register allocator costs.  */
 740   },
 741
 742   COSTS_N_INSNS (1),                    /* cost of an add instruction */
 743   COSTS_N_INSNS (1),                    /* cost of a lea instruction */
 744   COSTS_N_INSNS (2),                    /* variable shift costs */
 745   COSTS_N_INSNS (1),                    /* constant shift costs */
 746   {COSTS_N_INSNS (3),                   /* cost of starting multiply for QI */
 747    COSTS_N_INSNS (4),                   /*                               HI */
 748    COSTS_N_INSNS (7),                   /*                               SI */
 749    COSTS_N_INSNS (7),                   /*                               DI */
 750    COSTS_N_INSNS (7)},                  /*                            other */
 751   0,                                    /* cost of multiply per each bit set */
 752   {COSTS_N_INSNS (15),                  /* cost of a divide/mod for QI */
 753    COSTS_N_INSNS (23),                  /*                          HI */
 754    COSTS_N_INSNS (39),                  /*                          SI */
 755    COSTS_N_INSNS (39),                  /*                          DI */
 756    COSTS_N_INSNS (39)},                 /*                          other */
 757   COSTS_N_INSNS (1),                    /* cost of movsx */
 758   COSTS_N_INSNS (1),                    /* cost of movzx */
 759   8,                                    /* "large" insn */
 760   4,                                    /* MOVE_RATIO */
 761   4,                                    /* CLEAR_RATIO */
 762   {2, 2, 2},                            /* cost of loading integer registers
 763                                            in QImode, HImode and SImode.
 764                                            Relative to reg-reg move (2).  */
 765   {2, 2, 2},                            /* cost of storing integer registers */
 766   {2, 2, 8, 16, 32},                    /* cost of loading SSE register
 767                                            in 32bit, 64bit, 128bit, 256bit and 512bit */
 768   {2, 2, 8, 16, 32},                    /* cost of storing SSE register
 769                                            in 32bit, 64bit, 128bit, 256bit and 512bit */
 770   {2, 2, 8, 16, 32},                    /* cost of unaligned loads.  */
 771   {2, 2, 8, 16, 32},                    /* cost of unaligned stores.  */
 772   2, 4, 8,                              /* cost of moving XMM,YMM,ZMM register */
 773   6,                                    /* cost of moving SSE register to integer.  */
 774   2, 2,                                 /* Gather load static, per_elt.  */
 775   2, 2,                                 /* Gather store static, per_elt.  */
 776   64,                                   /* size of l1 cache.  */
 777   128,                                  /* size of l2 cache.  */
 778   32,                                   /* size of prefetch block */
 779   1,                                    /* number of parallel prefetches */
 780   1,                                    /* Branch cost */
 781   COSTS_N_INSNS (6),                    /* cost of FADD and FSUB insns.  */
 782   COSTS_N_INSNS (11),                   /* cost of FMUL instruction.  */
 783   COSTS_N_INSNS (47),                   /* cost of FDIV instruction.  */
 784   COSTS_N_INSNS (1),                    /* cost of FABS instruction.  */
 785   COSTS_N_INSNS (1),                    /* cost of FCHS instruction.  */
 786   COSTS_N_INSNS (54),                   /* cost of FSQRT instruction.  */
 787
 788   COSTS_N_INSNS (1),                    /* cost of cheap SSE instruction.  */
 789   COSTS_N_INSNS (6),                    /* cost of ADDSS/SD SUBSS/SD insns.  */
 790   COSTS_N_INSNS (11),                   /* cost of MULSS instruction.  */
 791   COSTS_N_INSNS (11),                   /* cost of MULSD instruction.  */
 792   COSTS_N_INSNS (17),                   /* cost of FMA SS instruction.  */
 793   COSTS_N_INSNS (17),                   /* cost of FMA SD instruction.  */
 794   COSTS_N_INSNS (47),                   /* cost of DIVSS instruction.  */
 795   COSTS_N_INSNS (47),                   /* cost of DIVSD instruction.  */
 796   COSTS_N_INSNS (54),                   /* cost of SQRTSS instruction.  */
 797   COSTS_N_INSNS (54),                   /* cost of SQRTSD instruction.  */
 798   1, 1, 1, 1,                           /* reassoc int, fp, vec_int, vec_fp.  */
 799   geode_memcpy,
 800   geode_memset,
 801   COSTS_N_INSNS (3),                    /* cond_taken_branch_cost.  */
 802   COSTS_N_INSNS (1),                    /* cond_not_taken_branch_cost.  */
 803   NULL,                                 /* Loop alignment.  */
 804   NULL,                                 /* Jump alignment.  */
 805   NULL,                                 /* Label alignment.  */
 806   NULL,                                 /* Func alignment.  */
 807   4,                                    /* Small unroll limit.  */
 808   2,                                    /* Small unroll factor.  */
 809   COSTS_N_INSNS (2),                    /* Branch mispredict scale.  */
 810 };
 811
 812 static stringop_algs k6_memcpy[2] = {
 813   {libcall, {{256, rep_prefix_4_byte, false}, {-1, libcall, false}}},
 814   DUMMY_STRINGOP_ALGS};
 815 static stringop_algs k6_memset[2] = {
 816   {libcall, {{256, rep_prefix_4_byte, false}, {-1, libcall, false}}},
 817   DUMMY_STRINGOP_ALGS};
 818 static const
 819 struct processor_costs k6_cost = {
 820   {
 821   /* Start of register allocator costs.  integer->integer move cost is 2. */
 822   3,                                 /* cost for loading QImode using movzbl */
 823   {4, 5, 4},                            /* cost of loading integer registers
 824                                            in QImode, HImode and SImode.
 825                                            Relative to reg-reg move (2).  */
 826   {2, 3, 2},                            /* cost of storing integer registers */
 827   4,                                    /* cost of reg,reg fld/fst */
 828   {6, 6, 6},                            /* cost of loading fp registers
 829                                            in SFmode, DFmode and XFmode */
 830   {4, 4, 4},                            /* cost of storing fp registers
 831                                            in SFmode, DFmode and XFmode */
 832   2,                                    /* cost of moving MMX register */
 833   {2, 2},                               /* cost of loading MMX registers
 834                                            in SImode and DImode */
 835   {2, 2},                               /* cost of storing MMX registers
 836                                            in SImode and DImode */
 837   2, 4, 8,                              /* cost of moving XMM,YMM,ZMM register */
 838   {2, 2, 8, 16, 32},                    /* cost of loading SSE registers
 839                                            in 32,64,128,256 and 512-bit */
 840   {2, 2, 8, 16, 32},                    /* cost of storing SSE registers
 841                                            in 32,64,128,256 and 512-bit */
 842   6, 6,                         /* SSE->integer and integer->SSE moves */
 843   6, 6,                         /* mask->integer and integer->mask moves */
 844   {4, 5, 4},                            /* cost of loading mask register
 845                                            in QImode, HImode, SImode.  */
 846   {2, 3, 2},                            /* cost if storing mask register
 847                                            in QImode, HImode, SImode.  */
 848   2,                                    /* cost of moving mask register.  */
 849   /* End of register allocator costs.  */
 850   },
 851
 852   COSTS_N_INSNS (1),                    /* cost of an add instruction */
 853   COSTS_N_INSNS (2),                    /* cost of a lea instruction */
 854   COSTS_N_INSNS (1),                    /* variable shift costs */
 855   COSTS_N_INSNS (1),                    /* constant shift costs */
 856   {COSTS_N_INSNS (3),                   /* cost of starting multiply for QI */
 857    COSTS_N_INSNS (3),                   /*                               HI */
 858    COSTS_N_INSNS (3),                   /*                               SI */
 859    COSTS_N_INSNS (3),                   /*                               DI */
 860    COSTS_N_INSNS (3)},                  /*                            other */
 861   0,                                    /* cost of multiply per each bit set */
 862   {COSTS_N_INSNS (18),                  /* cost of a divide/mod for QI */
 863    COSTS_N_INSNS (18),                  /*                          HI */
 864    COSTS_N_INSNS (18),                  /*                          SI */
 865    COSTS_N_INSNS (18),                  /*                          DI */
 866    COSTS_N_INSNS (18)},                 /*                          other */
 867   COSTS_N_INSNS (2),                    /* cost of movsx */
 868   COSTS_N_INSNS (2),                    /* cost of movzx */
 869   8,                                    /* "large" insn */
 870   4,                                    /* MOVE_RATIO */
 871   4,                                    /* CLEAR_RATIO */
 872   {4, 5, 4},                            /* cost of loading integer registers
 873                                            in QImode, HImode and SImode.
 874                                            Relative to reg-reg move (2).  */
 875   {2, 3, 2},                            /* cost of storing integer registers */
 876   {2, 2, 8, 16, 32},                    /* cost of loading SSE register
 877                                            in 32bit, 64bit, 128bit, 256bit and 512bit */
 878   {2, 2, 8, 16, 32},                    /* cost of storing SSE register
 879                                            in 32bit, 64bit, 128bit, 256bit and 512bit */
 880   {2, 2, 8, 16, 32},                    /* cost of unaligned loads.  */
 881   {2, 2, 8, 16, 32},                    /* cost of unaligned stores.  */
 882   2, 4, 8,                              /* cost of moving XMM,YMM,ZMM register */
 883   6,                                    /* cost of moving SSE register to integer.  */
 884   2, 2,                                 /* Gather load static, per_elt.  */
 885   2, 2,                                 /* Gather store static, per_elt.  */
 886   32,                                   /* size of l1 cache.  */
 887   32,                                   /* size of l2 cache.  Some models
 888                                            have integrated l2 cache, but
 889                                            optimizing for k6 is not important
 890                                            enough to worry about that.  */
 891   32,                                   /* size of prefetch block */
 892   1,                                    /* number of parallel prefetches */
 893   1,                                    /* Branch cost */
 894   COSTS_N_INSNS (2),                    /* cost of FADD and FSUB insns.  */
 895   COSTS_N_INSNS (2),                    /* cost of FMUL instruction.  */
 896   COSTS_N_INSNS (56),                   /* cost of FDIV instruction.  */
 897   COSTS_N_INSNS (2),                    /* cost of FABS instruction.  */
 898   COSTS_N_INSNS (2),                    /* cost of FCHS instruction.  */
 899   COSTS_N_INSNS (56),                   /* cost of FSQRT instruction.  */
 900
 901   COSTS_N_INSNS (1),                    /* cost of cheap SSE instruction.  */
 902   COSTS_N_INSNS (2),                    /* cost of ADDSS/SD SUBSS/SD insns.  */
 903   COSTS_N_INSNS (2),                    /* cost of MULSS instruction.  */
 904   COSTS_N_INSNS (2),                    /* cost of MULSD instruction.  */
 905   COSTS_N_INSNS (4),                    /* cost of FMA SS instruction.  */
 906   COSTS_N_INSNS (4),                    /* cost of FMA SD instruction.  */
 907   COSTS_N_INSNS (56),                   /* cost of DIVSS instruction.  */
 908   COSTS_N_INSNS (56),                   /* cost of DIVSD instruction.  */
 909   COSTS_N_INSNS (56),                   /* cost of SQRTSS instruction.  */
 910   COSTS_N_INSNS (56),                   /* cost of SQRTSD instruction.  */
 911   1, 1, 1, 1,                           /* reassoc int, fp, vec_int, vec_fp.  */
 912   k6_memcpy,
 913   k6_memset,
 914   COSTS_N_INSNS (3),                    /* cond_taken_branch_cost.  */
 915   COSTS_N_INSNS (1),                    /* cond_not_taken_branch_cost.  */
 916   "32:8:8",                             /* Loop alignment.  */
 917   "32:8:8",                             /* Jump alignment.  */
 918   "0:0:8",                              /* Label alignment.  */
 919   "32",                                 /* Func alignment.  */
 920   4,                                    /* Small unroll limit.  */
 921   2,                                    /* Small unroll factor.  */
 922   COSTS_N_INSNS (2),                    /* Branch mispredict scale.  */
 923 };
 924
 925 /* For some reason, Athlon deals better with REP prefix (relative to loops)
 926    compared to K8. Alignment becomes important after 8 bytes for memcpy and
 927    128 bytes for memset.  */
 928 static stringop_algs athlon_memcpy[2] = {
 929   {libcall, {{2048, rep_prefix_4_byte, false}, {-1, libcall, false}}},
 930   DUMMY_STRINGOP_ALGS};
 931 static stringop_algs athlon_memset[2] = {
 932   {libcall, {{2048, rep_prefix_4_byte, false}, {-1, libcall, false}}},
 933   DUMMY_STRINGOP_ALGS};
 934 static const
 935 struct processor_costs athlon_cost = {
 936   {
 937   /* Start of register allocator costs.  integer->integer move cost is 2. */
 938   4,                                 /* cost for loading QImode using movzbl */
 939   {3, 4, 3},                            /* cost of loading integer registers
 940                                            in QImode, HImode and SImode.
 941                                            Relative to reg-reg move (2).  */
 942   {3, 4, 3},                            /* cost of storing integer registers */
 943   4,                                    /* cost of reg,reg fld/fst */
 944   {4, 4, 12},                           /* cost of loading fp registers
 945                                            in SFmode, DFmode and XFmode */
 946   {6, 6, 8},                            /* cost of storing fp registers
 947                                            in SFmode, DFmode and XFmode */
 948   2,                                    /* cost of moving MMX register */
 949   {4, 4},                               /* cost of loading MMX registers
 950                                            in SImode and DImode */
 951   {4, 4},                               /* cost of storing MMX registers
 952                                            in SImode and DImode */
 953   2, 4, 8,                              /* cost of moving XMM,YMM,ZMM register */
 954   {4, 4, 12, 12, 24},                   /* cost of loading SSE registers
 955                                            in 32,64,128,256 and 512-bit */
 956   {4, 4, 10, 10, 20},                   /* cost of storing SSE registers
 957                                            in 32,64,128,256 and 512-bit */
 958   5, 5,                         /* SSE->integer and integer->SSE moves */
 959   5, 5,                         /* mask->integer and integer->mask moves */
 960   {3, 4, 3},                            /* cost of loading mask register
 961                                            in QImode, HImode, SImode.  */
 962   {3, 4, 3},                            /* cost if storing mask register
 963                                            in QImode, HImode, SImode.  */
 964   2,                                    /* cost of moving mask register.  */
 965   /* End of register allocator costs.  */
 966   },
 967
 968   COSTS_N_INSNS (1),                    /* cost of an add instruction */
 969   COSTS_N_INSNS (2),                    /* cost of a lea instruction */
 970   COSTS_N_INSNS (1),                    /* variable shift costs */
 971   COSTS_N_INSNS (1),                    /* constant shift costs */
 972   {COSTS_N_INSNS (5),                   /* cost of starting multiply for QI */
 973    COSTS_N_INSNS (5),                   /*                               HI */
 974    COSTS_N_INSNS (5),                   /*                               SI */
 975    COSTS_N_INSNS (5),                   /*                               DI */
 976    COSTS_N_INSNS (5)},                  /*                            other */
 977   0,                                    /* cost of multiply per each bit set */
 978   {COSTS_N_INSNS (18),                  /* cost of a divide/mod for QI */
 979    COSTS_N_INSNS (26),                  /*                          HI */
 980    COSTS_N_INSNS (42),                  /*                          SI */
 981    COSTS_N_INSNS (74),                  /*                          DI */
 982    COSTS_N_INSNS (74)},                 /*                          other */
 983   COSTS_N_INSNS (1),                    /* cost of movsx */
 984   COSTS_N_INSNS (1),                    /* cost of movzx */
 985   8,                                    /* "large" insn */
 986   9,                                    /* MOVE_RATIO */
 987   6,                                    /* CLEAR_RATIO */
 988   {3, 4, 3},                            /* cost of loading integer registers
 989                                            in QImode, HImode and SImode.
 990                                            Relative to reg-reg move (2).  */
 991   {3, 4, 3},                            /* cost of storing integer registers */
 992   {4, 4, 12, 12, 24},                   /* cost of loading SSE register
 993                                            in 32bit, 64bit, 128bit, 256bit and 512bit */
 994   {4, 4, 10, 10, 20},                   /* cost of storing SSE register
 995                                            in 32bit, 64bit, 128bit, 256bit and 512bit */
 996   {4, 4, 12, 12, 24},                   /* cost of unaligned loads.  */
 997   {4, 4, 10, 10, 20},                   /* cost of unaligned stores.  */
 998   2, 4, 8,                              /* cost of moving XMM,YMM,ZMM register */
 999   5,                                    /* cost of moving SSE register to integer.  */
1000   4, 4,                                 /* Gather load static, per_elt.  */
1001   4, 4,                                 /* Gather store static, per_elt.  */
1002   64,                                   /* size of l1 cache.  */
1003   256,                                  /* size of l2 cache.  */
1004   64,                                   /* size of prefetch block */
1005   6,                                    /* number of parallel prefetches */
1006   5,                                    /* Branch cost */
1007   COSTS_N_INSNS (4),                    /* cost of FADD and FSUB insns.  */
1008   COSTS_N_INSNS (4),                    /* cost of FMUL instruction.  */
1009   COSTS_N_INSNS (24),                   /* cost of FDIV instruction.  */
1010   COSTS_N_INSNS (2),                    /* cost of FABS instruction.  */
1011   COSTS_N_INSNS (2),                    /* cost of FCHS instruction.  */
1012   COSTS_N_INSNS (35),                   /* cost of FSQRT instruction.  */
1013
1014   COSTS_N_INSNS (2),                    /* cost of cheap SSE instruction.  */
1015   COSTS_N_INSNS (4),                    /* cost of ADDSS/SD SUBSS/SD insns.  */
1016   COSTS_N_INSNS (4),                    /* cost of MULSS instruction.  */
1017   COSTS_N_INSNS (4),                    /* cost of MULSD instruction.  */
1018   COSTS_N_INSNS (8),                    /* cost of FMA SS instruction.  */
1019   COSTS_N_INSNS (8),                    /* cost of FMA SD instruction.  */
1020   /* 11-16  */
1021   COSTS_N_INSNS (16),                   /* cost of DIVSS instruction.  */
1022   COSTS_N_INSNS (24),                   /* cost of DIVSD instruction.  */
1023   COSTS_N_INSNS (19),                   /* cost of SQRTSS instruction.  */
1024   COSTS_N_INSNS (19),                   /* cost of SQRTSD instruction.  */
1025   1, 1, 1, 1,                           /* reassoc int, fp, vec_int, vec_fp.  */
1026   athlon_memcpy,
1027   athlon_memset,
1028   COSTS_N_INSNS (3),                    /* cond_taken_branch_cost.  */
1029   COSTS_N_INSNS (1),                    /* cond_not_taken_branch_cost.  */
1030   "16:8:8",                             /* Loop alignment.  */
1031   "16:8:8",                             /* Jump alignment.  */
1032   "0:0:8",                              /* Label alignment.  */
1033   "16",                                 /* Func alignment.  */
1034   4,                                    /* Small unroll limit.  */
1035   2,                                    /* Small unroll factor.  */
1036   COSTS_N_INSNS (2),                    /* Branch mispredict scale.  */
1037 };
1038
1039 /* K8 has optimized REP instruction for medium sized blocks, but for very
1040    small blocks it is better to use loop. For large blocks, libcall can
1041    do nontemporary accesses and beat inline considerably.  */
1042 static stringop_algs k8_memcpy[2] = {
1043   {libcall, {{6, loop, false}, {14, unrolled_loop, false},
1044              {-1, rep_prefix_4_byte, false}}},
1045   {libcall, {{16, loop, false}, {8192, rep_prefix_8_byte, false},
1046              {-1, libcall, false}}}};
1047 static stringop_algs k8_memset[2] = {
1048   {libcall, {{8, loop, false}, {24, unrolled_loop, false},
1049              {2048, rep_prefix_4_byte, false}, {-1, libcall, false}}},
1050   {libcall, {{48, unrolled_loop, false},
1051              {8192, rep_prefix_8_byte, false}, {-1, libcall, false}}}};
1052 static const
1053 struct processor_costs k8_cost = {
1054   {
1055   /* Start of register allocator costs.  integer->integer move cost is 2. */
1056   4,                                 /* cost for loading QImode using movzbl */
1057   {3, 4, 3},                            /* cost of loading integer registers
1058                                            in QImode, HImode and SImode.
1059                                            Relative to reg-reg move (2).  */
1060   {3, 4, 3},                            /* cost of storing integer registers */
1061   4,                                    /* cost of reg,reg fld/fst */
1062   {4, 4, 12},                           /* cost of loading fp registers
1063                                            in SFmode, DFmode and XFmode */
1064   {6, 6, 8},                            /* cost of storing fp registers
1065                                            in SFmode, DFmode and XFmode */
1066   2,                                    /* cost of moving MMX register */
1067   {3, 3},                               /* cost of loading MMX registers
1068                                            in SImode and DImode */
1069   {4, 4},                               /* cost of storing MMX registers
1070                                            in SImode and DImode */
1071   2, 4, 8,                              /* cost of moving XMM,YMM,ZMM register */
1072   {4, 3, 12, 12, 24},                   /* cost of loading SSE registers
1073                                            in 32,64,128,256 and 512-bit */
1074   {4, 4, 10, 10, 20},                   /* cost of storing SSE registers
1075                                            in 32,64,128,256 and 512-bit */
1076   5, 5,                         /* SSE->integer and integer->SSE moves */
1077   5, 5,                         /* mask->integer and integer->mask moves */
1078   {3, 4, 3},                            /* cost of loading mask register
1079                                            in QImode, HImode, SImode.  */
1080   {3, 4, 3},                            /* cost if storing mask register
1081                                            in QImode, HImode, SImode.  */
1082   2,                                    /* cost of moving mask register.  */
1083   /* End of register allocator costs.  */
1084   },
1085
1086   COSTS_N_INSNS (1),                    /* cost of an add instruction */
1087   COSTS_N_INSNS (2),                    /* cost of a lea instruction */
1088   COSTS_N_INSNS (1),                    /* variable shift costs */
1089   COSTS_N_INSNS (1),                    /* constant shift costs */
1090   {COSTS_N_INSNS (3),                   /* cost of starting multiply for QI */
1091    COSTS_N_INSNS (4),                   /*                               HI */
1092    COSTS_N_INSNS (3),                   /*                               SI */
1093    COSTS_N_INSNS (4),                   /*                               DI */
1094    COSTS_N_INSNS (5)},                  /*                            other */
1095   0,                                    /* cost of multiply per each bit set */
1096   {COSTS_N_INSNS (18),                  /* cost of a divide/mod for QI */
1097    COSTS_N_INSNS (26),                  /*                          HI */
1098    COSTS_N_INSNS (42),                  /*                          SI */
1099    COSTS_N_INSNS (74),                  /*                          DI */
1100    COSTS_N_INSNS (74)},                 /*                          other */
1101   COSTS_N_INSNS (1),                    /* cost of movsx */
1102   COSTS_N_INSNS (1),                    /* cost of movzx */
1103   8,                                    /* "large" insn */
1104   9,                                    /* MOVE_RATIO */
1105   6,                                    /* CLEAR_RATIO */
1106   {3, 4, 3},                            /* cost of loading integer registers
1107                                            in QImode, HImode and SImode.
1108                                            Relative to reg-reg move (2).  */
1109   {3, 4, 3},                            /* cost of storing integer registers */
1110   {4, 3, 12, 12, 24},                   /* cost of loading SSE register
1111                                            in 32bit, 64bit, 128bit, 256bit and 512bit */
1112   {4, 4, 10, 10, 20},                   /* cost of storing SSE register
1113                                            in 32bit, 64bit, 128bit, 256bit and 512bit */
1114   {4, 3, 12, 12, 24},                   /* cost of unaligned loads.  */
1115   {4, 4, 10, 10, 20},                   /* cost of unaligned stores.  */
1116   2, 4, 8,                              /* cost of moving XMM,YMM,ZMM register */
1117   5,                                    /* cost of moving SSE register to integer.  */
1118   4, 4,                                 /* Gather load static, per_elt.  */
1119   4, 4,                                 /* Gather store static, per_elt.  */
1120   64,                                   /* size of l1 cache.  */
1121   512,                                  /* size of l2 cache.  */
1122   64,                                   /* size of prefetch block */
1123   /* New AMD processors never drop prefetches; if they cannot be performed
1124      immediately, they are queued.  We set number of simultaneous prefetches
1125      to a large constant to reflect this (it probably is not a good idea not
1126      to limit number of prefetches at all, as their execution also takes some
1127      time).  */
1128   100,                                  /* number of parallel prefetches */
1129   3,                                    /* Branch cost */
1130   COSTS_N_INSNS (4),                    /* cost of FADD and FSUB insns.  */
1131   COSTS_N_INSNS (4),                    /* cost of FMUL instruction.  */
1132   COSTS_N_INSNS (19),                   /* cost of FDIV instruction.  */
1133   COSTS_N_INSNS (2),                    /* cost of FABS instruction.  */
1134   COSTS_N_INSNS (2),                    /* cost of FCHS instruction.  */
1135   COSTS_N_INSNS (35),                   /* cost of FSQRT instruction.  */
1136
1137   COSTS_N_INSNS (2),                    /* cost of cheap SSE instruction.  */
1138   COSTS_N_INSNS (4),                    /* cost of ADDSS/SD SUBSS/SD insns.  */
1139   COSTS_N_INSNS (4),                    /* cost of MULSS instruction.  */
1140   COSTS_N_INSNS (4),                    /* cost of MULSD instruction.  */
1141   COSTS_N_INSNS (8),                    /* cost of FMA SS instruction.  */
1142   COSTS_N_INSNS (8),                    /* cost of FMA SD instruction.  */
1143   /* 11-16  */
1144   COSTS_N_INSNS (16),                   /* cost of DIVSS instruction.  */
1145   COSTS_N_INSNS (20),                   /* cost of DIVSD instruction.  */
1146   COSTS_N_INSNS (19),                   /* cost of SQRTSS instruction.  */
1147   COSTS_N_INSNS (27),                   /* cost of SQRTSD instruction.  */
1148   1, 1, 1, 1,                           /* reassoc int, fp, vec_int, vec_fp.  */
1149   k8_memcpy,
1150   k8_memset,
1151   COSTS_N_INSNS (3),                    /* cond_taken_branch_cost.  */
1152   COSTS_N_INSNS (2),                    /* cond_not_taken_branch_cost.  */
1153   "16:8:8",                             /* Loop alignment.  */
1154   "16:8:8",                             /* Jump alignment.  */
1155   "0:0:8",                              /* Label alignment.  */
1156   "16",                                 /* Func alignment.  */
1157   4,                                    /* Small unroll limit.  */
1158   2,                                    /* Small unroll factor.  */
1159   COSTS_N_INSNS (2),                    /* Branch mispredict scale.  */
1160 };
1161
1162 /* AMDFAM10 has optimized REP instruction for medium sized blocks, but for
1163    very small blocks it is better to use loop. For large blocks, libcall can
1164    do nontemporary accesses and beat inline considerably.  */
1165 static stringop_algs amdfam10_memcpy[2] = {
1166   {libcall, {{6, loop, false}, {14, unrolled_loop, false},
1167              {-1, rep_prefix_4_byte, false}}},
1168   {libcall, {{16, loop, false}, {8192, rep_prefix_8_byte, false},
1169              {-1, libcall, false}}}};
1170 static stringop_algs amdfam10_memset[2] = {
1171   {libcall, {{8, loop, false}, {24, unrolled_loop, false},
1172              {2048, rep_prefix_4_byte, false}, {-1, libcall, false}}},
1173   {libcall, {{48, unrolled_loop, false}, {8192, rep_prefix_8_byte, false},
1174              {-1, libcall, false}}}};
1175 struct processor_costs amdfam10_cost = {
1176   {
1177   /* Start of register allocator costs.  integer->integer move cost is 2. */
1178   4,                                 /* cost for loading QImode using movzbl */
1179   {3, 4, 3},                            /* cost of loading integer registers
1180                                            in QImode, HImode and SImode.
1181                                            Relative to reg-reg move (2).  */
1182   {3, 4, 3},                            /* cost of storing integer registers */
1183   4,                                    /* cost of reg,reg fld/fst */
1184   {4, 4, 12},                           /* cost of loading fp registers
1185                                            in SFmode, DFmode and XFmode */
1186   {6, 6, 8},                            /* cost of storing fp registers
1187                                            in SFmode, DFmode and XFmode */
1188   2,                                    /* cost of moving MMX register */
1189   {3, 3},                               /* cost of loading MMX registers
1190                                            in SImode and DImode */
1191   {4, 4},                               /* cost of storing MMX registers
1192                                            in SImode and DImode */
1193   2, 4, 8,                              /* cost of moving XMM,YMM,ZMM register */
1194   {4, 4, 3, 6, 12},                     /* cost of loading SSE registers
1195                                            in 32,64,128,256 and 512-bit */
1196   {4, 4, 5, 10, 20},                    /* cost of storing SSE registers
1197                                            in 32,64,128,256 and 512-bit */
1198   3, 3,                         /* SSE->integer and integer->SSE moves */
1199   3, 3,                         /* mask->integer and integer->mask moves */
1200   {3, 4, 3},                            /* cost of loading mask register
1201                                            in QImode, HImode, SImode.  */
1202   {3, 4, 3},                            /* cost if storing mask register
1203                                            in QImode, HImode, SImode.  */
1204   2,                                    /* cost of moving mask register.  */
1205
1206                                         /* On K8:
1207                                             MOVD reg64, xmmreg Double FSTORE 4
1208                                             MOVD reg32, xmmreg Double FSTORE 4
1209                                            On AMDFAM10:
1210                                             MOVD reg64, xmmreg Double FADD 3
1211                                                                1/1  1/1
1212                                             MOVD reg32, xmmreg Double FADD 3
1213                                                                1/1  1/1 */
1214   /* End of register allocator costs.  */
1215   },
1216
1217   COSTS_N_INSNS (1),                    /* cost of an add instruction */
1218   COSTS_N_INSNS (2),                    /* cost of a lea instruction */
1219   COSTS_N_INSNS (1),                    /* variable shift costs */
1220   COSTS_N_INSNS (1),                    /* constant shift costs */
1221   {COSTS_N_INSNS (3),                   /* cost of starting multiply for QI */
1222    COSTS_N_INSNS (4),                   /*                               HI */
1223    COSTS_N_INSNS (3),                   /*                               SI */
1224    COSTS_N_INSNS (4),                   /*                               DI */
1225    COSTS_N_INSNS (5)},                  /*                            other */
1226   0,                                    /* cost of multiply per each bit set */
1227   {COSTS_N_INSNS (19),                  /* cost of a divide/mod for QI */
1228    COSTS_N_INSNS (35),                  /*                          HI */
1229    COSTS_N_INSNS (51),                  /*                          SI */
1230    COSTS_N_INSNS (83),                  /*                          DI */
1231    COSTS_N_INSNS (83)},                 /*                          other */
1232   COSTS_N_INSNS (1),                    /* cost of movsx */
1233   COSTS_N_INSNS (1),                    /* cost of movzx */
1234   8,                                    /* "large" insn */
1235   9,                                    /* MOVE_RATIO */
1236   6,                                    /* CLEAR_RATIO */
1237   {3, 4, 3},                            /* cost of loading integer registers
1238                                            in QImode, HImode and SImode.
1239                                            Relative to reg-reg move (2).  */
1240   {3, 4, 3},                            /* cost of storing integer registers */
1241   {4, 4, 3, 6, 12},                     /* cost of loading SSE register
1242                                            in 32bit, 64bit, 128bit, 256bit and 512bit */
1243   {4, 4, 5, 10, 20},                    /* cost of storing SSE register
1244                                            in 32bit, 64bit, 128bit, 256bit and 512bit */
1245   {4, 4, 3, 7, 12},                     /* cost of unaligned loads.  */
1246   {4, 4, 5, 10, 20},                    /* cost of unaligned stores.  */
1247   2, 4, 8,                              /* cost of moving XMM,YMM,ZMM register */
1248   3,                                    /* cost of moving SSE register to integer.  */
1249   4, 4,                                 /* Gather load static, per_elt.  */
1250   4, 4,                                 /* Gather store static, per_elt.  */
1251   64,                                   /* size of l1 cache.  */
1252   512,                                  /* size of l2 cache.  */
1253   64,                                   /* size of prefetch block */
1254   /* New AMD processors never drop prefetches; if they cannot be performed
1255      immediately, they are queued.  We set number of simultaneous prefetches
1256      to a large constant to reflect this (it probably is not a good idea not
1257      to limit number of prefetches at all, as their execution also takes some
1258      time).  */
1259   100,                                  /* number of parallel prefetches */
1260   2,                                    /* Branch cost */
1261   COSTS_N_INSNS (4),                    /* cost of FADD and FSUB insns.  */
1262   COSTS_N_INSNS (4),                    /* cost of FMUL instruction.  */
1263   COSTS_N_INSNS (19),                   /* cost of FDIV instruction.  */
1264   COSTS_N_INSNS (2),                    /* cost of FABS instruction.  */
1265   COSTS_N_INSNS (2),                    /* cost of FCHS instruction.  */
1266   COSTS_N_INSNS (35),                   /* cost of FSQRT instruction.  */
1267
1268   COSTS_N_INSNS (2),                    /* cost of cheap SSE instruction.  */
1269   COSTS_N_INSNS (4),                    /* cost of ADDSS/SD SUBSS/SD insns.  */
1270   COSTS_N_INSNS (4),                    /* cost of MULSS instruction.  */
1271   COSTS_N_INSNS (4),                    /* cost of MULSD instruction.  */
1272   COSTS_N_INSNS (8),                    /* cost of FMA SS instruction.  */
1273   COSTS_N_INSNS (8),                    /* cost of FMA SD instruction.  */
1274   /* 11-16  */
1275   COSTS_N_INSNS (16),                   /* cost of DIVSS instruction.  */
1276   COSTS_N_INSNS (20),                   /* cost of DIVSD instruction.  */
1277   COSTS_N_INSNS (19),                   /* cost of SQRTSS instruction.  */
1278   COSTS_N_INSNS (27),                   /* cost of SQRTSD instruction.  */
1279   1, 1, 1, 1,                           /* reassoc int, fp, vec_int, vec_fp.  */
1280   amdfam10_memcpy,
1281   amdfam10_memset,
1282   COSTS_N_INSNS (2),                    /* cond_taken_branch_cost.  */
1283   COSTS_N_INSNS (1),                    /* cond_not_taken_branch_cost.  */
1284   "32:25:8",                            /* Loop alignment.  */
1285   "32:8:8",                             /* Jump alignment.  */
1286   "0:0:8",                              /* Label alignment.  */
1287   "32",                                 /* Func alignment.  */
1288   4,                                    /* Small unroll limit.  */
1289   2,                                    /* Small unroll factor.  */
1290   COSTS_N_INSNS (2),                    /* Branch mispredict scale.  */
1291 };
1292
1293 /*  BDVER has optimized REP instruction for medium sized blocks, but for
1294     very small blocks it is better to use loop. For large blocks, libcall
1295     can do nontemporary accesses and beat inline considerably.  */
1296 static stringop_algs bdver_memcpy[2] = {
1297   {libcall, {{6, loop, false}, {14, unrolled_loop, false},
1298              {-1, rep_prefix_4_byte, false}}},
1299   {libcall, {{16, loop, false}, {8192, rep_prefix_8_byte, false},
1300              {-1, libcall, false}}}};
1301 static stringop_algs bdver_memset[2] = {
1302   {libcall, {{8, loop, false}, {24, unrolled_loop, false},
1303              {2048, rep_prefix_4_byte, false}, {-1, libcall, false}}},
1304   {libcall, {{48, unrolled_loop, false}, {8192, rep_prefix_8_byte, false},
1305              {-1, libcall, false}}}};
1306
1307 const struct processor_costs bdver_cost = {
1308   {
1309   /* Start of register allocator costs.  integer->integer move cost is 2. */
1310   8,                                 /* cost for loading QImode using movzbl */
1311   {8, 8, 8},                            /* cost of loading integer registers
1312                                            in QImode, HImode and SImode.
1313                                            Relative to reg-reg move (2).  */
1314   {8, 8, 8},                            /* cost of storing integer registers */
1315   4,                                    /* cost of reg,reg fld/fst */
1316   {12, 12, 28},                         /* cost of loading fp registers
1317                                            in SFmode, DFmode and XFmode */
1318   {10, 10, 18},                         /* cost of storing fp registers
1319                                            in SFmode, DFmode and XFmode */
1320   4,                                    /* cost of moving MMX register */
1321   {12, 12},                             /* cost of loading MMX registers
1322                                            in SImode and DImode */
1323   {10, 10},                             /* cost of storing MMX registers
1324                                            in SImode and DImode */
1325   2, 4, 8,                              /* cost of moving XMM,YMM,ZMM register */
1326   {12, 12, 10, 40, 60},                 /* cost of loading SSE registers
1327                                            in 32,64,128,256 and 512-bit */
1328   {10, 10, 10, 40, 60},                 /* cost of storing SSE registers
1329                                            in 32,64,128,256 and 512-bit */
1330   16, 20,                               /* SSE->integer and integer->SSE moves */
1331   16, 20,                               /* mask->integer and integer->mask moves */
1332   {8, 8, 8},                            /* cost of loading mask register
1333                                            in QImode, HImode, SImode.  */
1334   {8, 8, 8},                            /* cost if storing mask register
1335                                            in QImode, HImode, SImode.  */
1336   2,                                    /* cost of moving mask register.  */
1337   /* End of register allocator costs.  */
1338   },
1339
1340   COSTS_N_INSNS (1),                    /* cost of an add instruction */
1341   COSTS_N_INSNS (1),                    /* cost of a lea instruction */
1342   COSTS_N_INSNS (1),                    /* variable shift costs */
1343   COSTS_N_INSNS (1),                    /* constant shift costs */
1344   {COSTS_N_INSNS (4),                   /* cost of starting multiply for QI */
1345    COSTS_N_INSNS (4),                   /*                               HI */
1346    COSTS_N_INSNS (4),                   /*                               SI */
1347    COSTS_N_INSNS (6),                   /*                               DI */
1348    COSTS_N_INSNS (6)},                  /*                            other */
1349   0,                                    /* cost of multiply per each bit set */
1350   {COSTS_N_INSNS (19),                  /* cost of a divide/mod for QI */
1351    COSTS_N_INSNS (35),                  /*                          HI */
1352    COSTS_N_INSNS (51),                  /*                          SI */
1353    COSTS_N_INSNS (83),                  /*                          DI */
1354    COSTS_N_INSNS (83)},                 /*                          other */
1355   COSTS_N_INSNS (1),                    /* cost of movsx */
1356   COSTS_N_INSNS (1),                    /* cost of movzx */
1357   8,                                    /* "large" insn */
1358   9,                                    /* MOVE_RATIO */
1359   6,                                    /* CLEAR_RATIO */
1360   {8, 8, 8},                            /* cost of loading integer registers
1361                                            in QImode, HImode and SImode.
1362                                            Relative to reg-reg move (2).  */
1363   {8, 8, 8},                            /* cost of storing integer registers */
1364   {12, 12, 10, 40, 60},                 /* cost of loading SSE register
1365                                            in 32bit, 64bit, 128bit, 256bit and 512bit */
1366   {10, 10, 10, 40, 60},                 /* cost of storing SSE register
1367                                            in 32bit, 64bit, 128bit, 256bit and 512bit */
1368   {12, 12, 10, 40, 60},                 /* cost of unaligned loads.  */
1369   {10, 10, 10, 40, 60},                 /* cost of unaligned stores.  */
1370   2, 4, 8,                              /* cost of moving XMM,YMM,ZMM register */
1371   16,                                   /* cost of moving SSE register to integer.  */
1372   12, 12,                               /* Gather load static, per_elt.  */
1373   10, 10,                               /* Gather store static, per_elt.  */
1374   16,                                   /* size of l1 cache.  */
1375   2048,                                 /* size of l2 cache.  */
1376   64,                                   /* size of prefetch block */
1377   /* New AMD processors never drop prefetches; if they cannot be performed
1378      immediately, they are queued.  We set number of simultaneous prefetches
1379      to a large constant to reflect this (it probably is not a good idea not
1380      to limit number of prefetches at all, as their execution also takes some
1381      time).  */
1382   100,                                  /* number of parallel prefetches */
1383   2,                                    /* Branch cost */
1384   COSTS_N_INSNS (6),                    /* cost of FADD and FSUB insns.  */
1385   COSTS_N_INSNS (6),                    /* cost of FMUL instruction.  */
1386   COSTS_N_INSNS (42),                   /* cost of FDIV instruction.  */
1387   COSTS_N_INSNS (2),                    /* cost of FABS instruction.  */
1388   COSTS_N_INSNS (2),                    /* cost of FCHS instruction.  */
1389   COSTS_N_INSNS (52),                   /* cost of FSQRT instruction.  */
1390
1391   COSTS_N_INSNS (2),                    /* cost of cheap SSE instruction.  */
1392   COSTS_N_INSNS (6),                    /* cost of ADDSS/SD SUBSS/SD insns.  */
1393   COSTS_N_INSNS (6),                    /* cost of MULSS instruction.  */
1394   COSTS_N_INSNS (6),                    /* cost of MULSD instruction.  */
1395   COSTS_N_INSNS (6),                    /* cost of FMA SS instruction.  */
1396   COSTS_N_INSNS (6),                    /* cost of FMA SD instruction.  */
1397   /* 9-24  */
1398   COSTS_N_INSNS (24),                   /* cost of DIVSS instruction.  */
1399   /* 9-27  */
1400   COSTS_N_INSNS (27),                   /* cost of DIVSD instruction.  */
1401   COSTS_N_INSNS (15),                   /* cost of SQRTSS instruction.  */
1402   COSTS_N_INSNS (26),                   /* cost of SQRTSD instruction.  */
1403   1, 2, 1, 1,                           /* reassoc int, fp, vec_int, vec_fp.  */
1404   bdver_memcpy,
1405   bdver_memset,
1406   COSTS_N_INSNS (4),                    /* cond_taken_branch_cost.  */
1407   COSTS_N_INSNS (2),                    /* cond_not_taken_branch_cost.  */
1408   "16:11:8",                            /* Loop alignment.  */
1409   "16:8:8",                             /* Jump alignment.  */
1410   "0:0:8",                              /* Label alignment.  */
1411   "11",                                 /* Func alignment.  */
1412   4,                                    /* Small unroll limit.  */
1413   2,                                    /* Small unroll factor.  */
1414   COSTS_N_INSNS (2),                    /* Branch mispredict scale.  */
1415 };
1416
1417
1418 /*  ZNVER1 has optimized REP instruction for medium sized blocks, but for
1419     very small blocks it is better to use loop.  For large blocks, libcall
1420     can do nontemporary accesses and beat inline considerably.  */
1421 static stringop_algs znver1_memcpy[2] = {
1422   /* 32-bit tuning.  */
1423   {libcall, {{6, loop, false},
1424              {14, unrolled_loop, false},
1425              {-1, libcall, false}}},
1426   /* 64-bit tuning.  */
1427   {libcall, {{16, loop, false},
1428              {128, rep_prefix_8_byte, false},
1429              {-1, libcall, false}}}};
1430 static stringop_algs znver1_memset[2] = {
1431   /* 32-bit tuning.  */
1432   {libcall, {{8, loop, false},
1433              {24, unrolled_loop, false},
1434              {128, rep_prefix_4_byte, false},
1435              {-1, libcall, false}}},
1436   /* 64-bit tuning.  */
1437   {libcall, {{48, unrolled_loop, false},
1438              {128, rep_prefix_8_byte, false},
1439              {-1, libcall, false}}}};
1440 struct processor_costs znver1_cost = {
1441   {
1442   /* Start of register allocator costs.  integer->integer move cost is 2. */
1443
1444   /* reg-reg moves are done by renaming and thus they are even cheaper than
1445      1 cycle. Becuase reg-reg move cost is 2 and the following tables correspond
1446      to doubles of latencies, we do not model this correctly.  It does not
1447      seem to make practical difference to bump prices up even more.  */
1448   6,                                    /* cost for loading QImode using
1449                                            movzbl.  */
1450   {6, 6, 6},                            /* cost of loading integer registers
1451                                            in QImode, HImode and SImode.
1452                                            Relative to reg-reg move (2).  */
1453   {8, 8, 8},                            /* cost of storing integer
1454                                            registers.  */
1455   2,                                    /* cost of reg,reg fld/fst.  */
1456   {6, 6, 16},                           /* cost of loading fp registers
1457                                            in SFmode, DFmode and XFmode.  */
1458   {8, 8, 16},                           /* cost of storing fp registers
1459                                            in SFmode, DFmode and XFmode.  */
1460   2,                                    /* cost of moving MMX register.  */
1461   {6, 6},                               /* cost of loading MMX registers
1462                                            in SImode and DImode.  */
1463   {8, 8},                               /* cost of storing MMX registers
1464                                            in SImode and DImode.  */
1465   2, 3, 6,                              /* cost of moving XMM,YMM,ZMM register.  */
1466   {6, 6, 6, 12, 24},                    /* cost of loading SSE registers
1467                                            in 32,64,128,256 and 512-bit.  */
1468   {8, 8, 8, 16, 32},                    /* cost of storing SSE registers
1469                                            in 32,64,128,256 and 512-bit.  */
1470   6, 6,                         /* SSE->integer and integer->SSE moves.  */
1471   8, 8,                         /* mask->integer and integer->mask moves */
1472   {6, 6, 6},                            /* cost of loading mask register
1473                                            in QImode, HImode, SImode.  */
1474   {8, 8, 8},                            /* cost if storing mask register
1475                                            in QImode, HImode, SImode.  */
1476   2,                                    /* cost of moving mask register.  */
1477   /* End of register allocator costs.  */
1478   },
1479
1480   COSTS_N_INSNS (1),                    /* cost of an add instruction.  */
1481   COSTS_N_INSNS (1),                    /* cost of a lea instruction.  */
1482   COSTS_N_INSNS (1),                    /* variable shift costs.  */
1483   COSTS_N_INSNS (1),                    /* constant shift costs.  */
1484   {COSTS_N_INSNS (3),                   /* cost of starting multiply for QI.  */
1485    COSTS_N_INSNS (3),                   /*                               HI.  */
1486    COSTS_N_INSNS (3),                   /*                               SI.  */
1487    COSTS_N_INSNS (3),                   /*                               DI.  */
1488    COSTS_N_INSNS (3)},                  /*                            other.  */
1489   0,                                    /* cost of multiply per each bit
1490                                             set.  */
1491    /* Depending on parameters, idiv can get faster on ryzen.  This is upper
1492       bound.  */
1493   {COSTS_N_INSNS (16),                  /* cost of a divide/mod for QI.  */
1494    COSTS_N_INSNS (22),                  /*                          HI.  */
1495    COSTS_N_INSNS (30),                  /*                          SI.  */
1496    COSTS_N_INSNS (45),                  /*                          DI.  */
1497    COSTS_N_INSNS (45)},                 /*                          other.  */
1498   COSTS_N_INSNS (1),                    /* cost of movsx.  */
1499   COSTS_N_INSNS (1),                    /* cost of movzx.  */
1500   8,                                    /* "large" insn.  */
1501   9,                                    /* MOVE_RATIO.  */
1502   6,                                    /* CLEAR_RATIO */
1503   {6, 6, 6},                            /* cost of loading integer registers
1504                                            in QImode, HImode and SImode.
1505                                            Relative to reg-reg move (2).  */
1506   {8, 8, 8},                            /* cost of storing integer
1507                                            registers.  */
1508   {6, 6, 6, 12, 24},                    /* cost of loading SSE register
1509                                            in 32bit, 64bit, 128bit, 256bit and 512bit */
1510   {8, 8, 8, 16, 32},                    /* cost of storing SSE register
1511                                            in 32bit, 64bit, 128bit, 256bit and 512bit */
1512   {6, 6, 6, 12, 24},                    /* cost of unaligned loads.  */
1513   {8, 8, 8, 16, 32},                    /* cost of unaligned stores.  */
1514   2, 3, 6,                              /* cost of moving XMM,YMM,ZMM register.  */
1515   6,                                    /* cost of moving SSE register to integer.  */
1516   /* VGATHERDPD is 23 uops and throughput is 9, VGATHERDPD is 35 uops,
1517      throughput 12.  Approx 9 uops do not depend on vector size and every load
1518      is 7 uops.  */
1519   18, 8,                                /* Gather load static, per_elt.  */
1520   18, 10,                               /* Gather store static, per_elt.  */
1521   32,                                   /* size of l1 cache.  */
1522   512,                                  /* size of l2 cache.  */
1523   64,                                   /* size of prefetch block.  */
1524   /* New AMD processors never drop prefetches; if they cannot be performed
1525      immediately, they are queued.  We set number of simultaneous prefetches
1526      to a large constant to reflect this (it probably is not a good idea not
1527      to limit number of prefetches at all, as their execution also takes some
1528      time).  */
1529   100,                                  /* number of parallel prefetches.  */
1530   3,                                    /* Branch cost.  */
1531   COSTS_N_INSNS (5),                    /* cost of FADD and FSUB insns.  */
1532   COSTS_N_INSNS (5),                    /* cost of FMUL instruction.  */
1533   /* Latency of fdiv is 8-15.  */
1534   COSTS_N_INSNS (15),                   /* cost of FDIV instruction.  */
1535   COSTS_N_INSNS (1),                    /* cost of FABS instruction.  */
1536   COSTS_N_INSNS (1),                    /* cost of FCHS instruction.  */
1537   /* Latency of fsqrt is 4-10.  */
1538   COSTS_N_INSNS (10),                   /* cost of FSQRT instruction.  */
1539
1540   COSTS_N_INSNS (1),                    /* cost of cheap SSE instruction.  */
1541   COSTS_N_INSNS (3),                    /* cost of ADDSS/SD SUBSS/SD insns.  */
1542   COSTS_N_INSNS (3),                    /* cost of MULSS instruction.  */
1543   COSTS_N_INSNS (4),                    /* cost of MULSD instruction.  */
1544   COSTS_N_INSNS (5),                    /* cost of FMA SS instruction.  */
1545   COSTS_N_INSNS (5),                    /* cost of FMA SD instruction.  */
1546   COSTS_N_INSNS (10),                   /* cost of DIVSS instruction.  */
1547   /* 9-13  */
1548   COSTS_N_INSNS (13),                   /* cost of DIVSD instruction.  */
1549   COSTS_N_INSNS (10),                   /* cost of SQRTSS instruction.  */
1550   COSTS_N_INSNS (15),                   /* cost of SQRTSD instruction.  */
1551   /* Zen can execute 4 integer operations per cycle. FP operations take 3 cycles
1552      and it can execute 2 integer additions and 2 multiplications thus
1553      reassociation may make sense up to with of 6.  SPEC2k6 bencharks suggests
1554      that 4 works better than 6 probably due to register pressure.
1555
1556      Integer vector operations are taken by FP unit and execute 3 vector
1557      plus/minus operations per cycle but only one multiply.  This is adjusted
1558      in ix86_reassociation_width.  */
1559   4, 4, 3, 6,                           /* reassoc int, fp, vec_int, vec_fp.  */
1560   znver1_memcpy,
1561   znver1_memset,
1562   COSTS_N_INSNS (4),                    /* cond_taken_branch_cost.  */
1563   COSTS_N_INSNS (2),                    /* cond_not_taken_branch_cost.  */
1564   "16",                                 /* Loop alignment.  */
1565   "16",                                 /* Jump alignment.  */
1566   "0:0:8",                              /* Label alignment.  */
1567   "16",                                 /* Func alignment.  */
1568   4,                                    /* Small unroll limit.  */
1569   2,                                    /* Small unroll factor.  */
1570   COSTS_N_INSNS (2),                    /* Branch mispredict scale.  */
1571 };
1572
1573 /*  ZNVER2 has optimized REP instruction for medium sized blocks, but for
1574     very small blocks it is better to use loop.  For large blocks, libcall
1575     can do nontemporary accesses and beat inline considerably.  */
1576 static stringop_algs znver2_memcpy[2] = {
1577   /* 32-bit tuning.  */
1578   {libcall, {{6, loop, false},
1579              {14, unrolled_loop, false},
1580              {-1, libcall, false}}},
1581   /* 64-bit tuning.  */
1582   {libcall, {{16, loop, false},
1583              {64, rep_prefix_4_byte, false},
1584              {-1, libcall, false}}}};
1585 static stringop_algs znver2_memset[2] = {
1586   /* 32-bit tuning.  */
1587   {libcall, {{8, loop, false},
1588              {24, unrolled_loop, false},
1589              {128, rep_prefix_4_byte, false},
1590              {-1, libcall, false}}},
1591   /* 64-bit tuning.  */
1592   {libcall, {{24, rep_prefix_4_byte, false},
1593              {128, rep_prefix_8_byte, false},
1594              {-1, libcall, false}}}};
1595
1596 struct processor_costs znver2_cost = {
1597   {
1598   /* Start of register allocator costs.  integer->integer move cost is 2. */
1599
1600   /* reg-reg moves are done by renaming and thus they are even cheaper than
1601      1 cycle.  Because reg-reg move cost is 2 and following tables correspond
1602      to doubles of latencies, we do not model this correctly.  It does not
1603      seem to make practical difference to bump prices up even more.  */
1604   6,                                    /* cost for loading QImode using
1605                                            movzbl.  */
1606   {6, 6, 6},                            /* cost of loading integer registers
1607                                            in QImode, HImode and SImode.
1608                                            Relative to reg-reg move (2).  */
1609   {8, 8, 8},                            /* cost of storing integer
1610                                            registers.  */
1611   2,                                    /* cost of reg,reg fld/fst.  */
1612   {6, 6, 16},                           /* cost of loading fp registers
1613                                            in SFmode, DFmode and XFmode.  */
1614   {8, 8, 16},                           /* cost of storing fp registers
1615                                            in SFmode, DFmode and XFmode.  */
1616   2,                                    /* cost of moving MMX register.  */
1617   {6, 6},                               /* cost of loading MMX registers
1618                                            in SImode and DImode.  */
1619   {8, 8},                               /* cost of storing MMX registers
1620                                            in SImode and DImode.  */
1621   2, 2, 3,                              /* cost of moving XMM,YMM,ZMM
1622                                            register.  */
1623   {6, 6, 6, 6, 12},                     /* cost of loading SSE registers
1624                                            in 32,64,128,256 and 512-bit.  */
1625   {8, 8, 8, 8, 16},                     /* cost of storing SSE registers
1626                                            in 32,64,128,256 and 512-bit.  */
1627   6, 6,                                 /* SSE->integer and integer->SSE
1628                                            moves.  */
1629   8, 8,                         /* mask->integer and integer->mask moves */
1630   {6, 6, 6},                            /* cost of loading mask register
1631                                            in QImode, HImode, SImode.  */
1632   {8, 8, 8},                            /* cost if storing mask register
1633                                            in QImode, HImode, SImode.  */
1634   2,                                    /* cost of moving mask register.  */
1635   /* End of register allocator costs.  */
1636   },
1637
1638   COSTS_N_INSNS (1),                    /* cost of an add instruction.  */
1639   COSTS_N_INSNS (1),                    /* cost of a lea instruction.  */
1640   COSTS_N_INSNS (1),                    /* variable shift costs.  */
1641   COSTS_N_INSNS (1),                    /* constant shift costs.  */
1642   {COSTS_N_INSNS (3),                   /* cost of starting multiply for QI.  */
1643    COSTS_N_INSNS (3),                   /*                               HI.  */
1644    COSTS_N_INSNS (3),                   /*                               SI.  */
1645    COSTS_N_INSNS (3),                   /*                               DI.  */
1646    COSTS_N_INSNS (3)},                  /*                      other.  */
1647   0,                                    /* cost of multiply per each bit
1648                                            set.  */
1649    /* Depending on parameters, idiv can get faster on ryzen.  This is upper
1650       bound.  */
1651   {COSTS_N_INSNS (16),                  /* cost of a divide/mod for QI.  */
1652    COSTS_N_INSNS (22),                  /*                          HI.  */
1653    COSTS_N_INSNS (30),                  /*                          SI.  */
1654    COSTS_N_INSNS (45),                  /*                          DI.  */
1655    COSTS_N_INSNS (45)},                 /*                          other.  */
1656   COSTS_N_INSNS (1),                    /* cost of movsx.  */
1657   COSTS_N_INSNS (1),                    /* cost of movzx.  */
1658   8,                                    /* "large" insn.  */
1659   9,                                    /* MOVE_RATIO.  */
1660   6,                                    /* CLEAR_RATIO */
1661   {6, 6, 6},                            /* cost of loading integer registers
1662                                            in QImode, HImode and SImode.
1663                                            Relative to reg-reg move (2).  */
1664   {8, 8, 8},                            /* cost of storing integer
1665                                            registers.  */
1666   {6, 6, 6, 6, 12},                     /* cost of loading SSE registers
1667                                            in 32bit, 64bit, 128bit, 256bit and 512bit */
1668   {8, 8, 8, 8, 16},                     /* cost of storing SSE register
1669                                            in 32bit, 64bit, 128bit, 256bit and 512bit */
1670   {6, 6, 6, 6, 12},                     /* cost of unaligned loads.  */
1671   {8, 8, 8, 8, 16},                     /* cost of unaligned stores.  */
1672   2, 2, 3,                              /* cost of moving XMM,YMM,ZMM
1673                                            register.  */
1674   6,                                    /* cost of moving SSE register to integer.  */
1675   /* VGATHERDPD is 23 uops and throughput is 9, VGATHERDPD is 35 uops,
1676      throughput 12.  Approx 9 uops do not depend on vector size and every load
1677      is 7 uops.  */
1678   18, 8,                                /* Gather load static, per_elt.  */
1679   18, 10,                               /* Gather store static, per_elt.  */
1680   32,                                   /* size of l1 cache.  */
1681   512,                                  /* size of l2 cache.  */
1682   64,                                   /* size of prefetch block.  */
1683   /* New AMD processors never drop prefetches; if they cannot be performed
1684      immediately, they are queued.  We set number of simultaneous prefetches
1685      to a large constant to reflect this (it probably is not a good idea not
1686      to limit number of prefetches at all, as their execution also takes some
1687      time).  */
1688   100,                                  /* number of parallel prefetches.  */
1689   3,                                    /* Branch cost.  */
1690   COSTS_N_INSNS (5),                    /* cost of FADD and FSUB insns.  */
1691   COSTS_N_INSNS (5),                    /* cost of FMUL instruction.  */
1692   /* Latency of fdiv is 8-15.  */
1693   COSTS_N_INSNS (15),                   /* cost of FDIV instruction.  */
1694   COSTS_N_INSNS (1),                    /* cost of FABS instruction.  */
1695   COSTS_N_INSNS (1),                    /* cost of FCHS instruction.  */
1696   /* Latency of fsqrt is 4-10.  */
1697   COSTS_N_INSNS (10),                   /* cost of FSQRT instruction.  */
1698
1699   COSTS_N_INSNS (1),                    /* cost of cheap SSE instruction.  */
1700   COSTS_N_INSNS (3),                    /* cost of ADDSS/SD SUBSS/SD insns.  */
1701   COSTS_N_INSNS (3),                    /* cost of MULSS instruction.  */
1702   COSTS_N_INSNS (3),                    /* cost of MULSD instruction.  */
1703   COSTS_N_INSNS (5),                    /* cost of FMA SS instruction.  */
1704   COSTS_N_INSNS (5),                    /* cost of FMA SD instruction.  */
1705   COSTS_N_INSNS (10),                   /* cost of DIVSS instruction.  */
1706   /* 9-13.  */
1707   COSTS_N_INSNS (13),                   /* cost of DIVSD instruction.  */
1708   COSTS_N_INSNS (10),                   /* cost of SQRTSS instruction.  */
1709   COSTS_N_INSNS (15),                   /* cost of SQRTSD instruction.  */
1710   /* Zen can execute 4 integer operations per cycle.  FP operations
1711      take 3 cycles and it can execute 2 integer additions and 2
1712      multiplications thus reassociation may make sense up to with of 6.
1713      SPEC2k6 bencharks suggests
1714      that 4 works better than 6 probably due to register pressure.
1715
1716      Integer vector operations are taken by FP unit and execute 3 vector
1717      plus/minus operations per cycle but only one multiply.  This is adjusted
1718      in ix86_reassociation_width.  */
1719   4, 4, 3, 6,                           /* reassoc int, fp, vec_int, vec_fp.  */
1720   znver2_memcpy,
1721   znver2_memset,
1722   COSTS_N_INSNS (4),                    /* cond_taken_branch_cost.  */
1723   COSTS_N_INSNS (2),                    /* cond_not_taken_branch_cost.  */
1724   "16",                                 /* Loop alignment.  */
1725   "16",                                 /* Jump alignment.  */
1726   "0:0:8",                              /* Label alignment.  */
1727   "16",                                 /* Func alignment.  */
1728   4,                                    /* Small unroll limit.  */
1729   2,                                    /* Small unroll factor.  */
1730   COSTS_N_INSNS (2),                    /* Branch mispredict scale.  */
1731 };
1732
1733 struct processor_costs znver3_cost = {
1734   {
1735   /* Start of register allocator costs.  integer->integer move cost is 2. */
1736
1737   /* reg-reg moves are done by renaming and thus they are even cheaper than
1738      1 cycle.  Because reg-reg move cost is 2 and following tables correspond
1739      to doubles of latencies, we do not model this correctly.  It does not
1740      seem to make practical difference to bump prices up even more.  */
1741   6,                                    /* cost for loading QImode using
1742                                            movzbl.  */
1743   {6, 6, 6},                            /* cost of loading integer registers
1744                                            in QImode, HImode and SImode.
1745                                            Relative to reg-reg move (2).  */
1746   {8, 8, 8},                            /* cost of storing integer
1747                                            registers.  */
1748   2,                                    /* cost of reg,reg fld/fst.  */
1749   {6, 6, 16},                           /* cost of loading fp registers
1750                                            in SFmode, DFmode and XFmode.  */
1751   {8, 8, 16},                           /* cost of storing fp registers
1752                                            in SFmode, DFmode and XFmode.  */
1753   2,                                    /* cost of moving MMX register.  */
1754   {6, 6},                               /* cost of loading MMX registers
1755                                            in SImode and DImode.  */
1756   {8, 8},                               /* cost of storing MMX registers
1757                                            in SImode and DImode.  */
1758   2, 2, 3,                              /* cost of moving XMM,YMM,ZMM
1759                                            register.  */
1760   {6, 6, 6, 6, 12},                     /* cost of loading SSE registers
1761                                            in 32,64,128,256 and 512-bit.  */
1762   {8, 8, 8, 8, 16},                     /* cost of storing SSE registers
1763                                            in 32,64,128,256 and 512-bit.  */
1764   6, 6,                                 /* SSE->integer and integer->SSE
1765                                            moves.  */
1766   8, 8,                         /* mask->integer and integer->mask moves */
1767   {6, 6, 6},                            /* cost of loading mask register
1768                                            in QImode, HImode, SImode.  */
1769   {8, 8, 8},                            /* cost if storing mask register
1770                                            in QImode, HImode, SImode.  */
1771   2,                                    /* cost of moving mask register.  */
1772   /* End of register allocator costs.  */
1773   },
1774
1775   COSTS_N_INSNS (1),                    /* cost of an add instruction.  */
1776   COSTS_N_INSNS (1),                    /* cost of a lea instruction.  */
1777   COSTS_N_INSNS (1),                    /* variable shift costs.  */
1778   COSTS_N_INSNS (1),                    /* constant shift costs.  */
1779   {COSTS_N_INSNS (3),                   /* cost of starting multiply for QI.  */
1780    COSTS_N_INSNS (3),                   /*                               HI.  */
1781    COSTS_N_INSNS (3),                   /*                               SI.  */
1782    COSTS_N_INSNS (3),                   /*                               DI.  */
1783    COSTS_N_INSNS (3)},                  /*                      other.  */
1784   0,                                    /* cost of multiply per each bit
1785                                            set.  */
1786   {COSTS_N_INSNS (9),                   /* cost of a divide/mod for QI.  */
1787    COSTS_N_INSNS (10),                  /*                          HI.  */
1788    COSTS_N_INSNS (12),                  /*                          SI.  */
1789    COSTS_N_INSNS (17),                  /*                          DI.  */
1790    COSTS_N_INSNS (17)},                 /*                          other.  */
1791   COSTS_N_INSNS (1),                    /* cost of movsx.  */
1792   COSTS_N_INSNS (1),                    /* cost of movzx.  */
1793   8,                                    /* "large" insn.  */
1794   9,                                    /* MOVE_RATIO.  */
1795   6,                                    /* CLEAR_RATIO */
1796   {6, 6, 6},                            /* cost of loading integer registers
1797                                            in QImode, HImode and SImode.
1798                                            Relative to reg-reg move (2).  */
1799   {8, 8, 8},                            /* cost of storing integer
1800                                            registers.  */
1801   {6, 6, 6, 6, 12},                     /* cost of loading SSE registers
1802                                            in 32bit, 64bit, 128bit, 256bit and 512bit */
1803   {8, 8, 8, 8, 16},                     /* cost of storing SSE register
1804                                            in 32bit, 64bit, 128bit, 256bit and 512bit */
1805   {6, 6, 6, 6, 12},                     /* cost of unaligned loads.  */
1806   {8, 8, 8, 8, 16},                     /* cost of unaligned stores.  */
1807   2, 2, 3,                              /* cost of moving XMM,YMM,ZMM
1808                                            register.  */
1809   6,                                    /* cost of moving SSE register to integer.  */
1810   /* VGATHERDPD is 15 uops and throughput is 4, VGATHERDPS is 23 uops,
1811      throughput 9.  Approx 7 uops do not depend on vector size and every load
1812      is 4 uops.  */
1813   14, 8,                                /* Gather load static, per_elt.  */
1814   14, 10,                               /* Gather store static, per_elt.  */
1815   32,                                   /* size of l1 cache.  */
1816   512,                                  /* size of l2 cache.  */
1817   64,                                   /* size of prefetch block.  */
1818   /* New AMD processors never drop prefetches; if they cannot be performed
1819      immediately, they are queued.  We set number of simultaneous prefetches
1820      to a large constant to reflect this (it probably is not a good idea not
1821      to limit number of prefetches at all, as their execution also takes some
1822      time).  */
1823   100,                                  /* number of parallel prefetches.  */
1824   3,                                    /* Branch cost.  */
1825   COSTS_N_INSNS (5),                    /* cost of FADD and FSUB insns.  */
1826   COSTS_N_INSNS (5),                    /* cost of FMUL instruction.  */
1827   /* Latency of fdiv is 8-15.  */
1828   COSTS_N_INSNS (15),                   /* cost of FDIV instruction.  */
1829   COSTS_N_INSNS (1),                    /* cost of FABS instruction.  */
1830   COSTS_N_INSNS (1),                    /* cost of FCHS instruction.  */
1831   /* Latency of fsqrt is 4-10.  */
1832   COSTS_N_INSNS (10),                   /* cost of FSQRT instruction.  */
1833
1834   COSTS_N_INSNS (1),                    /* cost of cheap SSE instruction.  */
1835   COSTS_N_INSNS (3),                    /* cost of ADDSS/SD SUBSS/SD insns.  */
1836   COSTS_N_INSNS (3),                    /* cost of MULSS instruction.  */
1837   COSTS_N_INSNS (3),                    /* cost of MULSD instruction.  */
1838   COSTS_N_INSNS (5),                    /* cost of FMA SS instruction.  */
1839   COSTS_N_INSNS (5),                    /* cost of FMA SD instruction.  */
1840   COSTS_N_INSNS (10),                   /* cost of DIVSS instruction.  */
1841   /* 9-13.  */
1842   COSTS_N_INSNS (13),                   /* cost of DIVSD instruction.  */
1843   COSTS_N_INSNS (10),                   /* cost of SQRTSS instruction.  */
1844   COSTS_N_INSNS (15),                   /* cost of SQRTSD instruction.  */
1845   /* Zen can execute 4 integer operations per cycle.  FP operations
1846      take 3 cycles and it can execute 2 integer additions and 2
1847      multiplications thus reassociation may make sense up to with of 6.
1848      SPEC2k6 bencharks suggests
1849      that 4 works better than 6 probably due to register pressure.
1850
1851      Integer vector operations are taken by FP unit and execute 3 vector
1852      plus/minus operations per cycle but only one multiply.  This is adjusted
1853      in ix86_reassociation_width.  */
1854   4, 4, 3, 6,                           /* reassoc int, fp, vec_int, vec_fp.  */
1855   znver2_memcpy,
1856   znver2_memset,
1857   COSTS_N_INSNS (4),                    /* cond_taken_branch_cost.  */
1858   COSTS_N_INSNS (2),                    /* cond_not_taken_branch_cost.  */
1859   "16",                                 /* Loop alignment.  */
1860   "16",                                 /* Jump alignment.  */
1861   "0:0:8",                              /* Label alignment.  */
1862   "16",                                 /* Func alignment.  */
1863   4,                                    /* Small unroll limit.  */
1864   2,                                    /* Small unroll factor.  */
1865   COSTS_N_INSNS (2),                    /* Branch mispredict scale.  */
1866 };
1867
1868 /* This table currently replicates znver3_cost table. */
1869 struct processor_costs znver4_cost = {
1870   {
1871   /* Start of register allocator costs.  integer->integer move cost is 2. */
1872
1873   /* reg-reg moves are done by renaming and thus they are even cheaper than
1874      1 cycle.  Because reg-reg move cost is 2 and following tables correspond
1875      to doubles of latencies, we do not model this correctly.  It does not
1876      seem to make practical difference to bump prices up even more.  */
1877   6,                                    /* cost for loading QImode using
1878                                            movzbl.  */
1879   {6, 6, 6},                            /* cost of loading integer registers
1880                                            in QImode, HImode and SImode.
1881                                            Relative to reg-reg move (2).  */
1882   {8, 8, 8},                            /* cost of storing integer
1883                                            registers.  */
1884   2,                                    /* cost of reg,reg fld/fst.  */
1885   {14, 14, 17},                         /* cost of loading fp registers
1886                                            in SFmode, DFmode and XFmode.  */
1887   {12, 12, 16},                         /* cost of storing fp registers
1888                                            in SFmode, DFmode and XFmode.  */
1889   2,                                    /* cost of moving MMX register.  */
1890   {6, 6},                               /* cost of loading MMX registers
1891                                            in SImode and DImode.  */
1892   {8, 8},                               /* cost of storing MMX registers
1893                                            in SImode and DImode.  */
1894   2, 2, 3,                              /* cost of moving XMM,YMM,ZMM
1895                                            register.  */
1896   {6, 6, 10, 10, 12},                   /* cost of loading SSE registers
1897                                            in 32,64,128,256 and 512-bit.  */
1898   {8, 8, 8, 12, 12},                    /* cost of storing SSE registers
1899                                            in 32,64,128,256 and 512-bit.  */
1900   6, 8,                                 /* SSE->integer and integer->SSE
1901                                            moves.  */
1902   8, 8,                                 /* mask->integer and integer->mask moves */
1903   {6, 6, 6},                            /* cost of loading mask register
1904                                            in QImode, HImode, SImode.  */
1905   {8, 8, 8},                            /* cost if storing mask register
1906                                            in QImode, HImode, SImode.  */
1907   2,                                    /* cost of moving mask register.  */
1908   /* End of register allocator costs.  */
1909   },
1910
1911   COSTS_N_INSNS (1),                    /* cost of an add instruction.  */
1912   /* TODO: Lea with 3 components has cost 2.  */
1913   COSTS_N_INSNS (1),                    /* cost of a lea instruction.  */
1914   COSTS_N_INSNS (1),                    /* variable shift costs.  */
1915   COSTS_N_INSNS (1),                    /* constant shift costs.  */
1916   {COSTS_N_INSNS (3),                   /* cost of starting multiply for QI.  */
1917    COSTS_N_INSNS (3),                   /*                               HI.  */
1918    COSTS_N_INSNS (3),                   /*                               SI.  */
1919    COSTS_N_INSNS (3),                   /*                               DI.  */
1920    COSTS_N_INSNS (3)},                  /*                      other.  */
1921   0,                                    /* cost of multiply per each bit
1922                                            set.  */
1923   {COSTS_N_INSNS (12),                  /* cost of a divide/mod for QI.  */
1924    COSTS_N_INSNS (13),                  /*                          HI.  */
1925    COSTS_N_INSNS (13),                  /*                          SI.  */
1926    COSTS_N_INSNS (18),                  /*                          DI.  */
1927    COSTS_N_INSNS (18)},                 /*                          other.  */
1928   COSTS_N_INSNS (1),                    /* cost of movsx.  */
1929   COSTS_N_INSNS (1),                    /* cost of movzx.  */
1930   8,                                    /* "large" insn.  */
1931   9,                                    /* MOVE_RATIO.  */
1932   6,                                    /* CLEAR_RATIO */
1933   {6, 6, 6},                            /* cost of loading integer registers
1934                                            in QImode, HImode and SImode.
1935                                            Relative to reg-reg move (2).  */
1936   {8, 8, 8},                            /* cost of storing integer
1937                                            registers.  */
1938   {6, 6, 10, 10, 12},                   /* cost of loading SSE registers
1939                                            in 32bit, 64bit, 128bit, 256bit and 512bit */
1940   {8, 8, 8, 12, 12},                    /* cost of storing SSE register
1941                                            in 32bit, 64bit, 128bit, 256bit and 512bit */
1942   {6, 6, 10, 10, 12},                   /* cost of unaligned loads.  */
1943   {8, 8, 8, 12, 12},                    /* cost of unaligned stores.  */
1944   2, 2, 2,                              /* cost of moving XMM,YMM,ZMM
1945                                            register.  */
1946   6,                                    /* cost of moving SSE register to integer.  */
1947   /* VGATHERDPD is 17 uops and throughput is 4, VGATHERDPS is 24 uops,
1948      throughput 5.  Approx 7 uops do not depend on vector size and every load
1949      is 5 uops.  */
1950   14, 10,                               /* Gather load static, per_elt.  */
1951   14, 20,                               /* Gather store static, per_elt.  */
1952   32,                                   /* size of l1 cache.  */
1953   1024,                                 /* size of l2 cache.  */
1954   64,                                   /* size of prefetch block.  */
1955   /* New AMD processors never drop prefetches; if they cannot be performed
1956      immediately, they are queued.  We set number of simultaneous prefetches
1957      to a large constant to reflect this (it probably is not a good idea not
1958      to limit number of prefetches at all, as their execution also takes some
1959      time).  */
1960   100,                                  /* number of parallel prefetches.  */
1961   3,                                    /* Branch cost.  */
1962   COSTS_N_INSNS (7),                    /* cost of FADD and FSUB insns.  */
1963   COSTS_N_INSNS (7),                    /* cost of FMUL instruction.  */
1964   /* Latency of fdiv is 8-15.  */
1965   COSTS_N_INSNS (15),                   /* cost of FDIV instruction.  */
1966   COSTS_N_INSNS (1),                    /* cost of FABS instruction.  */
1967   COSTS_N_INSNS (1),                    /* cost of FCHS instruction.  */
1968   /* Latency of fsqrt is 4-10.  */
1969   COSTS_N_INSNS (25),                   /* cost of FSQRT instruction.  */
1970
1971   COSTS_N_INSNS (1),                    /* cost of cheap SSE instruction.  */
1972   COSTS_N_INSNS (3),                    /* cost of ADDSS/SD SUBSS/SD insns.  */
1973   COSTS_N_INSNS (3),                    /* cost of MULSS instruction.  */
1974   COSTS_N_INSNS (3),                    /* cost of MULSD instruction.  */
1975   COSTS_N_INSNS (4),                    /* cost of FMA SS instruction.  */
1976   COSTS_N_INSNS (4),                    /* cost of FMA SD instruction.  */
1977   COSTS_N_INSNS (13),                   /* cost of DIVSS instruction.  */
1978   /* 9-13.  */
1979   COSTS_N_INSNS (13),                   /* cost of DIVSD instruction.  */
1980   COSTS_N_INSNS (15),                   /* cost of SQRTSS instruction.  */
1981   COSTS_N_INSNS (21),                   /* cost of SQRTSD instruction.  */
1982   /* Zen can execute 4 integer operations per cycle.  FP operations
1983      take 3 cycles and it can execute 2 integer additions and 2
1984      multiplications thus reassociation may make sense up to with of 6.
1985      SPEC2k6 bencharks suggests
1986      that 4 works better than 6 probably due to register pressure.
1987
1988      Integer vector operations are taken by FP unit and execute 3 vector
1989      plus/minus operations per cycle but only one multiply.  This is adjusted
1990      in ix86_reassociation_width.  */
1991   4, 4, 3, 6,                           /* reassoc int, fp, vec_int, vec_fp.  */
1992   znver2_memcpy,
1993   znver2_memset,
1994   COSTS_N_INSNS (4),                    /* cond_taken_branch_cost.  */
1995   COSTS_N_INSNS (2),                    /* cond_not_taken_branch_cost.  */
1996   "16",                                 /* Loop alignment.  */
1997   "16",                                 /* Jump alignment.  */
1998   "0:0:8",                              /* Label alignment.  */
1999   "16",                                 /* Func alignment.  */
2000   4,                                    /* Small unroll limit.  */
2001   2,                                    /* Small unroll factor.  */
2002   COSTS_N_INSNS (2),                    /* Branch mispredict scale.  */
2003 };
2004
2005 /* This table currently replicates znver4_cost table. */
2006 struct processor_costs znver5_cost = {
2007   {
2008   /* Start of register allocator costs.  integer->integer move cost is 2. */
2009
2010   /* reg-reg moves are done by renaming and thus they are even cheaper than
2011      1 cycle.  Because reg-reg move cost is 2 and following tables correspond
2012      to doubles of latencies, we do not model this correctly.  It does not
2013      seem to make practical difference to bump prices up even more.  */
2014   6,                                    /* cost for loading QImode using
2015                                            movzbl.  */
2016   {6, 6, 6},                            /* cost of loading integer registers
2017                                            in QImode, HImode and SImode.
2018                                            Relative to reg-reg move (2).  */
2019   {8, 8, 8},                            /* cost of storing integer
2020                                            registers.  */
2021   2,                                    /* cost of reg,reg fld/fst.  */
2022   {14, 14, 17},                         /* cost of loading fp registers
2023                                            in SFmode, DFmode and XFmode.  */
2024   {12, 12, 16},                         /* cost of storing fp registers
2025                                            in SFmode, DFmode and XFmode.  */
2026   2,                                    /* cost of moving MMX register.  */
2027   {6, 6},                               /* cost of loading MMX registers
2028                                            in SImode and DImode.  */
2029   {8, 8},                               /* cost of storing MMX registers
2030                                            in SImode and DImode.  */
2031   2, 2, 3,                              /* cost of moving XMM,YMM,ZMM
2032                                            register.  */
2033   {6, 6, 10, 10, 12},                   /* cost of loading SSE registers
2034                                            in 32,64,128,256 and 512-bit.  */
2035   {8, 8, 8, 12, 12},                    /* cost of storing SSE registers
2036                                            in 32,64,128,256 and 512-bit.  */
2037   6, 8,                                 /* SSE->integer and integer->SSE
2038                                            moves.  */
2039   8, 8,                                 /* mask->integer and integer->mask moves */
2040   {6, 6, 6},                            /* cost of loading mask register
2041                                            in QImode, HImode, SImode.  */
2042   {8, 8, 8},                            /* cost if storing mask register
2043                                            in QImode, HImode, SImode.  */
2044   2,                                    /* cost of moving mask register.  */
2045   /* End of register allocator costs.  */
2046   },
2047
2048   COSTS_N_INSNS (1),                    /* cost of an add instruction.  */
2049   /* TODO: Lea with 3 components has cost 2.  */
2050   COSTS_N_INSNS (1),                    /* cost of a lea instruction.  */
2051   COSTS_N_INSNS (1),                    /* variable shift costs.  */
2052   COSTS_N_INSNS (1),                    /* constant shift costs.  */
2053   /* mul has latency 3, executes in 3 integer units.  */
2054   {COSTS_N_INSNS (3),                   /* cost of starting multiply for QI.  */
2055    COSTS_N_INSNS (3),                   /*                               HI.  */
2056    COSTS_N_INSNS (3),                   /*                               SI.  */
2057    COSTS_N_INSNS (3),                   /*                               DI.  */
2058    COSTS_N_INSNS (3)},                  /*                      other.  */
2059   0,                                    /* cost of multiply per each bit
2060                                            set.  */
2061   /* integer divide has latency of 8 cycles
2062      plus 1 for every 9 bits of quotient.  */
2063   {COSTS_N_INSNS (10),                  /* cost of a divide/mod for QI.  */
2064    COSTS_N_INSNS (11),                  /*                          HI.  */
2065    COSTS_N_INSNS (13),                  /*                          SI.  */
2066    COSTS_N_INSNS (16),                  /*                          DI.  */
2067    COSTS_N_INSNS (16)},                 /*                          other.  */
2068   COSTS_N_INSNS (1),                    /* cost of movsx.  */
2069   COSTS_N_INSNS (1),                    /* cost of movzx.  */
2070   15,                                   /* "large" insn.  */
2071   9,                                    /* MOVE_RATIO.  */
2072   6,                                    /* CLEAR_RATIO */
2073   {6, 6, 6},                            /* cost of loading integer registers
2074                                            in QImode, HImode and SImode.
2075                                            Relative to reg-reg move (2).  */
2076   {8, 8, 8},                            /* cost of storing integer
2077                                            registers.  */
2078   {6, 6, 10, 10, 12},                   /* cost of loading SSE registers
2079                                            in 32bit, 64bit, 128bit, 256bit and 512bit */
2080   {8, 8, 8, 12, 12},                    /* cost of storing SSE register
2081                                            in 32bit, 64bit, 128bit, 256bit and 512bit */
2082   {6, 6, 10, 10, 12},                   /* cost of unaligned loads.  */
2083   {8, 8, 8, 12, 12},                    /* cost of unaligned stores.  */
2084   2, 2, 2,                              /* cost of moving XMM,YMM,ZMM
2085                                            register.  */
2086   6,                                    /* cost of moving SSE register to integer.  */
2087
2088   /* TODO: gather and scatter instructions are currently disabled in
2089      x86-tune.def.  In some cases they are however a win, see PR116582
2090      We however need good cost model for them.  */
2091   14, 10,                               /* Gather load static, per_elt.  */
2092   14, 20,                               /* Gather store static, per_elt.  */
2093   48,                                   /* size of l1 cache.  */
2094   1024,                                 /* size of l2 cache.  */
2095   64,                                   /* size of prefetch block.  */
2096   /* New AMD processors never drop prefetches; if they cannot be performed
2097      immediately, they are queued.  We set number of simultaneous prefetches
2098      to a large constant to reflect this (it probably is not a good idea not
2099      to limit number of prefetches at all, as their execution also takes some
2100      time).  */
2101   100,                                  /* number of parallel prefetches.  */
2102   3,                                    /* Branch cost.  */
2103   /* TODO x87 latencies are still based on znver4.
2104      Probably not very important these days.  */
2105   COSTS_N_INSNS (7),                    /* cost of FADD and FSUB insns.  */
2106   COSTS_N_INSNS (7),                    /* cost of FMUL instruction.  */
2107   /* Latency of fdiv is 8-15.  */
2108   COSTS_N_INSNS (15),                   /* cost of FDIV instruction.  */
2109   COSTS_N_INSNS (1),                    /* cost of FABS instruction.  */
2110   COSTS_N_INSNS (1),                    /* cost of FCHS instruction.  */
2111   /* Latency of fsqrt is 4-10.  */
2112   COSTS_N_INSNS (25),                   /* cost of FSQRT instruction.  */
2113
2114   /* SSE instructions have typical throughput 4 and latency 1.  */
2115   COSTS_N_INSNS (1),                    /* cost of cheap SSE instruction.  */
2116   /* ADDSS has throughput 2 and latency 2
2117      (in some cases when source is another addition).  */
2118   COSTS_N_INSNS (2),                    /* cost of ADDSS/SD SUBSS/SD insns.  */
2119   /* MULSS has throughput 2 and latency 3.  */
2120   COSTS_N_INSNS (3),                    /* cost of MULSS instruction.  */
2121   COSTS_N_INSNS (3),                    /* cost of MULSD instruction.  */
2122   /* FMA had throughput 2 and latency 4.  */
2123   COSTS_N_INSNS (4),                    /* cost of FMA SS instruction.  */
2124   COSTS_N_INSNS (4),                    /* cost of FMA SD instruction.  */
2125   /* DIVSS has throughtput 0.4 and latency 10.  */
2126   COSTS_N_INSNS (10),                   /* cost of DIVSS instruction.  */
2127   /* DIVSD has throughtput 0.25 and latency 13.  */
2128   COSTS_N_INSNS (13),                   /* cost of DIVSD instruction.  */
2129   /* DIVSD has throughtput 0.22 and latency 14.  */
2130   COSTS_N_INSNS (14),                   /* cost of SQRTSS instruction.  */
2131   /* DIVSD has throughtput 0.13 and latency 20.  */
2132   COSTS_N_INSNS (20),                   /* cost of SQRTSD instruction.  */
2133   /* Zen5 can execute:
2134       - integer ops: 6 per cycle, at most 3 multiplications.
2135         latency 1 for additions, 3 for multiplications (pipelined)
2136
2137         Setting width of 9 for multiplication is probably excessive
2138         for register pressure.
2139       - fp ops: 2 additions per cycle, latency 2-3
2140                 2 multiplicaitons per cycle, latency 3
2141       - vector intger ops: 4 additions, latency 1
2142                            2 multiplications, latency 4
2143         We increase width to 6 for multiplications
2144         in ix86_reassociation_width.  */
2145   6, 6, 4, 6,                           /* reassoc int, fp, vec_int, vec_fp.  */
2146   znver2_memcpy,
2147   znver2_memset,
2148   COSTS_N_INSNS (4),                    /* cond_taken_branch_cost.  */
2149   COSTS_N_INSNS (2),                    /* cond_not_taken_branch_cost.  */
2150   "16",                                 /* Loop alignment.  */
2151   "16",                                 /* Jump alignment.  */
2152   "0:0:8",                              /* Label alignment.  */
2153   "16",                                 /* Func alignment.  */
2154   4,                                    /* Small unroll limit.  */
2155   2,                                    /* Small unroll factor.  */
2156   COSTS_N_INSNS (2),                    /* Branch mispredict scale.  */
2157 };
2158
2159 /* skylake_cost should produce code tuned for Skylake familly of CPUs.  */
2160 static stringop_algs skylake_memcpy[2] =   {
2161   {libcall,
2162    {{256, rep_prefix_1_byte, true},
2163     {256, loop, false},
2164     {-1, libcall, false}}},
2165   {libcall,
2166    {{256, rep_prefix_1_byte, true},
2167     {256, loop, false},
2168     {-1, libcall, false}}}};
2169
2170 static stringop_algs skylake_memset[2] = {
2171   {libcall,
2172    {{256, rep_prefix_1_byte, true},
2173     {256, loop, false},
2174     {-1, libcall, false}}},
2175   {libcall,
2176    {{256, rep_prefix_1_byte, true},
2177     {256, loop, false},
2178     {-1, libcall, false}}}};
2179
2180 static const
2181 struct processor_costs skylake_cost = {
2182   {
2183   /* Start of register allocator costs.  integer->integer move cost is 2. */
2184   6,                                 /* cost for loading QImode using movzbl */
2185   {4, 4, 4},                            /* cost of loading integer registers
2186                                            in QImode, HImode and SImode.
2187                                            Relative to reg-reg move (2).  */
2188   {6, 6, 6},                            /* cost of storing integer registers */
2189   2,                                    /* cost of reg,reg fld/fst */
2190   {6, 6, 8},                            /* cost of loading fp registers
2191                                            in SFmode, DFmode and XFmode */
2192   {6, 6, 10},                           /* cost of storing fp registers
2193                                            in SFmode, DFmode and XFmode */
2194   2,                                    /* cost of moving MMX register */
2195   {6, 6},                               /* cost of loading MMX registers
2196                                            in SImode and DImode */
2197   {6, 6},                               /* cost of storing MMX registers
2198                                            in SImode and DImode */
2199   2, 2, 4,                              /* cost of moving XMM,YMM,ZMM register */
2200   {6, 6, 6, 10, 20},                    /* cost of loading SSE registers
2201                                            in 32,64,128,256 and 512-bit */
2202   {8, 8, 8, 12, 24},                    /* cost of storing SSE registers
2203                                            in 32,64,128,256 and 512-bit */
2204   6, 6,                         /* SSE->integer and integer->SSE moves */
2205   6, 6,                         /* mask->integer and integer->mask moves */
2206   {8, 8, 8},                            /* cost of loading mask register
2207                                            in QImode, HImode, SImode.  */
2208   {6, 6, 6},                            /* cost if storing mask register
2209                                            in QImode, HImode, SImode.  */
2210   3,                                    /* cost of moving mask register.  */
2211   /* End of register allocator costs.  */
2212   },
2213
2214   COSTS_N_INSNS (1),                    /* cost of an add instruction */
2215   COSTS_N_INSNS (1)+1,          /* cost of a lea instruction */
2216   COSTS_N_INSNS (1),                    /* variable shift costs */
2217   COSTS_N_INSNS (1),                    /* constant shift costs */
2218   {COSTS_N_INSNS (3),                   /* cost of starting multiply for QI */
2219    COSTS_N_INSNS (3),                   /*                               HI */
2220    COSTS_N_INSNS (3),                   /*                               SI */
2221    COSTS_N_INSNS (3),                   /*                               DI */
2222    COSTS_N_INSNS (3)},                  /*                            other */
2223   0,                                    /* cost of multiply per each bit set */
2224   /* Expanding div/mod currently doesn't consider parallelism. So the cost
2225      model is not realistic. We compensate by increasing the latencies a bit.  */
2226   {COSTS_N_INSNS (11),                  /* cost of a divide/mod for QI */
2227    COSTS_N_INSNS (11),                  /*                          HI */
2228    COSTS_N_INSNS (14),                  /*                          SI */
2229    COSTS_N_INSNS (76),                  /*                          DI */
2230    COSTS_N_INSNS (76)},                 /*                          other */
2231   COSTS_N_INSNS (1),                    /* cost of movsx */
2232   COSTS_N_INSNS (0),                    /* cost of movzx */
2233   8,                                    /* "large" insn */
2234   17,                                   /* MOVE_RATIO */
2235   17,                                   /* CLEAR_RATIO */
2236   {6, 6, 6},                            /* cost of loading integer registers
2237                                            in QImode, HImode and SImode.
2238                                            Relative to reg-reg move (2).  */
2239   {8, 8, 8},                            /* cost of storing integer registers */
2240   {8, 8, 8, 8, 16},                     /* cost of loading SSE register
2241                                            in 32bit, 64bit, 128bit, 256bit and 512bit */
2242   {8, 8, 8, 8, 16},                     /* cost of storing SSE register
2243                                            in 32bit, 64bit, 128bit, 256bit and 512bit */
2244   {8, 8, 8, 8, 16},                     /* cost of unaligned loads.  */
2245   {8, 8, 8, 8, 16},                     /* cost of unaligned stores.  */
2246   2, 2, 4,                              /* cost of moving XMM,YMM,ZMM register */
2247   6,                                    /* cost of moving SSE register to integer.  */
2248   20, 8,                                /* Gather load static, per_elt.  */
2249   22, 10,                               /* Gather store static, per_elt.  */
2250   64,                                   /* size of l1 cache.  */
2251   512,                                  /* size of l2 cache.  */
2252   64,                                   /* size of prefetch block */
2253   6,                                    /* number of parallel prefetches */
2254   3,                                    /* Branch cost */
2255   COSTS_N_INSNS (3),                    /* cost of FADD and FSUB insns.  */
2256   COSTS_N_INSNS (4),                    /* cost of FMUL instruction.  */
2257   COSTS_N_INSNS (20),                   /* cost of FDIV instruction.  */
2258   COSTS_N_INSNS (1),                    /* cost of FABS instruction.  */
2259   COSTS_N_INSNS (1),                    /* cost of FCHS instruction.  */
2260   COSTS_N_INSNS (20),                   /* cost of FSQRT instruction.  */
2261
2262   COSTS_N_INSNS (1),                    /* cost of cheap SSE instruction.  */
2263   COSTS_N_INSNS (4),                    /* cost of ADDSS/SD SUBSS/SD insns.  */
2264   COSTS_N_INSNS (4),                    /* cost of MULSS instruction.  */
2265   COSTS_N_INSNS (4),                    /* cost of MULSD instruction.  */
2266   COSTS_N_INSNS (4),                    /* cost of FMA SS instruction.  */
2267   COSTS_N_INSNS (4),                    /* cost of FMA SD instruction.  */
2268   COSTS_N_INSNS (11),                   /* cost of DIVSS instruction.  */
2269   COSTS_N_INSNS (14),                   /* cost of DIVSD instruction.  */
2270   COSTS_N_INSNS (12),                   /* cost of SQRTSS instruction.  */
2271   COSTS_N_INSNS (18),                   /* cost of SQRTSD instruction.  */
2272   1, 4, 2, 2,                           /* reassoc int, fp, vec_int, vec_fp.  */
2273   skylake_memcpy,
2274   skylake_memset,
2275   COSTS_N_INSNS (3),                    /* cond_taken_branch_cost.  */
2276   COSTS_N_INSNS (1),                    /* cond_not_taken_branch_cost.  */
2277   "16:11:8",                            /* Loop alignment.  */
2278   "16:11:8",                            /* Jump alignment.  */
2279   "0:0:8",                              /* Label alignment.  */
2280   "16",                                 /* Func alignment.  */
2281   4,                                    /* Small unroll limit.  */
2282   2,                                    /* Small unroll factor.  */
2283   COSTS_N_INSNS (2),                    /* Branch mispredict scale.  */
2284 };
2285
2286 /* icelake_cost should produce code tuned for Icelake family of CPUs.
2287    NB: rep_prefix_1_byte is used only for known size. */
2288
2289 static stringop_algs icelake_memcpy[2] =   {
2290   {libcall,
2291    {{256, rep_prefix_1_byte, true},
2292     {256, loop, false},
2293     {-1, libcall, false}}},
2294   {libcall,
2295    {{256, rep_prefix_1_byte, true},
2296     {256, loop, false},
2297     {-1, libcall, false}}}};
2298
2299 static stringop_algs icelake_memset[2] = {
2300   {libcall,
2301    {{256, rep_prefix_1_byte, true},
2302     {256, loop, false},
2303     {-1, libcall, false}}},
2304   {libcall,
2305    {{256, rep_prefix_1_byte, true},
2306     {256, loop, false},
2307     {-1, libcall, false}}}};
2308
2309 static const
2310 struct processor_costs icelake_cost = {
2311   {
2312   /* Start of register allocator costs.  integer->integer move cost is 2. */
2313   6,                                 /* cost for loading QImode using movzbl */
2314   {4, 4, 4},                            /* cost of loading integer registers
2315                                            in QImode, HImode and SImode.
2316                                            Relative to reg-reg move (2).  */
2317   {6, 6, 6},                            /* cost of storing integer registers */
2318   2,                                    /* cost of reg,reg fld/fst */
2319   {6, 6, 8},                            /* cost of loading fp registers
2320                                            in SFmode, DFmode and XFmode */
2321   {6, 6, 10},                           /* cost of storing fp registers
2322                                            in SFmode, DFmode and XFmode */
2323   2,                                    /* cost of moving MMX register */
2324   {6, 6},                               /* cost of loading MMX registers
2325                                            in SImode and DImode */
2326   {6, 6},                               /* cost of storing MMX registers
2327                                            in SImode and DImode */
2328   2, 2, 4,                              /* cost of moving XMM,YMM,ZMM register */
2329   {6, 6, 6, 10, 20},                    /* cost of loading SSE registers
2330                                            in 32,64,128,256 and 512-bit */
2331   {8, 8, 8, 12, 24},                    /* cost of storing SSE registers
2332                                            in 32,64,128,256 and 512-bit */
2333   6, 6,                         /* SSE->integer and integer->SSE moves */
2334   6, 6,                         /* mask->integer and integer->mask moves */
2335   {8, 8, 8},                            /* cost of loading mask register
2336                                            in QImode, HImode, SImode.  */
2337   {6, 6, 6},                            /* cost if storing mask register
2338                                            in QImode, HImode, SImode.  */
2339   3,                                    /* cost of moving mask register.  */
2340   /* End of register allocator costs.  */
2341   },
2342
2343   COSTS_N_INSNS (1),                    /* cost of an add instruction */
2344   COSTS_N_INSNS (1)+1,          /* cost of a lea instruction */
2345   COSTS_N_INSNS (1),                    /* variable shift costs */
2346   COSTS_N_INSNS (1),                    /* constant shift costs */
2347   {COSTS_N_INSNS (3),                   /* cost of starting multiply for QI */
2348    COSTS_N_INSNS (3),                   /*                               HI */
2349    COSTS_N_INSNS (3),                   /*                               SI */
2350    COSTS_N_INSNS (3),                   /*                               DI */
2351    COSTS_N_INSNS (3)},                  /*                            other */
2352   0,                                    /* cost of multiply per each bit set */
2353   /* Expanding div/mod currently doesn't consider parallelism. So the cost
2354      model is not realistic. We compensate by increasing the latencies a bit.  */
2355   {COSTS_N_INSNS (11),                  /* cost of a divide/mod for QI */
2356    COSTS_N_INSNS (11),                  /*                          HI */
2357    COSTS_N_INSNS (14),                  /*                          SI */
2358    COSTS_N_INSNS (76),                  /*                          DI */
2359    COSTS_N_INSNS (76)},                 /*                          other */
2360   COSTS_N_INSNS (1),                    /* cost of movsx */
2361   COSTS_N_INSNS (0),                    /* cost of movzx */
2362   8,                                    /* "large" insn */
2363   17,                                   /* MOVE_RATIO */
2364   17,                                   /* CLEAR_RATIO */
2365   {6, 6, 6},                            /* cost of loading integer registers
2366                                            in QImode, HImode and SImode.
2367                                            Relative to reg-reg move (2).  */
2368   {8, 8, 8},                            /* cost of storing integer registers */
2369   {8, 8, 8, 8, 16},                     /* cost of loading SSE register
2370                                            in 32bit, 64bit, 128bit, 256bit and 512bit */
2371   {8, 8, 8, 8, 16},                     /* cost of storing SSE register
2372                                            in 32bit, 64bit, 128bit, 256bit and 512bit */
2373   {8, 8, 8, 8, 16},                     /* cost of unaligned loads.  */
2374   {8, 8, 8, 8, 16},                     /* cost of unaligned stores.  */
2375   2, 2, 4,                              /* cost of moving XMM,YMM,ZMM register */
2376   6,                                    /* cost of moving SSE register to integer.  */
2377   20, 8,                                /* Gather load static, per_elt.  */
2378   22, 10,                               /* Gather store static, per_elt.  */
2379   64,                                   /* size of l1 cache.  */
2380   512,                                  /* size of l2 cache.  */
2381   64,                                   /* size of prefetch block */
2382   6,                                    /* number of parallel prefetches */
2383   3,                                    /* Branch cost */
2384   COSTS_N_INSNS (3),                    /* cost of FADD and FSUB insns.  */
2385   COSTS_N_INSNS (4),                    /* cost of FMUL instruction.  */
2386   COSTS_N_INSNS (20),                   /* cost of FDIV instruction.  */
2387   COSTS_N_INSNS (1),                    /* cost of FABS instruction.  */
2388   COSTS_N_INSNS (1),                    /* cost of FCHS instruction.  */
2389   COSTS_N_INSNS (20),                   /* cost of FSQRT instruction.  */
2390
2391   COSTS_N_INSNS (1),                    /* cost of cheap SSE instruction.  */
2392   COSTS_N_INSNS (4),                    /* cost of ADDSS/SD SUBSS/SD insns.  */
2393   COSTS_N_INSNS (4),                    /* cost of MULSS instruction.  */
2394   COSTS_N_INSNS (4),                    /* cost of MULSD instruction.  */
2395   COSTS_N_INSNS (4),                    /* cost of FMA SS instruction.  */
2396   COSTS_N_INSNS (4),                    /* cost of FMA SD instruction.  */
2397   COSTS_N_INSNS (11),                   /* cost of DIVSS instruction.  */
2398   COSTS_N_INSNS (14),                   /* cost of DIVSD instruction.  */
2399   COSTS_N_INSNS (12),                   /* cost of SQRTSS instruction.  */
2400   COSTS_N_INSNS (18),                   /* cost of SQRTSD instruction.  */
2401   1, 4, 2, 2,                           /* reassoc int, fp, vec_int, vec_fp.  */
2402   icelake_memcpy,
2403   icelake_memset,
2404   COSTS_N_INSNS (3),                    /* cond_taken_branch_cost.  */
2405   COSTS_N_INSNS (1),                    /* cond_not_taken_branch_cost.  */
2406   "16:11:8",                            /* Loop alignment.  */
2407   "16:11:8",                            /* Jump alignment.  */
2408   "0:0:8",                              /* Label alignment.  */
2409   "16",                                 /* Func alignment.  */
2410   4,                                    /* Small unroll limit.  */
2411   2,                                    /* Small unroll factor.  */
2412   COSTS_N_INSNS (2) + 3,                /* Branch mispredict scale.  */
2413 };
2414
2415 /* alderlake_cost should produce code tuned for alderlake family of CPUs.  */
2416 static stringop_algs alderlake_memcpy[2] = {
2417   {libcall,
2418    {{256, rep_prefix_1_byte, true},
2419     {256, loop, false},
2420     {-1, libcall, false}}},
2421   {libcall,
2422    {{256, rep_prefix_1_byte, true},
2423     {256, loop, false},
2424     {-1, libcall, false}}}};
2425 static stringop_algs alderlake_memset[2] = {
2426   {libcall,
2427    {{256, rep_prefix_1_byte, true},
2428     {256, loop, false},
2429     {-1, libcall, false}}},
2430   {libcall,
2431    {{256, rep_prefix_1_byte, true},
2432     {256, loop, false},
2433     {-1, libcall, false}}}};
2434 static const
2435 struct processor_costs alderlake_cost = {
2436   {
2437   /* Start of register allocator costs.  integer->integer move cost is 2.  */
2438   6,                                 /* cost for loading QImode using movzbl */
2439   {6, 6, 6},                            /* cost of loading integer registers
2440                                            in QImode, HImode and SImode.
2441                                            Relative to reg-reg move (2).  */
2442   {6, 6, 6},                            /* cost of storing integer registers */
2443   4,                                    /* cost of reg,reg fld/fst */
2444   {6, 6, 12},                           /* cost of loading fp registers
2445                                            in SFmode, DFmode and XFmode */
2446   {6, 6, 12},                           /* cost of storing fp registers
2447                                            in SFmode, DFmode and XFmode */
2448   2,                                    /* cost of moving MMX register */
2449   {6, 6},                               /* cost of loading MMX registers
2450                                            in SImode and DImode */
2451   {6, 6},                               /* cost of storing MMX registers
2452                                            in SImode and DImode */
2453   2, 3, 4,                              /* cost of moving XMM,YMM,ZMM register */
2454   {6, 6, 6, 10, 15},                    /* cost of loading SSE registers
2455                                            in 32,64,128,256 and 512-bit */
2456   {6, 6, 6, 10, 15},                    /* cost of storing SSE registers
2457                                            in 32,64,128,256 and 512-bit */
2458   6, 6,                         /* SSE->integer and integer->SSE moves */
2459   6, 6,                         /* mask->integer and integer->mask moves */
2460   {6, 6, 6},                            /* cost of loading mask register
2461                                            in QImode, HImode, SImode.  */
2462   {6, 6, 6},                    /* cost if storing mask register
2463                                            in QImode, HImode, SImode.  */
2464   2,                                    /* cost of moving mask register.  */
2465   /* End of register allocator costs.  */
2466   },
2467
2468   COSTS_N_INSNS (1),                    /* cost of an add instruction */
2469   COSTS_N_INSNS (1) + 1,                /* cost of a lea instruction */
2470   COSTS_N_INSNS (1),                    /* variable shift costs */
2471   COSTS_N_INSNS (1),                    /* constant shift costs */
2472   {COSTS_N_INSNS (3),                   /* cost of starting multiply for QI */
2473    COSTS_N_INSNS (3),                   /*                               HI */
2474    COSTS_N_INSNS (3),                   /*                               SI */
2475    COSTS_N_INSNS (3),                   /*                               DI */
2476    COSTS_N_INSNS (4)},                  /*                            other */
2477   0,                                    /* cost of multiply per each bit set */
2478   {COSTS_N_INSNS (16),                  /* cost of a divide/mod for QI */
2479    COSTS_N_INSNS (22),                  /*                          HI */
2480    COSTS_N_INSNS (30),                  /*                          SI */
2481    COSTS_N_INSNS (74),                  /*                          DI */
2482    COSTS_N_INSNS (74)},                 /*                          other */
2483   COSTS_N_INSNS (1),                    /* cost of movsx */
2484   COSTS_N_INSNS (1),                    /* cost of movzx */
2485   8,                                    /* "large" insn */
2486   17,                                   /* MOVE_RATIO */
2487   17,                                   /* CLEAR_RATIO */
2488   {6, 6, 6},                            /* cost of loading integer registers
2489                                            in QImode, HImode and SImode.
2490                                            Relative to reg-reg move (2).  */
2491   {8, 8, 8},                            /* cost of storing integer registers */
2492   {8, 8, 8, 10, 15},                    /* cost of loading SSE register
2493                                            in 32bit, 64bit, 128bit, 256bit and 512bit */
2494   {8, 8, 8, 10, 15},                    /* cost of storing SSE register
2495                                            in 32bit, 64bit, 128bit, 256bit and 512bit */
2496   {8, 8, 8, 10, 15},                    /* cost of unaligned loads.  */
2497   {8, 8, 8, 10, 15},                    /* cost of unaligned storess.  */
2498   2, 3, 4,                              /* cost of moving XMM,YMM,ZMM register */
2499   6,                                    /* cost of moving SSE register to integer.  */
2500   18, 6,                                /* Gather load static, per_elt.  */
2501   18, 6,                                /* Gather store static, per_elt.  */
2502   32,                                   /* size of l1 cache.  */
2503   512,                                  /* size of l2 cache.  */
2504   64,                                   /* size of prefetch block */
2505   6,                                    /* number of parallel prefetches */
2506   3,                                    /* Branch cost */
2507   COSTS_N_INSNS (3),                    /* cost of FADD and FSUB insns.  */
2508   COSTS_N_INSNS (5),                    /* cost of FMUL instruction.  */
2509   COSTS_N_INSNS (17),                   /* cost of FDIV instruction.  */
2510   COSTS_N_INSNS (1),                    /* cost of FABS instruction.  */
2511   COSTS_N_INSNS (1),                    /* cost of FCHS instruction.  */
2512   COSTS_N_INSNS (14),                   /* cost of FSQRT instruction.  */
2513
2514   COSTS_N_INSNS (1),                    /* cost of cheap SSE instruction.  */
2515   COSTS_N_INSNS (3),                    /* cost of ADDSS/SD SUBSS/SD insns.  */
2516   COSTS_N_INSNS (4),                    /* cost of MULSS instruction.  */
2517   COSTS_N_INSNS (5),                    /* cost of MULSD instruction.  */
2518   COSTS_N_INSNS (5),                    /* cost of FMA SS instruction.  */
2519   COSTS_N_INSNS (5),                    /* cost of FMA SD instruction.  */
2520   COSTS_N_INSNS (13),                   /* cost of DIVSS instruction.  */
2521   COSTS_N_INSNS (17),                   /* cost of DIVSD instruction.  */
2522   COSTS_N_INSNS (14),                   /* cost of SQRTSS instruction.  */
2523   COSTS_N_INSNS (18),                   /* cost of SQRTSD instruction.  */
2524   1, 4, 3, 3,                           /* reassoc int, fp, vec_int, vec_fp.  */
2525   alderlake_memcpy,
2526   alderlake_memset,
2527   COSTS_N_INSNS (4),                    /* cond_taken_branch_cost.  */
2528   COSTS_N_INSNS (2),                    /* cond_not_taken_branch_cost.  */
2529   "16:11:8",                            /* Loop alignment.  */
2530   "16:11:8",                            /* Jump alignment.  */
2531   "0:0:8",                              /* Label alignment.  */
2532   "16",                                 /* Func alignment.  */
2533   4,                                    /* Small unroll limit.  */
2534   2,                                    /* Small unroll factor.  */
2535   COSTS_N_INSNS (2) + 3,                /* Branch mispredict scale.  */
2536 };
2537
2538   /* BTVER1 has optimized REP instruction for medium sized blocks, but for
2539      very small blocks it is better to use loop. For large blocks, libcall can
2540      do nontemporary accesses and beat inline considerably.  */
2541 static stringop_algs btver1_memcpy[2] = {
2542   {libcall, {{6, loop, false}, {14, unrolled_loop, false},
2543              {-1, rep_prefix_4_byte, false}}},
2544   {libcall, {{16, loop, false}, {8192, rep_prefix_8_byte, false},
2545              {-1, libcall, false}}}};
2546 static stringop_algs btver1_memset[2] = {
2547   {libcall, {{8, loop, false}, {24, unrolled_loop, false},
2548              {2048, rep_prefix_4_byte, false}, {-1, libcall, false}}},
2549   {libcall, {{48, unrolled_loop, false}, {8192, rep_prefix_8_byte, false},
2550              {-1, libcall, false}}}};
2551 const struct processor_costs btver1_cost = {
2552   {
2553   /* Start of register allocator costs.  integer->integer move cost is 2. */
2554   8,                                 /* cost for loading QImode using movzbl */
2555   {6, 8, 6},                            /* cost of loading integer registers
2556                                            in QImode, HImode and SImode.
2557                                            Relative to reg-reg move (2).  */
2558   {6, 8, 6},                            /* cost of storing integer registers */
2559   4,                                    /* cost of reg,reg fld/fst */
2560   {12, 12, 28},                         /* cost of loading fp registers
2561                                            in SFmode, DFmode and XFmode */
2562   {12, 12, 38},                         /* cost of storing fp registers
2563                                            in SFmode, DFmode and XFmode */
2564   4,                                    /* cost of moving MMX register */
2565   {10, 10},                             /* cost of loading MMX registers
2566                                            in SImode and DImode */
2567   {12, 12},                             /* cost of storing MMX registers
2568                                            in SImode and DImode */
2569   2, 4, 8,                              /* cost of moving XMM,YMM,ZMM register */
2570   {10, 10, 12, 48, 96},                 /* cost of loading SSE registers
2571                                            in 32,64,128,256 and 512-bit */
2572   {10, 10, 12, 48, 96},                 /* cost of storing SSE registers
2573                                            in 32,64,128,256 and 512-bit */
2574   14, 14,                               /* SSE->integer and integer->SSE moves */
2575   14, 14,                               /* mask->integer and integer->mask moves */
2576   {6, 8, 6},                            /* cost of loading mask register
2577                                            in QImode, HImode, SImode.  */
2578   {6, 8, 6},                            /* cost if storing mask register
2579                                            in QImode, HImode, SImode.  */
2580   2,                                    /* cost of moving mask register.  */
2581   /* End of register allocator costs.  */
2582   },
2583
2584   COSTS_N_INSNS (1),                    /* cost of an add instruction */
2585   COSTS_N_INSNS (2),                    /* cost of a lea instruction */
2586   COSTS_N_INSNS (1),                    /* variable shift costs */
2587   COSTS_N_INSNS (1),                    /* constant shift costs */
2588   {COSTS_N_INSNS (3),                   /* cost of starting multiply for QI */
2589    COSTS_N_INSNS (4),                   /*                               HI */
2590    COSTS_N_INSNS (3),                   /*                               SI */
2591    COSTS_N_INSNS (4),                   /*                               DI */
2592    COSTS_N_INSNS (5)},                  /*                            other */
2593   0,                                    /* cost of multiply per each bit set */
2594   {COSTS_N_INSNS (19),                  /* cost of a divide/mod for QI */
2595    COSTS_N_INSNS (35),                  /*                          HI */
2596    COSTS_N_INSNS (51),                  /*                          SI */
2597    COSTS_N_INSNS (83),                  /*                          DI */
2598    COSTS_N_INSNS (83)},                 /*                          other */
2599   COSTS_N_INSNS (1),                    /* cost of movsx */
2600   COSTS_N_INSNS (1),                    /* cost of movzx */
2601   8,                                    /* "large" insn */
2602   9,                                    /* MOVE_RATIO */
2603   6,                                    /* CLEAR_RATIO */
2604   {6, 8, 6},                            /* cost of loading integer registers
2605                                            in QImode, HImode and SImode.
2606                                            Relative to reg-reg move (2).  */
2607   {6, 8, 6},                            /* cost of storing integer registers */
2608   {10, 10, 12, 48, 96},                 /* cost of loading SSE register
2609                                            in 32bit, 64bit, 128bit, 256bit and 512bit */
2610   {10, 10, 12, 48, 96},                 /* cost of storing SSE register
2611                                            in 32bit, 64bit, 128bit, 256bit and 512bit */
2612   {10, 10, 12, 48, 96},                 /* cost of unaligned loads.  */
2613   {10, 10, 12, 48, 96},                 /* cost of unaligned stores.  */
2614   2, 4, 8,                              /* cost of moving XMM,YMM,ZMM register */
2615   14,                                   /* cost of moving SSE register to integer.  */
2616   10, 10,                               /* Gather load static, per_elt.  */
2617   10, 10,                               /* Gather store static, per_elt.  */
2618   32,                                   /* size of l1 cache.  */
2619   512,                                  /* size of l2 cache.  */
2620   64,                                   /* size of prefetch block */
2621   100,                                  /* number of parallel prefetches */
2622   2,                                    /* Branch cost */
2623   COSTS_N_INSNS (4),                    /* cost of FADD and FSUB insns.  */
2624   COSTS_N_INSNS (4),                    /* cost of FMUL instruction.  */
2625   COSTS_N_INSNS (19),                   /* cost of FDIV instruction.  */
2626   COSTS_N_INSNS (2),                    /* cost of FABS instruction.  */
2627   COSTS_N_INSNS (2),                    /* cost of FCHS instruction.  */
2628   COSTS_N_INSNS (35),                   /* cost of FSQRT instruction.  */
2629
2630   COSTS_N_INSNS (1),                    /* cost of cheap SSE instruction.  */
2631   COSTS_N_INSNS (3),                    /* cost of ADDSS/SD SUBSS/SD insns.  */
2632   COSTS_N_INSNS (2),                    /* cost of MULSS instruction.  */
2633   COSTS_N_INSNS (4),                    /* cost of MULSD instruction.  */
2634   COSTS_N_INSNS (5),                    /* cost of FMA SS instruction.  */
2635   COSTS_N_INSNS (5),                    /* cost of FMA SD instruction.  */
2636   COSTS_N_INSNS (13),                   /* cost of DIVSS instruction.  */
2637   COSTS_N_INSNS (17),                   /* cost of DIVSD instruction.  */
2638   COSTS_N_INSNS (14),                   /* cost of SQRTSS instruction.  */
2639   COSTS_N_INSNS (48),                   /* cost of SQRTSD instruction.  */
2640   1, 1, 1, 1,                           /* reassoc int, fp, vec_int, vec_fp.  */
2641   btver1_memcpy,
2642   btver1_memset,
2643   COSTS_N_INSNS (2),                    /* cond_taken_branch_cost.  */
2644   COSTS_N_INSNS (1),                    /* cond_not_taken_branch_cost.  */
2645   "16:11:8",                            /* Loop alignment.  */
2646   "16:8:8",                             /* Jump alignment.  */
2647   "0:0:8",                              /* Label alignment.  */
2648   "11",                                 /* Func alignment.  */
2649   4,                                    /* Small unroll limit.  */
2650   2,                                    /* Small unroll factor.  */
2651   COSTS_N_INSNS (2),                    /* Branch mispredict scale.  */
2652 };
2653
2654 static stringop_algs btver2_memcpy[2] = {
2655   {libcall, {{6, loop, false}, {14, unrolled_loop, false},
2656              {-1, rep_prefix_4_byte, false}}},
2657   {libcall, {{16, loop, false}, {8192, rep_prefix_8_byte, false},
2658              {-1, libcall, false}}}};
2659 static stringop_algs btver2_memset[2] = {
2660   {libcall, {{8, loop, false}, {24, unrolled_loop, false},
2661              {2048, rep_prefix_4_byte, false}, {-1, libcall, false}}},
2662   {libcall, {{48, unrolled_loop, false}, {8192, rep_prefix_8_byte, false},
2663              {-1, libcall, false}}}};
2664 const struct processor_costs btver2_cost = {
2665   {
2666   /* Start of register allocator costs.  integer->integer move cost is 2. */
2667   8,                                 /* cost for loading QImode using movzbl */
2668   {8, 8, 6},                            /* cost of loading integer registers
2669                                            in QImode, HImode and SImode.
2670                                            Relative to reg-reg move (2).  */
2671   {8, 8, 6},                            /* cost of storing integer registers */
2672   4,                                    /* cost of reg,reg fld/fst */
2673   {12, 12, 28},                         /* cost of loading fp registers
2674                                            in SFmode, DFmode and XFmode */
2675   {12, 12, 38},                         /* cost of storing fp registers
2676                                            in SFmode, DFmode and XFmode */
2677   4,                                    /* cost of moving MMX register */
2678   {10, 10},                             /* cost of loading MMX registers
2679                                            in SImode and DImode */
2680   {12, 12},                             /* cost of storing MMX registers
2681                                            in SImode and DImode */
2682   2, 4, 8,                              /* cost of moving XMM,YMM,ZMM register */
2683   {10, 10, 12, 48, 96},                 /* cost of loading SSE registers
2684                                            in 32,64,128,256 and 512-bit */
2685   {10, 10, 12, 48, 96},                 /* cost of storing SSE registers
2686                                            in 32,64,128,256 and 512-bit */
2687   14, 14,                               /* SSE->integer and integer->SSE moves */
2688   14, 14,                               /* mask->integer and integer->mask moves */
2689   {8, 8, 6},                            /* cost of loading mask register
2690                                            in QImode, HImode, SImode.  */
2691   {8, 8, 6},                            /* cost if storing mask register
2692                                            in QImode, HImode, SImode.  */
2693   2,                                    /* cost of moving mask register.  */
2694   /* End of register allocator costs.  */
2695   },
2696
2697   COSTS_N_INSNS (1),                    /* cost of an add instruction */
2698   COSTS_N_INSNS (2),                    /* cost of a lea instruction */
2699   COSTS_N_INSNS (1),                    /* variable shift costs */
2700   COSTS_N_INSNS (1),                    /* constant shift costs */
2701   {COSTS_N_INSNS (3),                   /* cost of starting multiply for QI */
2702    COSTS_N_INSNS (4),                   /*                               HI */
2703    COSTS_N_INSNS (3),                   /*                               SI */
2704    COSTS_N_INSNS (4),                   /*                               DI */
2705    COSTS_N_INSNS (5)},                  /*                            other */
2706   0,                                    /* cost of multiply per each bit set */
2707   {COSTS_N_INSNS (19),                  /* cost of a divide/mod for QI */
2708    COSTS_N_INSNS (35),                  /*                          HI */
2709    COSTS_N_INSNS (51),                  /*                          SI */
2710    COSTS_N_INSNS (83),                  /*                          DI */
2711    COSTS_N_INSNS (83)},                 /*                          other */
2712   COSTS_N_INSNS (1),                    /* cost of movsx */
2713   COSTS_N_INSNS (1),                    /* cost of movzx */
2714   8,                                    /* "large" insn */
2715   9,                                    /* MOVE_RATIO */
2716   6,                                    /* CLEAR_RATIO */
2717   {8, 8, 6},                            /* cost of loading integer registers
2718                                            in QImode, HImode and SImode.
2719                                            Relative to reg-reg move (2).  */
2720   {8, 8, 6},                            /* cost of storing integer registers */
2721   {10, 10, 12, 48, 96},                 /* cost of loading SSE register
2722                                            in 32bit, 64bit, 128bit, 256bit and 512bit */
2723   {10, 10, 12, 48, 96},                 /* cost of storing SSE register
2724                                            in 32bit, 64bit, 128bit, 256bit and 512bit */
2725   {10, 10, 12, 48, 96},                 /* cost of unaligned loads.  */
2726   {10, 10, 12, 48, 96},                 /* cost of unaligned stores.  */
2727   2, 4, 8,                              /* cost of moving XMM,YMM,ZMM register */
2728   14,                                   /* cost of moving SSE register to integer.  */
2729   10, 10,                               /* Gather load static, per_elt.  */
2730   10, 10,                               /* Gather store static, per_elt.  */
2731   32,                                   /* size of l1 cache.  */
2732   2048,                                 /* size of l2 cache.  */
2733   64,                                   /* size of prefetch block */
2734   100,                                  /* number of parallel prefetches */
2735   2,                                    /* Branch cost */
2736   COSTS_N_INSNS (4),                    /* cost of FADD and FSUB insns.  */
2737   COSTS_N_INSNS (4),                    /* cost of FMUL instruction.  */
2738   COSTS_N_INSNS (19),                   /* cost of FDIV instruction.  */
2739   COSTS_N_INSNS (2),                    /* cost of FABS instruction.  */
2740   COSTS_N_INSNS (2),                    /* cost of FCHS instruction.  */
2741   COSTS_N_INSNS (35),                   /* cost of FSQRT instruction.  */
2742
2743   COSTS_N_INSNS (1),                    /* cost of cheap SSE instruction.  */
2744   COSTS_N_INSNS (3),                    /* cost of ADDSS/SD SUBSS/SD insns.  */
2745   COSTS_N_INSNS (2),                    /* cost of MULSS instruction.  */
2746   COSTS_N_INSNS (4),                    /* cost of MULSD instruction.  */
2747   COSTS_N_INSNS (5),                    /* cost of FMA SS instruction.  */
2748   COSTS_N_INSNS (5),                    /* cost of FMA SD instruction.  */
2749   COSTS_N_INSNS (13),                   /* cost of DIVSS instruction.  */
2750   COSTS_N_INSNS (19),                   /* cost of DIVSD instruction.  */
2751   COSTS_N_INSNS (16),                   /* cost of SQRTSS instruction.  */
2752   COSTS_N_INSNS (21),                   /* cost of SQRTSD instruction.  */
2753   1, 1, 1, 1,                           /* reassoc int, fp, vec_int, vec_fp.  */
2754   btver2_memcpy,
2755   btver2_memset,
2756   COSTS_N_INSNS (2),                    /* cond_taken_branch_cost.  */
2757   COSTS_N_INSNS (1),                    /* cond_not_taken_branch_cost.  */
2758   "16:11:8",                            /* Loop alignment.  */
2759   "16:8:8",                             /* Jump alignment.  */
2760   "0:0:8",                              /* Label alignment.  */
2761   "11",                                 /* Func alignment.  */
2762   4,                                    /* Small unroll limit.  */
2763   2,                                    /* Small unroll factor.  */
2764   COSTS_N_INSNS (2),                    /* Branch mispredict scale.  */
2765 };
2766
2767 static stringop_algs pentium4_memcpy[2] = {
2768   {libcall, {{12, loop_1_byte, false}, {-1, rep_prefix_4_byte, false}}},
2769   DUMMY_STRINGOP_ALGS};
2770 static stringop_algs pentium4_memset[2] = {
2771   {libcall, {{6, loop_1_byte, false}, {48, loop, false},
2772              {20480, rep_prefix_4_byte, false}, {-1, libcall, false}}},
2773   DUMMY_STRINGOP_ALGS};
2774
2775 static const
2776 struct processor_costs pentium4_cost = {
2777   {
2778   /* Start of register allocator costs.  integer->integer move cost is 2. */
2779   5,                                 /* cost for loading QImode using movzbl */
2780   {4, 5, 4},                            /* cost of loading integer registers
2781                                            in QImode, HImode and SImode.
2782                                            Relative to reg-reg move (2).  */
2783   {2, 3, 2},                            /* cost of storing integer registers */
2784   12,                                   /* cost of reg,reg fld/fst */
2785   {14, 14, 14},                         /* cost of loading fp registers
2786                                            in SFmode, DFmode and XFmode */
2787   {14, 14, 14},                         /* cost of storing fp registers
2788                                            in SFmode, DFmode and XFmode */
2789   12,                                   /* cost of moving MMX register */
2790   {16, 16},                             /* cost of loading MMX registers
2791                                            in SImode and DImode */
2792   {16, 16},                             /* cost of storing MMX registers
2793                                            in SImode and DImode */
2794   12, 24, 48,                           /* cost of moving XMM,YMM,ZMM register */
2795   {16, 16, 16, 32, 64},                 /* cost of loading SSE registers
2796                                            in 32,64,128,256 and 512-bit */
2797   {16, 16, 16, 32, 64},                 /* cost of storing SSE registers
2798                                            in 32,64,128,256 and 512-bit */
2799   20, 12,                               /* SSE->integer and integer->SSE moves */
2800   20, 12,                               /* mask->integer and integer->mask moves */
2801   {4, 5, 4},                            /* cost of loading mask register
2802                                            in QImode, HImode, SImode.  */
2803   {2, 3, 2},                            /* cost if storing mask register
2804                                            in QImode, HImode, SImode.  */
2805   2,                                    /* cost of moving mask register.  */
2806   /* End of register allocator costs.  */
2807   },
2808
2809   COSTS_N_INSNS (1),                    /* cost of an add instruction */
2810   COSTS_N_INSNS (3),                    /* cost of a lea instruction */
2811   COSTS_N_INSNS (4),                    /* variable shift costs */
2812   COSTS_N_INSNS (4),                    /* constant shift costs */
2813   {COSTS_N_INSNS (15),                  /* cost of starting multiply for QI */
2814    COSTS_N_INSNS (15),                  /*                               HI */
2815    COSTS_N_INSNS (15),                  /*                               SI */
2816    COSTS_N_INSNS (15),                  /*                               DI */
2817    COSTS_N_INSNS (15)},                 /*                            other */
2818   0,                                    /* cost of multiply per each bit set */
2819   {COSTS_N_INSNS (56),                  /* cost of a divide/mod for QI */
2820    COSTS_N_INSNS (56),                  /*                          HI */
2821    COSTS_N_INSNS (56),                  /*                          SI */
2822    COSTS_N_INSNS (56),                  /*                          DI */
2823    COSTS_N_INSNS (56)},                 /*                          other */
2824   COSTS_N_INSNS (1),                    /* cost of movsx */
2825   COSTS_N_INSNS (1),                    /* cost of movzx */
2826   16,                                   /* "large" insn */
2827   6,                                    /* MOVE_RATIO */
2828   6,                                    /* CLEAR_RATIO */
2829   {4, 5, 4},                            /* cost of loading integer registers
2830                                            in QImode, HImode and SImode.
2831                                            Relative to reg-reg move (2).  */
2832   {2, 3, 2},                            /* cost of storing integer registers */
2833   {16, 16, 16, 32, 64},                 /* cost of loading SSE register
2834                                            in 32bit, 64bit, 128bit, 256bit and 512bit */
2835   {16, 16, 16, 32, 64},                 /* cost of storing SSE register
2836                                            in 32bit, 64bit, 128bit, 256bit and 512bit */
2837   {32, 32, 32, 64, 128},                /* cost of unaligned loads.  */
2838   {32, 32, 32, 64, 128},                /* cost of unaligned stores.  */
2839   12, 24, 48,                           /* cost of moving XMM,YMM,ZMM register */
2840   20,                                   /* cost of moving SSE register to integer.  */
2841   16, 16,                               /* Gather load static, per_elt.  */
2842   16, 16,                               /* Gather store static, per_elt.  */
2843   8,                                    /* size of l1 cache.  */
2844   256,                                  /* size of l2 cache.  */
2845   64,                                   /* size of prefetch block */
2846   6,                                    /* number of parallel prefetches */
2847   2,                                    /* Branch cost */
2848   COSTS_N_INSNS (5),                    /* cost of FADD and FSUB insns.  */
2849   COSTS_N_INSNS (7),                    /* cost of FMUL instruction.  */
2850   COSTS_N_INSNS (43),                   /* cost of FDIV instruction.  */
2851   COSTS_N_INSNS (2),                    /* cost of FABS instruction.  */
2852   COSTS_N_INSNS (2),                    /* cost of FCHS instruction.  */
2853   COSTS_N_INSNS (43),                   /* cost of FSQRT instruction.  */
2854
2855   COSTS_N_INSNS (2),                    /* cost of cheap SSE instruction.  */
2856   COSTS_N_INSNS (4),                    /* cost of ADDSS/SD SUBSS/SD insns.  */
2857   COSTS_N_INSNS (6),                    /* cost of MULSS instruction.  */
2858   COSTS_N_INSNS (6),                    /* cost of MULSD instruction.  */
2859   COSTS_N_INSNS (6),                    /* cost of FMA SS instruction.  */
2860   COSTS_N_INSNS (6),                    /* cost of FMA SD instruction.  */
2861   COSTS_N_INSNS (23),                   /* cost of DIVSS instruction.  */
2862   COSTS_N_INSNS (38),                   /* cost of DIVSD instruction.  */
2863   COSTS_N_INSNS (23),                   /* cost of SQRTSS instruction.  */
2864   COSTS_N_INSNS (38),                   /* cost of SQRTSD instruction.  */
2865   1, 1, 1, 1,                           /* reassoc int, fp, vec_int, vec_fp.  */
2866   pentium4_memcpy,
2867   pentium4_memset,
2868   COSTS_N_INSNS (3),                    /* cond_taken_branch_cost.  */
2869   COSTS_N_INSNS (1),                    /* cond_not_taken_branch_cost.  */
2870   NULL,                                 /* Loop alignment.  */
2871   NULL,                                 /* Jump alignment.  */
2872   NULL,                                 /* Label alignment.  */
2873   NULL,                                 /* Func alignment.  */
2874   4,                                    /* Small unroll limit.  */
2875   2,                                    /* Small unroll factor.  */
2876   COSTS_N_INSNS (2),                    /* Branch mispredict scale.  */
2877 };
2878
2879 static stringop_algs nocona_memcpy[2] = {
2880   {libcall, {{12, loop_1_byte, false}, {-1, rep_prefix_4_byte, false}}},
2881   {libcall, {{32, loop, false}, {20000, rep_prefix_8_byte, false},
2882              {100000, unrolled_loop, false}, {-1, libcall, false}}}};
2883
2884 static stringop_algs nocona_memset[2] = {
2885   {libcall, {{6, loop_1_byte, false}, {48, loop, false},
2886              {20480, rep_prefix_4_byte, false}, {-1, libcall, false}}},
2887   {libcall, {{24, loop, false}, {64, unrolled_loop, false},
2888              {8192, rep_prefix_8_byte, false}, {-1, libcall, false}}}};
2889
2890 static const
2891 struct processor_costs nocona_cost = {
2892   {
2893   /* Start of register allocator costs.  integer->integer move cost is 2. */
2894   4,                                 /* cost for loading QImode using movzbl */
2895   {4, 4, 4},                            /* cost of loading integer registers
2896                                            in QImode, HImode and SImode.
2897                                            Relative to reg-reg move (2).  */
2898   {4, 4, 4},                            /* cost of storing integer registers */
2899   12,                                   /* cost of reg,reg fld/fst */
2900   {14, 14, 14},                         /* cost of loading fp registers
2901                                            in SFmode, DFmode and XFmode */
2902   {14, 14, 14},                         /* cost of storing fp registers
2903                                            in SFmode, DFmode and XFmode */
2904   14,                                   /* cost of moving MMX register */
2905   {12, 12},                             /* cost of loading MMX registers
2906                                            in SImode and DImode */
2907   {12, 12},                             /* cost of storing MMX registers
2908                                            in SImode and DImode */
2909   6, 12, 24,                            /* cost of moving XMM,YMM,ZMM register */
2910   {12, 12, 12, 24, 48},                 /* cost of loading SSE registers
2911                                            in 32,64,128,256 and 512-bit */
2912   {12, 12, 12, 24, 48},                 /* cost of storing SSE registers
2913                                            in 32,64,128,256 and 512-bit */
2914   20, 12,                               /* SSE->integer and integer->SSE moves */
2915   20, 12,                               /* mask->integer and integer->mask moves */
2916   {4, 4, 4},                            /* cost of loading mask register
2917                                            in QImode, HImode, SImode.  */
2918   {4, 4, 4},                            /* cost if storing mask register
2919                                            in QImode, HImode, SImode.  */
2920   2,                                    /* cost of moving mask register.  */
2921   /* End of register allocator costs.  */
2922   },
2923
2924   COSTS_N_INSNS (1),                    /* cost of an add instruction */
2925   COSTS_N_INSNS (1),                    /* cost of a lea instruction */
2926   COSTS_N_INSNS (1),                    /* variable shift costs */
2927   COSTS_N_INSNS (1),                    /* constant shift costs */
2928   {COSTS_N_INSNS (10),                  /* cost of starting multiply for QI */
2929    COSTS_N_INSNS (10),                  /*                               HI */
2930    COSTS_N_INSNS (10),                  /*                               SI */
2931    COSTS_N_INSNS (10),                  /*                               DI */
2932    COSTS_N_INSNS (10)},                 /*                            other */
2933   0,                                    /* cost of multiply per each bit set */
2934   {COSTS_N_INSNS (66),                  /* cost of a divide/mod for QI */
2935    COSTS_N_INSNS (66),                  /*                          HI */
2936    COSTS_N_INSNS (66),                  /*                          SI */
2937    COSTS_N_INSNS (66),                  /*                          DI */
2938    COSTS_N_INSNS (66)},                 /*                          other */
2939   COSTS_N_INSNS (1),                    /* cost of movsx */
2940   COSTS_N_INSNS (1),                    /* cost of movzx */
2941   16,                                   /* "large" insn */
2942   17,                                   /* MOVE_RATIO */
2943   6,                                    /* CLEAR_RATIO */
2944   {4, 4, 4},                            /* cost of loading integer registers
2945                                            in QImode, HImode and SImode.
2946                                            Relative to reg-reg move (2).  */
2947   {4, 4, 4},                            /* cost of storing integer registers */
2948   {12, 12, 12, 24, 48},                 /* cost of loading SSE register
2949                                            in 32bit, 64bit, 128bit, 256bit and 512bit */
2950   {12, 12, 12, 24, 48},                 /* cost of storing SSE register
2951                                            in 32bit, 64bit, 128bit, 256bit and 512bit */
2952   {24, 24, 24, 48, 96},                 /* cost of unaligned loads.  */
2953   {24, 24, 24, 48, 96},                 /* cost of unaligned stores.  */
2954   6, 12, 24,                            /* cost of moving XMM,YMM,ZMM register */
2955   20,                                   /* cost of moving SSE register to integer.  */
2956   12, 12,                               /* Gather load static, per_elt.  */
2957   12, 12,                               /* Gather store static, per_elt.  */
2958   8,                                    /* size of l1 cache.  */
2959   1024,                                 /* size of l2 cache.  */
2960   64,                                   /* size of prefetch block */
2961   8,                                    /* number of parallel prefetches */
2962   1,                                    /* Branch cost */
2963   COSTS_N_INSNS (6),                    /* cost of FADD and FSUB insns.  */
2964   COSTS_N_INSNS (8),                    /* cost of FMUL instruction.  */
2965   COSTS_N_INSNS (40),                   /* cost of FDIV instruction.  */
2966   COSTS_N_INSNS (3),                    /* cost of FABS instruction.  */
2967   COSTS_N_INSNS (3),                    /* cost of FCHS instruction.  */
2968   COSTS_N_INSNS (44),                   /* cost of FSQRT instruction.  */
2969
2970   COSTS_N_INSNS (2),                    /* cost of cheap SSE instruction.  */
2971   COSTS_N_INSNS (5),                    /* cost of ADDSS/SD SUBSS/SD insns.  */
2972   COSTS_N_INSNS (7),                    /* cost of MULSS instruction.  */
2973   COSTS_N_INSNS (7),                    /* cost of MULSD instruction.  */
2974   COSTS_N_INSNS (7),                    /* cost of FMA SS instruction.  */
2975   COSTS_N_INSNS (7),                    /* cost of FMA SD instruction.  */
2976   COSTS_N_INSNS (32),                   /* cost of DIVSS instruction.  */
2977   COSTS_N_INSNS (40),                   /* cost of DIVSD instruction.  */
2978   COSTS_N_INSNS (32),                   /* cost of SQRTSS instruction.  */
2979   COSTS_N_INSNS (41),                   /* cost of SQRTSD instruction.  */
2980   1, 1, 1, 1,                           /* reassoc int, fp, vec_int, vec_fp.  */
2981   nocona_memcpy,
2982   nocona_memset,
2983   COSTS_N_INSNS (3),                    /* cond_taken_branch_cost.  */
2984   COSTS_N_INSNS (1),                    /* cond_not_taken_branch_cost.  */
2985   NULL,                                 /* Loop alignment.  */
2986   NULL,                                 /* Jump alignment.  */
2987   NULL,                                 /* Label alignment.  */
2988   NULL,                                 /* Func alignment.  */
2989   4,                                    /* Small unroll limit.  */
2990   2,                                    /* Small unroll factor.  */
2991   COSTS_N_INSNS (2),                    /* Branch mispredict scale.  */
2992 };
2993
2994 static stringop_algs atom_memcpy[2] = {
2995   {libcall, {{11, loop, false}, {-1, rep_prefix_4_byte, false}}},
2996   {libcall, {{32, loop, false}, {64, rep_prefix_4_byte, false},
2997              {8192, rep_prefix_8_byte, false}, {-1, libcall, false}}}};
2998 static stringop_algs atom_memset[2] = {
2999   {libcall, {{8, loop, false}, {15, unrolled_loop, false},
3000              {2048, rep_prefix_4_byte, false}, {-1, libcall, false}}},
3001   {libcall, {{24, loop, false}, {32, unrolled_loop, false},
3002              {8192, rep_prefix_8_byte, false}, {-1, libcall, false}}}};
3003 static const
3004 struct processor_costs atom_cost = {
3005   {
3006   /* Start of register allocator costs.  integer->integer move cost is 2. */
3007   6,                                    /* cost for loading QImode using movzbl */
3008   {6, 6, 6},                            /* cost of loading integer registers
3009                                            in QImode, HImode and SImode.
3010                                            Relative to reg-reg move (2).  */
3011   {6, 6, 6},                            /* cost of storing integer registers */
3012   4,                                    /* cost of reg,reg fld/fst */
3013   {6, 6, 18},                           /* cost of loading fp registers
3014                                            in SFmode, DFmode and XFmode */
3015   {14, 14, 24},                         /* cost of storing fp registers
3016                                            in SFmode, DFmode and XFmode */
3017   2,                                    /* cost of moving MMX register */
3018   {8, 8},                               /* cost of loading MMX registers
3019                                            in SImode and DImode */
3020   {10, 10},                             /* cost of storing MMX registers
3021                                            in SImode and DImode */
3022   2, 4, 8,                              /* cost of moving XMM,YMM,ZMM register */
3023   {8, 8, 8, 16, 32},                    /* cost of loading SSE registers
3024                                            in 32,64,128,256 and 512-bit */
3025   {8, 8, 8, 16, 32},                    /* cost of storing SSE registers
3026                                            in 32,64,128,256 and 512-bit */
3027   8, 6,                         /* SSE->integer and integer->SSE moves */
3028   8, 6,                         /* mask->integer and integer->mask moves */
3029   {6, 6, 6},                            /* cost of loading mask register
3030                                            in QImode, HImode, SImode.  */
3031   {6, 6, 6},                    /* cost if storing mask register
3032                                            in QImode, HImode, SImode.  */
3033   2,                                    /* cost of moving mask register.  */
3034   /* End of register allocator costs.  */
3035   },
3036
3037   COSTS_N_INSNS (1),                    /* cost of an add instruction */
3038   COSTS_N_INSNS (1) + 1,                /* cost of a lea instruction */
3039   COSTS_N_INSNS (1),                    /* variable shift costs */
3040   COSTS_N_INSNS (1),                    /* constant shift costs */
3041   {COSTS_N_INSNS (3),                   /* cost of starting multiply for QI */
3042    COSTS_N_INSNS (4),                   /*                               HI */
3043    COSTS_N_INSNS (3),                   /*                               SI */
3044    COSTS_N_INSNS (4),                   /*                               DI */
3045    COSTS_N_INSNS (2)},                  /*                            other */
3046   0,                                    /* cost of multiply per each bit set */
3047   {COSTS_N_INSNS (18),                  /* cost of a divide/mod for QI */
3048    COSTS_N_INSNS (26),                  /*                          HI */
3049    COSTS_N_INSNS (42),                  /*                          SI */
3050    COSTS_N_INSNS (74),                  /*                          DI */
3051    COSTS_N_INSNS (74)},                 /*                          other */
3052   COSTS_N_INSNS (1),                    /* cost of movsx */
3053   COSTS_N_INSNS (1),                    /* cost of movzx */
3054   8,                                    /* "large" insn */
3055   17,                                   /* MOVE_RATIO */
3056   6,                                    /* CLEAR_RATIO */
3057   {6, 6, 6},                            /* cost of loading integer registers
3058                                            in QImode, HImode and SImode.
3059                                            Relative to reg-reg move (2).  */
3060   {6, 6, 6},                            /* cost of storing integer registers */
3061   {8, 8, 8, 16, 32},                    /* cost of loading SSE register
3062                                            in 32bit, 64bit, 128bit, 256bit and 512bit */
3063   {8, 8, 8, 16, 32},                    /* cost of storing SSE register
3064                                            in 32bit, 64bit, 128bit, 256bit and 512bit */
3065   {16, 16, 16, 32, 64},                 /* cost of unaligned loads.  */
3066   {16, 16, 16, 32, 64},                 /* cost of unaligned stores.  */
3067   2, 4, 8,                              /* cost of moving XMM,YMM,ZMM register */
3068   8,                                    /* cost of moving SSE register to integer.  */
3069   8, 8,                                 /* Gather load static, per_elt.  */
3070   8, 8,                                 /* Gather store static, per_elt.  */
3071   32,                                   /* size of l1 cache.  */
3072   256,                                  /* size of l2 cache.  */
3073   64,                                   /* size of prefetch block */
3074   6,                                    /* number of parallel prefetches */
3075   3,                                    /* Branch cost */
3076   COSTS_N_INSNS (8),                    /* cost of FADD and FSUB insns.  */
3077   COSTS_N_INSNS (8),                    /* cost of FMUL instruction.  */
3078   COSTS_N_INSNS (20),                   /* cost of FDIV instruction.  */
3079   COSTS_N_INSNS (8),                    /* cost of FABS instruction.  */
3080   COSTS_N_INSNS (8),                    /* cost of FCHS instruction.  */
3081   COSTS_N_INSNS (40),                   /* cost of FSQRT instruction.  */
3082
3083   COSTS_N_INSNS (1),                    /* cost of cheap SSE instruction.  */
3084   COSTS_N_INSNS (5),                    /* cost of ADDSS/SD SUBSS/SD insns.  */
3085   COSTS_N_INSNS (4),                    /* cost of MULSS instruction.  */
3086   COSTS_N_INSNS (5),                    /* cost of MULSD instruction.  */
3087   COSTS_N_INSNS (6),                    /* cost of FMA SS instruction.  */
3088   COSTS_N_INSNS (6),                    /* cost of FMA SD instruction.  */
3089   COSTS_N_INSNS (31),                   /* cost of DIVSS instruction.  */
3090   COSTS_N_INSNS (60),                   /* cost of DIVSD instruction.  */
3091   COSTS_N_INSNS (31),                   /* cost of SQRTSS instruction.  */
3092   COSTS_N_INSNS (63),                   /* cost of SQRTSD instruction.  */
3093   2, 2, 2, 2,                           /* reassoc int, fp, vec_int, vec_fp.  */
3094   atom_memcpy,
3095   atom_memset,
3096   COSTS_N_INSNS (3),                    /* cond_taken_branch_cost.  */
3097   COSTS_N_INSNS (1),                    /* cond_not_taken_branch_cost.  */
3098   "16",                                 /* Loop alignment.  */
3099   "16:8:8",                             /* Jump alignment.  */
3100   "0:0:8",                              /* Label alignment.  */
3101   "16",                                 /* Func alignment.  */
3102   4,                                    /* Small unroll limit.  */
3103   2,                                    /* Small unroll factor.  */
3104   COSTS_N_INSNS (2),                    /* Branch mispredict scale.  */
3105 };
3106
3107 static stringop_algs slm_memcpy[2] = {
3108   {libcall, {{11, loop, false}, {-1, rep_prefix_4_byte, false}}},
3109   {libcall, {{32, loop, false}, {64, rep_prefix_4_byte, false},
3110              {8192, rep_prefix_8_byte, false}, {-1, libcall, false}}}};
3111 static stringop_algs slm_memset[2] = {
3112   {libcall, {{8, loop, false}, {15, unrolled_loop, false},
3113              {2048, rep_prefix_4_byte, false}, {-1, libcall, false}}},
3114   {libcall, {{24, loop, false}, {32, unrolled_loop, false},
3115              {8192, rep_prefix_8_byte, false}, {-1, libcall, false}}}};
3116 static const
3117 struct processor_costs slm_cost = {
3118   {
3119   /* Start of register allocator costs.  integer->integer move cost is 2. */
3120   8,                                    /* cost for loading QImode using movzbl */
3121   {8, 8, 8},                            /* cost of loading integer registers
3122                                            in QImode, HImode and SImode.
3123                                            Relative to reg-reg move (2).  */
3124   {6, 6, 6},                            /* cost of storing integer registers */
3125   2,                                    /* cost of reg,reg fld/fst */
3126   {8, 8, 18},                           /* cost of loading fp registers
3127                                            in SFmode, DFmode and XFmode */
3128   {6, 6, 18},                           /* cost of storing fp registers
3129                                            in SFmode, DFmode and XFmode */
3130   2,                                    /* cost of moving MMX register */
3131   {8, 8},                               /* cost of loading MMX registers
3132                                            in SImode and DImode */
3133   {6, 6},                               /* cost of storing MMX registers
3134                                            in SImode and DImode */
3135   2, 4, 8,                              /* cost of moving XMM,YMM,ZMM register */
3136   {8, 8, 8, 16, 32},                    /* cost of loading SSE registers
3137                                            in 32,64,128,256 and 512-bit */
3138   {8, 8, 8, 16, 32},                    /* cost of storing SSE registers
3139                                            in 32,64,128,256 and 512-bit */
3140   8, 6,                         /* SSE->integer and integer->SSE moves */
3141   8, 6,                         /* mask->integer and integer->mask moves */
3142   {8, 8, 8},                    /* cost of loading mask register
3143                                            in QImode, HImode, SImode.  */
3144   {6, 6, 6},                    /* cost if storing mask register
3145                                            in QImode, HImode, SImode.  */
3146   2,                                    /* cost of moving mask register.  */
3147   /* End of register allocator costs.  */
3148   },
3149
3150   COSTS_N_INSNS (1),                    /* cost of an add instruction */
3151   COSTS_N_INSNS (1) + 1,                /* cost of a lea instruction */
3152   COSTS_N_INSNS (1),                    /* variable shift costs */
3153   COSTS_N_INSNS (1),                    /* constant shift costs */
3154   {COSTS_N_INSNS (3),                   /* cost of starting multiply for QI */
3155    COSTS_N_INSNS (3),                   /*                               HI */
3156    COSTS_N_INSNS (3),                   /*                               SI */
3157    COSTS_N_INSNS (4),                   /*                               DI */
3158    COSTS_N_INSNS (2)},                  /*                            other */
3159   0,                                    /* cost of multiply per each bit set */
3160   {COSTS_N_INSNS (18),                  /* cost of a divide/mod for QI */
3161    COSTS_N_INSNS (26),                  /*                          HI */
3162    COSTS_N_INSNS (42),                  /*                          SI */
3163    COSTS_N_INSNS (74),                  /*                          DI */
3164    COSTS_N_INSNS (74)},                 /*                          other */
3165   COSTS_N_INSNS (1),                    /* cost of movsx */
3166   COSTS_N_INSNS (1),                    /* cost of movzx */
3167   8,                                    /* "large" insn */
3168   17,                                   /* MOVE_RATIO */
3169   6,                                    /* CLEAR_RATIO */
3170   {8, 8, 8},                            /* cost of loading integer registers
3171                                            in QImode, HImode and SImode.
3172                                            Relative to reg-reg move (2).  */
3173   {6, 6, 6},                            /* cost of storing integer registers */
3174   {8, 8, 8, 16, 32},                    /* cost of loading SSE register
3175                                            in 32bit, 64bit, 128bit, 256bit and 512bit */
3176   {8, 8, 8, 16, 32},                    /* cost of storing SSE register
3177                                            in SImode, DImode and TImode.  */
3178   {16, 16, 16, 32, 64},                 /* cost of unaligned loads.  */
3179   {16, 16, 16, 32, 64},                 /* cost of unaligned stores.  */
3180   2, 4, 8,                              /* cost of moving XMM,YMM,ZMM register */
3181   8,                                    /* cost of moving SSE register to integer.  */
3182   8, 8,                                 /* Gather load static, per_elt.  */
3183   8, 8,                                 /* Gather store static, per_elt.  */
3184   32,                                   /* size of l1 cache.  */
3185   256,                                  /* size of l2 cache.  */
3186   64,                                   /* size of prefetch block */
3187   6,                                    /* number of parallel prefetches */
3188   3,                                    /* Branch cost */
3189   COSTS_N_INSNS (8),                    /* cost of FADD and FSUB insns.  */
3190   COSTS_N_INSNS (8),                    /* cost of FMUL instruction.  */
3191   COSTS_N_INSNS (20),                   /* cost of FDIV instruction.  */
3192   COSTS_N_INSNS (8),                    /* cost of FABS instruction.  */
3193   COSTS_N_INSNS (8),                    /* cost of FCHS instruction.  */
3194   COSTS_N_INSNS (40),                   /* cost of FSQRT instruction.  */
3195
3196   COSTS_N_INSNS (1),                    /* cost of cheap SSE instruction.  */
3197   COSTS_N_INSNS (3),                    /* cost of ADDSS/SD SUBSS/SD insns.  */
3198   COSTS_N_INSNS (4),                    /* cost of MULSS instruction.  */
3199   COSTS_N_INSNS (5),                    /* cost of MULSD instruction.  */
3200   COSTS_N_INSNS (6),                    /* cost of FMA SS instruction.  */
3201   COSTS_N_INSNS (6),                    /* cost of FMA SD instruction.  */
3202   COSTS_N_INSNS (39),                   /* cost of DIVSS instruction.  */
3203   COSTS_N_INSNS (69),                   /* cost of DIVSD instruction.  */
3204   COSTS_N_INSNS (20),                   /* cost of SQRTSS instruction.  */
3205   COSTS_N_INSNS (35),                   /* cost of SQRTSD instruction.  */
3206   1, 2, 1, 1,                           /* reassoc int, fp, vec_int, vec_fp.  */
3207   slm_memcpy,
3208   slm_memset,
3209   COSTS_N_INSNS (3),                    /* cond_taken_branch_cost.  */
3210   COSTS_N_INSNS (1),                    /* cond_not_taken_branch_cost.  */
3211   "16",                                 /* Loop alignment.  */
3212   "16:8:8",                             /* Jump alignment.  */
3213   "0:0:8",                              /* Label alignment.  */
3214   "16",                                 /* Func alignment.  */
3215   4,                                    /* Small unroll limit.  */
3216   2,                                    /* Small unroll factor.  */
3217   COSTS_N_INSNS (2),                    /* Branch mispredict scale.  */
3218 };
3219
3220 static stringop_algs tremont_memcpy[2] = {
3221   {libcall,
3222    {{256, rep_prefix_1_byte, true},
3223     {256, loop, false},
3224     {-1, libcall, false}}},
3225   {libcall,
3226    {{256, rep_prefix_1_byte, true},
3227     {256, loop, false},
3228     {-1, libcall, false}}}};
3229 static stringop_algs tremont_memset[2] = {
3230   {libcall,
3231    {{256, rep_prefix_1_byte, true},
3232     {256, loop, false},
3233     {-1, libcall, false}}},
3234   {libcall,
3235    {{256, rep_prefix_1_byte, true},
3236     {256, loop, false},
3237     {-1, libcall, false}}}};
3238 static const
3239 struct processor_costs tremont_cost = {
3240   {
3241   /* Start of register allocator costs.  integer->integer move cost is 2. */
3242   6,                                 /* cost for loading QImode using movzbl */
3243   {6, 6, 6},                            /* cost of loading integer registers
3244                                            in QImode, HImode and SImode.
3245                                            Relative to reg-reg move (2).  */
3246   {6, 6, 6},                            /* cost of storing integer registers */
3247   4,                                    /* cost of reg,reg fld/fst */
3248   {6, 6, 12},                           /* cost of loading fp registers
3249                                            in SFmode, DFmode and XFmode */
3250   {6, 6, 12},                           /* cost of storing fp registers
3251                                            in SFmode, DFmode and XFmode */
3252   2,                                    /* cost of moving MMX register */
3253   {6, 6},                               /* cost of loading MMX registers
3254                                            in SImode and DImode */
3255   {6, 6},                               /* cost of storing MMX registers
3256                                            in SImode and DImode */
3257   2, 3, 4,                              /* cost of moving XMM,YMM,ZMM register */
3258   {6, 6, 6, 10, 15},                    /* cost of loading SSE registers
3259                                            in 32,64,128,256 and 512-bit */
3260   {6, 6, 6, 10, 15},                    /* cost of storing SSE registers
3261                                            in 32,64,128,256 and 512-bit */
3262   6, 6,                         /* SSE->integer and integer->SSE moves */
3263   6, 6,                         /* mask->integer and integer->mask moves */
3264   {6, 6, 6},                            /* cost of loading mask register
3265                                            in QImode, HImode, SImode.  */
3266   {6, 6, 6},                    /* cost if storing mask register
3267                                            in QImode, HImode, SImode.  */
3268   2,                                    /* cost of moving mask register.  */
3269   /* End of register allocator costs.  */
3270   },
3271
3272   COSTS_N_INSNS (1),                    /* cost of an add instruction */
3273   /* Setting cost to 2 makes our current implementation of synth_mult result in
3274      use of unnecessary temporary registers causing regression on several
3275      SPECfp benchmarks.  */
3276   COSTS_N_INSNS (1) + 1,                /* cost of a lea instruction */
3277   COSTS_N_INSNS (1),                    /* variable shift costs */
3278   COSTS_N_INSNS (1),                    /* constant shift costs */
3279   {COSTS_N_INSNS (3),                   /* cost of starting multiply for QI */
3280    COSTS_N_INSNS (3),                   /*                               HI */
3281    COSTS_N_INSNS (3),                   /*                               SI */
3282    COSTS_N_INSNS (3),                   /*                               DI */
3283    COSTS_N_INSNS (4)},                  /*                            other */
3284   0,                                    /* cost of multiply per each bit set */
3285   {COSTS_N_INSNS (16),                  /* cost of a divide/mod for QI */
3286    COSTS_N_INSNS (22),                  /*                          HI */
3287    COSTS_N_INSNS (30),                  /*                          SI */
3288    COSTS_N_INSNS (74),                  /*                          DI */
3289    COSTS_N_INSNS (74)},                 /*                          other */
3290   COSTS_N_INSNS (1),                    /* cost of movsx */
3291   COSTS_N_INSNS (1),                    /* cost of movzx */
3292   8,                                    /* "large" insn */
3293   17,                                   /* MOVE_RATIO */
3294   17,                                   /* CLEAR_RATIO */
3295   {6, 6, 6},                            /* cost of loading integer registers
3296                                            in QImode, HImode and SImode.
3297                                            Relative to reg-reg move (2).  */
3298   {6, 6, 6},                            /* cost of storing integer registers */
3299   {6, 6, 6, 10, 15},                    /* cost of loading SSE register
3300                                            in 32bit, 64bit, 128bit, 256bit and 512bit */
3301   {6, 6, 6, 10, 15},                    /* cost of storing SSE register
3302                                            in 32bit, 64bit, 128bit, 256bit and 512bit */
3303   {6, 6, 6, 10, 15},                    /* cost of unaligned loads.  */
3304   {6, 6, 6, 10, 15},                    /* cost of unaligned storess.  */
3305   2, 3, 4,                              /* cost of moving XMM,YMM,ZMM register */
3306   6,                                    /* cost of moving SSE register to integer.  */
3307   18, 6,                                /* Gather load static, per_elt.  */
3308   18, 6,                                /* Gather store static, per_elt.  */
3309   32,                                   /* size of l1 cache.  */
3310   512,                                  /* size of l2 cache.  */
3311   64,                                   /* size of prefetch block */
3312   6,                                    /* number of parallel prefetches */
3313   /* Benchmarks shows large regressions on K8 sixtrack benchmark when this
3314      value is increased to perhaps more appropriate value of 5.  */
3315   3,                                    /* Branch cost */
3316   COSTS_N_INSNS (3),                    /* cost of FADD and FSUB insns.  */
3317   COSTS_N_INSNS (5),                    /* cost of FMUL instruction.  */
3318   COSTS_N_INSNS (17),                   /* cost of FDIV instruction.  */
3319   COSTS_N_INSNS (1),                    /* cost of FABS instruction.  */
3320   COSTS_N_INSNS (1),                    /* cost of FCHS instruction.  */
3321   COSTS_N_INSNS (14),                   /* cost of FSQRT instruction.  */
3322
3323   COSTS_N_INSNS (1),                    /* cost of cheap SSE instruction.  */
3324   COSTS_N_INSNS (3),                    /* cost of ADDSS/SD SUBSS/SD insns.  */
3325   COSTS_N_INSNS (4),                    /* cost of MULSS instruction.  */
3326   COSTS_N_INSNS (5),                    /* cost of MULSD instruction.  */
3327   COSTS_N_INSNS (5),                    /* cost of FMA SS instruction.  */
3328   COSTS_N_INSNS (5),                    /* cost of FMA SD instruction.  */
3329   COSTS_N_INSNS (13),                   /* cost of DIVSS instruction.  */
3330   COSTS_N_INSNS (17),                   /* cost of DIVSD instruction.  */
3331   COSTS_N_INSNS (14),                   /* cost of SQRTSS instruction.  */
3332   COSTS_N_INSNS (18),                   /* cost of SQRTSD instruction.  */
3333   1, 4, 3, 3,                           /* reassoc int, fp, vec_int, vec_fp.  */
3334   tremont_memcpy,
3335   tremont_memset,
3336   COSTS_N_INSNS (4),                    /* cond_taken_branch_cost.  */
3337   COSTS_N_INSNS (2),                    /* cond_not_taken_branch_cost.  */
3338   "16:11:8",                            /* Loop alignment.  */
3339   "16:11:8",                            /* Jump alignment.  */
3340   "0:0:8",                              /* Label alignment.  */
3341   "16",                                 /* Func alignment.  */
3342   4,                                    /* Small unroll limit.  */
3343   2,                                    /* Small unroll factor.  */
3344   COSTS_N_INSNS (2),                    /* Branch mispredict scale.  */
3345 };
3346
3347 static stringop_algs intel_memcpy[2] = {
3348   {libcall, {{11, loop, false}, {-1, rep_prefix_4_byte, false}}},
3349   {libcall, {{32, loop, false}, {64, rep_prefix_4_byte, false},
3350              {8192, rep_prefix_8_byte, false}, {-1, libcall, false}}}};
3351 static stringop_algs intel_memset[2] = {
3352   {libcall, {{8, loop, false}, {15, unrolled_loop, false},
3353              {2048, rep_prefix_4_byte, false}, {-1, libcall, false}}},
3354   {libcall, {{24, loop, false}, {32, unrolled_loop, false},
3355              {8192, rep_prefix_8_byte, false}, {-1, libcall, false}}}};
3356 static const
3357 struct processor_costs intel_cost = {
3358   {
3359   /* Start of register allocator costs.  integer->integer move cost is 2. */
3360   6,                                 /* cost for loading QImode using movzbl */
3361   {4, 4, 4},                            /* cost of loading integer registers
3362                                            in QImode, HImode and SImode.
3363                                            Relative to reg-reg move (2).  */
3364   {6, 6, 6},                            /* cost of storing integer registers */
3365   2,                                    /* cost of reg,reg fld/fst */
3366   {6, 6, 8},                            /* cost of loading fp registers
3367                                            in SFmode, DFmode and XFmode */
3368   {6, 6, 10},                           /* cost of storing fp registers
3369                                            in SFmode, DFmode and XFmode */
3370   2,                                    /* cost of moving MMX register */
3371   {6, 6},                               /* cost of loading MMX registers
3372                                            in SImode and DImode */
3373   {6, 6},                               /* cost of storing MMX registers
3374                                            in SImode and DImode */
3375   2, 2, 2,                              /* cost of moving XMM,YMM,ZMM register */
3376   {6, 6, 6, 6, 6},                      /* cost of loading SSE registers
3377                                            in 32,64,128,256 and 512-bit */
3378   {6, 6, 6, 6, 6},                      /* cost of storing SSE registers
3379                                            in 32,64,128,256 and 512-bit */
3380   4, 4,                         /* SSE->integer and integer->SSE moves */
3381   4, 4,                         /* mask->integer and integer->mask moves */
3382   {4, 4, 4},                            /* cost of loading mask register
3383                                            in QImode, HImode, SImode.  */
3384   {6, 6, 6},                            /* cost if storing mask register
3385                                            in QImode, HImode, SImode.  */
3386   2,                                    /* cost of moving mask register.  */
3387   /* End of register allocator costs.  */
3388   },
3389
3390   COSTS_N_INSNS (1),                    /* cost of an add instruction */
3391   COSTS_N_INSNS (1) + 1,                /* cost of a lea instruction */
3392   COSTS_N_INSNS (1),                    /* variable shift costs */
3393   COSTS_N_INSNS (1),                    /* constant shift costs */
3394   {COSTS_N_INSNS (3),                   /* cost of starting multiply for QI */
3395    COSTS_N_INSNS (3),                   /*                               HI */
3396    COSTS_N_INSNS (3),                   /*                               SI */
3397    COSTS_N_INSNS (4),                   /*                               DI */
3398    COSTS_N_INSNS (2)},                  /*                            other */
3399   0,                                    /* cost of multiply per each bit set */
3400   {COSTS_N_INSNS (18),                  /* cost of a divide/mod for QI */
3401    COSTS_N_INSNS (26),                  /*                          HI */
3402    COSTS_N_INSNS (42),                  /*                          SI */
3403    COSTS_N_INSNS (74),                  /*                          DI */
3404    COSTS_N_INSNS (74)},                 /*                          other */
3405   COSTS_N_INSNS (1),                    /* cost of movsx */
3406   COSTS_N_INSNS (1),                    /* cost of movzx */
3407   8,                                    /* "large" insn */
3408   17,                                   /* MOVE_RATIO */
3409   6,                                    /* CLEAR_RATIO */
3410   {4, 4, 4},                            /* cost of loading integer registers
3411                                            in QImode, HImode and SImode.
3412                                            Relative to reg-reg move (2).  */
3413   {6, 6, 6},                            /* cost of storing integer registers */
3414   {6, 6, 6, 6, 6},                      /* cost of loading SSE register
3415                                            in 32bit, 64bit, 128bit, 256bit and 512bit */
3416   {6, 6, 6, 6, 6},                      /* cost of storing SSE register
3417                                            in 32bit, 64bit, 128bit, 256bit and 512bit */
3418   {10, 10, 10, 10, 10},                 /* cost of unaligned loads.  */
3419   {10, 10, 10, 10, 10},                 /* cost of unaligned loads.  */
3420   2, 2, 2,                              /* cost of moving XMM,YMM,ZMM register */
3421   4,                                    /* cost of moving SSE register to integer.  */
3422   6, 6,                                 /* Gather load static, per_elt.  */
3423   6, 6,                                 /* Gather store static, per_elt.  */
3424   32,                                   /* size of l1 cache.  */
3425   256,                                  /* size of l2 cache.  */
3426   64,                                   /* size of prefetch block */
3427   6,                                    /* number of parallel prefetches */
3428   3,                                    /* Branch cost */
3429   COSTS_N_INSNS (8),                    /* cost of FADD and FSUB insns.  */
3430   COSTS_N_INSNS (8),                    /* cost of FMUL instruction.  */
3431   COSTS_N_INSNS (20),                   /* cost of FDIV instruction.  */
3432   COSTS_N_INSNS (8),                    /* cost of FABS instruction.  */
3433   COSTS_N_INSNS (8),                    /* cost of FCHS instruction.  */
3434   COSTS_N_INSNS (40),                   /* cost of FSQRT instruction.  */
3435
3436   COSTS_N_INSNS (1),                    /* cost of cheap SSE instruction.  */
3437   COSTS_N_INSNS (8),                    /* cost of ADDSS/SD SUBSS/SD insns.  */
3438   COSTS_N_INSNS (8),                    /* cost of MULSS instruction.  */
3439   COSTS_N_INSNS (8),                    /* cost of MULSD instruction.  */
3440   COSTS_N_INSNS (6),                    /* cost of FMA SS instruction.  */
3441   COSTS_N_INSNS (6),                    /* cost of FMA SD instruction.  */
3442   COSTS_N_INSNS (20),                   /* cost of DIVSS instruction.  */
3443   COSTS_N_INSNS (20),                   /* cost of DIVSD instruction.  */
3444   COSTS_N_INSNS (40),                   /* cost of SQRTSS instruction.  */
3445   COSTS_N_INSNS (40),                   /* cost of SQRTSD instruction.  */
3446   1, 4, 1, 1,                           /* reassoc int, fp, vec_int, vec_fp.  */
3447   intel_memcpy,
3448   intel_memset,
3449   COSTS_N_INSNS (3),                    /* cond_taken_branch_cost.  */
3450   COSTS_N_INSNS (1),                    /* cond_not_taken_branch_cost.  */
3451   "16",                                 /* Loop alignment.  */
3452   "16:8:8",                             /* Jump alignment.  */
3453   "0:0:8",                              /* Label alignment.  */
3454   "16",                                 /* Func alignment.  */
3455   4,                                    /* Small unroll limit.  */
3456   2,                                    /* Small unroll factor.  */
3457   COSTS_N_INSNS (2),                    /* Branch mispredict scale.  */
3458 };
3459
3460 /* lujiazui_cost should produce code tuned for ZHAOXIN lujiazui CPU.  */
3461 static stringop_algs lujiazui_memcpy[2] = {
3462   {libcall, {{32, loop, false}, {8192, rep_prefix_4_byte, false},
3463                          {-1, libcall, false}}},
3464   {libcall, {{12, unrolled_loop, true}, {32, loop, false},
3465                          {6144, rep_prefix_8_byte, false},
3466                          {-1, libcall, false}}}};
3467 static stringop_algs lujiazui_memset[2] = {
3468   {libcall, {{32, loop, false}, {8192, rep_prefix_4_byte, false},
3469                          {-1, libcall, false}}},
3470   {libcall, {{12, loop, true}, {32, loop, false},
3471                          {640, rep_prefix_8_byte, false},
3472                          {-1, libcall, false}}}};
3473 static const
3474 struct processor_costs lujiazui_cost = {
3475   {
3476   /* Start of register allocator costs.  integer->integer move cost is 2.  */
3477   6,                            /* cost for loading QImode using movzbl.  */
3478   {6, 6, 6},                    /* cost of loading integer registers
3479                                            in QImode, HImode and SImode.
3480                                            Relative to reg-reg move (2).  */
3481   {6, 6, 6},                    /* cost of storing integer registers.  */
3482   2,                                    /* cost of reg,reg fld/fst.  */
3483   {6, 6, 8},                    /* cost of loading fp registers
3484                                 in SFmode, DFmode and XFmode.  */
3485   {6, 6, 8},                    /* cost of storing fp registers
3486                                 in SFmode, DFmode and XFmode.  */
3487   2,                            /* cost of moving MMX register.  */
3488   {6, 6},                       /* cost of loading MMX registers
3489                                 in SImode and DImode.  */
3490   {6, 6},                       /* cost of storing MMX registers
3491                                 in SImode and DImode.  */
3492   2, 3, 4,                      /* cost of moving XMM,YMM,ZMM register.  */
3493   {6, 6, 6, 10, 15},    /* cost of loading SSE registers
3494                                 in 32,64,128,256 and 512-bit.  */
3495   {6, 6, 6, 10, 15},    /* cost of storing SSE registers
3496                                 in 32,64,128,256 and 512-bit.  */
3497   6, 6,                         /* SSE->integer and integer->SSE moves.  */
3498   6, 6,                         /* mask->integer and integer->mask moves.  */
3499   {6, 6, 6},            /* cost of loading mask register
3500                                 in QImode, HImode, SImode.  */
3501   {6, 6, 6},            /* cost if storing mask register
3502                                 in QImode, HImode, SImode.  */
3503   2,                            /* cost of moving mask register.  */
3504   /* End of register allocator costs.  */
3505   },
3506
3507   COSTS_N_INSNS (1),                    /* cost of an add instruction.  */
3508   COSTS_N_INSNS (1) + 1,                /* cost of a lea instruction.  */
3509   COSTS_N_INSNS (1),                    /* variable shift costs.  */
3510   COSTS_N_INSNS (1),                    /* constant shift costs.  */
3511   {COSTS_N_INSNS (2),                   /* cost of starting multiply for QI.  */
3512    COSTS_N_INSNS (3),                   /*                               HI.  */
3513    COSTS_N_INSNS (3),                   /*                               SI.  */
3514    COSTS_N_INSNS (12),                  /*                               DI.  */
3515    COSTS_N_INSNS (14)},         /*                               other.  */
3516   0,                            /* cost of multiply per each bit set.  */
3517   {COSTS_N_INSNS (22),                  /* cost of a divide/mod for QI.  */
3518    COSTS_N_INSNS (24),                  /*                          HI.  */
3519    COSTS_N_INSNS (24),                  /*                          SI.  */
3520    COSTS_N_INSNS (150),                 /*                          DI.  */
3521    COSTS_N_INSNS (152)},                /*                          other.  */
3522   COSTS_N_INSNS (1),                    /* cost of movsx.  */
3523   COSTS_N_INSNS (1),                    /* cost of movzx.  */
3524   8,                                    /* "large" insn.  */
3525   17,                                   /* MOVE_RATIO.  */
3526   6,                                    /* CLEAR_RATIO.  */
3527   {6, 6, 6},                            /* cost of loading integer registers
3528                                            in QImode, HImode and SImode.
3529                                            Relative to reg-reg move (2).  */
3530   {6, 6, 6},                    /* cost of storing integer registers.  */
3531   {6, 6, 6, 10, 15},                    /* cost of loading SSE register
3532                                 in 32bit, 64bit, 128bit, 256bit and 512bit.  */
3533   {6, 6, 6, 10, 15},                    /* cost of storing SSE register
3534                                 in 32bit, 64bit, 128bit, 256bit and 512bit.  */
3535   {6, 6, 6, 10, 15},                    /* cost of unaligned loads.  */
3536   {6, 6, 6, 10, 15},                    /* cost of unaligned storess.  */
3537   2, 3, 4,                      /* cost of moving XMM,YMM,ZMM register.  */
3538   6,                            /* cost of moving SSE register to integer.  */
3539   18, 6,                                /* Gather load static, per_elt.  */
3540   18, 6,                                /* Gather store static, per_elt.  */
3541   32,                                   /* size of l1 cache.  */
3542   4096,                                 /* size of l2 cache.  */
3543   64,                                   /* size of prefetch block.  */
3544   /* Lujiazui processor never drop prefetches, like AMD processors.  */
3545   100,                                  /* number of parallel prefetches.  */
3546   3,                                    /* Branch cost.  */
3547   COSTS_N_INSNS (3),                    /* cost of FADD and FSUB insns.  */
3548   COSTS_N_INSNS (4),                    /* cost of FMUL instruction.  */
3549   COSTS_N_INSNS (22),                   /* cost of FDIV instruction.  */
3550   COSTS_N_INSNS (1),                    /* cost of FABS instruction.  */
3551   COSTS_N_INSNS (1),                    /* cost of FCHS instruction.  */
3552   COSTS_N_INSNS (44),                   /* cost of FSQRT instruction.  */
3553
3554   COSTS_N_INSNS (1),                    /* cost of cheap SSE instruction.  */
3555   COSTS_N_INSNS (3),                    /* cost of ADDSS/SD SUBSS/SD insns.  */
3556   COSTS_N_INSNS (3),                    /* cost of MULSS instruction.  */
3557   COSTS_N_INSNS (4),                    /* cost of MULSD instruction.  */
3558   COSTS_N_INSNS (6),                    /* cost of FMA SS instruction.  */
3559   COSTS_N_INSNS (6),                    /* cost of FMA SD instruction.  */
3560   COSTS_N_INSNS (13),                   /* cost of DIVSS instruction.  */
3561   COSTS_N_INSNS (17),                   /* cost of DIVSD instruction.  */
3562   COSTS_N_INSNS (32),                   /* cost of SQRTSS instruction.  */
3563   COSTS_N_INSNS (60),                   /* cost of SQRTSD instruction.  */
3564   1, 4, 3, 3,                           /* reassoc int, fp, vec_int, vec_fp.  */
3565   lujiazui_memcpy,
3566   lujiazui_memset,
3567   COSTS_N_INSNS (4),                    /* cond_taken_branch_cost.  */
3568   COSTS_N_INSNS (2),                    /* cond_not_taken_branch_cost.  */
3569   "16:11:8",                            /* Loop alignment.  */
3570   "16:11:8",                            /* Jump alignment.  */
3571   "0:0:8",                              /* Label alignment.  */
3572   "16",                                 /* Func alignment.  */
3573   4,                                    /* Small unroll limit.  */
3574   2,                                    /* Small unroll factor.  */
3575   COSTS_N_INSNS (2),                    /* Branch mispredict scale.  */
3576 };
3577
3578 /* yongfeng_cost should produce code tuned for ZHAOXIN yongfeng CPU.  */
3579 static stringop_algs yongfeng_memcpy[2] = {
3580   {libcall, {{6, unrolled_loop, true}, {256, unrolled_loop, false},
3581                          {-1, libcall, false}}},
3582   {libcall, {{8, loop, false}, {512, unrolled_loop, false},
3583                          {-1, libcall, false}}}};
3584 static stringop_algs yongfeng_memset[2] = {
3585   {libcall, {{6, loop_1_byte, false}, {128, loop, false},
3586                          {-1, libcall, false}}},
3587   {libcall, {{2, rep_prefix_4_byte, false}, {64, loop, false},
3588                          {1024, vector_loop, false},
3589                          {-1, libcall, false}}}};
3590 static const
3591 struct processor_costs yongfeng_cost = {
3592   {
3593   /* Start of register allocator costs.  integer->integer move cost is 2.  */
3594   8,                            /* cost for loading QImode using movzbl.  */
3595   {8, 8, 8},                    /* cost of loading integer registers
3596                                            in QImode, HImode and SImode.
3597                                            Relative to reg-reg move (2).  */
3598   {8, 8, 8},                    /* cost of storing integer registers.  */
3599   2,                                    /* cost of reg,reg fld/fst.  */
3600   {8, 8, 8},                    /* cost of loading fp registers
3601                                 in SFmode, DFmode and XFmode.  */
3602   {8, 8, 8},                    /* cost of storing fp registers
3603                                 in SFmode, DFmode and XFmode.  */
3604   2,                            /* cost of moving MMX register.  */
3605   {8, 8},                       /* cost of loading MMX registers
3606                                 in SImode and DImode.  */
3607   {8, 8},                       /* cost of storing MMX registers
3608                                 in SImode and DImode.  */
3609   2, 3, 4,                      /* cost of moving XMM,YMM,ZMM register.  */
3610   {8, 8, 8, 10, 15},    /* cost of loading SSE registers
3611                                 in 32,64,128,256 and 512-bit.  */
3612   {8, 8, 8, 10, 15},    /* cost of storing SSE registers
3613                                 in 32,64,128,256 and 512-bit.  */
3614   8, 8,                         /* SSE->integer and integer->SSE moves.  */
3615   8, 8,                         /* mask->integer and integer->mask moves.  */
3616   {8, 8, 8},            /* cost of loading mask register
3617                                 in QImode, HImode, SImode.  */
3618   {8, 8, 8},            /* cost if storing mask register
3619                                 in QImode, HImode, SImode.  */
3620   2,                            /* cost of moving mask register.  */
3621   /* End of register allocator costs.  */
3622   },
3623
3624   COSTS_N_INSNS (1),                    /* cost of an add instruction.  */
3625   COSTS_N_INSNS (1),                    /* cost of a lea instruction.  */
3626   COSTS_N_INSNS (1),                    /* variable shift costs.  */
3627   COSTS_N_INSNS (1),                    /* constant shift costs.  */
3628   {COSTS_N_INSNS (2),                   /* cost of starting multiply for QI.  */
3629    COSTS_N_INSNS (3),                   /*                               HI.  */
3630    COSTS_N_INSNS (2),                   /*                               SI.  */
3631    COSTS_N_INSNS (2),                   /*                               DI.  */
3632    COSTS_N_INSNS (3)},          /*                               other.  */
3633   0,                            /* cost of multiply per each bit set.  */
3634   {COSTS_N_INSNS (8),                   /* cost of a divide/mod for QI.  */
3635    COSTS_N_INSNS (9),                   /*                          HI.  */
3636    COSTS_N_INSNS (8),                   /*                          SI.  */
3637    COSTS_N_INSNS (41),                  /*                          DI.  */
3638    COSTS_N_INSNS (41)},         /*                          other.  */
3639   COSTS_N_INSNS (1),                    /* cost of movsx.  */
3640   COSTS_N_INSNS (1),                    /* cost of movzx.  */
3641   8,                                    /* "large" insn.  */
3642   17,                                   /* MOVE_RATIO.  */
3643   6,                                    /* CLEAR_RATIO.  */
3644   {8, 8, 8},                            /* cost of loading integer registers
3645                                            in QImode, HImode and SImode.
3646                                            Relative to reg-reg move (2).  */
3647   {8, 8, 8},                    /* cost of storing integer registers.  */
3648   {8, 8, 8, 12, 15},                    /* cost of loading SSE register
3649                                 in 32bit, 64bit, 128bit, 256bit and 512bit.  */
3650   {8, 8, 8, 12, 15},                    /* cost of storing SSE register
3651                                 in 32bit, 64bit, 128bit, 256bit and 512bit.  */
3652   {8, 8, 8, 12, 15},                    /* cost of unaligned loads.  */
3653   {8, 8, 8, 12, 15},                    /* cost of unaligned storess.  */
3654   2, 3, 4,                      /* cost of moving XMM,YMM,ZMM register.  */
3655   8,                            /* cost of moving SSE register to integer.  */
3656   18, 6,                                /* Gather load static, per_elt.  */
3657   18, 6,                                /* Gather store static, per_elt.  */
3658   32,                                   /* size of l1 cache.  */
3659   256,                                  /* size of l2 cache.  */
3660   64,                                   /* size of prefetch block.  */
3661   12,                                   /* number of parallel prefetches.  */
3662   3,                                    /* Branch cost.  */
3663   COSTS_N_INSNS (3),                    /* cost of FADD and FSUB insns.  */
3664   COSTS_N_INSNS (3),                    /* cost of FMUL instruction.  */
3665   COSTS_N_INSNS (14),                   /* cost of FDIV instruction.  */
3666   COSTS_N_INSNS (1),                    /* cost of FABS instruction.  */
3667   COSTS_N_INSNS (1),                    /* cost of FCHS instruction.  */
3668   COSTS_N_INSNS (40),                   /* cost of FSQRT instruction.  */
3669
3670   COSTS_N_INSNS (1),                    /* cost of cheap SSE instruction.  */
3671   COSTS_N_INSNS (3),                    /* cost of ADDSS/SD SUBSS/SD insns.  */
3672   COSTS_N_INSNS (3),                    /* cost of MULSS instruction.  */
3673   COSTS_N_INSNS (3),                    /* cost of MULSD instruction.  */
3674   COSTS_N_INSNS (5),                    /* cost of FMA SS instruction.  */
3675   COSTS_N_INSNS (5),                    /* cost of FMA SD instruction.  */
3676   COSTS_N_INSNS (10),                   /* cost of DIVSS instruction.  */
3677   COSTS_N_INSNS (14),                   /* cost of DIVSD instruction.  */
3678   COSTS_N_INSNS (20),                   /* cost of SQRTSS instruction.  */
3679   COSTS_N_INSNS (35),                   /* cost of SQRTSD instruction.  */
3680   4, 4, 4, 4,                           /* reassoc int, fp, vec_int, vec_fp.  */
3681   yongfeng_memcpy,
3682   yongfeng_memset,
3683   COSTS_N_INSNS (3),                    /* cond_taken_branch_cost.  */
3684   COSTS_N_INSNS (1),                    /* cond_not_taken_branch_cost.  */
3685   "16:11:8",                            /* Loop alignment.  */
3686   "16:11:8",                            /* Jump alignment.  */
3687   "0:0:8",                              /* Label alignment.  */
3688   "16",                                 /* Func alignment.  */
3689   4,                                    /* Small unroll limit.  */
3690   2,                                    /* Small unroll factor.  */
3691   COSTS_N_INSNS (2),                    /* Branch mispredict scale.  */
3692 };
3693
3694 /* shijidadao_cost should produce code tuned for ZHAOXIN shijidadao CPU.  */
3695 static stringop_algs shijidadao_memcpy[2] = {
3696   {libcall, {{8, unrolled_loop, true}, {256, unrolled_loop, false},
3697                          {-1, libcall, false}}},
3698   {libcall, {{10, loop, true}, {256, unrolled_loop, false},
3699                          {-1, libcall, false}}}};
3700 static stringop_algs shijidadao_memset[2] = {
3701   {libcall, {{4, loop, true}, {128, unrolled_loop, false},
3702                          {-1, libcall, false}}},
3703   {libcall, {{1, rep_prefix_4_byte, false}, {14, loop, true},
3704                          {1024, vector_loop, false},
3705                          {-1, libcall, false}}}};
3706 static const
3707 struct processor_costs shijidadao_cost = {
3708   {
3709   /* Start of register allocator costs.  integer->integer move cost is 2.  */
3710   8,                            /* cost for loading QImode using movzbl.  */
3711   {8, 8, 8},                    /* cost of loading integer registers
3712                                            in QImode, HImode and SImode.
3713                                            Relative to reg-reg move (2).  */
3714   {8, 8, 8},                    /* cost of storing integer registers.  */
3715   2,                                    /* cost of reg,reg fld/fst.  */
3716   {8, 8, 8},                    /* cost of loading fp registers
3717                                 in SFmode, DFmode and XFmode.  */
3718   {8, 8, 8},                    /* cost of storing fp registers
3719                                 in SFmode, DFmode and XFmode.  */
3720   2,                            /* cost of moving MMX register.  */
3721   {8, 8},                       /* cost of loading MMX registers
3722                                 in SImode and DImode.  */
3723   {8, 8},                       /* cost of storing MMX registers
3724                                 in SImode and DImode.  */
3725   2, 3, 4,                      /* cost of moving XMM,YMM,ZMM register.  */
3726   {8, 8, 8, 10, 15},    /* cost of loading SSE registers
3727                                 in 32,64,128,256 and 512-bit.  */
3728   {8, 8, 8, 10, 15},    /* cost of storing SSE registers
3729                                 in 32,64,128,256 and 512-bit.  */
3730   8, 8,                         /* SSE->integer and integer->SSE moves.  */
3731   8, 8,                         /* mask->integer and integer->mask moves.  */
3732   {8, 8, 8},            /* cost of loading mask register
3733                                 in QImode, HImode, SImode.  */
3734   {8, 8, 8},            /* cost if storing mask register
3735                                 in QImode, HImode, SImode.  */
3736   2,                            /* cost of moving mask register.  */
3737   /* End of register allocator costs.  */
3738   },
3739
3740   COSTS_N_INSNS (1),                    /* cost of an add instruction.  */
3741   COSTS_N_INSNS (1),                    /* cost of a lea instruction.  */
3742   COSTS_N_INSNS (1),                    /* variable shift costs.  */
3743   COSTS_N_INSNS (1),                    /* constant shift costs.  */
3744   {COSTS_N_INSNS (2),                   /* cost of starting multiply for QI.  */
3745    COSTS_N_INSNS (3),                   /*                               HI.  */
3746    COSTS_N_INSNS (2),                   /*                               SI.  */
3747    COSTS_N_INSNS (2),                   /*                               DI.  */
3748    COSTS_N_INSNS (3)},          /*                               other.  */
3749   0,                            /* cost of multiply per each bit set.  */
3750   {COSTS_N_INSNS (9),                   /* cost of a divide/mod for QI.  */
3751    COSTS_N_INSNS (10),                  /*                          HI.  */
3752    COSTS_N_INSNS (9),                   /*                          SI.  */
3753    COSTS_N_INSNS (50),                  /*                          DI.  */
3754    COSTS_N_INSNS (50)},         /*                          other.  */
3755   COSTS_N_INSNS (1),                    /* cost of movsx.  */
3756   COSTS_N_INSNS (1),                    /* cost of movzx.  */
3757   8,                                    /* "large" insn.  */
3758   17,                                   /* MOVE_RATIO.  */
3759   6,                                    /* CLEAR_RATIO.  */
3760   {8, 8, 8},                            /* cost of loading integer registers
3761                                            in QImode, HImode and SImode.
3762                                            Relative to reg-reg move (2).  */
3763   {8, 8, 8},                    /* cost of storing integer registers.  */
3764   {8, 8, 8, 12, 15},                    /* cost of loading SSE register
3765                                 in 32bit, 64bit, 128bit, 256bit and 512bit.  */
3766   {8, 8, 8, 12, 15},                    /* cost of storing SSE register
3767                                 in 32bit, 64bit, 128bit, 256bit and 512bit.  */
3768   {8, 8, 8, 12, 15},                    /* cost of unaligned loads.  */
3769   {8, 8, 8, 12, 15},                    /* cost of unaligned storess.  */
3770   2, 3, 4,                      /* cost of moving XMM,YMM,ZMM register.  */
3771   8,                            /* cost of moving SSE register to integer.  */
3772   18, 6,                                /* Gather load static, per_elt.  */
3773   18, 6,                                /* Gather store static, per_elt.  */
3774   32,                                   /* size of l1 cache.  */
3775   256,                                  /* size of l2 cache.  */
3776   64,                                   /* size of prefetch block.  */
3777   12,                                   /* number of parallel prefetches.  */
3778   3,                                    /* Branch cost.  */
3779   COSTS_N_INSNS (3),                    /* cost of FADD and FSUB insns.  */
3780   COSTS_N_INSNS (3),                    /* cost of FMUL instruction.  */
3781   COSTS_N_INSNS (13),                   /* cost of FDIV instruction.  */
3782   COSTS_N_INSNS (2),                    /* cost of FABS instruction.  */
3783   COSTS_N_INSNS (2),                    /* cost of FCHS instruction.  */
3784   COSTS_N_INSNS (44),                   /* cost of FSQRT instruction.  */
3785
3786   COSTS_N_INSNS (1),                    /* cost of cheap SSE instruction.  */
3787   COSTS_N_INSNS (3),                    /* cost of ADDSS/SD SUBSS/SD insns.  */
3788   COSTS_N_INSNS (3),                    /* cost of MULSS instruction.  */
3789   COSTS_N_INSNS (3),                    /* cost of MULSD instruction.  */
3790   COSTS_N_INSNS (5),                    /* cost of FMA SS instruction.  */
3791   COSTS_N_INSNS (5),                    /* cost of FMA SD instruction.  */
3792   COSTS_N_INSNS (11),                   /* cost of DIVSS instruction.  */
3793   COSTS_N_INSNS (14),                   /* cost of DIVSD instruction.  */
3794   COSTS_N_INSNS (11),                   /* cost of SQRTSS instruction.  */
3795   COSTS_N_INSNS (18),                   /* cost of SQRTSD instruction.  */
3796   4, 4, 4, 4,                           /* reassoc int, fp, vec_int, vec_fp.  */
3797   shijidadao_memcpy,
3798   shijidadao_memset,
3799   COSTS_N_INSNS (3),                    /* cond_taken_branch_cost.  */
3800   COSTS_N_INSNS (1),                    /* cond_not_taken_branch_cost.  */
3801   "16:11:8",                            /* Loop alignment.  */
3802   "16:11:8",                            /* Jump alignment.  */
3803   "0:0:8",                              /* Label alignment.  */
3804   "16",                         /* Func alignment.  */
3805   4,                                    /* Small unroll limit.  */
3806   2,                                    /* Small unroll factor.  */
3807   COSTS_N_INSNS (2),                    /* Branch mispredict scale.  */
3808 };
3809
3810
3811
3812 /* Generic should produce code tuned for Core-i7 (and newer chips)
3813    and btver1 (and newer chips).  */
3814
3815 static stringop_algs generic_memcpy[2] = {
3816   {libcall, {{32, loop, false}, {8192, rep_prefix_4_byte, false},
3817              {-1, libcall, false}}},
3818   {libcall, {{32, loop, false}, {8192, rep_prefix_8_byte, false},
3819              {-1, libcall, false}}}};
3820 static stringop_algs generic_memset[2] = {
3821   {libcall, {{32, loop, false}, {8192, rep_prefix_4_byte, false},
3822              {-1, libcall, false}}},
3823   {libcall, {{32, loop, false}, {8192, rep_prefix_8_byte, false},
3824              {-1, libcall, false}}}};
3825 static const
3826 struct processor_costs generic_cost = {
3827   {
3828   /* Start of register allocator costs.  integer->integer move cost is 2. */
3829   6,                                 /* cost for loading QImode using movzbl */
3830   {6, 6, 6},                            /* cost of loading integer registers
3831                                            in QImode, HImode and SImode.
3832                                            Relative to reg-reg move (2).  */
3833   {6, 6, 6},                            /* cost of storing integer registers */
3834   4,                                    /* cost of reg,reg fld/fst */
3835   {6, 6, 12},                           /* cost of loading fp registers
3836                                            in SFmode, DFmode and XFmode */
3837   {6, 6, 12},                           /* cost of storing fp registers
3838                                            in SFmode, DFmode and XFmode */
3839   2,                                    /* cost of moving MMX register */
3840   {6, 6},                               /* cost of loading MMX registers
3841                                            in SImode and DImode */
3842   {6, 6},                               /* cost of storing MMX registers
3843                                            in SImode and DImode */
3844   2, 3, 4,                              /* cost of moving XMM,YMM,ZMM register */
3845   {6, 6, 6, 10, 15},                    /* cost of loading SSE registers
3846                                            in 32,64,128,256 and 512-bit */
3847   {6, 6, 6, 10, 15},                    /* cost of storing SSE registers
3848                                            in 32,64,128,256 and 512-bit */
3849   6, 6,                         /* SSE->integer and integer->SSE moves */
3850   6, 6,                         /* mask->integer and integer->mask moves */
3851   {6, 6, 6},                            /* cost of loading mask register
3852                                            in QImode, HImode, SImode.  */
3853   {6, 6, 6},                    /* cost if storing mask register
3854                                            in QImode, HImode, SImode.  */
3855   2,                                    /* cost of moving mask register.  */
3856   /* End of register allocator costs.  */
3857   },
3858
3859   COSTS_N_INSNS (1),                    /* cost of an add instruction */
3860   /* Setting cost to 2 makes our current implementation of synth_mult result in
3861      use of unnecessary temporary registers causing regression on several
3862      SPECfp benchmarks.  */
3863   COSTS_N_INSNS (1) + 1,                /* cost of a lea instruction */
3864   COSTS_N_INSNS (1),                    /* variable shift costs */
3865   COSTS_N_INSNS (1),                    /* constant shift costs */
3866   {COSTS_N_INSNS (3),                   /* cost of starting multiply for QI */
3867    COSTS_N_INSNS (3),                   /*                               HI */
3868    COSTS_N_INSNS (3),                   /*                               SI */
3869    COSTS_N_INSNS (3),                   /*                               DI */
3870    COSTS_N_INSNS (4)},                  /*                            other */
3871   0,                                    /* cost of multiply per each bit set */
3872   {COSTS_N_INSNS (16),                  /* cost of a divide/mod for QI */
3873    COSTS_N_INSNS (22),                  /*                          HI */
3874    COSTS_N_INSNS (30),                  /*                          SI */
3875    COSTS_N_INSNS (74),                  /*                          DI */
3876    COSTS_N_INSNS (74)},                 /*                          other */
3877   COSTS_N_INSNS (1),                    /* cost of movsx */
3878   COSTS_N_INSNS (1),                    /* cost of movzx */
3879   8,                                    /* "large" insn */
3880   17,                                   /* MOVE_RATIO */
3881   6,                                    /* CLEAR_RATIO */
3882   {6, 6, 6},                            /* cost of loading integer registers
3883                                            in QImode, HImode and SImode.
3884                                            Relative to reg-reg move (2).  */
3885   {6, 6, 6},                            /* cost of storing integer registers */
3886   {6, 6, 6, 10, 15},                    /* cost of loading SSE register
3887                                            in 32bit, 64bit, 128bit, 256bit and 512bit */
3888   {6, 6, 6, 10, 15},                    /* cost of storing SSE register
3889                                            in 32bit, 64bit, 128bit, 256bit and 512bit */
3890   {6, 6, 6, 10, 15},                    /* cost of unaligned loads.  */
3891   {6, 6, 6, 10, 15},                    /* cost of unaligned storess.  */
3892   2, 3, 4,                              /* cost of moving XMM,YMM,ZMM register */
3893   6,                                    /* cost of moving SSE register to integer.  */
3894   18, 6,                                /* Gather load static, per_elt.  */
3895   18, 6,                                /* Gather store static, per_elt.  */
3896   32,                                   /* size of l1 cache.  */
3897   512,                                  /* size of l2 cache.  */
3898   64,                                   /* size of prefetch block */
3899   6,                                    /* number of parallel prefetches */
3900   /* Benchmarks shows large regressions on K8 sixtrack benchmark when this
3901      value is increased to perhaps more appropriate value of 5.  */
3902   3,                                    /* Branch cost */
3903   COSTS_N_INSNS (3),                    /* cost of FADD and FSUB insns.  */
3904   COSTS_N_INSNS (5),                    /* cost of FMUL instruction.  */
3905   COSTS_N_INSNS (17),                   /* cost of FDIV instruction.  */
3906   COSTS_N_INSNS (1),                    /* cost of FABS instruction.  */
3907   COSTS_N_INSNS (1),                    /* cost of FCHS instruction.  */
3908   COSTS_N_INSNS (14),                   /* cost of FSQRT instruction.  */
3909
3910   COSTS_N_INSNS (1),                    /* cost of cheap SSE instruction.  */
3911   COSTS_N_INSNS (3),                    /* cost of ADDSS/SD SUBSS/SD insns.  */
3912   COSTS_N_INSNS (4),                    /* cost of MULSS instruction.  */
3913   COSTS_N_INSNS (5),                    /* cost of MULSD instruction.  */
3914   COSTS_N_INSNS (5),                    /* cost of FMA SS instruction.  */
3915   COSTS_N_INSNS (5),                    /* cost of FMA SD instruction.  */
3916   COSTS_N_INSNS (13),                   /* cost of DIVSS instruction.  */
3917   COSTS_N_INSNS (17),                   /* cost of DIVSD instruction.  */
3918   COSTS_N_INSNS (14),                   /* cost of SQRTSS instruction.  */
3919   COSTS_N_INSNS (18),                   /* cost of SQRTSD instruction.  */
3920   1, 4, 3, 3,                           /* reassoc int, fp, vec_int, vec_fp.  */
3921   generic_memcpy,
3922   generic_memset,
3923   COSTS_N_INSNS (4),                    /* cond_taken_branch_cost.  */
3924   COSTS_N_INSNS (2),                    /* cond_not_taken_branch_cost.  */
3925   "16",                                 /* Loop alignment.  */
3926   "16:11:8",                            /* Jump alignment.  */
3927   "0:0:8",                              /* Label alignment.  */
3928   "16",                                 /* Func alignment.  */
3929   4,                                    /* Small unroll limit.  */
3930   2,                                    /* Small unroll factor.  */
3931   COSTS_N_INSNS (2),                    /* Branch mispredict scale.  */
3932 };
3933
3934 /* core_cost should produce code tuned for Core familly of CPUs.  */
3935 static stringop_algs core_memcpy[2] = {
3936   {libcall, {{1024, rep_prefix_4_byte, true}, {-1, libcall, false}}},
3937   {libcall, {{24, loop, true}, {128, rep_prefix_8_byte, true},
3938              {-1, libcall, false}}}};
3939 static stringop_algs core_memset[2] = {
3940   {libcall, {{6, loop_1_byte, true},
3941              {24, loop, true},
3942              {8192, rep_prefix_4_byte, true},
3943              {-1, libcall, false}}},
3944   {libcall, {{24, loop, true}, {512, rep_prefix_8_byte, true},
3945              {-1, libcall, false}}}};
3946
3947 static const
3948 struct processor_costs core_cost = {
3949   {
3950   /* Start of register allocator costs.  integer->integer move cost is 2. */
3951   6,                                 /* cost for loading QImode using movzbl */
3952   {4, 4, 4},                            /* cost of loading integer registers
3953                                            in QImode, HImode and SImode.
3954                                            Relative to reg-reg move (2).  */
3955   {6, 6, 6},                            /* cost of storing integer registers */
3956   2,                                    /* cost of reg,reg fld/fst */
3957   {6, 6, 8},                            /* cost of loading fp registers
3958                                            in SFmode, DFmode and XFmode */
3959   {6, 6, 10},                           /* cost of storing fp registers
3960                                            in SFmode, DFmode and XFmode */
3961   2,                                    /* cost of moving MMX register */
3962   {6, 6},                               /* cost of loading MMX registers
3963                                            in SImode and DImode */
3964   {6, 6},                               /* cost of storing MMX registers
3965                                            in SImode and DImode */
3966   2, 2, 4,                              /* cost of moving XMM,YMM,ZMM register */
3967   {6, 6, 6, 6, 12},                     /* cost of loading SSE registers
3968                                            in 32,64,128,256 and 512-bit */
3969   {6, 6, 6, 6, 12},                     /* cost of storing SSE registers
3970                                            in 32,64,128,256 and 512-bit */
3971   6, 6,                         /* SSE->integer and integer->SSE moves */
3972   6, 6,                         /* mask->integer and integer->mask moves */
3973   {4, 4, 4},                            /* cost of loading mask register
3974                                            in QImode, HImode, SImode.  */
3975   {6, 6, 6},                            /* cost if storing mask register
3976                                            in QImode, HImode, SImode.  */
3977   2,                                    /* cost of moving mask register.  */
3978   /* End of register allocator costs.  */
3979   },
3980
3981   COSTS_N_INSNS (1),                    /* cost of an add instruction */
3982   /* On all chips taken into consideration lea is 2 cycles and more.  With
3983      this cost however our current implementation of synth_mult results in
3984      use of unnecessary temporary registers causing regression on several
3985      SPECfp benchmarks.  */
3986   COSTS_N_INSNS (1) + 1,                /* cost of a lea instruction */
3987   COSTS_N_INSNS (1),                    /* variable shift costs */
3988   COSTS_N_INSNS (1),                    /* constant shift costs */
3989   {COSTS_N_INSNS (3),                   /* cost of starting multiply for QI */
3990    COSTS_N_INSNS (4),                   /*                               HI */
3991    COSTS_N_INSNS (3),                   /*                               SI */
3992    /* Here we tune for Sandybridge or newer.  */
3993    COSTS_N_INSNS (3),                   /*                               DI */
3994    COSTS_N_INSNS (3)},                  /*                            other */
3995   0,                                    /* cost of multiply per each bit set */
3996   /* Expanding div/mod currently doesn't consider parallelism. So the cost
3997      model is not realistic. We compensate by increasing the latencies a bit.  */
3998   {COSTS_N_INSNS (11),                  /* cost of a divide/mod for QI */
3999    COSTS_N_INSNS (11),                  /*                          HI */
4000    COSTS_N_INSNS (14),                  /*                          SI */
4001    COSTS_N_INSNS (81),                  /*                          DI */
4002    COSTS_N_INSNS (81)},                 /*                          other */
4003   COSTS_N_INSNS (1),                    /* cost of movsx */
4004   COSTS_N_INSNS (1),                    /* cost of movzx */
4005   8,                                    /* "large" insn */
4006   17,                                   /* MOVE_RATIO */
4007   6,                                    /* CLEAR_RATIO */
4008   {4, 4, 4},                            /* cost of loading integer registers
4009                                            in QImode, HImode and SImode.
4010                                            Relative to reg-reg move (2).  */
4011   {6, 6, 6},                            /* cost of storing integer registers */
4012   {6, 6, 6, 6, 12},                     /* cost of loading SSE register
4013                                            in 32bit, 64bit, 128bit, 256bit and 512bit */
4014   {6, 6, 6, 6, 12},                     /* cost of storing SSE register
4015                                            in 32bit, 64bit, 128bit, 256bit and 512bit */
4016   {6, 6, 6, 6, 12},                     /* cost of unaligned loads.  */
4017   {6, 6, 6, 6, 12},                     /* cost of unaligned stores.  */
4018   2, 2, 4,                              /* cost of moving XMM,YMM,ZMM register */
4019   2,                                    /* cost of moving SSE register to integer.  */
4020   /* VGATHERDPD is 7 uops, rec throughput 5, while VGATHERDPD is 9 uops,
4021      rec. throughput 6.
4022      So 5 uops statically and one uops per load.  */
4023   10, 6,                                /* Gather load static, per_elt.  */
4024   10, 6,                                /* Gather store static, per_elt.  */
4025   64,                                   /* size of l1 cache.  */
4026   512,                                  /* size of l2 cache.  */
4027   64,                                   /* size of prefetch block */
4028   6,                                    /* number of parallel prefetches */
4029   /* FIXME perhaps more appropriate value is 5.  */
4030   3,                                    /* Branch cost */
4031   COSTS_N_INSNS (3),                    /* cost of FADD and FSUB insns.  */
4032   COSTS_N_INSNS (5),                    /* cost of FMUL instruction.  */
4033   /* 10-24 */
4034   COSTS_N_INSNS (24),                   /* cost of FDIV instruction.  */
4035   COSTS_N_INSNS (1),                    /* cost of FABS instruction.  */
4036   COSTS_N_INSNS (1),                    /* cost of FCHS instruction.  */
4037   COSTS_N_INSNS (23),                   /* cost of FSQRT instruction.  */
4038
4039   COSTS_N_INSNS (1),                    /* cost of cheap SSE instruction.  */
4040   COSTS_N_INSNS (3),                    /* cost of ADDSS/SD SUBSS/SD insns.  */
4041   COSTS_N_INSNS (4),                    /* cost of MULSS instruction.  */
4042   COSTS_N_INSNS (5),                    /* cost of MULSD instruction.  */
4043   COSTS_N_INSNS (5),                    /* cost of FMA SS instruction.  */
4044   COSTS_N_INSNS (5),                    /* cost of FMA SD instruction.  */
4045   COSTS_N_INSNS (18),                   /* cost of DIVSS instruction.  */
4046   COSTS_N_INSNS (32),                   /* cost of DIVSD instruction.  */
4047   COSTS_N_INSNS (30),                   /* cost of SQRTSS instruction.  */
4048   COSTS_N_INSNS (58),                   /* cost of SQRTSD instruction.  */
4049   1, 4, 2, 2,                           /* reassoc int, fp, vec_int, vec_fp.  */
4050   core_memcpy,
4051   core_memset,
4052   COSTS_N_INSNS (3),                    /* cond_taken_branch_cost.  */
4053   COSTS_N_INSNS (1),                    /* cond_not_taken_branch_cost.  */
4054   "16:11:8",                            /* Loop alignment.  */
4055   "16:11:8",                            /* Jump alignment.  */
4056   "0:0:8",                              /* Label alignment.  */
4057   "16",                                 /* Func alignment.  */
4058   4,                                    /* Small unroll limit.  */
4059   2,                                    /* Small unroll factor.  */
4060   COSTS_N_INSNS (2),                    /* Branch mispredict scale.  */
4061 };
4062