gcc/config/i386/x86-tune-costs.h

   1 /* Costs of operations of individual x86 CPUs.
   2    Copyright (C) 1988-2024 Free Software Foundation, Inc.
   3
   4 This file is part of GCC.
   5
   6 GCC is free software; you can redistribute it and/or modify
   7 it under the terms of the GNU General Public License as published by
   8 the Free Software Foundation; either version 3, or (at your option)
   9 any later version.
  10
  11 GCC is distributed in the hope that it will be useful,
  12 but WITHOUT ANY WARRANTY; without even the implied warranty of
  13 MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
  14 GNU General Public License for more details.
  15
  16 Under Section 7 of GPL version 3, you are granted additional
  17 permissions described in the GCC Runtime Library Exception, version
  18 3.1, as published by the Free Software Foundation.
  19
  20 You should have received a copy of the GNU General Public License and
  21 a copy of the GCC Runtime Library Exception along with this program;
  22 see the files COPYING3 and COPYING.RUNTIME respectively.  If not, see
  23 <http://www.gnu.org/licenses/>.  */
  24 /* Processor costs (relative to an add) */
  25 /* We assume COSTS_N_INSNS is defined as (N)*4 and an addition is 2 bytes.  */
  26 #define COSTS_N_BYTES(N) ((N) * 2)
  27
  28 #define DUMMY_STRINGOP_ALGS {libcall, {{-1, libcall, false}}}
  29
  30 static stringop_algs ix86_size_memcpy[2] = {
  31   {rep_prefix_1_byte, {{-1, rep_prefix_1_byte, false}}},
  32   {rep_prefix_1_byte, {{-1, rep_prefix_1_byte, false}}}};
  33 static stringop_algs ix86_size_memset[2] = {
  34   {rep_prefix_1_byte, {{-1, rep_prefix_1_byte, false}}},
  35   {rep_prefix_1_byte, {{-1, rep_prefix_1_byte, false}}}};
  36
  37 const
  38 struct processor_costs ix86_size_cost = {/* costs for tuning for size */
  39   {
  40   /* Start of register allocator costs.  integer->integer move cost is 2. */
  41   2,                                 /* cost for loading QImode using movzbl */
  42   {2, 2, 2},                            /* cost of loading integer registers
  43                                            in QImode, HImode and SImode.
  44                                            Relative to reg-reg move (2).  */
  45   {2, 2, 2},                            /* cost of storing integer registers */
  46   2,                                    /* cost of reg,reg fld/fst */
  47   {2, 2, 2},                            /* cost of loading fp registers
  48                                            in SFmode, DFmode and XFmode */
  49   {2, 2, 2},                            /* cost of storing fp registers
  50                                            in SFmode, DFmode and XFmode */
  51   3,                                    /* cost of moving MMX register */
  52   {3, 3},                               /* cost of loading MMX registers
  53                                            in SImode and DImode */
  54   {3, 3},                               /* cost of storing MMX registers
  55                                            in SImode and DImode */
  56   3, 3, 3,                              /* cost of moving XMM,YMM,ZMM register */
  57   {3, 3, 3, 3, 3},                      /* cost of loading SSE registers
  58                                            in 32,64,128,256 and 512-bit */
  59   {3, 3, 3, 3, 3},                      /* cost of storing SSE registers
  60                                            in 32,64,128,256 and 512-bit */
  61   3, 3,                         /* SSE->integer and integer->SSE moves */
  62   3, 3,                         /* mask->integer and integer->mask moves */
  63   {2, 2, 2},                            /* cost of loading mask register
  64                                            in QImode, HImode, SImode.  */
  65   {2, 2, 2},                            /* cost if storing mask register
  66                                            in QImode, HImode, SImode.  */
  67   2,                                    /* cost of moving mask register.  */
  68   /* End of register allocator costs.  */
  69   },
  70
  71   COSTS_N_BYTES (2),                    /* cost of an add instruction */
  72   COSTS_N_BYTES (3),                    /* cost of a lea instruction */
  73   COSTS_N_BYTES (2),                    /* variable shift costs */
  74   COSTS_N_BYTES (3),                    /* constant shift costs */
  75   {COSTS_N_BYTES (3),                   /* cost of starting multiply for QI */
  76    COSTS_N_BYTES (3),                   /*                               HI */
  77    COSTS_N_BYTES (3),                   /*                               SI */
  78    COSTS_N_BYTES (3),                   /*                               DI */
  79    COSTS_N_BYTES (5)},                  /*                            other */
  80   0,                                    /* cost of multiply per each bit set */
  81   {COSTS_N_BYTES (3),                   /* cost of a divide/mod for QI */
  82    COSTS_N_BYTES (3),                   /*                          HI */
  83    COSTS_N_BYTES (3),                   /*                          SI */
  84    COSTS_N_BYTES (3),                   /*                          DI */
  85    COSTS_N_BYTES (5)},                  /*                          other */
  86   COSTS_N_BYTES (3),                    /* cost of movsx */
  87   COSTS_N_BYTES (3),                    /* cost of movzx */
  88   0,                                    /* "large" insn */
  89   2,                                    /* MOVE_RATIO */
  90   2,                                    /* CLEAR_RATIO */
  91   {2, 2, 2},                            /* cost of loading integer registers
  92                                            in QImode, HImode and SImode.
  93                                            Relative to reg-reg move (2).  */
  94   {2, 2, 2},                            /* cost of storing integer registers */
  95   {3, 3, 3, 3, 3},                      /* cost of loading SSE register
  96                                            in 32bit, 64bit, 128bit, 256bit and 512bit */
  97   {3, 3, 3, 3, 3},                      /* cost of storing SSE register
  98                                            in 32bit, 64bit, 128bit, 256bit and 512bit */
  99   {3, 3, 3, 3, 3},                      /* cost of unaligned SSE load
 100                                            in 128bit, 256bit and 512bit */
 101   {3, 3, 3, 3, 3},                      /* cost of unaligned SSE store
 102                                            in 128bit, 256bit and 512bit */
 103   3, 3, 3,                              /* cost of moving XMM,YMM,ZMM register */
 104   3,                                    /* cost of moving SSE register to integer.  */
 105   5, 0,                                 /* Gather load static, per_elt.  */
 106   5, 0,                                 /* Gather store static, per_elt.  */
 107   0,                                    /* size of l1 cache  */
 108   0,                                    /* size of l2 cache  */
 109   0,                                    /* size of prefetch block */
 110   0,                                    /* number of parallel prefetches */
 111   2,                                    /* Branch cost */
 112   COSTS_N_BYTES (2),                    /* cost of FADD and FSUB insns.  */
 113   COSTS_N_BYTES (2),                    /* cost of FMUL instruction.  */
 114   COSTS_N_BYTES (2),                    /* cost of FDIV instruction.  */
 115   COSTS_N_BYTES (2),                    /* cost of FABS instruction.  */
 116   COSTS_N_BYTES (2),                    /* cost of FCHS instruction.  */
 117   COSTS_N_BYTES (2),                    /* cost of FSQRT instruction.  */
 118
 119   COSTS_N_BYTES (2),                    /* cost of cheap SSE instruction.  */
 120   COSTS_N_BYTES (2),                    /* cost of ADDSS/SD SUBSS/SD insns.  */
 121   COSTS_N_BYTES (2),                    /* cost of MULSS instruction.  */
 122   COSTS_N_BYTES (2),                    /* cost of MULSD instruction.  */
 123   COSTS_N_BYTES (2),                    /* cost of FMA SS instruction.  */
 124   COSTS_N_BYTES (2),                    /* cost of FMA SD instruction.  */
 125   COSTS_N_BYTES (2),                    /* cost of DIVSS instruction.  */
 126   COSTS_N_BYTES (2),                    /* cost of DIVSD instruction.  */
 127   COSTS_N_BYTES (2),                    /* cost of SQRTSS instruction.  */
 128   COSTS_N_BYTES (2),                    /* cost of SQRTSD instruction.  */
 129   1, 1, 1, 1,                           /* reassoc int, fp, vec_int, vec_fp.  */
 130   ix86_size_memcpy,
 131   ix86_size_memset,
 132   COSTS_N_BYTES (1),                    /* cond_taken_branch_cost.  */
 133   COSTS_N_BYTES (1),                    /* cond_not_taken_branch_cost.  */
 134   NULL,                                 /* Loop alignment.  */
 135   NULL,                                 /* Jump alignment.  */
 136   NULL,                                 /* Label alignment.  */
 137   NULL,                                 /* Func alignment.  */
 138   4,                                    /* Small unroll limit.  */
 139   2,                                    /* Small unroll factor.  */
 140 };
 141
 142 /* Processor costs (relative to an add) */
 143 static stringop_algs i386_memcpy[2] = {
 144   {rep_prefix_1_byte, {{-1, rep_prefix_1_byte, false}}},
 145   DUMMY_STRINGOP_ALGS};
 146 static stringop_algs i386_memset[2] = {
 147   {rep_prefix_1_byte, {{-1, rep_prefix_1_byte, false}}},
 148   DUMMY_STRINGOP_ALGS};
 149
 150 static const
 151 struct processor_costs i386_cost = {    /* 386 specific costs */
 152   {
 153   /* Start of register allocator costs.  integer->integer move cost is 2. */
 154   4,                                 /* cost for loading QImode using movzbl */
 155   {2, 4, 2},                            /* cost of loading integer registers
 156                                            in QImode, HImode and SImode.
 157                                            Relative to reg-reg move (2).  */
 158   {2, 4, 2},                            /* cost of storing integer registers */
 159   2,                                    /* cost of reg,reg fld/fst */
 160   {8, 8, 8},                            /* cost of loading fp registers
 161                                            in SFmode, DFmode and XFmode */
 162   {8, 8, 8},                            /* cost of storing fp registers
 163                                            in SFmode, DFmode and XFmode */
 164   2,                                    /* cost of moving MMX register */
 165   {4, 8},                               /* cost of loading MMX registers
 166                                            in SImode and DImode */
 167   {4, 8},                               /* cost of storing MMX registers
 168                                            in SImode and DImode */
 169   2, 4, 8,                              /* cost of moving XMM,YMM,ZMM register */
 170   {4, 8, 16, 32, 64},                   /* cost of loading SSE registers
 171                                            in 32,64,128,256 and 512-bit */
 172   {4, 8, 16, 32, 64},                   /* cost of storing SSE registers
 173                                            in 32,64,128,256 and 512-bit */
 174   3, 3,                         /* SSE->integer and integer->SSE moves */
 175   3, 3,                         /* mask->integer and integer->mask moves */
 176   {2, 4, 2},                            /* cost of loading mask register
 177                                            in QImode, HImode, SImode.  */
 178   {2, 4, 2},                            /* cost if storing mask register
 179                                            in QImode, HImode, SImode.  */
 180   2,                                    /* cost of moving mask register.  */
 181   /* End of register allocator costs.  */
 182   },
 183
 184   COSTS_N_INSNS (1),                    /* cost of an add instruction */
 185   COSTS_N_INSNS (1),                    /* cost of a lea instruction */
 186   COSTS_N_INSNS (3),                    /* variable shift costs */
 187   COSTS_N_INSNS (2),                    /* constant shift costs */
 188   {COSTS_N_INSNS (6),                   /* cost of starting multiply for QI */
 189    COSTS_N_INSNS (6),                   /*                               HI */
 190    COSTS_N_INSNS (6),                   /*                               SI */
 191    COSTS_N_INSNS (6),                   /*                               DI */
 192    COSTS_N_INSNS (6)},                  /*                            other */
 193   COSTS_N_INSNS (1),                    /* cost of multiply per each bit set */
 194   {COSTS_N_INSNS (23),                  /* cost of a divide/mod for QI */
 195    COSTS_N_INSNS (23),                  /*                          HI */
 196    COSTS_N_INSNS (23),                  /*                          SI */
 197    COSTS_N_INSNS (23),                  /*                          DI */
 198    COSTS_N_INSNS (23)},                 /*                          other */
 199   COSTS_N_INSNS (3),                    /* cost of movsx */
 200   COSTS_N_INSNS (2),                    /* cost of movzx */
 201   15,                                   /* "large" insn */
 202   3,                                    /* MOVE_RATIO */
 203   3,                                    /* CLEAR_RATIO */
 204   {2, 4, 2},                            /* cost of loading integer registers
 205                                            in QImode, HImode and SImode.
 206                                            Relative to reg-reg move (2).  */
 207   {2, 4, 2},                            /* cost of storing integer registers */
 208   {4, 8, 16, 32, 64},                   /* cost of loading SSE register
 209                                            in 32bit, 64bit, 128bit, 256bit and 512bit */
 210   {4, 8, 16, 32, 64},                   /* cost of storing SSE register
 211                                            in 32bit, 64bit, 128bit, 256bit and 512bit */
 212   {4, 8, 16, 32, 64},                   /* cost of unaligned loads.  */
 213   {4, 8, 16, 32, 64},                   /* cost of unaligned stores.  */
 214   2, 4, 8,                              /* cost of moving XMM,YMM,ZMM register */
 215   3,                                    /* cost of moving SSE register to integer.  */
 216   4, 4,                                 /* Gather load static, per_elt.  */
 217   4, 4,                                 /* Gather store static, per_elt.  */
 218   0,                                    /* size of l1 cache  */
 219   0,                                    /* size of l2 cache  */
 220   0,                                    /* size of prefetch block */
 221   0,                                    /* number of parallel prefetches */
 222   1,                                    /* Branch cost */
 223   COSTS_N_INSNS (23),                   /* cost of FADD and FSUB insns.  */
 224   COSTS_N_INSNS (27),                   /* cost of FMUL instruction.  */
 225   COSTS_N_INSNS (88),                   /* cost of FDIV instruction.  */
 226   COSTS_N_INSNS (22),                   /* cost of FABS instruction.  */
 227   COSTS_N_INSNS (24),                   /* cost of FCHS instruction.  */
 228   COSTS_N_INSNS (122),                  /* cost of FSQRT instruction.  */
 229
 230   COSTS_N_INSNS (1),                    /* cost of cheap SSE instruction.  */
 231   COSTS_N_INSNS (23),                   /* cost of ADDSS/SD SUBSS/SD insns.  */
 232   COSTS_N_INSNS (27),                   /* cost of MULSS instruction.  */
 233   COSTS_N_INSNS (27),                   /* cost of MULSD instruction.  */
 234   COSTS_N_INSNS (27),                   /* cost of FMA SS instruction.  */
 235   COSTS_N_INSNS (27),                   /* cost of FMA SD instruction.  */
 236   COSTS_N_INSNS (88),                   /* cost of DIVSS instruction.  */
 237   COSTS_N_INSNS (88),                   /* cost of DIVSD instruction.  */
 238   COSTS_N_INSNS (122),                  /* cost of SQRTSS instruction.  */
 239   COSTS_N_INSNS (122),                  /* cost of SQRTSD instruction.  */
 240   1, 1, 1, 1,                           /* reassoc int, fp, vec_int, vec_fp.  */
 241   i386_memcpy,
 242   i386_memset,
 243   COSTS_N_INSNS (3),                    /* cond_taken_branch_cost.  */
 244   COSTS_N_INSNS (1),                    /* cond_not_taken_branch_cost.  */
 245   "4",                                  /* Loop alignment.  */
 246   "4",                                  /* Jump alignment.  */
 247   NULL,                                 /* Label alignment.  */
 248   "4",                                  /* Func alignment.  */
 249   4,                                    /* Small unroll limit.  */
 250   2,                                    /* Small unroll factor.  */
 251 };
 252
 253 static stringop_algs i486_memcpy[2] = {
 254   {rep_prefix_4_byte, {{-1, rep_prefix_4_byte, false}}},
 255   DUMMY_STRINGOP_ALGS};
 256 static stringop_algs i486_memset[2] = {
 257   {rep_prefix_4_byte, {{-1, rep_prefix_4_byte, false}}},
 258   DUMMY_STRINGOP_ALGS};
 259
 260 static const
 261 struct processor_costs i486_cost = {    /* 486 specific costs */
 262   {
 263   /* Start of register allocator costs.  integer->integer move cost is 2. */
 264   4,                                 /* cost for loading QImode using movzbl */
 265   {2, 4, 2},                            /* cost of loading integer registers
 266                                            in QImode, HImode and SImode.
 267                                            Relative to reg-reg move (2).  */
 268   {2, 4, 2},                            /* cost of storing integer registers */
 269   2,                                    /* cost of reg,reg fld/fst */
 270   {8, 8, 8},                            /* cost of loading fp registers
 271                                            in SFmode, DFmode and XFmode */
 272   {8, 8, 8},                            /* cost of storing fp registers
 273                                            in SFmode, DFmode and XFmode */
 274   2,                                    /* cost of moving MMX register */
 275   {4, 8},                               /* cost of loading MMX registers
 276                                            in SImode and DImode */
 277   {4, 8},                               /* cost of storing MMX registers
 278                                            in SImode and DImode */
 279   2, 4, 8,                              /* cost of moving XMM,YMM,ZMM register */
 280   {4, 8, 16, 32, 64},                   /* cost of loading SSE registers
 281                                            in 32,64,128,256 and 512-bit */
 282   {4, 8, 16, 32, 64},                   /* cost of storing SSE registers
 283                                            in 32,64,128,256 and 512-bit */
 284   3, 3,                         /* SSE->integer and integer->SSE moves */
 285   3, 3,                         /* mask->integer and integer->mask moves */
 286   {2, 4, 2},                            /* cost of loading mask register
 287                                            in QImode, HImode, SImode.  */
 288   {2, 4, 2},                            /* cost if storing mask register
 289                                            in QImode, HImode, SImode.  */
 290   2,                                    /* cost of moving mask register.  */
 291   /* End of register allocator costs.  */
 292   },
 293
 294   COSTS_N_INSNS (1),                    /* cost of an add instruction */
 295   COSTS_N_INSNS (1),                    /* cost of a lea instruction */
 296   COSTS_N_INSNS (3),                    /* variable shift costs */
 297   COSTS_N_INSNS (2),                    /* constant shift costs */
 298   {COSTS_N_INSNS (12),                  /* cost of starting multiply for QI */
 299    COSTS_N_INSNS (12),                  /*                               HI */
 300    COSTS_N_INSNS (12),                  /*                               SI */
 301    COSTS_N_INSNS (12),                  /*                               DI */
 302    COSTS_N_INSNS (12)},                 /*                            other */
 303   1,                                    /* cost of multiply per each bit set */
 304   {COSTS_N_INSNS (40),                  /* cost of a divide/mod for QI */
 305    COSTS_N_INSNS (40),                  /*                          HI */
 306    COSTS_N_INSNS (40),                  /*                          SI */
 307    COSTS_N_INSNS (40),                  /*                          DI */
 308    COSTS_N_INSNS (40)},                 /*                          other */
 309   COSTS_N_INSNS (3),                    /* cost of movsx */
 310   COSTS_N_INSNS (2),                    /* cost of movzx */
 311   15,                                   /* "large" insn */
 312   3,                                    /* MOVE_RATIO */
 313   3,                                    /* CLEAR_RATIO */
 314   {2, 4, 2},                            /* cost of loading integer registers
 315                                            in QImode, HImode and SImode.
 316                                            Relative to reg-reg move (2).  */
 317   {2, 4, 2},                            /* cost of storing integer registers */
 318   {4, 8, 16, 32, 64},                   /* cost of loading SSE register
 319                                            in 32bit, 64bit, 128bit, 256bit and 512bit */
 320   {4, 8, 16, 32, 64},                   /* cost of storing SSE register
 321                                            in 32bit, 64bit, 128bit, 256bit and 512bit */
 322   {4, 8, 16, 32, 64},                   /* cost of unaligned loads.  */
 323   {4, 8, 16, 32, 64},                   /* cost of unaligned stores.  */
 324   2, 4, 8,                              /* cost of moving XMM,YMM,ZMM register */
 325   3,                                    /* cost of moving SSE register to integer.  */
 326   4, 4,                                 /* Gather load static, per_elt.  */
 327   4, 4,                                 /* Gather store static, per_elt.  */
 328   4,                                    /* size of l1 cache.  486 has 8kB cache
 329                                            shared for code and data, so 4kB is
 330                                            not really precise.  */
 331   4,                                    /* size of l2 cache  */
 332   0,                                    /* size of prefetch block */
 333   0,                                    /* number of parallel prefetches */
 334   1,                                    /* Branch cost */
 335   COSTS_N_INSNS (8),                    /* cost of FADD and FSUB insns.  */
 336   COSTS_N_INSNS (16),                   /* cost of FMUL instruction.  */
 337   COSTS_N_INSNS (73),                   /* cost of FDIV instruction.  */
 338   COSTS_N_INSNS (3),                    /* cost of FABS instruction.  */
 339   COSTS_N_INSNS (3),                    /* cost of FCHS instruction.  */
 340   COSTS_N_INSNS (83),                   /* cost of FSQRT instruction.  */
 341
 342   COSTS_N_INSNS (1),                    /* cost of cheap SSE instruction.  */
 343   COSTS_N_INSNS (8),                    /* cost of ADDSS/SD SUBSS/SD insns.  */
 344   COSTS_N_INSNS (16),                   /* cost of MULSS instruction.  */
 345   COSTS_N_INSNS (16),                   /* cost of MULSD instruction.  */
 346   COSTS_N_INSNS (16),                   /* cost of FMA SS instruction.  */
 347   COSTS_N_INSNS (16),                   /* cost of FMA SD instruction.  */
 348   COSTS_N_INSNS (73),                   /* cost of DIVSS instruction.  */
 349   COSTS_N_INSNS (74),                   /* cost of DIVSD instruction.  */
 350   COSTS_N_INSNS (83),                   /* cost of SQRTSS instruction.  */
 351   COSTS_N_INSNS (83),                   /* cost of SQRTSD instruction.  */
 352   1, 1, 1, 1,                           /* reassoc int, fp, vec_int, vec_fp.  */
 353   i486_memcpy,
 354   i486_memset,
 355   COSTS_N_INSNS (3),                    /* cond_taken_branch_cost.  */
 356   COSTS_N_INSNS (1),                    /* cond_not_taken_branch_cost.  */
 357   "16",                                 /* Loop alignment.  */
 358   "16",                                 /* Jump alignment.  */
 359   "0:0:8",                              /* Label alignment.  */
 360   "16",                                 /* Func alignment.  */
 361   4,                                    /* Small unroll limit.  */
 362   2,                                    /* Small unroll factor.  */
 363 };
 364
 365 static stringop_algs pentium_memcpy[2] = {
 366   {libcall, {{256, rep_prefix_4_byte, false}, {-1, libcall, false}}},
 367   DUMMY_STRINGOP_ALGS};
 368 static stringop_algs pentium_memset[2] = {
 369   {libcall, {{-1, rep_prefix_4_byte, false}}},
 370   DUMMY_STRINGOP_ALGS};
 371
 372 static const
 373 struct processor_costs pentium_cost = {
 374   {
 375   /* Start of register allocator costs.  integer->integer move cost is 2. */
 376   6,                                 /* cost for loading QImode using movzbl */
 377   {2, 4, 2},                            /* cost of loading integer registers
 378                                            in QImode, HImode and SImode.
 379                                            Relative to reg-reg move (2).  */
 380   {2, 4, 2},                            /* cost of storing integer registers */
 381   2,                                    /* cost of reg,reg fld/fst */
 382   {2, 2, 6},                            /* cost of loading fp registers
 383                                            in SFmode, DFmode and XFmode */
 384   {4, 4, 6},                            /* cost of storing fp registers
 385                                            in SFmode, DFmode and XFmode */
 386   8,                                    /* cost of moving MMX register */
 387   {8, 8},                               /* cost of loading MMX registers
 388                                            in SImode and DImode */
 389   {8, 8},                               /* cost of storing MMX registers
 390                                            in SImode and DImode */
 391   2, 4, 8,                              /* cost of moving XMM,YMM,ZMM register */
 392   {4, 8, 16, 32, 64},                   /* cost of loading SSE registers
 393                                            in 32,64,128,256 and 512-bit */
 394   {4, 8, 16, 32, 64},                   /* cost of storing SSE registers
 395                                            in 32,64,128,256 and 512-bit */
 396   3, 3,                         /* SSE->integer and integer->SSE moves */
 397   3, 3,                         /* mask->integer and integer->mask moves */
 398   {2, 4, 2},                            /* cost of loading mask register
 399                                            in QImode, HImode, SImode.  */
 400   {2, 4, 2},                            /* cost if storing mask register
 401                                            in QImode, HImode, SImode.  */
 402   2,                                    /* cost of moving mask register.  */
 403   /* End of register allocator costs.  */
 404   },
 405
 406   COSTS_N_INSNS (1),                    /* cost of an add instruction */
 407   COSTS_N_INSNS (1),                    /* cost of a lea instruction */
 408   COSTS_N_INSNS (4),                    /* variable shift costs */
 409   COSTS_N_INSNS (1),                    /* constant shift costs */
 410   {COSTS_N_INSNS (11),                  /* cost of starting multiply for QI */
 411    COSTS_N_INSNS (11),                  /*                               HI */
 412    COSTS_N_INSNS (11),                  /*                               SI */
 413    COSTS_N_INSNS (11),                  /*                               DI */
 414    COSTS_N_INSNS (11)},                 /*                            other */
 415   0,                                    /* cost of multiply per each bit set */
 416   {COSTS_N_INSNS (25),                  /* cost of a divide/mod for QI */
 417    COSTS_N_INSNS (25),                  /*                          HI */
 418    COSTS_N_INSNS (25),                  /*                          SI */
 419    COSTS_N_INSNS (25),                  /*                          DI */
 420    COSTS_N_INSNS (25)},                 /*                          other */
 421   COSTS_N_INSNS (3),                    /* cost of movsx */
 422   COSTS_N_INSNS (2),                    /* cost of movzx */
 423   8,                                    /* "large" insn */
 424   6,                                    /* MOVE_RATIO */
 425   6,                                    /* CLEAR_RATIO */
 426   {2, 4, 2},                            /* cost of loading integer registers
 427                                            in QImode, HImode and SImode.
 428                                            Relative to reg-reg move (2).  */
 429   {2, 4, 2},                            /* cost of storing integer registers */
 430   {4, 8, 16, 32, 64},                   /* cost of loading SSE register
 431                                            in 32bit, 64bit, 128bit, 256bit and 512bit */
 432   {4, 8, 16, 32, 64},                   /* cost of storing SSE register
 433                                            in 32bit, 64bit, 128bit, 256bit and 512bit */
 434   {4, 8, 16, 32, 64},                   /* cost of unaligned loads.  */
 435   {4, 8, 16, 32, 64},                   /* cost of unaligned stores.  */
 436   2, 4, 8,                              /* cost of moving XMM,YMM,ZMM register */
 437   3,                                    /* cost of moving SSE register to integer.  */
 438   4, 4,                                 /* Gather load static, per_elt.  */
 439   4, 4,                                 /* Gather store static, per_elt.  */
 440   8,                                    /* size of l1 cache.  */
 441   8,                                    /* size of l2 cache  */
 442   0,                                    /* size of prefetch block */
 443   0,                                    /* number of parallel prefetches */
 444   2,                                    /* Branch cost */
 445   COSTS_N_INSNS (3),                    /* cost of FADD and FSUB insns.  */
 446   COSTS_N_INSNS (3),                    /* cost of FMUL instruction.  */
 447   COSTS_N_INSNS (39),                   /* cost of FDIV instruction.  */
 448   COSTS_N_INSNS (1),                    /* cost of FABS instruction.  */
 449   COSTS_N_INSNS (1),                    /* cost of FCHS instruction.  */
 450   COSTS_N_INSNS (70),                   /* cost of FSQRT instruction.  */
 451
 452   COSTS_N_INSNS (1),                    /* cost of cheap SSE instruction.  */
 453   COSTS_N_INSNS (3),                    /* cost of ADDSS/SD SUBSS/SD insns.  */
 454   COSTS_N_INSNS (3),                    /* cost of MULSS instruction.  */
 455   COSTS_N_INSNS (3),                    /* cost of MULSD instruction.  */
 456   COSTS_N_INSNS (6),                    /* cost of FMA SS instruction.  */
 457   COSTS_N_INSNS (6),                    /* cost of FMA SD instruction.  */
 458   COSTS_N_INSNS (39),                   /* cost of DIVSS instruction.  */
 459   COSTS_N_INSNS (39),                   /* cost of DIVSD instruction.  */
 460   COSTS_N_INSNS (70),                   /* cost of SQRTSS instruction.  */
 461   COSTS_N_INSNS (70),                   /* cost of SQRTSD instruction.  */
 462   1, 1, 1, 1,                           /* reassoc int, fp, vec_int, vec_fp.  */
 463   pentium_memcpy,
 464   pentium_memset,
 465   COSTS_N_INSNS (3),                    /* cond_taken_branch_cost.  */
 466   COSTS_N_INSNS (1),                    /* cond_not_taken_branch_cost.  */
 467   "16:8:8",                             /* Loop alignment.  */
 468   "16:8:8",                             /* Jump alignment.  */
 469   "0:0:8",                              /* Label alignment.  */
 470   "16",                                 /* Func alignment.  */
 471   4,                                    /* Small unroll limit.  */
 472   2,                                    /* Small unroll factor.  */
 473 };
 474
 475 static const
 476 struct processor_costs lakemont_cost = {
 477   {
 478   /* Start of register allocator costs.  integer->integer move cost is 2. */
 479   6,                                 /* cost for loading QImode using movzbl */
 480   {2, 4, 2},                            /* cost of loading integer registers
 481                                            in QImode, HImode and SImode.
 482                                            Relative to reg-reg move (2).  */
 483   {2, 4, 2},                            /* cost of storing integer registers */
 484   2,                                    /* cost of reg,reg fld/fst */
 485   {2, 2, 6},                            /* cost of loading fp registers
 486                                            in SFmode, DFmode and XFmode */
 487   {4, 4, 6},                            /* cost of storing fp registers
 488                                            in SFmode, DFmode and XFmode */
 489   8,                                    /* cost of moving MMX register */
 490   {8, 8},                               /* cost of loading MMX registers
 491                                            in SImode and DImode */
 492   {8, 8},                               /* cost of storing MMX registers
 493                                            in SImode and DImode */
 494   2, 4, 8,                              /* cost of moving XMM,YMM,ZMM register */
 495   {4, 8, 16, 32, 64},                   /* cost of loading SSE registers
 496                                            in 32,64,128,256 and 512-bit */
 497   {4, 8, 16, 32, 64},                   /* cost of storing SSE registers
 498                                            in 32,64,128,256 and 512-bit */
 499   3, 3,                         /* SSE->integer and integer->SSE moves */
 500   3, 3,                         /* mask->integer and integer->mask moves */
 501   {2, 4, 2},                            /* cost of loading mask register
 502                                            in QImode, HImode, SImode.  */
 503   {2, 4, 2},                            /* cost if storing mask register
 504                                            in QImode, HImode, SImode.  */
 505   2,                                    /* cost of moving mask register.  */
 506   /* End of register allocator costs.  */
 507   },
 508
 509   COSTS_N_INSNS (1),                    /* cost of an add instruction */
 510   COSTS_N_INSNS (1) + 1,                /* cost of a lea instruction */
 511   COSTS_N_INSNS (1),                    /* variable shift costs */
 512   COSTS_N_INSNS (1),                    /* constant shift costs */
 513   {COSTS_N_INSNS (11),                  /* cost of starting multiply for QI */
 514    COSTS_N_INSNS (11),                  /*                               HI */
 515    COSTS_N_INSNS (11),                  /*                               SI */
 516    COSTS_N_INSNS (11),                  /*                               DI */
 517    COSTS_N_INSNS (11)},                 /*                            other */
 518   0,                                    /* cost of multiply per each bit set */
 519   {COSTS_N_INSNS (25),                  /* cost of a divide/mod for QI */
 520    COSTS_N_INSNS (25),                  /*                          HI */
 521    COSTS_N_INSNS (25),                  /*                          SI */
 522    COSTS_N_INSNS (25),                  /*                          DI */
 523    COSTS_N_INSNS (25)},                 /*                          other */
 524   COSTS_N_INSNS (3),                    /* cost of movsx */
 525   COSTS_N_INSNS (2),                    /* cost of movzx */
 526   8,                                    /* "large" insn */
 527   17,                                   /* MOVE_RATIO */
 528   6,                                    /* CLEAR_RATIO */
 529   {2, 4, 2},                            /* cost of loading integer registers
 530                                            in QImode, HImode and SImode.
 531                                            Relative to reg-reg move (2).  */
 532   {2, 4, 2},                            /* cost of storing integer registers */
 533   {4, 8, 16, 32, 64},                   /* cost of loading SSE register
 534                                            in 32bit, 64bit, 128bit, 256bit and 512bit */
 535   {4, 8, 16, 32, 64},                   /* cost of storing SSE register
 536                                            in 32bit, 64bit, 128bit, 256bit and 512bit */
 537   {4, 8, 16, 32, 64},                   /* cost of unaligned loads.  */
 538   {4, 8, 16, 32, 64},                   /* cost of unaligned stores.  */
 539   2, 4, 8,                              /* cost of moving XMM,YMM,ZMM register */
 540   3,                                    /* cost of moving SSE register to integer.  */
 541   4, 4,                                 /* Gather load static, per_elt.  */
 542   4, 4,                                 /* Gather store static, per_elt.  */
 543   8,                                    /* size of l1 cache.  */
 544   8,                                    /* size of l2 cache  */
 545   0,                                    /* size of prefetch block */
 546   0,                                    /* number of parallel prefetches */
 547   2,                                    /* Branch cost */
 548   COSTS_N_INSNS (3),                    /* cost of FADD and FSUB insns.  */
 549   COSTS_N_INSNS (3),                    /* cost of FMUL instruction.  */
 550   COSTS_N_INSNS (39),                   /* cost of FDIV instruction.  */
 551   COSTS_N_INSNS (1),                    /* cost of FABS instruction.  */
 552   COSTS_N_INSNS (1),                    /* cost of FCHS instruction.  */
 553   COSTS_N_INSNS (70),                   /* cost of FSQRT instruction.  */
 554
 555   COSTS_N_INSNS (1),                    /* cost of cheap SSE instruction.  */
 556   COSTS_N_INSNS (5),                    /* cost of ADDSS/SD SUBSS/SD insns.  */
 557   COSTS_N_INSNS (5),                    /* cost of MULSS instruction.  */
 558   COSTS_N_INSNS (5),                    /* cost of MULSD instruction.  */
 559   COSTS_N_INSNS (10),                   /* cost of FMA SS instruction.  */
 560   COSTS_N_INSNS (10),                   /* cost of FMA SD instruction.  */
 561   COSTS_N_INSNS (31),                   /* cost of DIVSS instruction.  */
 562   COSTS_N_INSNS (60),                   /* cost of DIVSD instruction.  */
 563   COSTS_N_INSNS (31),                   /* cost of SQRTSS instruction.  */
 564   COSTS_N_INSNS (63),                   /* cost of SQRTSD instruction.  */
 565   1, 1, 1, 1,                           /* reassoc int, fp, vec_int, vec_fp.  */
 566   pentium_memcpy,
 567   pentium_memset,
 568   COSTS_N_INSNS (3),                    /* cond_taken_branch_cost.  */
 569   COSTS_N_INSNS (1),                    /* cond_not_taken_branch_cost.  */
 570   "16:8:8",                             /* Loop alignment.  */
 571   "16:8:8",                             /* Jump alignment.  */
 572   "0:0:8",                              /* Label alignment.  */
 573   "16",                                 /* Func alignment.  */
 574   4,                                    /* Small unroll limit.  */
 575   2,                                    /* Small unroll factor.  */
 576 };
 577
 578 /* PentiumPro has optimized rep instructions for blocks aligned by 8 bytes
 579    (we ensure the alignment).  For small blocks inline loop is still a
 580    noticeable win, for bigger blocks either rep movsl or rep movsb is
 581    way to go.  Rep movsb has apparently more expensive startup time in CPU,
 582    but after 4K the difference is down in the noise.  */
 583 static stringop_algs pentiumpro_memcpy[2] = {
 584   {rep_prefix_4_byte, {{128, loop, false}, {1024, unrolled_loop, false},
 585                        {8192, rep_prefix_4_byte, false},
 586                        {-1, rep_prefix_1_byte, false}}},
 587   DUMMY_STRINGOP_ALGS};
 588 static stringop_algs pentiumpro_memset[2] = {
 589   {rep_prefix_4_byte, {{1024, unrolled_loop, false},
 590                        {8192, rep_prefix_4_byte, false},
 591                        {-1, libcall, false}}},
 592   DUMMY_STRINGOP_ALGS};
 593 static const
 594 struct processor_costs pentiumpro_cost = {
 595   {
 596   /* Start of register allocator costs.  integer->integer move cost is 2. */
 597   2,                                 /* cost for loading QImode using movzbl */
 598   {4, 4, 4},                            /* cost of loading integer registers
 599                                            in QImode, HImode and SImode.
 600                                            Relative to reg-reg move (2).  */
 601   {2, 2, 2},                            /* cost of storing integer registers */
 602   2,                                    /* cost of reg,reg fld/fst */
 603   {2, 2, 6},                            /* cost of loading fp registers
 604                                            in SFmode, DFmode and XFmode */
 605   {4, 4, 6},                            /* cost of storing fp registers
 606                                            in SFmode, DFmode and XFmode */
 607   2,                                    /* cost of moving MMX register */
 608   {2, 2},                               /* cost of loading MMX registers
 609                                            in SImode and DImode */
 610   {2, 2},                               /* cost of storing MMX registers
 611                                            in SImode and DImode */
 612   2, 4, 8,                              /* cost of moving XMM,YMM,ZMM register */
 613   {4, 8, 16, 32, 64},                   /* cost of loading SSE registers
 614                                            in 32,64,128,256 and 512-bit */
 615   {4, 8, 16, 32, 64},                   /* cost of storing SSE registers
 616                                            in 32,64,128,256 and 512-bit */
 617   3, 3,                         /* SSE->integer and integer->SSE moves */
 618   3, 3,                         /* mask->integer and integer->mask moves */
 619   {4, 4, 4},                            /* cost of loading mask register
 620                                            in QImode, HImode, SImode.  */
 621   {2, 2, 2},                            /* cost if storing mask register
 622                                            in QImode, HImode, SImode.  */
 623   2,                                    /* cost of moving mask register.  */
 624   /* End of register allocator costs.  */
 625   },
 626
 627   COSTS_N_INSNS (1),                    /* cost of an add instruction */
 628   COSTS_N_INSNS (1),                    /* cost of a lea instruction */
 629   COSTS_N_INSNS (1),                    /* variable shift costs */
 630   COSTS_N_INSNS (1),                    /* constant shift costs */
 631   {COSTS_N_INSNS (4),                   /* cost of starting multiply for QI */
 632    COSTS_N_INSNS (4),                   /*                               HI */
 633    COSTS_N_INSNS (4),                   /*                               SI */
 634    COSTS_N_INSNS (4),                   /*                               DI */
 635    COSTS_N_INSNS (4)},                  /*                            other */
 636   0,                                    /* cost of multiply per each bit set */
 637   {COSTS_N_INSNS (17),                  /* cost of a divide/mod for QI */
 638    COSTS_N_INSNS (17),                  /*                          HI */
 639    COSTS_N_INSNS (17),                  /*                          SI */
 640    COSTS_N_INSNS (17),                  /*                          DI */
 641    COSTS_N_INSNS (17)},                 /*                          other */
 642   COSTS_N_INSNS (1),                    /* cost of movsx */
 643   COSTS_N_INSNS (1),                    /* cost of movzx */
 644   8,                                    /* "large" insn */
 645   6,                                    /* MOVE_RATIO */
 646   6,                                    /* CLEAR_RATIO */
 647   {4, 4, 4},                            /* cost of loading integer registers
 648                                            in QImode, HImode and SImode.
 649                                            Relative to reg-reg move (2).  */
 650   {2, 2, 2},                            /* cost of storing integer registers */
 651   {4, 8, 16, 32, 64},                   /* cost of loading SSE register
 652                                            in 32bit, 64bit, 128bit, 256bit and 512bit */
 653   {4, 8, 16, 32, 64},                   /* cost of storing SSE register
 654                                            in 32bit, 64bit, 128bit, 256bit and 512bit */
 655   {4, 8, 16, 32, 64},                   /* cost of unaligned loads.  */
 656   {4, 8, 16, 32, 64},                   /* cost of unaligned stores.  */
 657   2, 4, 8,                              /* cost of moving XMM,YMM,ZMM register */
 658   3,                                    /* cost of moving SSE register to integer.  */
 659   4, 4,                                 /* Gather load static, per_elt.  */
 660   4, 4,                                 /* Gather store static, per_elt.  */
 661   8,                                    /* size of l1 cache.  */
 662   256,                                  /* size of l2 cache  */
 663   32,                                   /* size of prefetch block */
 664   6,                                    /* number of parallel prefetches */
 665   2,                                    /* Branch cost */
 666   COSTS_N_INSNS (3),                    /* cost of FADD and FSUB insns.  */
 667   COSTS_N_INSNS (5),                    /* cost of FMUL instruction.  */
 668   COSTS_N_INSNS (56),                   /* cost of FDIV instruction.  */
 669   COSTS_N_INSNS (2),                    /* cost of FABS instruction.  */
 670   COSTS_N_INSNS (2),                    /* cost of FCHS instruction.  */
 671   COSTS_N_INSNS (56),                   /* cost of FSQRT instruction.  */
 672
 673   COSTS_N_INSNS (1),                    /* cost of cheap SSE instruction.  */
 674   COSTS_N_INSNS (3),                    /* cost of ADDSS/SD SUBSS/SD insns.  */
 675   COSTS_N_INSNS (4),                    /* cost of MULSS instruction.  */
 676   COSTS_N_INSNS (4),                    /* cost of MULSD instruction.  */
 677   COSTS_N_INSNS (7),                    /* cost of FMA SS instruction.  */
 678   COSTS_N_INSNS (7),                    /* cost of FMA SD instruction.  */
 679   COSTS_N_INSNS (18),                   /* cost of DIVSS instruction.  */
 680   COSTS_N_INSNS (18),                   /* cost of DIVSD instruction.  */
 681   COSTS_N_INSNS (31),                   /* cost of SQRTSS instruction.  */
 682   COSTS_N_INSNS (31),                   /* cost of SQRTSD instruction.  */
 683   1, 1, 1, 1,                           /* reassoc int, fp, vec_int, vec_fp.  */
 684   pentiumpro_memcpy,
 685   pentiumpro_memset,
 686   COSTS_N_INSNS (3),                    /* cond_taken_branch_cost.  */
 687   COSTS_N_INSNS (1),                    /* cond_not_taken_branch_cost.  */
 688   "16",                                 /* Loop alignment.  */
 689   "16:11:8",                            /* Jump alignment.  */
 690   "0:0:8",                              /* Label alignment.  */
 691   "16",                                 /* Func alignment.  */
 692   4,                                    /* Small unroll limit.  */
 693   2,                                    /* Small unroll factor.  */
 694 };
 695
 696 static stringop_algs geode_memcpy[2] = {
 697   {libcall, {{256, rep_prefix_4_byte, false}, {-1, libcall, false}}},
 698   DUMMY_STRINGOP_ALGS};
 699 static stringop_algs geode_memset[2] = {
 700   {libcall, {{256, rep_prefix_4_byte, false}, {-1, libcall, false}}},
 701   DUMMY_STRINGOP_ALGS};
 702 static const
 703 struct processor_costs geode_cost = {
 704   {
 705   /* Start of register allocator costs.  integer->integer move cost is 2. */
 706   2,                                 /* cost for loading QImode using movzbl */
 707   {2, 2, 2},                            /* cost of loading integer registers
 708                                            in QImode, HImode and SImode.
 709                                            Relative to reg-reg move (2).  */
 710   {2, 2, 2},                            /* cost of storing integer registers */
 711   2,                                    /* cost of reg,reg fld/fst */
 712   {2, 2, 2},                            /* cost of loading fp registers
 713                                            in SFmode, DFmode and XFmode */
 714   {4, 6, 6},                            /* cost of storing fp registers
 715                                            in SFmode, DFmode and XFmode */
 716   2,                                    /* cost of moving MMX register */
 717   {2, 2},                               /* cost of loading MMX registers
 718                                            in SImode and DImode */
 719   {2, 2},                               /* cost of storing MMX registers
 720                                            in SImode and DImode */
 721   2, 4, 8,                              /* cost of moving XMM,YMM,ZMM register */
 722   {2, 2, 8, 16, 32},                    /* cost of loading SSE registers
 723                                            in 32,64,128,256 and 512-bit */
 724   {2, 2, 8, 16, 32},                    /* cost of storing SSE registers
 725                                            in 32,64,128,256 and 512-bit */
 726   6, 6,                         /* SSE->integer and integer->SSE moves */
 727   6, 6,                         /* mask->integer and integer->mask moves */
 728   {2, 2, 2},                            /* cost of loading mask register
 729                                            in QImode, HImode, SImode.  */
 730   {2, 2, 2},                            /* cost if storing mask register
 731                                            in QImode, HImode, SImode.  */
 732   2,                                    /* cost of moving mask register.  */
 733   /* End of register allocator costs.  */
 734   },
 735
 736   COSTS_N_INSNS (1),                    /* cost of an add instruction */
 737   COSTS_N_INSNS (1),                    /* cost of a lea instruction */
 738   COSTS_N_INSNS (2),                    /* variable shift costs */
 739   COSTS_N_INSNS (1),                    /* constant shift costs */
 740   {COSTS_N_INSNS (3),                   /* cost of starting multiply for QI */
 741    COSTS_N_INSNS (4),                   /*                               HI */
 742    COSTS_N_INSNS (7),                   /*                               SI */
 743    COSTS_N_INSNS (7),                   /*                               DI */
 744    COSTS_N_INSNS (7)},                  /*                            other */
 745   0,                                    /* cost of multiply per each bit set */
 746   {COSTS_N_INSNS (15),                  /* cost of a divide/mod for QI */
 747    COSTS_N_INSNS (23),                  /*                          HI */
 748    COSTS_N_INSNS (39),                  /*                          SI */
 749    COSTS_N_INSNS (39),                  /*                          DI */
 750    COSTS_N_INSNS (39)},                 /*                          other */
 751   COSTS_N_INSNS (1),                    /* cost of movsx */
 752   COSTS_N_INSNS (1),                    /* cost of movzx */
 753   8,                                    /* "large" insn */
 754   4,                                    /* MOVE_RATIO */
 755   4,                                    /* CLEAR_RATIO */
 756   {2, 2, 2},                            /* cost of loading integer registers
 757                                            in QImode, HImode and SImode.
 758                                            Relative to reg-reg move (2).  */
 759   {2, 2, 2},                            /* cost of storing integer registers */
 760   {2, 2, 8, 16, 32},                    /* cost of loading SSE register
 761                                            in 32bit, 64bit, 128bit, 256bit and 512bit */
 762   {2, 2, 8, 16, 32},                    /* cost of storing SSE register
 763                                            in 32bit, 64bit, 128bit, 256bit and 512bit */
 764   {2, 2, 8, 16, 32},                    /* cost of unaligned loads.  */
 765   {2, 2, 8, 16, 32},                    /* cost of unaligned stores.  */
 766   2, 4, 8,                              /* cost of moving XMM,YMM,ZMM register */
 767   6,                                    /* cost of moving SSE register to integer.  */
 768   2, 2,                                 /* Gather load static, per_elt.  */
 769   2, 2,                                 /* Gather store static, per_elt.  */
 770   64,                                   /* size of l1 cache.  */
 771   128,                                  /* size of l2 cache.  */
 772   32,                                   /* size of prefetch block */
 773   1,                                    /* number of parallel prefetches */
 774   1,                                    /* Branch cost */
 775   COSTS_N_INSNS (6),                    /* cost of FADD and FSUB insns.  */
 776   COSTS_N_INSNS (11),                   /* cost of FMUL instruction.  */
 777   COSTS_N_INSNS (47),                   /* cost of FDIV instruction.  */
 778   COSTS_N_INSNS (1),                    /* cost of FABS instruction.  */
 779   COSTS_N_INSNS (1),                    /* cost of FCHS instruction.  */
 780   COSTS_N_INSNS (54),                   /* cost of FSQRT instruction.  */
 781
 782   COSTS_N_INSNS (1),                    /* cost of cheap SSE instruction.  */
 783   COSTS_N_INSNS (6),                    /* cost of ADDSS/SD SUBSS/SD insns.  */
 784   COSTS_N_INSNS (11),                   /* cost of MULSS instruction.  */
 785   COSTS_N_INSNS (11),                   /* cost of MULSD instruction.  */
 786   COSTS_N_INSNS (17),                   /* cost of FMA SS instruction.  */
 787   COSTS_N_INSNS (17),                   /* cost of FMA SD instruction.  */
 788   COSTS_N_INSNS (47),                   /* cost of DIVSS instruction.  */
 789   COSTS_N_INSNS (47),                   /* cost of DIVSD instruction.  */
 790   COSTS_N_INSNS (54),                   /* cost of SQRTSS instruction.  */
 791   COSTS_N_INSNS (54),                   /* cost of SQRTSD instruction.  */
 792   1, 1, 1, 1,                           /* reassoc int, fp, vec_int, vec_fp.  */
 793   geode_memcpy,
 794   geode_memset,
 795   COSTS_N_INSNS (3),                    /* cond_taken_branch_cost.  */
 796   COSTS_N_INSNS (1),                    /* cond_not_taken_branch_cost.  */
 797   NULL,                                 /* Loop alignment.  */
 798   NULL,                                 /* Jump alignment.  */
 799   NULL,                                 /* Label alignment.  */
 800   NULL,                                 /* Func alignment.  */
 801   4,                                    /* Small unroll limit.  */
 802   2,                                    /* Small unroll factor.  */
 803 };
 804
 805 static stringop_algs k6_memcpy[2] = {
 806   {libcall, {{256, rep_prefix_4_byte, false}, {-1, libcall, false}}},
 807   DUMMY_STRINGOP_ALGS};
 808 static stringop_algs k6_memset[2] = {
 809   {libcall, {{256, rep_prefix_4_byte, false}, {-1, libcall, false}}},
 810   DUMMY_STRINGOP_ALGS};
 811 static const
 812 struct processor_costs k6_cost = {
 813   {
 814   /* Start of register allocator costs.  integer->integer move cost is 2. */
 815   3,                                 /* cost for loading QImode using movzbl */
 816   {4, 5, 4},                            /* cost of loading integer registers
 817                                            in QImode, HImode and SImode.
 818                                            Relative to reg-reg move (2).  */
 819   {2, 3, 2},                            /* cost of storing integer registers */
 820   4,                                    /* cost of reg,reg fld/fst */
 821   {6, 6, 6},                            /* cost of loading fp registers
 822                                            in SFmode, DFmode and XFmode */
 823   {4, 4, 4},                            /* cost of storing fp registers
 824                                            in SFmode, DFmode and XFmode */
 825   2,                                    /* cost of moving MMX register */
 826   {2, 2},                               /* cost of loading MMX registers
 827                                            in SImode and DImode */
 828   {2, 2},                               /* cost of storing MMX registers
 829                                            in SImode and DImode */
 830   2, 4, 8,                              /* cost of moving XMM,YMM,ZMM register */
 831   {2, 2, 8, 16, 32},                    /* cost of loading SSE registers
 832                                            in 32,64,128,256 and 512-bit */
 833   {2, 2, 8, 16, 32},                    /* cost of storing SSE registers
 834                                            in 32,64,128,256 and 512-bit */
 835   6, 6,                         /* SSE->integer and integer->SSE moves */
 836   6, 6,                         /* mask->integer and integer->mask moves */
 837   {4, 5, 4},                            /* cost of loading mask register
 838                                            in QImode, HImode, SImode.  */
 839   {2, 3, 2},                            /* cost if storing mask register
 840                                            in QImode, HImode, SImode.  */
 841   2,                                    /* cost of moving mask register.  */
 842   /* End of register allocator costs.  */
 843   },
 844
 845   COSTS_N_INSNS (1),                    /* cost of an add instruction */
 846   COSTS_N_INSNS (2),                    /* cost of a lea instruction */
 847   COSTS_N_INSNS (1),                    /* variable shift costs */
 848   COSTS_N_INSNS (1),                    /* constant shift costs */
 849   {COSTS_N_INSNS (3),                   /* cost of starting multiply for QI */
 850    COSTS_N_INSNS (3),                   /*                               HI */
 851    COSTS_N_INSNS (3),                   /*                               SI */
 852    COSTS_N_INSNS (3),                   /*                               DI */
 853    COSTS_N_INSNS (3)},                  /*                            other */
 854   0,                                    /* cost of multiply per each bit set */
 855   {COSTS_N_INSNS (18),                  /* cost of a divide/mod for QI */
 856    COSTS_N_INSNS (18),                  /*                          HI */
 857    COSTS_N_INSNS (18),                  /*                          SI */
 858    COSTS_N_INSNS (18),                  /*                          DI */
 859    COSTS_N_INSNS (18)},                 /*                          other */
 860   COSTS_N_INSNS (2),                    /* cost of movsx */
 861   COSTS_N_INSNS (2),                    /* cost of movzx */
 862   8,                                    /* "large" insn */
 863   4,                                    /* MOVE_RATIO */
 864   4,                                    /* CLEAR_RATIO */
 865   {4, 5, 4},                            /* cost of loading integer registers
 866                                            in QImode, HImode and SImode.
 867                                            Relative to reg-reg move (2).  */
 868   {2, 3, 2},                            /* cost of storing integer registers */
 869   {2, 2, 8, 16, 32},                    /* cost of loading SSE register
 870                                            in 32bit, 64bit, 128bit, 256bit and 512bit */
 871   {2, 2, 8, 16, 32},                    /* cost of storing SSE register
 872                                            in 32bit, 64bit, 128bit, 256bit and 512bit */
 873   {2, 2, 8, 16, 32},                    /* cost of unaligned loads.  */
 874   {2, 2, 8, 16, 32},                    /* cost of unaligned stores.  */
 875   2, 4, 8,                              /* cost of moving XMM,YMM,ZMM register */
 876   6,                                    /* cost of moving SSE register to integer.  */
 877   2, 2,                                 /* Gather load static, per_elt.  */
 878   2, 2,                                 /* Gather store static, per_elt.  */
 879   32,                                   /* size of l1 cache.  */
 880   32,                                   /* size of l2 cache.  Some models
 881                                            have integrated l2 cache, but
 882                                            optimizing for k6 is not important
 883                                            enough to worry about that.  */
 884   32,                                   /* size of prefetch block */
 885   1,                                    /* number of parallel prefetches */
 886   1,                                    /* Branch cost */
 887   COSTS_N_INSNS (2),                    /* cost of FADD and FSUB insns.  */
 888   COSTS_N_INSNS (2),                    /* cost of FMUL instruction.  */
 889   COSTS_N_INSNS (56),                   /* cost of FDIV instruction.  */
 890   COSTS_N_INSNS (2),                    /* cost of FABS instruction.  */
 891   COSTS_N_INSNS (2),                    /* cost of FCHS instruction.  */
 892   COSTS_N_INSNS (56),                   /* cost of FSQRT instruction.  */
 893
 894   COSTS_N_INSNS (1),                    /* cost of cheap SSE instruction.  */
 895   COSTS_N_INSNS (2),                    /* cost of ADDSS/SD SUBSS/SD insns.  */
 896   COSTS_N_INSNS (2),                    /* cost of MULSS instruction.  */
 897   COSTS_N_INSNS (2),                    /* cost of MULSD instruction.  */
 898   COSTS_N_INSNS (4),                    /* cost of FMA SS instruction.  */
 899   COSTS_N_INSNS (4),                    /* cost of FMA SD instruction.  */
 900   COSTS_N_INSNS (56),                   /* cost of DIVSS instruction.  */
 901   COSTS_N_INSNS (56),                   /* cost of DIVSD instruction.  */
 902   COSTS_N_INSNS (56),                   /* cost of SQRTSS instruction.  */
 903   COSTS_N_INSNS (56),                   /* cost of SQRTSD instruction.  */
 904   1, 1, 1, 1,                           /* reassoc int, fp, vec_int, vec_fp.  */
 905   k6_memcpy,
 906   k6_memset,
 907   COSTS_N_INSNS (3),                    /* cond_taken_branch_cost.  */
 908   COSTS_N_INSNS (1),                    /* cond_not_taken_branch_cost.  */
 909   "32:8:8",                             /* Loop alignment.  */
 910   "32:8:8",                             /* Jump alignment.  */
 911   "0:0:8",                              /* Label alignment.  */
 912   "32",                                 /* Func alignment.  */
 913   4,                                    /* Small unroll limit.  */
 914   2,                                    /* Small unroll factor.  */
 915 };
 916
 917 /* For some reason, Athlon deals better with REP prefix (relative to loops)
 918    compared to K8. Alignment becomes important after 8 bytes for memcpy and
 919    128 bytes for memset.  */
 920 static stringop_algs athlon_memcpy[2] = {
 921   {libcall, {{2048, rep_prefix_4_byte, false}, {-1, libcall, false}}},
 922   DUMMY_STRINGOP_ALGS};
 923 static stringop_algs athlon_memset[2] = {
 924   {libcall, {{2048, rep_prefix_4_byte, false}, {-1, libcall, false}}},
 925   DUMMY_STRINGOP_ALGS};
 926 static const
 927 struct processor_costs athlon_cost = {
 928   {
 929   /* Start of register allocator costs.  integer->integer move cost is 2. */
 930   4,                                 /* cost for loading QImode using movzbl */
 931   {3, 4, 3},                            /* cost of loading integer registers
 932                                            in QImode, HImode and SImode.
 933                                            Relative to reg-reg move (2).  */
 934   {3, 4, 3},                            /* cost of storing integer registers */
 935   4,                                    /* cost of reg,reg fld/fst */
 936   {4, 4, 12},                           /* cost of loading fp registers
 937                                            in SFmode, DFmode and XFmode */
 938   {6, 6, 8},                            /* cost of storing fp registers
 939                                            in SFmode, DFmode and XFmode */
 940   2,                                    /* cost of moving MMX register */
 941   {4, 4},                               /* cost of loading MMX registers
 942                                            in SImode and DImode */
 943   {4, 4},                               /* cost of storing MMX registers
 944                                            in SImode and DImode */
 945   2, 4, 8,                              /* cost of moving XMM,YMM,ZMM register */
 946   {4, 4, 12, 12, 24},                   /* cost of loading SSE registers
 947                                            in 32,64,128,256 and 512-bit */
 948   {4, 4, 10, 10, 20},                   /* cost of storing SSE registers
 949                                            in 32,64,128,256 and 512-bit */
 950   5, 5,                         /* SSE->integer and integer->SSE moves */
 951   5, 5,                         /* mask->integer and integer->mask moves */
 952   {3, 4, 3},                            /* cost of loading mask register
 953                                            in QImode, HImode, SImode.  */
 954   {3, 4, 3},                            /* cost if storing mask register
 955                                            in QImode, HImode, SImode.  */
 956   2,                                    /* cost of moving mask register.  */
 957   /* End of register allocator costs.  */
 958   },
 959
 960   COSTS_N_INSNS (1),                    /* cost of an add instruction */
 961   COSTS_N_INSNS (2),                    /* cost of a lea instruction */
 962   COSTS_N_INSNS (1),                    /* variable shift costs */
 963   COSTS_N_INSNS (1),                    /* constant shift costs */
 964   {COSTS_N_INSNS (5),                   /* cost of starting multiply for QI */
 965    COSTS_N_INSNS (5),                   /*                               HI */
 966    COSTS_N_INSNS (5),                   /*                               SI */
 967    COSTS_N_INSNS (5),                   /*                               DI */
 968    COSTS_N_INSNS (5)},                  /*                            other */
 969   0,                                    /* cost of multiply per each bit set */
 970   {COSTS_N_INSNS (18),                  /* cost of a divide/mod for QI */
 971    COSTS_N_INSNS (26),                  /*                          HI */
 972    COSTS_N_INSNS (42),                  /*                          SI */
 973    COSTS_N_INSNS (74),                  /*                          DI */
 974    COSTS_N_INSNS (74)},                 /*                          other */
 975   COSTS_N_INSNS (1),                    /* cost of movsx */
 976   COSTS_N_INSNS (1),                    /* cost of movzx */
 977   8,                                    /* "large" insn */
 978   9,                                    /* MOVE_RATIO */
 979   6,                                    /* CLEAR_RATIO */
 980   {3, 4, 3},                            /* cost of loading integer registers
 981                                            in QImode, HImode and SImode.
 982                                            Relative to reg-reg move (2).  */
 983   {3, 4, 3},                            /* cost of storing integer registers */
 984   {4, 4, 12, 12, 24},                   /* cost of loading SSE register
 985                                            in 32bit, 64bit, 128bit, 256bit and 512bit */
 986   {4, 4, 10, 10, 20},                   /* cost of storing SSE register
 987                                            in 32bit, 64bit, 128bit, 256bit and 512bit */
 988   {4, 4, 12, 12, 24},                   /* cost of unaligned loads.  */
 989   {4, 4, 10, 10, 20},                   /* cost of unaligned stores.  */
 990   2, 4, 8,                              /* cost of moving XMM,YMM,ZMM register */
 991   5,                                    /* cost of moving SSE register to integer.  */
 992   4, 4,                                 /* Gather load static, per_elt.  */
 993   4, 4,                                 /* Gather store static, per_elt.  */
 994   64,                                   /* size of l1 cache.  */
 995   256,                                  /* size of l2 cache.  */
 996   64,                                   /* size of prefetch block */
 997   6,                                    /* number of parallel prefetches */
 998   5,                                    /* Branch cost */
 999   COSTS_N_INSNS (4),                    /* cost of FADD and FSUB insns.  */
1000   COSTS_N_INSNS (4),                    /* cost of FMUL instruction.  */
1001   COSTS_N_INSNS (24),                   /* cost of FDIV instruction.  */
1002   COSTS_N_INSNS (2),                    /* cost of FABS instruction.  */
1003   COSTS_N_INSNS (2),                    /* cost of FCHS instruction.  */
1004   COSTS_N_INSNS (35),                   /* cost of FSQRT instruction.  */
1005
1006   COSTS_N_INSNS (2),                    /* cost of cheap SSE instruction.  */
1007   COSTS_N_INSNS (4),                    /* cost of ADDSS/SD SUBSS/SD insns.  */
1008   COSTS_N_INSNS (4),                    /* cost of MULSS instruction.  */
1009   COSTS_N_INSNS (4),                    /* cost of MULSD instruction.  */
1010   COSTS_N_INSNS (8),                    /* cost of FMA SS instruction.  */
1011   COSTS_N_INSNS (8),                    /* cost of FMA SD instruction.  */
1012   /* 11-16  */
1013   COSTS_N_INSNS (16),                   /* cost of DIVSS instruction.  */
1014   COSTS_N_INSNS (24),                   /* cost of DIVSD instruction.  */
1015   COSTS_N_INSNS (19),                   /* cost of SQRTSS instruction.  */
1016   COSTS_N_INSNS (19),                   /* cost of SQRTSD instruction.  */
1017   1, 1, 1, 1,                           /* reassoc int, fp, vec_int, vec_fp.  */
1018   athlon_memcpy,
1019   athlon_memset,
1020   COSTS_N_INSNS (3),                    /* cond_taken_branch_cost.  */
1021   COSTS_N_INSNS (1),                    /* cond_not_taken_branch_cost.  */
1022   "16:8:8",                             /* Loop alignment.  */
1023   "16:8:8",                             /* Jump alignment.  */
1024   "0:0:8",                              /* Label alignment.  */
1025   "16",                                 /* Func alignment.  */
1026   4,                                    /* Small unroll limit.  */
1027   2,                                    /* Small unroll factor.  */
1028 };
1029
1030 /* K8 has optimized REP instruction for medium sized blocks, but for very
1031    small blocks it is better to use loop. For large blocks, libcall can
1032    do nontemporary accesses and beat inline considerably.  */
1033 static stringop_algs k8_memcpy[2] = {
1034   {libcall, {{6, loop, false}, {14, unrolled_loop, false},
1035              {-1, rep_prefix_4_byte, false}}},
1036   {libcall, {{16, loop, false}, {8192, rep_prefix_8_byte, false},
1037              {-1, libcall, false}}}};
1038 static stringop_algs k8_memset[2] = {
1039   {libcall, {{8, loop, false}, {24, unrolled_loop, false},
1040              {2048, rep_prefix_4_byte, false}, {-1, libcall, false}}},
1041   {libcall, {{48, unrolled_loop, false},
1042              {8192, rep_prefix_8_byte, false}, {-1, libcall, false}}}};
1043 static const
1044 struct processor_costs k8_cost = {
1045   {
1046   /* Start of register allocator costs.  integer->integer move cost is 2. */
1047   4,                                 /* cost for loading QImode using movzbl */
1048   {3, 4, 3},                            /* cost of loading integer registers
1049                                            in QImode, HImode and SImode.
1050                                            Relative to reg-reg move (2).  */
1051   {3, 4, 3},                            /* cost of storing integer registers */
1052   4,                                    /* cost of reg,reg fld/fst */
1053   {4, 4, 12},                           /* cost of loading fp registers
1054                                            in SFmode, DFmode and XFmode */
1055   {6, 6, 8},                            /* cost of storing fp registers
1056                                            in SFmode, DFmode and XFmode */
1057   2,                                    /* cost of moving MMX register */
1058   {3, 3},                               /* cost of loading MMX registers
1059                                            in SImode and DImode */
1060   {4, 4},                               /* cost of storing MMX registers
1061                                            in SImode and DImode */
1062   2, 4, 8,                              /* cost of moving XMM,YMM,ZMM register */
1063   {4, 3, 12, 12, 24},                   /* cost of loading SSE registers
1064                                            in 32,64,128,256 and 512-bit */
1065   {4, 4, 10, 10, 20},                   /* cost of storing SSE registers
1066                                            in 32,64,128,256 and 512-bit */
1067   5, 5,                         /* SSE->integer and integer->SSE moves */
1068   5, 5,                         /* mask->integer and integer->mask moves */
1069   {3, 4, 3},                            /* cost of loading mask register
1070                                            in QImode, HImode, SImode.  */
1071   {3, 4, 3},                            /* cost if storing mask register
1072                                            in QImode, HImode, SImode.  */
1073   2,                                    /* cost of moving mask register.  */
1074   /* End of register allocator costs.  */
1075   },
1076
1077   COSTS_N_INSNS (1),                    /* cost of an add instruction */
1078   COSTS_N_INSNS (2),                    /* cost of a lea instruction */
1079   COSTS_N_INSNS (1),                    /* variable shift costs */
1080   COSTS_N_INSNS (1),                    /* constant shift costs */
1081   {COSTS_N_INSNS (3),                   /* cost of starting multiply for QI */
1082    COSTS_N_INSNS (4),                   /*                               HI */
1083    COSTS_N_INSNS (3),                   /*                               SI */
1084    COSTS_N_INSNS (4),                   /*                               DI */
1085    COSTS_N_INSNS (5)},                  /*                            other */
1086   0,                                    /* cost of multiply per each bit set */
1087   {COSTS_N_INSNS (18),                  /* cost of a divide/mod for QI */
1088    COSTS_N_INSNS (26),                  /*                          HI */
1089    COSTS_N_INSNS (42),                  /*                          SI */
1090    COSTS_N_INSNS (74),                  /*                          DI */
1091    COSTS_N_INSNS (74)},                 /*                          other */
1092   COSTS_N_INSNS (1),                    /* cost of movsx */
1093   COSTS_N_INSNS (1),                    /* cost of movzx */
1094   8,                                    /* "large" insn */
1095   9,                                    /* MOVE_RATIO */
1096   6,                                    /* CLEAR_RATIO */
1097   {3, 4, 3},                            /* cost of loading integer registers
1098                                            in QImode, HImode and SImode.
1099                                            Relative to reg-reg move (2).  */
1100   {3, 4, 3},                            /* cost of storing integer registers */
1101   {4, 3, 12, 12, 24},                   /* cost of loading SSE register
1102                                            in 32bit, 64bit, 128bit, 256bit and 512bit */
1103   {4, 4, 10, 10, 20},                   /* cost of storing SSE register
1104                                            in 32bit, 64bit, 128bit, 256bit and 512bit */
1105   {4, 3, 12, 12, 24},                   /* cost of unaligned loads.  */
1106   {4, 4, 10, 10, 20},                   /* cost of unaligned stores.  */
1107   2, 4, 8,                              /* cost of moving XMM,YMM,ZMM register */
1108   5,                                    /* cost of moving SSE register to integer.  */
1109   4, 4,                                 /* Gather load static, per_elt.  */
1110   4, 4,                                 /* Gather store static, per_elt.  */
1111   64,                                   /* size of l1 cache.  */
1112   512,                                  /* size of l2 cache.  */
1113   64,                                   /* size of prefetch block */
1114   /* New AMD processors never drop prefetches; if they cannot be performed
1115      immediately, they are queued.  We set number of simultaneous prefetches
1116      to a large constant to reflect this (it probably is not a good idea not
1117      to limit number of prefetches at all, as their execution also takes some
1118      time).  */
1119   100,                                  /* number of parallel prefetches */
1120   3,                                    /* Branch cost */
1121   COSTS_N_INSNS (4),                    /* cost of FADD and FSUB insns.  */
1122   COSTS_N_INSNS (4),                    /* cost of FMUL instruction.  */
1123   COSTS_N_INSNS (19),                   /* cost of FDIV instruction.  */
1124   COSTS_N_INSNS (2),                    /* cost of FABS instruction.  */
1125   COSTS_N_INSNS (2),                    /* cost of FCHS instruction.  */
1126   COSTS_N_INSNS (35),                   /* cost of FSQRT instruction.  */
1127
1128   COSTS_N_INSNS (2),                    /* cost of cheap SSE instruction.  */
1129   COSTS_N_INSNS (4),                    /* cost of ADDSS/SD SUBSS/SD insns.  */
1130   COSTS_N_INSNS (4),                    /* cost of MULSS instruction.  */
1131   COSTS_N_INSNS (4),                    /* cost of MULSD instruction.  */
1132   COSTS_N_INSNS (8),                    /* cost of FMA SS instruction.  */
1133   COSTS_N_INSNS (8),                    /* cost of FMA SD instruction.  */
1134   /* 11-16  */
1135   COSTS_N_INSNS (16),                   /* cost of DIVSS instruction.  */
1136   COSTS_N_INSNS (20),                   /* cost of DIVSD instruction.  */
1137   COSTS_N_INSNS (19),                   /* cost of SQRTSS instruction.  */
1138   COSTS_N_INSNS (27),                   /* cost of SQRTSD instruction.  */
1139   1, 1, 1, 1,                           /* reassoc int, fp, vec_int, vec_fp.  */
1140   k8_memcpy,
1141   k8_memset,
1142   COSTS_N_INSNS (3),                    /* cond_taken_branch_cost.  */
1143   COSTS_N_INSNS (2),                    /* cond_not_taken_branch_cost.  */
1144   "16:8:8",                             /* Loop alignment.  */
1145   "16:8:8",                             /* Jump alignment.  */
1146   "0:0:8",                              /* Label alignment.  */
1147   "16",                                 /* Func alignment.  */
1148   4,                                    /* Small unroll limit.  */
1149   2,                                    /* Small unroll factor.  */
1150 };
1151
1152 /* AMDFAM10 has optimized REP instruction for medium sized blocks, but for
1153    very small blocks it is better to use loop. For large blocks, libcall can
1154    do nontemporary accesses and beat inline considerably.  */
1155 static stringop_algs amdfam10_memcpy[2] = {
1156   {libcall, {{6, loop, false}, {14, unrolled_loop, false},
1157              {-1, rep_prefix_4_byte, false}}},
1158   {libcall, {{16, loop, false}, {8192, rep_prefix_8_byte, false},
1159              {-1, libcall, false}}}};
1160 static stringop_algs amdfam10_memset[2] = {
1161   {libcall, {{8, loop, false}, {24, unrolled_loop, false},
1162              {2048, rep_prefix_4_byte, false}, {-1, libcall, false}}},
1163   {libcall, {{48, unrolled_loop, false}, {8192, rep_prefix_8_byte, false},
1164              {-1, libcall, false}}}};
1165 struct processor_costs amdfam10_cost = {
1166   {
1167   /* Start of register allocator costs.  integer->integer move cost is 2. */
1168   4,                                 /* cost for loading QImode using movzbl */
1169   {3, 4, 3},                            /* cost of loading integer registers
1170                                            in QImode, HImode and SImode.
1171                                            Relative to reg-reg move (2).  */
1172   {3, 4, 3},                            /* cost of storing integer registers */
1173   4,                                    /* cost of reg,reg fld/fst */
1174   {4, 4, 12},                           /* cost of loading fp registers
1175                                            in SFmode, DFmode and XFmode */
1176   {6, 6, 8},                            /* cost of storing fp registers
1177                                            in SFmode, DFmode and XFmode */
1178   2,                                    /* cost of moving MMX register */
1179   {3, 3},                               /* cost of loading MMX registers
1180                                            in SImode and DImode */
1181   {4, 4},                               /* cost of storing MMX registers
1182                                            in SImode and DImode */
1183   2, 4, 8,                              /* cost of moving XMM,YMM,ZMM register */
1184   {4, 4, 3, 6, 12},                     /* cost of loading SSE registers
1185                                            in 32,64,128,256 and 512-bit */
1186   {4, 4, 5, 10, 20},                    /* cost of storing SSE registers
1187                                            in 32,64,128,256 and 512-bit */
1188   3, 3,                         /* SSE->integer and integer->SSE moves */
1189   3, 3,                         /* mask->integer and integer->mask moves */
1190   {3, 4, 3},                            /* cost of loading mask register
1191                                            in QImode, HImode, SImode.  */
1192   {3, 4, 3},                            /* cost if storing mask register
1193                                            in QImode, HImode, SImode.  */
1194   2,                                    /* cost of moving mask register.  */
1195
1196                                         /* On K8:
1197                                             MOVD reg64, xmmreg Double FSTORE 4
1198                                             MOVD reg32, xmmreg Double FSTORE 4
1199                                            On AMDFAM10:
1200                                             MOVD reg64, xmmreg Double FADD 3
1201                                                                1/1  1/1
1202                                             MOVD reg32, xmmreg Double FADD 3
1203                                                                1/1  1/1 */
1204   /* End of register allocator costs.  */
1205   },
1206
1207   COSTS_N_INSNS (1),                    /* cost of an add instruction */
1208   COSTS_N_INSNS (2),                    /* cost of a lea instruction */
1209   COSTS_N_INSNS (1),                    /* variable shift costs */
1210   COSTS_N_INSNS (1),                    /* constant shift costs */
1211   {COSTS_N_INSNS (3),                   /* cost of starting multiply for QI */
1212    COSTS_N_INSNS (4),                   /*                               HI */
1213    COSTS_N_INSNS (3),                   /*                               SI */
1214    COSTS_N_INSNS (4),                   /*                               DI */
1215    COSTS_N_INSNS (5)},                  /*                            other */
1216   0,                                    /* cost of multiply per each bit set */
1217   {COSTS_N_INSNS (19),                  /* cost of a divide/mod for QI */
1218    COSTS_N_INSNS (35),                  /*                          HI */
1219    COSTS_N_INSNS (51),                  /*                          SI */
1220    COSTS_N_INSNS (83),                  /*                          DI */
1221    COSTS_N_INSNS (83)},                 /*                          other */
1222   COSTS_N_INSNS (1),                    /* cost of movsx */
1223   COSTS_N_INSNS (1),                    /* cost of movzx */
1224   8,                                    /* "large" insn */
1225   9,                                    /* MOVE_RATIO */
1226   6,                                    /* CLEAR_RATIO */
1227   {3, 4, 3},                            /* cost of loading integer registers
1228                                            in QImode, HImode and SImode.
1229                                            Relative to reg-reg move (2).  */
1230   {3, 4, 3},                            /* cost of storing integer registers */
1231   {4, 4, 3, 6, 12},                     /* cost of loading SSE register
1232                                            in 32bit, 64bit, 128bit, 256bit and 512bit */
1233   {4, 4, 5, 10, 20},                    /* cost of storing SSE register
1234                                            in 32bit, 64bit, 128bit, 256bit and 512bit */
1235   {4, 4, 3, 7, 12},                     /* cost of unaligned loads.  */
1236   {4, 4, 5, 10, 20},                    /* cost of unaligned stores.  */
1237   2, 4, 8,                              /* cost of moving XMM,YMM,ZMM register */
1238   3,                                    /* cost of moving SSE register to integer.  */
1239   4, 4,                                 /* Gather load static, per_elt.  */
1240   4, 4,                                 /* Gather store static, per_elt.  */
1241   64,                                   /* size of l1 cache.  */
1242   512,                                  /* size of l2 cache.  */
1243   64,                                   /* size of prefetch block */
1244   /* New AMD processors never drop prefetches; if they cannot be performed
1245      immediately, they are queued.  We set number of simultaneous prefetches
1246      to a large constant to reflect this (it probably is not a good idea not
1247      to limit number of prefetches at all, as their execution also takes some
1248      time).  */
1249   100,                                  /* number of parallel prefetches */
1250   2,                                    /* Branch cost */
1251   COSTS_N_INSNS (4),                    /* cost of FADD and FSUB insns.  */
1252   COSTS_N_INSNS (4),                    /* cost of FMUL instruction.  */
1253   COSTS_N_INSNS (19),                   /* cost of FDIV instruction.  */
1254   COSTS_N_INSNS (2),                    /* cost of FABS instruction.  */
1255   COSTS_N_INSNS (2),                    /* cost of FCHS instruction.  */
1256   COSTS_N_INSNS (35),                   /* cost of FSQRT instruction.  */
1257
1258   COSTS_N_INSNS (2),                    /* cost of cheap SSE instruction.  */
1259   COSTS_N_INSNS (4),                    /* cost of ADDSS/SD SUBSS/SD insns.  */
1260   COSTS_N_INSNS (4),                    /* cost of MULSS instruction.  */
1261   COSTS_N_INSNS (4),                    /* cost of MULSD instruction.  */
1262   COSTS_N_INSNS (8),                    /* cost of FMA SS instruction.  */
1263   COSTS_N_INSNS (8),                    /* cost of FMA SD instruction.  */
1264   /* 11-16  */
1265   COSTS_N_INSNS (16),                   /* cost of DIVSS instruction.  */
1266   COSTS_N_INSNS (20),                   /* cost of DIVSD instruction.  */
1267   COSTS_N_INSNS (19),                   /* cost of SQRTSS instruction.  */
1268   COSTS_N_INSNS (27),                   /* cost of SQRTSD instruction.  */
1269   1, 1, 1, 1,                           /* reassoc int, fp, vec_int, vec_fp.  */
1270   amdfam10_memcpy,
1271   amdfam10_memset,
1272   COSTS_N_INSNS (2),                    /* cond_taken_branch_cost.  */
1273   COSTS_N_INSNS (1),                    /* cond_not_taken_branch_cost.  */
1274   "32:25:8",                            /* Loop alignment.  */
1275   "32:8:8",                             /* Jump alignment.  */
1276   "0:0:8",                              /* Label alignment.  */
1277   "32",                                 /* Func alignment.  */
1278   4,                                    /* Small unroll limit.  */
1279   2,                                    /* Small unroll factor.  */
1280 };
1281
1282 /*  BDVER has optimized REP instruction for medium sized blocks, but for
1283     very small blocks it is better to use loop. For large blocks, libcall
1284     can do nontemporary accesses and beat inline considerably.  */
1285 static stringop_algs bdver_memcpy[2] = {
1286   {libcall, {{6, loop, false}, {14, unrolled_loop, false},
1287              {-1, rep_prefix_4_byte, false}}},
1288   {libcall, {{16, loop, false}, {8192, rep_prefix_8_byte, false},
1289              {-1, libcall, false}}}};
1290 static stringop_algs bdver_memset[2] = {
1291   {libcall, {{8, loop, false}, {24, unrolled_loop, false},
1292              {2048, rep_prefix_4_byte, false}, {-1, libcall, false}}},
1293   {libcall, {{48, unrolled_loop, false}, {8192, rep_prefix_8_byte, false},
1294              {-1, libcall, false}}}};
1295
1296 const struct processor_costs bdver_cost = {
1297   {
1298   /* Start of register allocator costs.  integer->integer move cost is 2. */
1299   8,                                 /* cost for loading QImode using movzbl */
1300   {8, 8, 8},                            /* cost of loading integer registers
1301                                            in QImode, HImode and SImode.
1302                                            Relative to reg-reg move (2).  */
1303   {8, 8, 8},                            /* cost of storing integer registers */
1304   4,                                    /* cost of reg,reg fld/fst */
1305   {12, 12, 28},                         /* cost of loading fp registers
1306                                            in SFmode, DFmode and XFmode */
1307   {10, 10, 18},                         /* cost of storing fp registers
1308                                            in SFmode, DFmode and XFmode */
1309   4,                                    /* cost of moving MMX register */
1310   {12, 12},                             /* cost of loading MMX registers
1311                                            in SImode and DImode */
1312   {10, 10},                             /* cost of storing MMX registers
1313                                            in SImode and DImode */
1314   2, 4, 8,                              /* cost of moving XMM,YMM,ZMM register */
1315   {12, 12, 10, 40, 60},                 /* cost of loading SSE registers
1316                                            in 32,64,128,256 and 512-bit */
1317   {10, 10, 10, 40, 60},                 /* cost of storing SSE registers
1318                                            in 32,64,128,256 and 512-bit */
1319   16, 20,                               /* SSE->integer and integer->SSE moves */
1320   16, 20,                               /* mask->integer and integer->mask moves */
1321   {8, 8, 8},                            /* cost of loading mask register
1322                                            in QImode, HImode, SImode.  */
1323   {8, 8, 8},                            /* cost if storing mask register
1324                                            in QImode, HImode, SImode.  */
1325   2,                                    /* cost of moving mask register.  */
1326   /* End of register allocator costs.  */
1327   },
1328
1329   COSTS_N_INSNS (1),                    /* cost of an add instruction */
1330   COSTS_N_INSNS (1),                    /* cost of a lea instruction */
1331   COSTS_N_INSNS (1),                    /* variable shift costs */
1332   COSTS_N_INSNS (1),                    /* constant shift costs */
1333   {COSTS_N_INSNS (4),                   /* cost of starting multiply for QI */
1334    COSTS_N_INSNS (4),                   /*                               HI */
1335    COSTS_N_INSNS (4),                   /*                               SI */
1336    COSTS_N_INSNS (6),                   /*                               DI */
1337    COSTS_N_INSNS (6)},                  /*                            other */
1338   0,                                    /* cost of multiply per each bit set */
1339   {COSTS_N_INSNS (19),                  /* cost of a divide/mod for QI */
1340    COSTS_N_INSNS (35),                  /*                          HI */
1341    COSTS_N_INSNS (51),                  /*                          SI */
1342    COSTS_N_INSNS (83),                  /*                          DI */
1343    COSTS_N_INSNS (83)},                 /*                          other */
1344   COSTS_N_INSNS (1),                    /* cost of movsx */
1345   COSTS_N_INSNS (1),                    /* cost of movzx */
1346   8,                                    /* "large" insn */
1347   9,                                    /* MOVE_RATIO */
1348   6,                                    /* CLEAR_RATIO */
1349   {8, 8, 8},                            /* cost of loading integer registers
1350                                            in QImode, HImode and SImode.
1351                                            Relative to reg-reg move (2).  */
1352   {8, 8, 8},                            /* cost of storing integer registers */
1353   {12, 12, 10, 40, 60},                 /* cost of loading SSE register
1354                                            in 32bit, 64bit, 128bit, 256bit and 512bit */
1355   {10, 10, 10, 40, 60},                 /* cost of storing SSE register
1356                                            in 32bit, 64bit, 128bit, 256bit and 512bit */
1357   {12, 12, 10, 40, 60},                 /* cost of unaligned loads.  */
1358   {10, 10, 10, 40, 60},                 /* cost of unaligned stores.  */
1359   2, 4, 8,                              /* cost of moving XMM,YMM,ZMM register */
1360   16,                                   /* cost of moving SSE register to integer.  */
1361   12, 12,                               /* Gather load static, per_elt.  */
1362   10, 10,                               /* Gather store static, per_elt.  */
1363   16,                                   /* size of l1 cache.  */
1364   2048,                                 /* size of l2 cache.  */
1365   64,                                   /* size of prefetch block */
1366   /* New AMD processors never drop prefetches; if they cannot be performed
1367      immediately, they are queued.  We set number of simultaneous prefetches
1368      to a large constant to reflect this (it probably is not a good idea not
1369      to limit number of prefetches at all, as their execution also takes some
1370      time).  */
1371   100,                                  /* number of parallel prefetches */
1372   2,                                    /* Branch cost */
1373   COSTS_N_INSNS (6),                    /* cost of FADD and FSUB insns.  */
1374   COSTS_N_INSNS (6),                    /* cost of FMUL instruction.  */
1375   COSTS_N_INSNS (42),                   /* cost of FDIV instruction.  */
1376   COSTS_N_INSNS (2),                    /* cost of FABS instruction.  */
1377   COSTS_N_INSNS (2),                    /* cost of FCHS instruction.  */
1378   COSTS_N_INSNS (52),                   /* cost of FSQRT instruction.  */
1379
1380   COSTS_N_INSNS (2),                    /* cost of cheap SSE instruction.  */
1381   COSTS_N_INSNS (6),                    /* cost of ADDSS/SD SUBSS/SD insns.  */
1382   COSTS_N_INSNS (6),                    /* cost of MULSS instruction.  */
1383   COSTS_N_INSNS (6),                    /* cost of MULSD instruction.  */
1384   COSTS_N_INSNS (6),                    /* cost of FMA SS instruction.  */
1385   COSTS_N_INSNS (6),                    /* cost of FMA SD instruction.  */
1386   /* 9-24  */
1387   COSTS_N_INSNS (24),                   /* cost of DIVSS instruction.  */
1388   /* 9-27  */
1389   COSTS_N_INSNS (27),                   /* cost of DIVSD instruction.  */
1390   COSTS_N_INSNS (15),                   /* cost of SQRTSS instruction.  */
1391   COSTS_N_INSNS (26),                   /* cost of SQRTSD instruction.  */
1392   1, 2, 1, 1,                           /* reassoc int, fp, vec_int, vec_fp.  */
1393   bdver_memcpy,
1394   bdver_memset,
1395   COSTS_N_INSNS (4),                    /* cond_taken_branch_cost.  */
1396   COSTS_N_INSNS (2),                    /* cond_not_taken_branch_cost.  */
1397   "16:11:8",                            /* Loop alignment.  */
1398   "16:8:8",                             /* Jump alignment.  */
1399   "0:0:8",                              /* Label alignment.  */
1400   "11",                                 /* Func alignment.  */
1401   4,                                    /* Small unroll limit.  */
1402   2,                                    /* Small unroll factor.  */
1403 };
1404
1405
1406 /*  ZNVER1 has optimized REP instruction for medium sized blocks, but for
1407     very small blocks it is better to use loop.  For large blocks, libcall
1408     can do nontemporary accesses and beat inline considerably.  */
1409 static stringop_algs znver1_memcpy[2] = {
1410   /* 32-bit tuning.  */
1411   {libcall, {{6, loop, false},
1412              {14, unrolled_loop, false},
1413              {-1, libcall, false}}},
1414   /* 64-bit tuning.  */
1415   {libcall, {{16, loop, false},
1416              {128, rep_prefix_8_byte, false},
1417              {-1, libcall, false}}}};
1418 static stringop_algs znver1_memset[2] = {
1419   /* 32-bit tuning.  */
1420   {libcall, {{8, loop, false},
1421              {24, unrolled_loop, false},
1422              {128, rep_prefix_4_byte, false},
1423              {-1, libcall, false}}},
1424   /* 64-bit tuning.  */
1425   {libcall, {{48, unrolled_loop, false},
1426              {128, rep_prefix_8_byte, false},
1427              {-1, libcall, false}}}};
1428 struct processor_costs znver1_cost = {
1429   {
1430   /* Start of register allocator costs.  integer->integer move cost is 2. */
1431
1432   /* reg-reg moves are done by renaming and thus they are even cheaper than
1433      1 cycle. Becuase reg-reg move cost is 2 and the following tables correspond
1434      to doubles of latencies, we do not model this correctly.  It does not
1435      seem to make practical difference to bump prices up even more.  */
1436   6,                                    /* cost for loading QImode using
1437                                            movzbl.  */
1438   {6, 6, 6},                            /* cost of loading integer registers
1439                                            in QImode, HImode and SImode.
1440                                            Relative to reg-reg move (2).  */
1441   {8, 8, 8},                            /* cost of storing integer
1442                                            registers.  */
1443   2,                                    /* cost of reg,reg fld/fst.  */
1444   {6, 6, 16},                           /* cost of loading fp registers
1445                                            in SFmode, DFmode and XFmode.  */
1446   {8, 8, 16},                           /* cost of storing fp registers
1447                                            in SFmode, DFmode and XFmode.  */
1448   2,                                    /* cost of moving MMX register.  */
1449   {6, 6},                               /* cost of loading MMX registers
1450                                            in SImode and DImode.  */
1451   {8, 8},                               /* cost of storing MMX registers
1452                                            in SImode and DImode.  */
1453   2, 3, 6,                              /* cost of moving XMM,YMM,ZMM register.  */
1454   {6, 6, 6, 12, 24},                    /* cost of loading SSE registers
1455                                            in 32,64,128,256 and 512-bit.  */
1456   {8, 8, 8, 16, 32},                    /* cost of storing SSE registers
1457                                            in 32,64,128,256 and 512-bit.  */
1458   6, 6,                         /* SSE->integer and integer->SSE moves.  */
1459   8, 8,                         /* mask->integer and integer->mask moves */
1460   {6, 6, 6},                            /* cost of loading mask register
1461                                            in QImode, HImode, SImode.  */
1462   {8, 8, 8},                            /* cost if storing mask register
1463                                            in QImode, HImode, SImode.  */
1464   2,                                    /* cost of moving mask register.  */
1465   /* End of register allocator costs.  */
1466   },
1467
1468   COSTS_N_INSNS (1),                    /* cost of an add instruction.  */
1469   COSTS_N_INSNS (1),                    /* cost of a lea instruction.  */
1470   COSTS_N_INSNS (1),                    /* variable shift costs.  */
1471   COSTS_N_INSNS (1),                    /* constant shift costs.  */
1472   {COSTS_N_INSNS (3),                   /* cost of starting multiply for QI.  */
1473    COSTS_N_INSNS (3),                   /*                               HI.  */
1474    COSTS_N_INSNS (3),                   /*                               SI.  */
1475    COSTS_N_INSNS (3),                   /*                               DI.  */
1476    COSTS_N_INSNS (3)},                  /*                            other.  */
1477   0,                                    /* cost of multiply per each bit
1478                                             set.  */
1479    /* Depending on parameters, idiv can get faster on ryzen.  This is upper
1480       bound.  */
1481   {COSTS_N_INSNS (16),                  /* cost of a divide/mod for QI.  */
1482    COSTS_N_INSNS (22),                  /*                          HI.  */
1483    COSTS_N_INSNS (30),                  /*                          SI.  */
1484    COSTS_N_INSNS (45),                  /*                          DI.  */
1485    COSTS_N_INSNS (45)},                 /*                          other.  */
1486   COSTS_N_INSNS (1),                    /* cost of movsx.  */
1487   COSTS_N_INSNS (1),                    /* cost of movzx.  */
1488   8,                                    /* "large" insn.  */
1489   9,                                    /* MOVE_RATIO.  */
1490   6,                                    /* CLEAR_RATIO */
1491   {6, 6, 6},                            /* cost of loading integer registers
1492                                            in QImode, HImode and SImode.
1493                                            Relative to reg-reg move (2).  */
1494   {8, 8, 8},                            /* cost of storing integer
1495                                            registers.  */
1496   {6, 6, 6, 12, 24},                    /* cost of loading SSE register
1497                                            in 32bit, 64bit, 128bit, 256bit and 512bit */
1498   {8, 8, 8, 16, 32},                    /* cost of storing SSE register
1499                                            in 32bit, 64bit, 128bit, 256bit and 512bit */
1500   {6, 6, 6, 12, 24},                    /* cost of unaligned loads.  */
1501   {8, 8, 8, 16, 32},                    /* cost of unaligned stores.  */
1502   2, 3, 6,                              /* cost of moving XMM,YMM,ZMM register.  */
1503   6,                                    /* cost of moving SSE register to integer.  */
1504   /* VGATHERDPD is 23 uops and throughput is 9, VGATHERDPD is 35 uops,
1505      throughput 12.  Approx 9 uops do not depend on vector size and every load
1506      is 7 uops.  */
1507   18, 8,                                /* Gather load static, per_elt.  */
1508   18, 10,                               /* Gather store static, per_elt.  */
1509   32,                                   /* size of l1 cache.  */
1510   512,                                  /* size of l2 cache.  */
1511   64,                                   /* size of prefetch block.  */
1512   /* New AMD processors never drop prefetches; if they cannot be performed
1513      immediately, they are queued.  We set number of simultaneous prefetches
1514      to a large constant to reflect this (it probably is not a good idea not
1515      to limit number of prefetches at all, as their execution also takes some
1516      time).  */
1517   100,                                  /* number of parallel prefetches.  */
1518   3,                                    /* Branch cost.  */
1519   COSTS_N_INSNS (5),                    /* cost of FADD and FSUB insns.  */
1520   COSTS_N_INSNS (5),                    /* cost of FMUL instruction.  */
1521   /* Latency of fdiv is 8-15.  */
1522   COSTS_N_INSNS (15),                   /* cost of FDIV instruction.  */
1523   COSTS_N_INSNS (1),                    /* cost of FABS instruction.  */
1524   COSTS_N_INSNS (1),                    /* cost of FCHS instruction.  */
1525   /* Latency of fsqrt is 4-10.  */
1526   COSTS_N_INSNS (10),                   /* cost of FSQRT instruction.  */
1527
1528   COSTS_N_INSNS (1),                    /* cost of cheap SSE instruction.  */
1529   COSTS_N_INSNS (3),                    /* cost of ADDSS/SD SUBSS/SD insns.  */
1530   COSTS_N_INSNS (3),                    /* cost of MULSS instruction.  */
1531   COSTS_N_INSNS (4),                    /* cost of MULSD instruction.  */
1532   COSTS_N_INSNS (5),                    /* cost of FMA SS instruction.  */
1533   COSTS_N_INSNS (5),                    /* cost of FMA SD instruction.  */
1534   COSTS_N_INSNS (10),                   /* cost of DIVSS instruction.  */
1535   /* 9-13  */
1536   COSTS_N_INSNS (13),                   /* cost of DIVSD instruction.  */
1537   COSTS_N_INSNS (10),                   /* cost of SQRTSS instruction.  */
1538   COSTS_N_INSNS (15),                   /* cost of SQRTSD instruction.  */
1539   /* Zen can execute 4 integer operations per cycle. FP operations take 3 cycles
1540      and it can execute 2 integer additions and 2 multiplications thus
1541      reassociation may make sense up to with of 6.  SPEC2k6 bencharks suggests
1542      that 4 works better than 6 probably due to register pressure.
1543
1544      Integer vector operations are taken by FP unit and execute 3 vector
1545      plus/minus operations per cycle but only one multiply.  This is adjusted
1546      in ix86_reassociation_width.  */
1547   4, 4, 3, 6,                           /* reassoc int, fp, vec_int, vec_fp.  */
1548   znver1_memcpy,
1549   znver1_memset,
1550   COSTS_N_INSNS (4),                    /* cond_taken_branch_cost.  */
1551   COSTS_N_INSNS (2),                    /* cond_not_taken_branch_cost.  */
1552   "16",                                 /* Loop alignment.  */
1553   "16",                                 /* Jump alignment.  */
1554   "0:0:8",                              /* Label alignment.  */
1555   "16",                                 /* Func alignment.  */
1556   4,                                    /* Small unroll limit.  */
1557   2,                                    /* Small unroll factor.  */
1558 };
1559
1560 /*  ZNVER2 has optimized REP instruction for medium sized blocks, but for
1561     very small blocks it is better to use loop.  For large blocks, libcall
1562     can do nontemporary accesses and beat inline considerably.  */
1563 static stringop_algs znver2_memcpy[2] = {
1564   /* 32-bit tuning.  */
1565   {libcall, {{6, loop, false},
1566              {14, unrolled_loop, false},
1567              {-1, libcall, false}}},
1568   /* 64-bit tuning.  */
1569   {libcall, {{16, loop, false},
1570              {64, rep_prefix_4_byte, false},
1571              {-1, libcall, false}}}};
1572 static stringop_algs znver2_memset[2] = {
1573   /* 32-bit tuning.  */
1574   {libcall, {{8, loop, false},
1575              {24, unrolled_loop, false},
1576              {128, rep_prefix_4_byte, false},
1577              {-1, libcall, false}}},
1578   /* 64-bit tuning.  */
1579   {libcall, {{24, rep_prefix_4_byte, false},
1580              {128, rep_prefix_8_byte, false},
1581              {-1, libcall, false}}}};
1582
1583 struct processor_costs znver2_cost = {
1584   {
1585   /* Start of register allocator costs.  integer->integer move cost is 2. */
1586
1587   /* reg-reg moves are done by renaming and thus they are even cheaper than
1588      1 cycle.  Because reg-reg move cost is 2 and following tables correspond
1589      to doubles of latencies, we do not model this correctly.  It does not
1590      seem to make practical difference to bump prices up even more.  */
1591   6,                                    /* cost for loading QImode using
1592                                            movzbl.  */
1593   {6, 6, 6},                            /* cost of loading integer registers
1594                                            in QImode, HImode and SImode.
1595                                            Relative to reg-reg move (2).  */
1596   {8, 8, 8},                            /* cost of storing integer
1597                                            registers.  */
1598   2,                                    /* cost of reg,reg fld/fst.  */
1599   {6, 6, 16},                           /* cost of loading fp registers
1600                                            in SFmode, DFmode and XFmode.  */
1601   {8, 8, 16},                           /* cost of storing fp registers
1602                                            in SFmode, DFmode and XFmode.  */
1603   2,                                    /* cost of moving MMX register.  */
1604   {6, 6},                               /* cost of loading MMX registers
1605                                            in SImode and DImode.  */
1606   {8, 8},                               /* cost of storing MMX registers
1607                                            in SImode and DImode.  */
1608   2, 2, 3,                              /* cost of moving XMM,YMM,ZMM
1609                                            register.  */
1610   {6, 6, 6, 6, 12},                     /* cost of loading SSE registers
1611                                            in 32,64,128,256 and 512-bit.  */
1612   {8, 8, 8, 8, 16},                     /* cost of storing SSE registers
1613                                            in 32,64,128,256 and 512-bit.  */
1614   6, 6,                                 /* SSE->integer and integer->SSE
1615                                            moves.  */
1616   8, 8,                         /* mask->integer and integer->mask moves */
1617   {6, 6, 6},                            /* cost of loading mask register
1618                                            in QImode, HImode, SImode.  */
1619   {8, 8, 8},                            /* cost if storing mask register
1620                                            in QImode, HImode, SImode.  */
1621   2,                                    /* cost of moving mask register.  */
1622   /* End of register allocator costs.  */
1623   },
1624
1625   COSTS_N_INSNS (1),                    /* cost of an add instruction.  */
1626   COSTS_N_INSNS (1),                    /* cost of a lea instruction.  */
1627   COSTS_N_INSNS (1),                    /* variable shift costs.  */
1628   COSTS_N_INSNS (1),                    /* constant shift costs.  */
1629   {COSTS_N_INSNS (3),                   /* cost of starting multiply for QI.  */
1630    COSTS_N_INSNS (3),                   /*                               HI.  */
1631    COSTS_N_INSNS (3),                   /*                               SI.  */
1632    COSTS_N_INSNS (3),                   /*                               DI.  */
1633    COSTS_N_INSNS (3)},                  /*                      other.  */
1634   0,                                    /* cost of multiply per each bit
1635                                            set.  */
1636    /* Depending on parameters, idiv can get faster on ryzen.  This is upper
1637       bound.  */
1638   {COSTS_N_INSNS (16),                  /* cost of a divide/mod for QI.  */
1639    COSTS_N_INSNS (22),                  /*                          HI.  */
1640    COSTS_N_INSNS (30),                  /*                          SI.  */
1641    COSTS_N_INSNS (45),                  /*                          DI.  */
1642    COSTS_N_INSNS (45)},                 /*                          other.  */
1643   COSTS_N_INSNS (1),                    /* cost of movsx.  */
1644   COSTS_N_INSNS (1),                    /* cost of movzx.  */
1645   8,                                    /* "large" insn.  */
1646   9,                                    /* MOVE_RATIO.  */
1647   6,                                    /* CLEAR_RATIO */
1648   {6, 6, 6},                            /* cost of loading integer registers
1649                                            in QImode, HImode and SImode.
1650                                            Relative to reg-reg move (2).  */
1651   {8, 8, 8},                            /* cost of storing integer
1652                                            registers.  */
1653   {6, 6, 6, 6, 12},                     /* cost of loading SSE registers
1654                                            in 32bit, 64bit, 128bit, 256bit and 512bit */
1655   {8, 8, 8, 8, 16},                     /* cost of storing SSE register
1656                                            in 32bit, 64bit, 128bit, 256bit and 512bit */
1657   {6, 6, 6, 6, 12},                     /* cost of unaligned loads.  */
1658   {8, 8, 8, 8, 16},                     /* cost of unaligned stores.  */
1659   2, 2, 3,                              /* cost of moving XMM,YMM,ZMM
1660                                            register.  */
1661   6,                                    /* cost of moving SSE register to integer.  */
1662   /* VGATHERDPD is 23 uops and throughput is 9, VGATHERDPD is 35 uops,
1663      throughput 12.  Approx 9 uops do not depend on vector size and every load
1664      is 7 uops.  */
1665   18, 8,                                /* Gather load static, per_elt.  */
1666   18, 10,                               /* Gather store static, per_elt.  */
1667   32,                                   /* size of l1 cache.  */
1668   512,                                  /* size of l2 cache.  */
1669   64,                                   /* size of prefetch block.  */
1670   /* New AMD processors never drop prefetches; if they cannot be performed
1671      immediately, they are queued.  We set number of simultaneous prefetches
1672      to a large constant to reflect this (it probably is not a good idea not
1673      to limit number of prefetches at all, as their execution also takes some
1674      time).  */
1675   100,                                  /* number of parallel prefetches.  */
1676   3,                                    /* Branch cost.  */
1677   COSTS_N_INSNS (5),                    /* cost of FADD and FSUB insns.  */
1678   COSTS_N_INSNS (5),                    /* cost of FMUL instruction.  */
1679   /* Latency of fdiv is 8-15.  */
1680   COSTS_N_INSNS (15),                   /* cost of FDIV instruction.  */
1681   COSTS_N_INSNS (1),                    /* cost of FABS instruction.  */
1682   COSTS_N_INSNS (1),                    /* cost of FCHS instruction.  */
1683   /* Latency of fsqrt is 4-10.  */
1684   COSTS_N_INSNS (10),                   /* cost of FSQRT instruction.  */
1685
1686   COSTS_N_INSNS (1),                    /* cost of cheap SSE instruction.  */
1687   COSTS_N_INSNS (3),                    /* cost of ADDSS/SD SUBSS/SD insns.  */
1688   COSTS_N_INSNS (3),                    /* cost of MULSS instruction.  */
1689   COSTS_N_INSNS (3),                    /* cost of MULSD instruction.  */
1690   COSTS_N_INSNS (5),                    /* cost of FMA SS instruction.  */
1691   COSTS_N_INSNS (5),                    /* cost of FMA SD instruction.  */
1692   COSTS_N_INSNS (10),                   /* cost of DIVSS instruction.  */
1693   /* 9-13.  */
1694   COSTS_N_INSNS (13),                   /* cost of DIVSD instruction.  */
1695   COSTS_N_INSNS (10),                   /* cost of SQRTSS instruction.  */
1696   COSTS_N_INSNS (15),                   /* cost of SQRTSD instruction.  */
1697   /* Zen can execute 4 integer operations per cycle.  FP operations
1698      take 3 cycles and it can execute 2 integer additions and 2
1699      multiplications thus reassociation may make sense up to with of 6.
1700      SPEC2k6 bencharks suggests
1701      that 4 works better than 6 probably due to register pressure.
1702
1703      Integer vector operations are taken by FP unit and execute 3 vector
1704      plus/minus operations per cycle but only one multiply.  This is adjusted
1705      in ix86_reassociation_width.  */
1706   4, 4, 3, 6,                           /* reassoc int, fp, vec_int, vec_fp.  */
1707   znver2_memcpy,
1708   znver2_memset,
1709   COSTS_N_INSNS (4),                    /* cond_taken_branch_cost.  */
1710   COSTS_N_INSNS (2),                    /* cond_not_taken_branch_cost.  */
1711   "16",                                 /* Loop alignment.  */
1712   "16",                                 /* Jump alignment.  */
1713   "0:0:8",                              /* Label alignment.  */
1714   "16",                                 /* Func alignment.  */
1715   4,                                    /* Small unroll limit.  */
1716   2,                                    /* Small unroll factor.  */
1717 };
1718
1719 struct processor_costs znver3_cost = {
1720   {
1721   /* Start of register allocator costs.  integer->integer move cost is 2. */
1722
1723   /* reg-reg moves are done by renaming and thus they are even cheaper than
1724      1 cycle.  Because reg-reg move cost is 2 and following tables correspond
1725      to doubles of latencies, we do not model this correctly.  It does not
1726      seem to make practical difference to bump prices up even more.  */
1727   6,                                    /* cost for loading QImode using
1728                                            movzbl.  */
1729   {6, 6, 6},                            /* cost of loading integer registers
1730                                            in QImode, HImode and SImode.
1731                                            Relative to reg-reg move (2).  */
1732   {8, 8, 8},                            /* cost of storing integer
1733                                            registers.  */
1734   2,                                    /* cost of reg,reg fld/fst.  */
1735   {6, 6, 16},                           /* cost of loading fp registers
1736                                            in SFmode, DFmode and XFmode.  */
1737   {8, 8, 16},                           /* cost of storing fp registers
1738                                            in SFmode, DFmode and XFmode.  */
1739   2,                                    /* cost of moving MMX register.  */
1740   {6, 6},                               /* cost of loading MMX registers
1741                                            in SImode and DImode.  */
1742   {8, 8},                               /* cost of storing MMX registers
1743                                            in SImode and DImode.  */
1744   2, 2, 3,                              /* cost of moving XMM,YMM,ZMM
1745                                            register.  */
1746   {6, 6, 6, 6, 12},                     /* cost of loading SSE registers
1747                                            in 32,64,128,256 and 512-bit.  */
1748   {8, 8, 8, 8, 16},                     /* cost of storing SSE registers
1749                                            in 32,64,128,256 and 512-bit.  */
1750   6, 6,                                 /* SSE->integer and integer->SSE
1751                                            moves.  */
1752   8, 8,                         /* mask->integer and integer->mask moves */
1753   {6, 6, 6},                            /* cost of loading mask register
1754                                            in QImode, HImode, SImode.  */
1755   {8, 8, 8},                            /* cost if storing mask register
1756                                            in QImode, HImode, SImode.  */
1757   2,                                    /* cost of moving mask register.  */
1758   /* End of register allocator costs.  */
1759   },
1760
1761   COSTS_N_INSNS (1),                    /* cost of an add instruction.  */
1762   COSTS_N_INSNS (1),                    /* cost of a lea instruction.  */
1763   COSTS_N_INSNS (1),                    /* variable shift costs.  */
1764   COSTS_N_INSNS (1),                    /* constant shift costs.  */
1765   {COSTS_N_INSNS (3),                   /* cost of starting multiply for QI.  */
1766    COSTS_N_INSNS (3),                   /*                               HI.  */
1767    COSTS_N_INSNS (3),                   /*                               SI.  */
1768    COSTS_N_INSNS (3),                   /*                               DI.  */
1769    COSTS_N_INSNS (3)},                  /*                      other.  */
1770   0,                                    /* cost of multiply per each bit
1771                                            set.  */
1772   {COSTS_N_INSNS (9),                   /* cost of a divide/mod for QI.  */
1773    COSTS_N_INSNS (10),                  /*                          HI.  */
1774    COSTS_N_INSNS (12),                  /*                          SI.  */
1775    COSTS_N_INSNS (17),                  /*                          DI.  */
1776    COSTS_N_INSNS (17)},                 /*                          other.  */
1777   COSTS_N_INSNS (1),                    /* cost of movsx.  */
1778   COSTS_N_INSNS (1),                    /* cost of movzx.  */
1779   8,                                    /* "large" insn.  */
1780   9,                                    /* MOVE_RATIO.  */
1781   6,                                    /* CLEAR_RATIO */
1782   {6, 6, 6},                            /* cost of loading integer registers
1783                                            in QImode, HImode and SImode.
1784                                            Relative to reg-reg move (2).  */
1785   {8, 8, 8},                            /* cost of storing integer
1786                                            registers.  */
1787   {6, 6, 6, 6, 12},                     /* cost of loading SSE registers
1788                                            in 32bit, 64bit, 128bit, 256bit and 512bit */
1789   {8, 8, 8, 8, 16},                     /* cost of storing SSE register
1790                                            in 32bit, 64bit, 128bit, 256bit and 512bit */
1791   {6, 6, 6, 6, 12},                     /* cost of unaligned loads.  */
1792   {8, 8, 8, 8, 16},                     /* cost of unaligned stores.  */
1793   2, 2, 3,                              /* cost of moving XMM,YMM,ZMM
1794                                            register.  */
1795   6,                                    /* cost of moving SSE register to integer.  */
1796   /* VGATHERDPD is 15 uops and throughput is 4, VGATHERDPS is 23 uops,
1797      throughput 9.  Approx 7 uops do not depend on vector size and every load
1798      is 4 uops.  */
1799   14, 8,                                /* Gather load static, per_elt.  */
1800   14, 10,                               /* Gather store static, per_elt.  */
1801   32,                                   /* size of l1 cache.  */
1802   512,                                  /* size of l2 cache.  */
1803   64,                                   /* size of prefetch block.  */
1804   /* New AMD processors never drop prefetches; if they cannot be performed
1805      immediately, they are queued.  We set number of simultaneous prefetches
1806      to a large constant to reflect this (it probably is not a good idea not
1807      to limit number of prefetches at all, as their execution also takes some
1808      time).  */
1809   100,                                  /* number of parallel prefetches.  */
1810   3,                                    /* Branch cost.  */
1811   COSTS_N_INSNS (5),                    /* cost of FADD and FSUB insns.  */
1812   COSTS_N_INSNS (5),                    /* cost of FMUL instruction.  */
1813   /* Latency of fdiv is 8-15.  */
1814   COSTS_N_INSNS (15),                   /* cost of FDIV instruction.  */
1815   COSTS_N_INSNS (1),                    /* cost of FABS instruction.  */
1816   COSTS_N_INSNS (1),                    /* cost of FCHS instruction.  */
1817   /* Latency of fsqrt is 4-10.  */
1818   COSTS_N_INSNS (10),                   /* cost of FSQRT instruction.  */
1819
1820   COSTS_N_INSNS (1),                    /* cost of cheap SSE instruction.  */
1821   COSTS_N_INSNS (3),                    /* cost of ADDSS/SD SUBSS/SD insns.  */
1822   COSTS_N_INSNS (3),                    /* cost of MULSS instruction.  */
1823   COSTS_N_INSNS (3),                    /* cost of MULSD instruction.  */
1824   COSTS_N_INSNS (5),                    /* cost of FMA SS instruction.  */
1825   COSTS_N_INSNS (5),                    /* cost of FMA SD instruction.  */
1826   COSTS_N_INSNS (10),                   /* cost of DIVSS instruction.  */
1827   /* 9-13.  */
1828   COSTS_N_INSNS (13),                   /* cost of DIVSD instruction.  */
1829   COSTS_N_INSNS (10),                   /* cost of SQRTSS instruction.  */
1830   COSTS_N_INSNS (15),                   /* cost of SQRTSD instruction.  */
1831   /* Zen can execute 4 integer operations per cycle.  FP operations
1832      take 3 cycles and it can execute 2 integer additions and 2
1833      multiplications thus reassociation may make sense up to with of 6.
1834      SPEC2k6 bencharks suggests
1835      that 4 works better than 6 probably due to register pressure.
1836
1837      Integer vector operations are taken by FP unit and execute 3 vector
1838      plus/minus operations per cycle but only one multiply.  This is adjusted
1839      in ix86_reassociation_width.  */
1840   4, 4, 3, 6,                           /* reassoc int, fp, vec_int, vec_fp.  */
1841   znver2_memcpy,
1842   znver2_memset,
1843   COSTS_N_INSNS (4),                    /* cond_taken_branch_cost.  */
1844   COSTS_N_INSNS (2),                    /* cond_not_taken_branch_cost.  */
1845   "16",                                 /* Loop alignment.  */
1846   "16",                                 /* Jump alignment.  */
1847   "0:0:8",                              /* Label alignment.  */
1848   "16",                                 /* Func alignment.  */
1849   4,                                    /* Small unroll limit.  */
1850   2,                                    /* Small unroll factor.  */
1851 };
1852
1853 /* This table currently replicates znver3_cost table. */
1854 struct processor_costs znver4_cost = {
1855   {
1856   /* Start of register allocator costs.  integer->integer move cost is 2. */
1857
1858   /* reg-reg moves are done by renaming and thus they are even cheaper than
1859      1 cycle.  Because reg-reg move cost is 2 and following tables correspond
1860      to doubles of latencies, we do not model this correctly.  It does not
1861      seem to make practical difference to bump prices up even more.  */
1862   6,                                    /* cost for loading QImode using
1863                                            movzbl.  */
1864   {6, 6, 6},                            /* cost of loading integer registers
1865                                            in QImode, HImode and SImode.
1866                                            Relative to reg-reg move (2).  */
1867   {8, 8, 8},                            /* cost of storing integer
1868                                            registers.  */
1869   2,                                    /* cost of reg,reg fld/fst.  */
1870   {14, 14, 17},                         /* cost of loading fp registers
1871                                            in SFmode, DFmode and XFmode.  */
1872   {12, 12, 16},                         /* cost of storing fp registers
1873                                            in SFmode, DFmode and XFmode.  */
1874   2,                                    /* cost of moving MMX register.  */
1875   {6, 6},                               /* cost of loading MMX registers
1876                                            in SImode and DImode.  */
1877   {8, 8},                               /* cost of storing MMX registers
1878                                            in SImode and DImode.  */
1879   2, 2, 3,                              /* cost of moving XMM,YMM,ZMM
1880                                            register.  */
1881   {6, 6, 10, 10, 12},                   /* cost of loading SSE registers
1882                                            in 32,64,128,256 and 512-bit.  */
1883   {8, 8, 8, 12, 12},                    /* cost of storing SSE registers
1884                                            in 32,64,128,256 and 512-bit.  */
1885   6, 8,                                 /* SSE->integer and integer->SSE
1886                                            moves.  */
1887   8, 8,                                 /* mask->integer and integer->mask moves */
1888   {6, 6, 6},                            /* cost of loading mask register
1889                                            in QImode, HImode, SImode.  */
1890   {8, 8, 8},                            /* cost if storing mask register
1891                                            in QImode, HImode, SImode.  */
1892   2,                                    /* cost of moving mask register.  */
1893   /* End of register allocator costs.  */
1894   },
1895
1896   COSTS_N_INSNS (1),                    /* cost of an add instruction.  */
1897   /* TODO: Lea with 3 components has cost 2.  */
1898   COSTS_N_INSNS (1),                    /* cost of a lea instruction.  */
1899   COSTS_N_INSNS (1),                    /* variable shift costs.  */
1900   COSTS_N_INSNS (1),                    /* constant shift costs.  */
1901   {COSTS_N_INSNS (3),                   /* cost of starting multiply for QI.  */
1902    COSTS_N_INSNS (3),                   /*                               HI.  */
1903    COSTS_N_INSNS (3),                   /*                               SI.  */
1904    COSTS_N_INSNS (3),                   /*                               DI.  */
1905    COSTS_N_INSNS (3)},                  /*                      other.  */
1906   0,                                    /* cost of multiply per each bit
1907                                            set.  */
1908   {COSTS_N_INSNS (12),                  /* cost of a divide/mod for QI.  */
1909    COSTS_N_INSNS (13),                  /*                          HI.  */
1910    COSTS_N_INSNS (13),                  /*                          SI.  */
1911    COSTS_N_INSNS (18),                  /*                          DI.  */
1912    COSTS_N_INSNS (18)},                 /*                          other.  */
1913   COSTS_N_INSNS (1),                    /* cost of movsx.  */
1914   COSTS_N_INSNS (1),                    /* cost of movzx.  */
1915   8,                                    /* "large" insn.  */
1916   9,                                    /* MOVE_RATIO.  */
1917   6,                                    /* CLEAR_RATIO */
1918   {6, 6, 6},                            /* cost of loading integer registers
1919                                            in QImode, HImode and SImode.
1920                                            Relative to reg-reg move (2).  */
1921   {8, 8, 8},                            /* cost of storing integer
1922                                            registers.  */
1923   {6, 6, 10, 10, 12},                   /* cost of loading SSE registers
1924                                            in 32bit, 64bit, 128bit, 256bit and 512bit */
1925   {8, 8, 8, 12, 12},                    /* cost of storing SSE register
1926                                            in 32bit, 64bit, 128bit, 256bit and 512bit */
1927   {6, 6, 10, 10, 12},                   /* cost of unaligned loads.  */
1928   {8, 8, 8, 12, 12},                    /* cost of unaligned stores.  */
1929   2, 2, 2,                              /* cost of moving XMM,YMM,ZMM
1930                                            register.  */
1931   6,                                    /* cost of moving SSE register to integer.  */
1932   /* VGATHERDPD is 17 uops and throughput is 4, VGATHERDPS is 24 uops,
1933      throughput 5.  Approx 7 uops do not depend on vector size and every load
1934      is 5 uops.  */
1935   14, 10,                               /* Gather load static, per_elt.  */
1936   14, 20,                               /* Gather store static, per_elt.  */
1937   32,                                   /* size of l1 cache.  */
1938   1024,                                 /* size of l2 cache.  */
1939   64,                                   /* size of prefetch block.  */
1940   /* New AMD processors never drop prefetches; if they cannot be performed
1941      immediately, they are queued.  We set number of simultaneous prefetches
1942      to a large constant to reflect this (it probably is not a good idea not
1943      to limit number of prefetches at all, as their execution also takes some
1944      time).  */
1945   100,                                  /* number of parallel prefetches.  */
1946   3,                                    /* Branch cost.  */
1947   COSTS_N_INSNS (7),                    /* cost of FADD and FSUB insns.  */
1948   COSTS_N_INSNS (7),                    /* cost of FMUL instruction.  */
1949   /* Latency of fdiv is 8-15.  */
1950   COSTS_N_INSNS (15),                   /* cost of FDIV instruction.  */
1951   COSTS_N_INSNS (1),                    /* cost of FABS instruction.  */
1952   COSTS_N_INSNS (1),                    /* cost of FCHS instruction.  */
1953   /* Latency of fsqrt is 4-10.  */
1954   COSTS_N_INSNS (25),                   /* cost of FSQRT instruction.  */
1955
1956   COSTS_N_INSNS (1),                    /* cost of cheap SSE instruction.  */
1957   COSTS_N_INSNS (3),                    /* cost of ADDSS/SD SUBSS/SD insns.  */
1958   COSTS_N_INSNS (3),                    /* cost of MULSS instruction.  */
1959   COSTS_N_INSNS (3),                    /* cost of MULSD instruction.  */
1960   COSTS_N_INSNS (4),                    /* cost of FMA SS instruction.  */
1961   COSTS_N_INSNS (4),                    /* cost of FMA SD instruction.  */
1962   COSTS_N_INSNS (13),                   /* cost of DIVSS instruction.  */
1963   /* 9-13.  */
1964   COSTS_N_INSNS (13),                   /* cost of DIVSD instruction.  */
1965   COSTS_N_INSNS (15),                   /* cost of SQRTSS instruction.  */
1966   COSTS_N_INSNS (21),                   /* cost of SQRTSD instruction.  */
1967   /* Zen can execute 4 integer operations per cycle.  FP operations
1968      take 3 cycles and it can execute 2 integer additions and 2
1969      multiplications thus reassociation may make sense up to with of 6.
1970      SPEC2k6 bencharks suggests
1971      that 4 works better than 6 probably due to register pressure.
1972
1973      Integer vector operations are taken by FP unit and execute 3 vector
1974      plus/minus operations per cycle but only one multiply.  This is adjusted
1975      in ix86_reassociation_width.  */
1976   4, 4, 3, 6,                           /* reassoc int, fp, vec_int, vec_fp.  */
1977   znver2_memcpy,
1978   znver2_memset,
1979   COSTS_N_INSNS (4),                    /* cond_taken_branch_cost.  */
1980   COSTS_N_INSNS (2),                    /* cond_not_taken_branch_cost.  */
1981   "16",                                 /* Loop alignment.  */
1982   "16",                                 /* Jump alignment.  */
1983   "0:0:8",                              /* Label alignment.  */
1984   "16",                                 /* Func alignment.  */
1985   4,                                    /* Small unroll limit.  */
1986   2,                                    /* Small unroll factor.  */
1987 };
1988
1989 /* This table currently replicates znver4_cost table. */
1990 struct processor_costs znver5_cost = {
1991   {
1992   /* Start of register allocator costs.  integer->integer move cost is 2. */
1993
1994   /* reg-reg moves are done by renaming and thus they are even cheaper than
1995      1 cycle.  Because reg-reg move cost is 2 and following tables correspond
1996      to doubles of latencies, we do not model this correctly.  It does not
1997      seem to make practical difference to bump prices up even more.  */
1998   6,                                    /* cost for loading QImode using
1999                                            movzbl.  */
2000   {6, 6, 6},                            /* cost of loading integer registers
2001                                            in QImode, HImode and SImode.
2002                                            Relative to reg-reg move (2).  */
2003   {8, 8, 8},                            /* cost of storing integer
2004                                            registers.  */
2005   2,                                    /* cost of reg,reg fld/fst.  */
2006   {14, 14, 17},                         /* cost of loading fp registers
2007                                            in SFmode, DFmode and XFmode.  */
2008   {12, 12, 16},                         /* cost of storing fp registers
2009                                            in SFmode, DFmode and XFmode.  */
2010   2,                                    /* cost of moving MMX register.  */
2011   {6, 6},                               /* cost of loading MMX registers
2012                                            in SImode and DImode.  */
2013   {8, 8},                               /* cost of storing MMX registers
2014                                            in SImode and DImode.  */
2015   2, 2, 3,                              /* cost of moving XMM,YMM,ZMM
2016                                            register.  */
2017   {6, 6, 10, 10, 12},                   /* cost of loading SSE registers
2018                                            in 32,64,128,256 and 512-bit.  */
2019   {8, 8, 8, 12, 12},                    /* cost of storing SSE registers
2020                                            in 32,64,128,256 and 512-bit.  */
2021   6, 8,                                 /* SSE->integer and integer->SSE
2022                                            moves.  */
2023   8, 8,                                 /* mask->integer and integer->mask moves */
2024   {6, 6, 6},                            /* cost of loading mask register
2025                                            in QImode, HImode, SImode.  */
2026   {8, 8, 8},                            /* cost if storing mask register
2027                                            in QImode, HImode, SImode.  */
2028   2,                                    /* cost of moving mask register.  */
2029   /* End of register allocator costs.  */
2030   },
2031
2032   COSTS_N_INSNS (1),                    /* cost of an add instruction.  */
2033   /* TODO: Lea with 3 components has cost 2.  */
2034   COSTS_N_INSNS (1),                    /* cost of a lea instruction.  */
2035   COSTS_N_INSNS (1),                    /* variable shift costs.  */
2036   COSTS_N_INSNS (1),                    /* constant shift costs.  */
2037   /* mul has latency 3, executes in 3 integer units.  */
2038   {COSTS_N_INSNS (3),                   /* cost of starting multiply for QI.  */
2039    COSTS_N_INSNS (3),                   /*                               HI.  */
2040    COSTS_N_INSNS (3),                   /*                               SI.  */
2041    COSTS_N_INSNS (3),                   /*                               DI.  */
2042    COSTS_N_INSNS (3)},                  /*                      other.  */
2043   0,                                    /* cost of multiply per each bit
2044                                            set.  */
2045   /* integer divide has latency of 8 cycles
2046      plus 1 for every 9 bits of quotient.  */
2047   {COSTS_N_INSNS (10),                  /* cost of a divide/mod for QI.  */
2048    COSTS_N_INSNS (11),                  /*                          HI.  */
2049    COSTS_N_INSNS (13),                  /*                          SI.  */
2050    COSTS_N_INSNS (16),                  /*                          DI.  */
2051    COSTS_N_INSNS (16)},                 /*                          other.  */
2052   COSTS_N_INSNS (1),                    /* cost of movsx.  */
2053   COSTS_N_INSNS (1),                    /* cost of movzx.  */
2054   15,                                   /* "large" insn.  */
2055   9,                                    /* MOVE_RATIO.  */
2056   6,                                    /* CLEAR_RATIO */
2057   {6, 6, 6},                            /* cost of loading integer registers
2058                                            in QImode, HImode and SImode.
2059                                            Relative to reg-reg move (2).  */
2060   {8, 8, 8},                            /* cost of storing integer
2061                                            registers.  */
2062   {6, 6, 10, 10, 12},                   /* cost of loading SSE registers
2063                                            in 32bit, 64bit, 128bit, 256bit and 512bit */
2064   {8, 8, 8, 12, 12},                    /* cost of storing SSE register
2065                                            in 32bit, 64bit, 128bit, 256bit and 512bit */
2066   {6, 6, 10, 10, 12},                   /* cost of unaligned loads.  */
2067   {8, 8, 8, 12, 12},                    /* cost of unaligned stores.  */
2068   2, 2, 2,                              /* cost of moving XMM,YMM,ZMM
2069                                            register.  */
2070   6,                                    /* cost of moving SSE register to integer.  */
2071
2072   /* TODO: gather and scatter instructions are currently disabled in
2073      x86-tune.def.  In some cases they are however a win, see PR116582
2074      We however need good cost model for them.  */
2075   14, 10,                               /* Gather load static, per_elt.  */
2076   14, 20,                               /* Gather store static, per_elt.  */
2077   48,                                   /* size of l1 cache.  */
2078   1024,                                 /* size of l2 cache.  */
2079   64,                                   /* size of prefetch block.  */
2080   /* New AMD processors never drop prefetches; if they cannot be performed
2081      immediately, they are queued.  We set number of simultaneous prefetches
2082      to a large constant to reflect this (it probably is not a good idea not
2083      to limit number of prefetches at all, as their execution also takes some
2084      time).  */
2085   100,                                  /* number of parallel prefetches.  */
2086   3,                                    /* Branch cost.  */
2087   /* TODO x87 latencies are still based on znver4.
2088      Probably not very important these days.  */
2089   COSTS_N_INSNS (7),                    /* cost of FADD and FSUB insns.  */
2090   COSTS_N_INSNS (7),                    /* cost of FMUL instruction.  */
2091   /* Latency of fdiv is 8-15.  */
2092   COSTS_N_INSNS (15),                   /* cost of FDIV instruction.  */
2093   COSTS_N_INSNS (1),                    /* cost of FABS instruction.  */
2094   COSTS_N_INSNS (1),                    /* cost of FCHS instruction.  */
2095   /* Latency of fsqrt is 4-10.  */
2096   COSTS_N_INSNS (25),                   /* cost of FSQRT instruction.  */
2097
2098   /* SSE instructions have typical throughput 4 and latency 1.  */
2099   COSTS_N_INSNS (1),                    /* cost of cheap SSE instruction.  */
2100   /* ADDSS has throughput 2 and latency 2
2101      (in some cases when source is another addition).  */
2102   COSTS_N_INSNS (2),                    /* cost of ADDSS/SD SUBSS/SD insns.  */
2103   /* MULSS has throughput 2 and latency 3.  */
2104   COSTS_N_INSNS (3),                    /* cost of MULSS instruction.  */
2105   COSTS_N_INSNS (3),                    /* cost of MULSD instruction.  */
2106   /* FMA had throughput 2 and latency 4.  */
2107   COSTS_N_INSNS (4),                    /* cost of FMA SS instruction.  */
2108   COSTS_N_INSNS (4),                    /* cost of FMA SD instruction.  */
2109   /* DIVSS has throughtput 0.4 and latency 10.  */
2110   COSTS_N_INSNS (10),                   /* cost of DIVSS instruction.  */
2111   /* DIVSD has throughtput 0.25 and latency 13.  */
2112   COSTS_N_INSNS (13),                   /* cost of DIVSD instruction.  */
2113   /* DIVSD has throughtput 0.22 and latency 14.  */
2114   COSTS_N_INSNS (14),                   /* cost of SQRTSS instruction.  */
2115   /* DIVSD has throughtput 0.13 and latency 20.  */
2116   COSTS_N_INSNS (20),                   /* cost of SQRTSD instruction.  */
2117   /* Zen5 can execute:
2118       - integer ops: 6 per cycle, at most 3 multiplications.
2119         latency 1 for additions, 3 for multiplications (pipelined)
2120
2121         Setting width of 9 for multiplication is probably excessive
2122         for register pressure.
2123       - fp ops: 2 additions per cycle, latency 2-3
2124                 2 multiplicaitons per cycle, latency 3
2125       - vector intger ops: 4 additions, latency 1
2126                            2 multiplications, latency 4
2127         We increase width to 6 for multiplications
2128         in ix86_reassociation_width.  */
2129   6, 6, 4, 6,                           /* reassoc int, fp, vec_int, vec_fp.  */
2130   znver2_memcpy,
2131   znver2_memset,
2132   COSTS_N_INSNS (4),                    /* cond_taken_branch_cost.  */
2133   COSTS_N_INSNS (2),                    /* cond_not_taken_branch_cost.  */
2134   "16",                                 /* Loop alignment.  */
2135   "16",                                 /* Jump alignment.  */
2136   "0:0:8",                              /* Label alignment.  */
2137   "16",                                 /* Func alignment.  */
2138   4,                                    /* Small unroll limit.  */
2139   2,                                    /* Small unroll factor.  */
2140 };
2141
2142 /* skylake_cost should produce code tuned for Skylake familly of CPUs.  */
2143 static stringop_algs skylake_memcpy[2] =   {
2144   {libcall,
2145    {{256, rep_prefix_1_byte, true},
2146     {256, loop, false},
2147     {-1, libcall, false}}},
2148   {libcall,
2149    {{256, rep_prefix_1_byte, true},
2150     {256, loop, false},
2151     {-1, libcall, false}}}};
2152
2153 static stringop_algs skylake_memset[2] = {
2154   {libcall,
2155    {{256, rep_prefix_1_byte, true},
2156     {256, loop, false},
2157     {-1, libcall, false}}},
2158   {libcall,
2159    {{256, rep_prefix_1_byte, true},
2160     {256, loop, false},
2161     {-1, libcall, false}}}};
2162
2163 static const
2164 struct processor_costs skylake_cost = {
2165   {
2166   /* Start of register allocator costs.  integer->integer move cost is 2. */
2167   6,                                 /* cost for loading QImode using movzbl */
2168   {4, 4, 4},                            /* cost of loading integer registers
2169                                            in QImode, HImode and SImode.
2170                                            Relative to reg-reg move (2).  */
2171   {6, 6, 6},                            /* cost of storing integer registers */
2172   2,                                    /* cost of reg,reg fld/fst */
2173   {6, 6, 8},                            /* cost of loading fp registers
2174                                            in SFmode, DFmode and XFmode */
2175   {6, 6, 10},                           /* cost of storing fp registers
2176                                            in SFmode, DFmode and XFmode */
2177   2,                                    /* cost of moving MMX register */
2178   {6, 6},                               /* cost of loading MMX registers
2179                                            in SImode and DImode */
2180   {6, 6},                               /* cost of storing MMX registers
2181                                            in SImode and DImode */
2182   2, 2, 4,                              /* cost of moving XMM,YMM,ZMM register */
2183   {6, 6, 6, 10, 20},                    /* cost of loading SSE registers
2184                                            in 32,64,128,256 and 512-bit */
2185   {8, 8, 8, 12, 24},                    /* cost of storing SSE registers
2186                                            in 32,64,128,256 and 512-bit */
2187   6, 6,                         /* SSE->integer and integer->SSE moves */
2188   6, 6,                         /* mask->integer and integer->mask moves */
2189   {8, 8, 8},                            /* cost of loading mask register
2190                                            in QImode, HImode, SImode.  */
2191   {6, 6, 6},                            /* cost if storing mask register
2192                                            in QImode, HImode, SImode.  */
2193   3,                                    /* cost of moving mask register.  */
2194   /* End of register allocator costs.  */
2195   },
2196
2197   COSTS_N_INSNS (1),                    /* cost of an add instruction */
2198   COSTS_N_INSNS (1)+1,          /* cost of a lea instruction */
2199   COSTS_N_INSNS (1),                    /* variable shift costs */
2200   COSTS_N_INSNS (1),                    /* constant shift costs */
2201   {COSTS_N_INSNS (3),                   /* cost of starting multiply for QI */
2202    COSTS_N_INSNS (3),                   /*                               HI */
2203    COSTS_N_INSNS (3),                   /*                               SI */
2204    COSTS_N_INSNS (3),                   /*                               DI */
2205    COSTS_N_INSNS (3)},                  /*                            other */
2206   0,                                    /* cost of multiply per each bit set */
2207   /* Expanding div/mod currently doesn't consider parallelism. So the cost
2208      model is not realistic. We compensate by increasing the latencies a bit.  */
2209   {COSTS_N_INSNS (11),                  /* cost of a divide/mod for QI */
2210    COSTS_N_INSNS (11),                  /*                          HI */
2211    COSTS_N_INSNS (14),                  /*                          SI */
2212    COSTS_N_INSNS (76),                  /*                          DI */
2213    COSTS_N_INSNS (76)},                 /*                          other */
2214   COSTS_N_INSNS (1),                    /* cost of movsx */
2215   COSTS_N_INSNS (0),                    /* cost of movzx */
2216   8,                                    /* "large" insn */
2217   17,                                   /* MOVE_RATIO */
2218   17,                                   /* CLEAR_RATIO */
2219   {6, 6, 6},                            /* cost of loading integer registers
2220                                            in QImode, HImode and SImode.
2221                                            Relative to reg-reg move (2).  */
2222   {8, 8, 8},                            /* cost of storing integer registers */
2223   {8, 8, 8, 8, 16},                     /* cost of loading SSE register
2224                                            in 32bit, 64bit, 128bit, 256bit and 512bit */
2225   {8, 8, 8, 8, 16},                     /* cost of storing SSE register
2226                                            in 32bit, 64bit, 128bit, 256bit and 512bit */
2227   {8, 8, 8, 8, 16},                     /* cost of unaligned loads.  */
2228   {8, 8, 8, 8, 16},                     /* cost of unaligned stores.  */
2229   2, 2, 4,                              /* cost of moving XMM,YMM,ZMM register */
2230   6,                                    /* cost of moving SSE register to integer.  */
2231   20, 8,                                /* Gather load static, per_elt.  */
2232   22, 10,                               /* Gather store static, per_elt.  */
2233   64,                                   /* size of l1 cache.  */
2234   512,                                  /* size of l2 cache.  */
2235   64,                                   /* size of prefetch block */
2236   6,                                    /* number of parallel prefetches */
2237   3,                                    /* Branch cost */
2238   COSTS_N_INSNS (3),                    /* cost of FADD and FSUB insns.  */
2239   COSTS_N_INSNS (4),                    /* cost of FMUL instruction.  */
2240   COSTS_N_INSNS (20),                   /* cost of FDIV instruction.  */
2241   COSTS_N_INSNS (1),                    /* cost of FABS instruction.  */
2242   COSTS_N_INSNS (1),                    /* cost of FCHS instruction.  */
2243   COSTS_N_INSNS (20),                   /* cost of FSQRT instruction.  */
2244
2245   COSTS_N_INSNS (1),                    /* cost of cheap SSE instruction.  */
2246   COSTS_N_INSNS (4),                    /* cost of ADDSS/SD SUBSS/SD insns.  */
2247   COSTS_N_INSNS (4),                    /* cost of MULSS instruction.  */
2248   COSTS_N_INSNS (4),                    /* cost of MULSD instruction.  */
2249   COSTS_N_INSNS (4),                    /* cost of FMA SS instruction.  */
2250   COSTS_N_INSNS (4),                    /* cost of FMA SD instruction.  */
2251   COSTS_N_INSNS (11),                   /* cost of DIVSS instruction.  */
2252   COSTS_N_INSNS (14),                   /* cost of DIVSD instruction.  */
2253   COSTS_N_INSNS (12),                   /* cost of SQRTSS instruction.  */
2254   COSTS_N_INSNS (18),                   /* cost of SQRTSD instruction.  */
2255   1, 4, 2, 2,                           /* reassoc int, fp, vec_int, vec_fp.  */
2256   skylake_memcpy,
2257   skylake_memset,
2258   COSTS_N_INSNS (3),                    /* cond_taken_branch_cost.  */
2259   COSTS_N_INSNS (1),                    /* cond_not_taken_branch_cost.  */
2260   "16:11:8",                            /* Loop alignment.  */
2261   "16:11:8",                            /* Jump alignment.  */
2262   "0:0:8",                              /* Label alignment.  */
2263   "16",                                 /* Func alignment.  */
2264   4,                                    /* Small unroll limit.  */
2265   2,                                    /* Small unroll factor.  */
2266 };
2267
2268 /* icelake_cost should produce code tuned for Icelake family of CPUs.
2269    NB: rep_prefix_1_byte is used only for known size. */
2270
2271 static stringop_algs icelake_memcpy[2] =   {
2272   {libcall,
2273    {{256, rep_prefix_1_byte, true},
2274     {256, loop, false},
2275     {-1, libcall, false}}},
2276   {libcall,
2277    {{256, rep_prefix_1_byte, true},
2278     {256, loop, false},
2279     {-1, libcall, false}}}};
2280
2281 static stringop_algs icelake_memset[2] = {
2282   {libcall,
2283    {{256, rep_prefix_1_byte, true},
2284     {256, loop, false},
2285     {-1, libcall, false}}},
2286   {libcall,
2287    {{256, rep_prefix_1_byte, true},
2288     {256, loop, false},
2289     {-1, libcall, false}}}};
2290
2291 static const
2292 struct processor_costs icelake_cost = {
2293   {
2294   /* Start of register allocator costs.  integer->integer move cost is 2. */
2295   6,                                 /* cost for loading QImode using movzbl */
2296   {4, 4, 4},                            /* cost of loading integer registers
2297                                            in QImode, HImode and SImode.
2298                                            Relative to reg-reg move (2).  */
2299   {6, 6, 6},                            /* cost of storing integer registers */
2300   2,                                    /* cost of reg,reg fld/fst */
2301   {6, 6, 8},                            /* cost of loading fp registers
2302                                            in SFmode, DFmode and XFmode */
2303   {6, 6, 10},                           /* cost of storing fp registers
2304                                            in SFmode, DFmode and XFmode */
2305   2,                                    /* cost of moving MMX register */
2306   {6, 6},                               /* cost of loading MMX registers
2307                                            in SImode and DImode */
2308   {6, 6},                               /* cost of storing MMX registers
2309                                            in SImode and DImode */
2310   2, 2, 4,                              /* cost of moving XMM,YMM,ZMM register */
2311   {6, 6, 6, 10, 20},                    /* cost of loading SSE registers
2312                                            in 32,64,128,256 and 512-bit */
2313   {8, 8, 8, 12, 24},                    /* cost of storing SSE registers
2314                                            in 32,64,128,256 and 512-bit */
2315   6, 6,                         /* SSE->integer and integer->SSE moves */
2316   6, 6,                         /* mask->integer and integer->mask moves */
2317   {8, 8, 8},                            /* cost of loading mask register
2318                                            in QImode, HImode, SImode.  */
2319   {6, 6, 6},                            /* cost if storing mask register
2320                                            in QImode, HImode, SImode.  */
2321   3,                                    /* cost of moving mask register.  */
2322   /* End of register allocator costs.  */
2323   },
2324
2325   COSTS_N_INSNS (1),                    /* cost of an add instruction */
2326   COSTS_N_INSNS (1)+1,          /* cost of a lea instruction */
2327   COSTS_N_INSNS (1),                    /* variable shift costs */
2328   COSTS_N_INSNS (1),                    /* constant shift costs */
2329   {COSTS_N_INSNS (3),                   /* cost of starting multiply for QI */
2330    COSTS_N_INSNS (3),                   /*                               HI */
2331    COSTS_N_INSNS (3),                   /*                               SI */
2332    COSTS_N_INSNS (3),                   /*                               DI */
2333    COSTS_N_INSNS (3)},                  /*                            other */
2334   0,                                    /* cost of multiply per each bit set */
2335   /* Expanding div/mod currently doesn't consider parallelism. So the cost
2336      model is not realistic. We compensate by increasing the latencies a bit.  */
2337   {COSTS_N_INSNS (11),                  /* cost of a divide/mod for QI */
2338    COSTS_N_INSNS (11),                  /*                          HI */
2339    COSTS_N_INSNS (14),                  /*                          SI */
2340    COSTS_N_INSNS (76),                  /*                          DI */
2341    COSTS_N_INSNS (76)},                 /*                          other */
2342   COSTS_N_INSNS (1),                    /* cost of movsx */
2343   COSTS_N_INSNS (0),                    /* cost of movzx */
2344   8,                                    /* "large" insn */
2345   17,                                   /* MOVE_RATIO */
2346   17,                                   /* CLEAR_RATIO */
2347   {6, 6, 6},                            /* cost of loading integer registers
2348                                            in QImode, HImode and SImode.
2349                                            Relative to reg-reg move (2).  */
2350   {8, 8, 8},                            /* cost of storing integer registers */
2351   {8, 8, 8, 8, 16},                     /* cost of loading SSE register
2352                                            in 32bit, 64bit, 128bit, 256bit and 512bit */
2353   {8, 8, 8, 8, 16},                     /* cost of storing SSE register
2354                                            in 32bit, 64bit, 128bit, 256bit and 512bit */
2355   {8, 8, 8, 8, 16},                     /* cost of unaligned loads.  */
2356   {8, 8, 8, 8, 16},                     /* cost of unaligned stores.  */
2357   2, 2, 4,                              /* cost of moving XMM,YMM,ZMM register */
2358   6,                                    /* cost of moving SSE register to integer.  */
2359   20, 8,                                /* Gather load static, per_elt.  */
2360   22, 10,                               /* Gather store static, per_elt.  */
2361   64,                                   /* size of l1 cache.  */
2362   512,                                  /* size of l2 cache.  */
2363   64,                                   /* size of prefetch block */
2364   6,                                    /* number of parallel prefetches */
2365   3,                                    /* Branch cost */
2366   COSTS_N_INSNS (3),                    /* cost of FADD and FSUB insns.  */
2367   COSTS_N_INSNS (4),                    /* cost of FMUL instruction.  */
2368   COSTS_N_INSNS (20),                   /* cost of FDIV instruction.  */
2369   COSTS_N_INSNS (1),                    /* cost of FABS instruction.  */
2370   COSTS_N_INSNS (1),                    /* cost of FCHS instruction.  */
2371   COSTS_N_INSNS (20),                   /* cost of FSQRT instruction.  */
2372
2373   COSTS_N_INSNS (1),                    /* cost of cheap SSE instruction.  */
2374   COSTS_N_INSNS (4),                    /* cost of ADDSS/SD SUBSS/SD insns.  */
2375   COSTS_N_INSNS (4),                    /* cost of MULSS instruction.  */
2376   COSTS_N_INSNS (4),                    /* cost of MULSD instruction.  */
2377   COSTS_N_INSNS (4),                    /* cost of FMA SS instruction.  */
2378   COSTS_N_INSNS (4),                    /* cost of FMA SD instruction.  */
2379   COSTS_N_INSNS (11),                   /* cost of DIVSS instruction.  */
2380   COSTS_N_INSNS (14),                   /* cost of DIVSD instruction.  */
2381   COSTS_N_INSNS (12),                   /* cost of SQRTSS instruction.  */
2382   COSTS_N_INSNS (18),                   /* cost of SQRTSD instruction.  */
2383   1, 4, 2, 2,                           /* reassoc int, fp, vec_int, vec_fp.  */
2384   icelake_memcpy,
2385   icelake_memset,
2386   COSTS_N_INSNS (3),                    /* cond_taken_branch_cost.  */
2387   COSTS_N_INSNS (1),                    /* cond_not_taken_branch_cost.  */
2388   "16:11:8",                            /* Loop alignment.  */
2389   "16:11:8",                            /* Jump alignment.  */
2390   "0:0:8",                              /* Label alignment.  */
2391   "16",                                 /* Func alignment.  */
2392   4,                                    /* Small unroll limit.  */
2393   2,                                    /* Small unroll factor.  */
2394 };
2395
2396 /* alderlake_cost should produce code tuned for alderlake family of CPUs.  */
2397 static stringop_algs alderlake_memcpy[2] = {
2398   {libcall,
2399    {{256, rep_prefix_1_byte, true},
2400     {256, loop, false},
2401     {-1, libcall, false}}},
2402   {libcall,
2403    {{256, rep_prefix_1_byte, true},
2404     {256, loop, false},
2405     {-1, libcall, false}}}};
2406 static stringop_algs alderlake_memset[2] = {
2407   {libcall,
2408    {{256, rep_prefix_1_byte, true},
2409     {256, loop, false},
2410     {-1, libcall, false}}},
2411   {libcall,
2412    {{256, rep_prefix_1_byte, true},
2413     {256, loop, false},
2414     {-1, libcall, false}}}};
2415 static const
2416 struct processor_costs alderlake_cost = {
2417   {
2418   /* Start of register allocator costs.  integer->integer move cost is 2.  */
2419   6,                                 /* cost for loading QImode using movzbl */
2420   {6, 6, 6},                            /* cost of loading integer registers
2421                                            in QImode, HImode and SImode.
2422                                            Relative to reg-reg move (2).  */
2423   {6, 6, 6},                            /* cost of storing integer registers */
2424   4,                                    /* cost of reg,reg fld/fst */
2425   {6, 6, 12},                           /* cost of loading fp registers
2426                                            in SFmode, DFmode and XFmode */
2427   {6, 6, 12},                           /* cost of storing fp registers
2428                                            in SFmode, DFmode and XFmode */
2429   2,                                    /* cost of moving MMX register */
2430   {6, 6},                               /* cost of loading MMX registers
2431                                            in SImode and DImode */
2432   {6, 6},                               /* cost of storing MMX registers
2433                                            in SImode and DImode */
2434   2, 3, 4,                              /* cost of moving XMM,YMM,ZMM register */
2435   {6, 6, 6, 10, 15},                    /* cost of loading SSE registers
2436                                            in 32,64,128,256 and 512-bit */
2437   {6, 6, 6, 10, 15},                    /* cost of storing SSE registers
2438                                            in 32,64,128,256 and 512-bit */
2439   6, 6,                         /* SSE->integer and integer->SSE moves */
2440   6, 6,                         /* mask->integer and integer->mask moves */
2441   {6, 6, 6},                            /* cost of loading mask register
2442                                            in QImode, HImode, SImode.  */
2443   {6, 6, 6},                    /* cost if storing mask register
2444                                            in QImode, HImode, SImode.  */
2445   2,                                    /* cost of moving mask register.  */
2446   /* End of register allocator costs.  */
2447   },
2448
2449   COSTS_N_INSNS (1),                    /* cost of an add instruction */
2450   COSTS_N_INSNS (1) + 1,                /* cost of a lea instruction */
2451   COSTS_N_INSNS (1),                    /* variable shift costs */
2452   COSTS_N_INSNS (1),                    /* constant shift costs */
2453   {COSTS_N_INSNS (3),                   /* cost of starting multiply for QI */
2454    COSTS_N_INSNS (3),                   /*                               HI */
2455    COSTS_N_INSNS (3),                   /*                               SI */
2456    COSTS_N_INSNS (3),                   /*                               DI */
2457    COSTS_N_INSNS (4)},                  /*                            other */
2458   0,                                    /* cost of multiply per each bit set */
2459   {COSTS_N_INSNS (16),                  /* cost of a divide/mod for QI */
2460    COSTS_N_INSNS (22),                  /*                          HI */
2461    COSTS_N_INSNS (30),                  /*                          SI */
2462    COSTS_N_INSNS (74),                  /*                          DI */
2463    COSTS_N_INSNS (74)},                 /*                          other */
2464   COSTS_N_INSNS (1),                    /* cost of movsx */
2465   COSTS_N_INSNS (1),                    /* cost of movzx */
2466   8,                                    /* "large" insn */
2467   17,                                   /* MOVE_RATIO */
2468   17,                                   /* CLEAR_RATIO */
2469   {6, 6, 6},                            /* cost of loading integer registers
2470                                            in QImode, HImode and SImode.
2471                                            Relative to reg-reg move (2).  */
2472   {8, 8, 8},                            /* cost of storing integer registers */
2473   {8, 8, 8, 10, 15},                    /* cost of loading SSE register
2474                                            in 32bit, 64bit, 128bit, 256bit and 512bit */
2475   {8, 8, 8, 10, 15},                    /* cost of storing SSE register
2476                                            in 32bit, 64bit, 128bit, 256bit and 512bit */
2477   {8, 8, 8, 10, 15},                    /* cost of unaligned loads.  */
2478   {8, 8, 8, 10, 15},                    /* cost of unaligned storess.  */
2479   2, 3, 4,                              /* cost of moving XMM,YMM,ZMM register */
2480   6,                                    /* cost of moving SSE register to integer.  */
2481   18, 6,                                /* Gather load static, per_elt.  */
2482   18, 6,                                /* Gather store static, per_elt.  */
2483   32,                                   /* size of l1 cache.  */
2484   512,                                  /* size of l2 cache.  */
2485   64,                                   /* size of prefetch block */
2486   6,                                    /* number of parallel prefetches */
2487   3,                                    /* Branch cost */
2488   COSTS_N_INSNS (3),                    /* cost of FADD and FSUB insns.  */
2489   COSTS_N_INSNS (5),                    /* cost of FMUL instruction.  */
2490   COSTS_N_INSNS (17),                   /* cost of FDIV instruction.  */
2491   COSTS_N_INSNS (1),                    /* cost of FABS instruction.  */
2492   COSTS_N_INSNS (1),                    /* cost of FCHS instruction.  */
2493   COSTS_N_INSNS (14),                   /* cost of FSQRT instruction.  */
2494
2495   COSTS_N_INSNS (1),                    /* cost of cheap SSE instruction.  */
2496   COSTS_N_INSNS (3),                    /* cost of ADDSS/SD SUBSS/SD insns.  */
2497   COSTS_N_INSNS (4),                    /* cost of MULSS instruction.  */
2498   COSTS_N_INSNS (5),                    /* cost of MULSD instruction.  */
2499   COSTS_N_INSNS (5),                    /* cost of FMA SS instruction.  */
2500   COSTS_N_INSNS (5),                    /* cost of FMA SD instruction.  */
2501   COSTS_N_INSNS (13),                   /* cost of DIVSS instruction.  */
2502   COSTS_N_INSNS (17),                   /* cost of DIVSD instruction.  */
2503   COSTS_N_INSNS (14),                   /* cost of SQRTSS instruction.  */
2504   COSTS_N_INSNS (18),                   /* cost of SQRTSD instruction.  */
2505   1, 4, 3, 3,                           /* reassoc int, fp, vec_int, vec_fp.  */
2506   alderlake_memcpy,
2507   alderlake_memset,
2508   COSTS_N_INSNS (4),                    /* cond_taken_branch_cost.  */
2509   COSTS_N_INSNS (2),                    /* cond_not_taken_branch_cost.  */
2510   "16:11:8",                            /* Loop alignment.  */
2511   "16:11:8",                            /* Jump alignment.  */
2512   "0:0:8",                              /* Label alignment.  */
2513   "16",                                 /* Func alignment.  */
2514   4,                                    /* Small unroll limit.  */
2515   2,                                    /* Small unroll factor.  */
2516 };
2517
2518   /* BTVER1 has optimized REP instruction for medium sized blocks, but for
2519      very small blocks it is better to use loop. For large blocks, libcall can
2520      do nontemporary accesses and beat inline considerably.  */
2521 static stringop_algs btver1_memcpy[2] = {
2522   {libcall, {{6, loop, false}, {14, unrolled_loop, false},
2523              {-1, rep_prefix_4_byte, false}}},
2524   {libcall, {{16, loop, false}, {8192, rep_prefix_8_byte, false},
2525              {-1, libcall, false}}}};
2526 static stringop_algs btver1_memset[2] = {
2527   {libcall, {{8, loop, false}, {24, unrolled_loop, false},
2528              {2048, rep_prefix_4_byte, false}, {-1, libcall, false}}},
2529   {libcall, {{48, unrolled_loop, false}, {8192, rep_prefix_8_byte, false},
2530              {-1, libcall, false}}}};
2531 const struct processor_costs btver1_cost = {
2532   {
2533   /* Start of register allocator costs.  integer->integer move cost is 2. */
2534   8,                                 /* cost for loading QImode using movzbl */
2535   {6, 8, 6},                            /* cost of loading integer registers
2536                                            in QImode, HImode and SImode.
2537                                            Relative to reg-reg move (2).  */
2538   {6, 8, 6},                            /* cost of storing integer registers */
2539   4,                                    /* cost of reg,reg fld/fst */
2540   {12, 12, 28},                         /* cost of loading fp registers
2541                                            in SFmode, DFmode and XFmode */
2542   {12, 12, 38},                         /* cost of storing fp registers
2543                                            in SFmode, DFmode and XFmode */
2544   4,                                    /* cost of moving MMX register */
2545   {10, 10},                             /* cost of loading MMX registers
2546                                            in SImode and DImode */
2547   {12, 12},                             /* cost of storing MMX registers
2548                                            in SImode and DImode */
2549   2, 4, 8,                              /* cost of moving XMM,YMM,ZMM register */
2550   {10, 10, 12, 48, 96},                 /* cost of loading SSE registers
2551                                            in 32,64,128,256 and 512-bit */
2552   {10, 10, 12, 48, 96},                 /* cost of storing SSE registers
2553                                            in 32,64,128,256 and 512-bit */
2554   14, 14,                               /* SSE->integer and integer->SSE moves */
2555   14, 14,                               /* mask->integer and integer->mask moves */
2556   {6, 8, 6},                            /* cost of loading mask register
2557                                            in QImode, HImode, SImode.  */
2558   {6, 8, 6},                            /* cost if storing mask register
2559                                            in QImode, HImode, SImode.  */
2560   2,                                    /* cost of moving mask register.  */
2561   /* End of register allocator costs.  */
2562   },
2563
2564   COSTS_N_INSNS (1),                    /* cost of an add instruction */
2565   COSTS_N_INSNS (2),                    /* cost of a lea instruction */
2566   COSTS_N_INSNS (1),                    /* variable shift costs */
2567   COSTS_N_INSNS (1),                    /* constant shift costs */
2568   {COSTS_N_INSNS (3),                   /* cost of starting multiply for QI */
2569    COSTS_N_INSNS (4),                   /*                               HI */
2570    COSTS_N_INSNS (3),                   /*                               SI */
2571    COSTS_N_INSNS (4),                   /*                               DI */
2572    COSTS_N_INSNS (5)},                  /*                            other */
2573   0,                                    /* cost of multiply per each bit set */
2574   {COSTS_N_INSNS (19),                  /* cost of a divide/mod for QI */
2575    COSTS_N_INSNS (35),                  /*                          HI */
2576    COSTS_N_INSNS (51),                  /*                          SI */
2577    COSTS_N_INSNS (83),                  /*                          DI */
2578    COSTS_N_INSNS (83)},                 /*                          other */
2579   COSTS_N_INSNS (1),                    /* cost of movsx */
2580   COSTS_N_INSNS (1),                    /* cost of movzx */
2581   8,                                    /* "large" insn */
2582   9,                                    /* MOVE_RATIO */
2583   6,                                    /* CLEAR_RATIO */
2584   {6, 8, 6},                            /* cost of loading integer registers
2585                                            in QImode, HImode and SImode.
2586                                            Relative to reg-reg move (2).  */
2587   {6, 8, 6},                            /* cost of storing integer registers */
2588   {10, 10, 12, 48, 96},                 /* cost of loading SSE register
2589                                            in 32bit, 64bit, 128bit, 256bit and 512bit */
2590   {10, 10, 12, 48, 96},                 /* cost of storing SSE register
2591                                            in 32bit, 64bit, 128bit, 256bit and 512bit */
2592   {10, 10, 12, 48, 96},                 /* cost of unaligned loads.  */
2593   {10, 10, 12, 48, 96},                 /* cost of unaligned stores.  */
2594   2, 4, 8,                              /* cost of moving XMM,YMM,ZMM register */
2595   14,                                   /* cost of moving SSE register to integer.  */
2596   10, 10,                               /* Gather load static, per_elt.  */
2597   10, 10,                               /* Gather store static, per_elt.  */
2598   32,                                   /* size of l1 cache.  */
2599   512,                                  /* size of l2 cache.  */
2600   64,                                   /* size of prefetch block */
2601   100,                                  /* number of parallel prefetches */
2602   2,                                    /* Branch cost */
2603   COSTS_N_INSNS (4),                    /* cost of FADD and FSUB insns.  */
2604   COSTS_N_INSNS (4),                    /* cost of FMUL instruction.  */
2605   COSTS_N_INSNS (19),                   /* cost of FDIV instruction.  */
2606   COSTS_N_INSNS (2),                    /* cost of FABS instruction.  */
2607   COSTS_N_INSNS (2),                    /* cost of FCHS instruction.  */
2608   COSTS_N_INSNS (35),                   /* cost of FSQRT instruction.  */
2609
2610   COSTS_N_INSNS (1),                    /* cost of cheap SSE instruction.  */
2611   COSTS_N_INSNS (3),                    /* cost of ADDSS/SD SUBSS/SD insns.  */
2612   COSTS_N_INSNS (2),                    /* cost of MULSS instruction.  */
2613   COSTS_N_INSNS (4),                    /* cost of MULSD instruction.  */
2614   COSTS_N_INSNS (5),                    /* cost of FMA SS instruction.  */
2615   COSTS_N_INSNS (5),                    /* cost of FMA SD instruction.  */
2616   COSTS_N_INSNS (13),                   /* cost of DIVSS instruction.  */
2617   COSTS_N_INSNS (17),                   /* cost of DIVSD instruction.  */
2618   COSTS_N_INSNS (14),                   /* cost of SQRTSS instruction.  */
2619   COSTS_N_INSNS (48),                   /* cost of SQRTSD instruction.  */
2620   1, 1, 1, 1,                           /* reassoc int, fp, vec_int, vec_fp.  */
2621   btver1_memcpy,
2622   btver1_memset,
2623   COSTS_N_INSNS (2),                    /* cond_taken_branch_cost.  */
2624   COSTS_N_INSNS (1),                    /* cond_not_taken_branch_cost.  */
2625   "16:11:8",                            /* Loop alignment.  */
2626   "16:8:8",                             /* Jump alignment.  */
2627   "0:0:8",                              /* Label alignment.  */
2628   "11",                                 /* Func alignment.  */
2629   4,                                    /* Small unroll limit.  */
2630   2,                                    /* Small unroll factor.  */
2631 };
2632
2633 static stringop_algs btver2_memcpy[2] = {
2634   {libcall, {{6, loop, false}, {14, unrolled_loop, false},
2635              {-1, rep_prefix_4_byte, false}}},
2636   {libcall, {{16, loop, false}, {8192, rep_prefix_8_byte, false},
2637              {-1, libcall, false}}}};
2638 static stringop_algs btver2_memset[2] = {
2639   {libcall, {{8, loop, false}, {24, unrolled_loop, false},
2640              {2048, rep_prefix_4_byte, false}, {-1, libcall, false}}},
2641   {libcall, {{48, unrolled_loop, false}, {8192, rep_prefix_8_byte, false},
2642              {-1, libcall, false}}}};
2643 const struct processor_costs btver2_cost = {
2644   {
2645   /* Start of register allocator costs.  integer->integer move cost is 2. */
2646   8,                                 /* cost for loading QImode using movzbl */
2647   {8, 8, 6},                            /* cost of loading integer registers
2648                                            in QImode, HImode and SImode.
2649                                            Relative to reg-reg move (2).  */
2650   {8, 8, 6},                            /* cost of storing integer registers */
2651   4,                                    /* cost of reg,reg fld/fst */
2652   {12, 12, 28},                         /* cost of loading fp registers
2653                                            in SFmode, DFmode and XFmode */
2654   {12, 12, 38},                         /* cost of storing fp registers
2655                                            in SFmode, DFmode and XFmode */
2656   4,                                    /* cost of moving MMX register */
2657   {10, 10},                             /* cost of loading MMX registers
2658                                            in SImode and DImode */
2659   {12, 12},                             /* cost of storing MMX registers
2660                                            in SImode and DImode */
2661   2, 4, 8,                              /* cost of moving XMM,YMM,ZMM register */
2662   {10, 10, 12, 48, 96},                 /* cost of loading SSE registers
2663                                            in 32,64,128,256 and 512-bit */
2664   {10, 10, 12, 48, 96},                 /* cost of storing SSE registers
2665                                            in 32,64,128,256 and 512-bit */
2666   14, 14,                               /* SSE->integer and integer->SSE moves */
2667   14, 14,                               /* mask->integer and integer->mask moves */
2668   {8, 8, 6},                            /* cost of loading mask register
2669                                            in QImode, HImode, SImode.  */
2670   {8, 8, 6},                            /* cost if storing mask register
2671                                            in QImode, HImode, SImode.  */
2672   2,                                    /* cost of moving mask register.  */
2673   /* End of register allocator costs.  */
2674   },
2675
2676   COSTS_N_INSNS (1),                    /* cost of an add instruction */
2677   COSTS_N_INSNS (2),                    /* cost of a lea instruction */
2678   COSTS_N_INSNS (1),                    /* variable shift costs */
2679   COSTS_N_INSNS (1),                    /* constant shift costs */
2680   {COSTS_N_INSNS (3),                   /* cost of starting multiply for QI */
2681    COSTS_N_INSNS (4),                   /*                               HI */
2682    COSTS_N_INSNS (3),                   /*                               SI */
2683    COSTS_N_INSNS (4),                   /*                               DI */
2684    COSTS_N_INSNS (5)},                  /*                            other */
2685   0,                                    /* cost of multiply per each bit set */
2686   {COSTS_N_INSNS (19),                  /* cost of a divide/mod for QI */
2687    COSTS_N_INSNS (35),                  /*                          HI */
2688    COSTS_N_INSNS (51),                  /*                          SI */
2689    COSTS_N_INSNS (83),                  /*                          DI */
2690    COSTS_N_INSNS (83)},                 /*                          other */
2691   COSTS_N_INSNS (1),                    /* cost of movsx */
2692   COSTS_N_INSNS (1),                    /* cost of movzx */
2693   8,                                    /* "large" insn */
2694   9,                                    /* MOVE_RATIO */
2695   6,                                    /* CLEAR_RATIO */
2696   {8, 8, 6},                            /* cost of loading integer registers
2697                                            in QImode, HImode and SImode.
2698                                            Relative to reg-reg move (2).  */
2699   {8, 8, 6},                            /* cost of storing integer registers */
2700   {10, 10, 12, 48, 96},                 /* cost of loading SSE register
2701                                            in 32bit, 64bit, 128bit, 256bit and 512bit */
2702   {10, 10, 12, 48, 96},                 /* cost of storing SSE register
2703                                            in 32bit, 64bit, 128bit, 256bit and 512bit */
2704   {10, 10, 12, 48, 96},                 /* cost of unaligned loads.  */
2705   {10, 10, 12, 48, 96},                 /* cost of unaligned stores.  */
2706   2, 4, 8,                              /* cost of moving XMM,YMM,ZMM register */
2707   14,                                   /* cost of moving SSE register to integer.  */
2708   10, 10,                               /* Gather load static, per_elt.  */
2709   10, 10,                               /* Gather store static, per_elt.  */
2710   32,                                   /* size of l1 cache.  */
2711   2048,                                 /* size of l2 cache.  */
2712   64,                                   /* size of prefetch block */
2713   100,                                  /* number of parallel prefetches */
2714   2,                                    /* Branch cost */
2715   COSTS_N_INSNS (4),                    /* cost of FADD and FSUB insns.  */
2716   COSTS_N_INSNS (4),                    /* cost of FMUL instruction.  */
2717   COSTS_N_INSNS (19),                   /* cost of FDIV instruction.  */
2718   COSTS_N_INSNS (2),                    /* cost of FABS instruction.  */
2719   COSTS_N_INSNS (2),                    /* cost of FCHS instruction.  */
2720   COSTS_N_INSNS (35),                   /* cost of FSQRT instruction.  */
2721
2722   COSTS_N_INSNS (1),                    /* cost of cheap SSE instruction.  */
2723   COSTS_N_INSNS (3),                    /* cost of ADDSS/SD SUBSS/SD insns.  */
2724   COSTS_N_INSNS (2),                    /* cost of MULSS instruction.  */
2725   COSTS_N_INSNS (4),                    /* cost of MULSD instruction.  */
2726   COSTS_N_INSNS (5),                    /* cost of FMA SS instruction.  */
2727   COSTS_N_INSNS (5),                    /* cost of FMA SD instruction.  */
2728   COSTS_N_INSNS (13),                   /* cost of DIVSS instruction.  */
2729   COSTS_N_INSNS (19),                   /* cost of DIVSD instruction.  */
2730   COSTS_N_INSNS (16),                   /* cost of SQRTSS instruction.  */
2731   COSTS_N_INSNS (21),                   /* cost of SQRTSD instruction.  */
2732   1, 1, 1, 1,                           /* reassoc int, fp, vec_int, vec_fp.  */
2733   btver2_memcpy,
2734   btver2_memset,
2735   COSTS_N_INSNS (2),                    /* cond_taken_branch_cost.  */
2736   COSTS_N_INSNS (1),                    /* cond_not_taken_branch_cost.  */
2737   "16:11:8",                            /* Loop alignment.  */
2738   "16:8:8",                             /* Jump alignment.  */
2739   "0:0:8",                              /* Label alignment.  */
2740   "11",                                 /* Func alignment.  */
2741   4,                                    /* Small unroll limit.  */
2742   2,                                    /* Small unroll factor.  */
2743 };
2744
2745 static stringop_algs pentium4_memcpy[2] = {
2746   {libcall, {{12, loop_1_byte, false}, {-1, rep_prefix_4_byte, false}}},
2747   DUMMY_STRINGOP_ALGS};
2748 static stringop_algs pentium4_memset[2] = {
2749   {libcall, {{6, loop_1_byte, false}, {48, loop, false},
2750              {20480, rep_prefix_4_byte, false}, {-1, libcall, false}}},
2751   DUMMY_STRINGOP_ALGS};
2752
2753 static const
2754 struct processor_costs pentium4_cost = {
2755   {
2756   /* Start of register allocator costs.  integer->integer move cost is 2. */
2757   5,                                 /* cost for loading QImode using movzbl */
2758   {4, 5, 4},                            /* cost of loading integer registers
2759                                            in QImode, HImode and SImode.
2760                                            Relative to reg-reg move (2).  */
2761   {2, 3, 2},                            /* cost of storing integer registers */
2762   12,                                   /* cost of reg,reg fld/fst */
2763   {14, 14, 14},                         /* cost of loading fp registers
2764                                            in SFmode, DFmode and XFmode */
2765   {14, 14, 14},                         /* cost of storing fp registers
2766                                            in SFmode, DFmode and XFmode */
2767   12,                                   /* cost of moving MMX register */
2768   {16, 16},                             /* cost of loading MMX registers
2769                                            in SImode and DImode */
2770   {16, 16},                             /* cost of storing MMX registers
2771                                            in SImode and DImode */
2772   12, 24, 48,                           /* cost of moving XMM,YMM,ZMM register */
2773   {16, 16, 16, 32, 64},                 /* cost of loading SSE registers
2774                                            in 32,64,128,256 and 512-bit */
2775   {16, 16, 16, 32, 64},                 /* cost of storing SSE registers
2776                                            in 32,64,128,256 and 512-bit */
2777   20, 12,                               /* SSE->integer and integer->SSE moves */
2778   20, 12,                               /* mask->integer and integer->mask moves */
2779   {4, 5, 4},                            /* cost of loading mask register
2780                                            in QImode, HImode, SImode.  */
2781   {2, 3, 2},                            /* cost if storing mask register
2782                                            in QImode, HImode, SImode.  */
2783   2,                                    /* cost of moving mask register.  */
2784   /* End of register allocator costs.  */
2785   },
2786
2787   COSTS_N_INSNS (1),                    /* cost of an add instruction */
2788   COSTS_N_INSNS (3),                    /* cost of a lea instruction */
2789   COSTS_N_INSNS (4),                    /* variable shift costs */
2790   COSTS_N_INSNS (4),                    /* constant shift costs */
2791   {COSTS_N_INSNS (15),                  /* cost of starting multiply for QI */
2792    COSTS_N_INSNS (15),                  /*                               HI */
2793    COSTS_N_INSNS (15),                  /*                               SI */
2794    COSTS_N_INSNS (15),                  /*                               DI */
2795    COSTS_N_INSNS (15)},                 /*                            other */
2796   0,                                    /* cost of multiply per each bit set */
2797   {COSTS_N_INSNS (56),                  /* cost of a divide/mod for QI */
2798    COSTS_N_INSNS (56),                  /*                          HI */
2799    COSTS_N_INSNS (56),                  /*                          SI */
2800    COSTS_N_INSNS (56),                  /*                          DI */
2801    COSTS_N_INSNS (56)},                 /*                          other */
2802   COSTS_N_INSNS (1),                    /* cost of movsx */
2803   COSTS_N_INSNS (1),                    /* cost of movzx */
2804   16,                                   /* "large" insn */
2805   6,                                    /* MOVE_RATIO */
2806   6,                                    /* CLEAR_RATIO */
2807   {4, 5, 4},                            /* cost of loading integer registers
2808                                            in QImode, HImode and SImode.
2809                                            Relative to reg-reg move (2).  */
2810   {2, 3, 2},                            /* cost of storing integer registers */
2811   {16, 16, 16, 32, 64},                 /* cost of loading SSE register
2812                                            in 32bit, 64bit, 128bit, 256bit and 512bit */
2813   {16, 16, 16, 32, 64},                 /* cost of storing SSE register
2814                                            in 32bit, 64bit, 128bit, 256bit and 512bit */
2815   {32, 32, 32, 64, 128},                /* cost of unaligned loads.  */
2816   {32, 32, 32, 64, 128},                /* cost of unaligned stores.  */
2817   12, 24, 48,                           /* cost of moving XMM,YMM,ZMM register */
2818   20,                                   /* cost of moving SSE register to integer.  */
2819   16, 16,                               /* Gather load static, per_elt.  */
2820   16, 16,                               /* Gather store static, per_elt.  */
2821   8,                                    /* size of l1 cache.  */
2822   256,                                  /* size of l2 cache.  */
2823   64,                                   /* size of prefetch block */
2824   6,                                    /* number of parallel prefetches */
2825   2,                                    /* Branch cost */
2826   COSTS_N_INSNS (5),                    /* cost of FADD and FSUB insns.  */
2827   COSTS_N_INSNS (7),                    /* cost of FMUL instruction.  */
2828   COSTS_N_INSNS (43),                   /* cost of FDIV instruction.  */
2829   COSTS_N_INSNS (2),                    /* cost of FABS instruction.  */
2830   COSTS_N_INSNS (2),                    /* cost of FCHS instruction.  */
2831   COSTS_N_INSNS (43),                   /* cost of FSQRT instruction.  */
2832
2833   COSTS_N_INSNS (2),                    /* cost of cheap SSE instruction.  */
2834   COSTS_N_INSNS (4),                    /* cost of ADDSS/SD SUBSS/SD insns.  */
2835   COSTS_N_INSNS (6),                    /* cost of MULSS instruction.  */
2836   COSTS_N_INSNS (6),                    /* cost of MULSD instruction.  */
2837   COSTS_N_INSNS (6),                    /* cost of FMA SS instruction.  */
2838   COSTS_N_INSNS (6),                    /* cost of FMA SD instruction.  */
2839   COSTS_N_INSNS (23),                   /* cost of DIVSS instruction.  */
2840   COSTS_N_INSNS (38),                   /* cost of DIVSD instruction.  */
2841   COSTS_N_INSNS (23),                   /* cost of SQRTSS instruction.  */
2842   COSTS_N_INSNS (38),                   /* cost of SQRTSD instruction.  */
2843   1, 1, 1, 1,                           /* reassoc int, fp, vec_int, vec_fp.  */
2844   pentium4_memcpy,
2845   pentium4_memset,
2846   COSTS_N_INSNS (3),                    /* cond_taken_branch_cost.  */
2847   COSTS_N_INSNS (1),                    /* cond_not_taken_branch_cost.  */
2848   NULL,                                 /* Loop alignment.  */
2849   NULL,                                 /* Jump alignment.  */
2850   NULL,                                 /* Label alignment.  */
2851   NULL,                                 /* Func alignment.  */
2852   4,                                    /* Small unroll limit.  */
2853   2,                                    /* Small unroll factor.  */
2854 };
2855
2856 static stringop_algs nocona_memcpy[2] = {
2857   {libcall, {{12, loop_1_byte, false}, {-1, rep_prefix_4_byte, false}}},
2858   {libcall, {{32, loop, false}, {20000, rep_prefix_8_byte, false},
2859              {100000, unrolled_loop, false}, {-1, libcall, false}}}};
2860
2861 static stringop_algs nocona_memset[2] = {
2862   {libcall, {{6, loop_1_byte, false}, {48, loop, false},
2863              {20480, rep_prefix_4_byte, false}, {-1, libcall, false}}},
2864   {libcall, {{24, loop, false}, {64, unrolled_loop, false},
2865              {8192, rep_prefix_8_byte, false}, {-1, libcall, false}}}};
2866
2867 static const
2868 struct processor_costs nocona_cost = {
2869   {
2870   /* Start of register allocator costs.  integer->integer move cost is 2. */
2871   4,                                 /* cost for loading QImode using movzbl */
2872   {4, 4, 4},                            /* cost of loading integer registers
2873                                            in QImode, HImode and SImode.
2874                                            Relative to reg-reg move (2).  */
2875   {4, 4, 4},                            /* cost of storing integer registers */
2876   12,                                   /* cost of reg,reg fld/fst */
2877   {14, 14, 14},                         /* cost of loading fp registers
2878                                            in SFmode, DFmode and XFmode */
2879   {14, 14, 14},                         /* cost of storing fp registers
2880                                            in SFmode, DFmode and XFmode */
2881   14,                                   /* cost of moving MMX register */
2882   {12, 12},                             /* cost of loading MMX registers
2883                                            in SImode and DImode */
2884   {12, 12},                             /* cost of storing MMX registers
2885                                            in SImode and DImode */
2886   6, 12, 24,                            /* cost of moving XMM,YMM,ZMM register */
2887   {12, 12, 12, 24, 48},                 /* cost of loading SSE registers
2888                                            in 32,64,128,256 and 512-bit */
2889   {12, 12, 12, 24, 48},                 /* cost of storing SSE registers
2890                                            in 32,64,128,256 and 512-bit */
2891   20, 12,                               /* SSE->integer and integer->SSE moves */
2892   20, 12,                               /* mask->integer and integer->mask moves */
2893   {4, 4, 4},                            /* cost of loading mask register
2894                                            in QImode, HImode, SImode.  */
2895   {4, 4, 4},                            /* cost if storing mask register
2896                                            in QImode, HImode, SImode.  */
2897   2,                                    /* cost of moving mask register.  */
2898   /* End of register allocator costs.  */
2899   },
2900
2901   COSTS_N_INSNS (1),                    /* cost of an add instruction */
2902   COSTS_N_INSNS (1),                    /* cost of a lea instruction */
2903   COSTS_N_INSNS (1),                    /* variable shift costs */
2904   COSTS_N_INSNS (1),                    /* constant shift costs */
2905   {COSTS_N_INSNS (10),                  /* cost of starting multiply for QI */
2906    COSTS_N_INSNS (10),                  /*                               HI */
2907    COSTS_N_INSNS (10),                  /*                               SI */
2908    COSTS_N_INSNS (10),                  /*                               DI */
2909    COSTS_N_INSNS (10)},                 /*                            other */
2910   0,                                    /* cost of multiply per each bit set */
2911   {COSTS_N_INSNS (66),                  /* cost of a divide/mod for QI */
2912    COSTS_N_INSNS (66),                  /*                          HI */
2913    COSTS_N_INSNS (66),                  /*                          SI */
2914    COSTS_N_INSNS (66),                  /*                          DI */
2915    COSTS_N_INSNS (66)},                 /*                          other */
2916   COSTS_N_INSNS (1),                    /* cost of movsx */
2917   COSTS_N_INSNS (1),                    /* cost of movzx */
2918   16,                                   /* "large" insn */
2919   17,                                   /* MOVE_RATIO */
2920   6,                                    /* CLEAR_RATIO */
2921   {4, 4, 4},                            /* cost of loading integer registers
2922                                            in QImode, HImode and SImode.
2923                                            Relative to reg-reg move (2).  */
2924   {4, 4, 4},                            /* cost of storing integer registers */
2925   {12, 12, 12, 24, 48},                 /* cost of loading SSE register
2926                                            in 32bit, 64bit, 128bit, 256bit and 512bit */
2927   {12, 12, 12, 24, 48},                 /* cost of storing SSE register
2928                                            in 32bit, 64bit, 128bit, 256bit and 512bit */
2929   {24, 24, 24, 48, 96},                 /* cost of unaligned loads.  */
2930   {24, 24, 24, 48, 96},                 /* cost of unaligned stores.  */
2931   6, 12, 24,                            /* cost of moving XMM,YMM,ZMM register */
2932   20,                                   /* cost of moving SSE register to integer.  */
2933   12, 12,                               /* Gather load static, per_elt.  */
2934   12, 12,                               /* Gather store static, per_elt.  */
2935   8,                                    /* size of l1 cache.  */
2936   1024,                                 /* size of l2 cache.  */
2937   64,                                   /* size of prefetch block */
2938   8,                                    /* number of parallel prefetches */
2939   1,                                    /* Branch cost */
2940   COSTS_N_INSNS (6),                    /* cost of FADD and FSUB insns.  */
2941   COSTS_N_INSNS (8),                    /* cost of FMUL instruction.  */
2942   COSTS_N_INSNS (40),                   /* cost of FDIV instruction.  */
2943   COSTS_N_INSNS (3),                    /* cost of FABS instruction.  */
2944   COSTS_N_INSNS (3),                    /* cost of FCHS instruction.  */
2945   COSTS_N_INSNS (44),                   /* cost of FSQRT instruction.  */
2946
2947   COSTS_N_INSNS (2),                    /* cost of cheap SSE instruction.  */
2948   COSTS_N_INSNS (5),                    /* cost of ADDSS/SD SUBSS/SD insns.  */
2949   COSTS_N_INSNS (7),                    /* cost of MULSS instruction.  */
2950   COSTS_N_INSNS (7),                    /* cost of MULSD instruction.  */
2951   COSTS_N_INSNS (7),                    /* cost of FMA SS instruction.  */
2952   COSTS_N_INSNS (7),                    /* cost of FMA SD instruction.  */
2953   COSTS_N_INSNS (32),                   /* cost of DIVSS instruction.  */
2954   COSTS_N_INSNS (40),                   /* cost of DIVSD instruction.  */
2955   COSTS_N_INSNS (32),                   /* cost of SQRTSS instruction.  */
2956   COSTS_N_INSNS (41),                   /* cost of SQRTSD instruction.  */
2957   1, 1, 1, 1,                           /* reassoc int, fp, vec_int, vec_fp.  */
2958   nocona_memcpy,
2959   nocona_memset,
2960   COSTS_N_INSNS (3),                    /* cond_taken_branch_cost.  */
2961   COSTS_N_INSNS (1),                    /* cond_not_taken_branch_cost.  */
2962   NULL,                                 /* Loop alignment.  */
2963   NULL,                                 /* Jump alignment.  */
2964   NULL,                                 /* Label alignment.  */
2965   NULL,                                 /* Func alignment.  */
2966   4,                                    /* Small unroll limit.  */
2967   2,                                    /* Small unroll factor.  */
2968 };
2969
2970 static stringop_algs atom_memcpy[2] = {
2971   {libcall, {{11, loop, false}, {-1, rep_prefix_4_byte, false}}},
2972   {libcall, {{32, loop, false}, {64, rep_prefix_4_byte, false},
2973              {8192, rep_prefix_8_byte, false}, {-1, libcall, false}}}};
2974 static stringop_algs atom_memset[2] = {
2975   {libcall, {{8, loop, false}, {15, unrolled_loop, false},
2976              {2048, rep_prefix_4_byte, false}, {-1, libcall, false}}},
2977   {libcall, {{24, loop, false}, {32, unrolled_loop, false},
2978              {8192, rep_prefix_8_byte, false}, {-1, libcall, false}}}};
2979 static const
2980 struct processor_costs atom_cost = {
2981   {
2982   /* Start of register allocator costs.  integer->integer move cost is 2. */
2983   6,                                    /* cost for loading QImode using movzbl */
2984   {6, 6, 6},                            /* cost of loading integer registers
2985                                            in QImode, HImode and SImode.
2986                                            Relative to reg-reg move (2).  */
2987   {6, 6, 6},                            /* cost of storing integer registers */
2988   4,                                    /* cost of reg,reg fld/fst */
2989   {6, 6, 18},                           /* cost of loading fp registers
2990                                            in SFmode, DFmode and XFmode */
2991   {14, 14, 24},                         /* cost of storing fp registers
2992                                            in SFmode, DFmode and XFmode */
2993   2,                                    /* cost of moving MMX register */
2994   {8, 8},                               /* cost of loading MMX registers
2995                                            in SImode and DImode */
2996   {10, 10},                             /* cost of storing MMX registers
2997                                            in SImode and DImode */
2998   2, 4, 8,                              /* cost of moving XMM,YMM,ZMM register */
2999   {8, 8, 8, 16, 32},                    /* cost of loading SSE registers
3000                                            in 32,64,128,256 and 512-bit */
3001   {8, 8, 8, 16, 32},                    /* cost of storing SSE registers
3002                                            in 32,64,128,256 and 512-bit */
3003   8, 6,                         /* SSE->integer and integer->SSE moves */
3004   8, 6,                         /* mask->integer and integer->mask moves */
3005   {6, 6, 6},                            /* cost of loading mask register
3006                                            in QImode, HImode, SImode.  */
3007   {6, 6, 6},                    /* cost if storing mask register
3008                                            in QImode, HImode, SImode.  */
3009   2,                                    /* cost of moving mask register.  */
3010   /* End of register allocator costs.  */
3011   },
3012
3013   COSTS_N_INSNS (1),                    /* cost of an add instruction */
3014   COSTS_N_INSNS (1) + 1,                /* cost of a lea instruction */
3015   COSTS_N_INSNS (1),                    /* variable shift costs */
3016   COSTS_N_INSNS (1),                    /* constant shift costs */
3017   {COSTS_N_INSNS (3),                   /* cost of starting multiply for QI */
3018    COSTS_N_INSNS (4),                   /*                               HI */
3019    COSTS_N_INSNS (3),                   /*                               SI */
3020    COSTS_N_INSNS (4),                   /*                               DI */
3021    COSTS_N_INSNS (2)},                  /*                            other */
3022   0,                                    /* cost of multiply per each bit set */
3023   {COSTS_N_INSNS (18),                  /* cost of a divide/mod for QI */
3024    COSTS_N_INSNS (26),                  /*                          HI */
3025    COSTS_N_INSNS (42),                  /*                          SI */
3026    COSTS_N_INSNS (74),                  /*                          DI */
3027    COSTS_N_INSNS (74)},                 /*                          other */
3028   COSTS_N_INSNS (1),                    /* cost of movsx */
3029   COSTS_N_INSNS (1),                    /* cost of movzx */
3030   8,                                    /* "large" insn */
3031   17,                                   /* MOVE_RATIO */
3032   6,                                    /* CLEAR_RATIO */
3033   {6, 6, 6},                            /* cost of loading integer registers
3034                                            in QImode, HImode and SImode.
3035                                            Relative to reg-reg move (2).  */
3036   {6, 6, 6},                            /* cost of storing integer registers */
3037   {8, 8, 8, 16, 32},                    /* cost of loading SSE register
3038                                            in 32bit, 64bit, 128bit, 256bit and 512bit */
3039   {8, 8, 8, 16, 32},                    /* cost of storing SSE register
3040                                            in 32bit, 64bit, 128bit, 256bit and 512bit */
3041   {16, 16, 16, 32, 64},                 /* cost of unaligned loads.  */
3042   {16, 16, 16, 32, 64},                 /* cost of unaligned stores.  */
3043   2, 4, 8,                              /* cost of moving XMM,YMM,ZMM register */
3044   8,                                    /* cost of moving SSE register to integer.  */
3045   8, 8,                                 /* Gather load static, per_elt.  */
3046   8, 8,                                 /* Gather store static, per_elt.  */
3047   32,                                   /* size of l1 cache.  */
3048   256,                                  /* size of l2 cache.  */
3049   64,                                   /* size of prefetch block */
3050   6,                                    /* number of parallel prefetches */
3051   3,                                    /* Branch cost */
3052   COSTS_N_INSNS (8),                    /* cost of FADD and FSUB insns.  */
3053   COSTS_N_INSNS (8),                    /* cost of FMUL instruction.  */
3054   COSTS_N_INSNS (20),                   /* cost of FDIV instruction.  */
3055   COSTS_N_INSNS (8),                    /* cost of FABS instruction.  */
3056   COSTS_N_INSNS (8),                    /* cost of FCHS instruction.  */
3057   COSTS_N_INSNS (40),                   /* cost of FSQRT instruction.  */
3058
3059   COSTS_N_INSNS (1),                    /* cost of cheap SSE instruction.  */
3060   COSTS_N_INSNS (5),                    /* cost of ADDSS/SD SUBSS/SD insns.  */
3061   COSTS_N_INSNS (4),                    /* cost of MULSS instruction.  */
3062   COSTS_N_INSNS (5),                    /* cost of MULSD instruction.  */
3063   COSTS_N_INSNS (6),                    /* cost of FMA SS instruction.  */
3064   COSTS_N_INSNS (6),                    /* cost of FMA SD instruction.  */
3065   COSTS_N_INSNS (31),                   /* cost of DIVSS instruction.  */
3066   COSTS_N_INSNS (60),                   /* cost of DIVSD instruction.  */
3067   COSTS_N_INSNS (31),                   /* cost of SQRTSS instruction.  */
3068   COSTS_N_INSNS (63),                   /* cost of SQRTSD instruction.  */
3069   2, 2, 2, 2,                           /* reassoc int, fp, vec_int, vec_fp.  */
3070   atom_memcpy,
3071   atom_memset,
3072   COSTS_N_INSNS (3),                    /* cond_taken_branch_cost.  */
3073   COSTS_N_INSNS (1),                    /* cond_not_taken_branch_cost.  */
3074   "16",                                 /* Loop alignment.  */
3075   "16:8:8",                             /* Jump alignment.  */
3076   "0:0:8",                              /* Label alignment.  */
3077   "16",                                 /* Func alignment.  */
3078   4,                                    /* Small unroll limit.  */
3079   2,                                    /* Small unroll factor.  */
3080 };
3081
3082 static stringop_algs slm_memcpy[2] = {
3083   {libcall, {{11, loop, false}, {-1, rep_prefix_4_byte, false}}},
3084   {libcall, {{32, loop, false}, {64, rep_prefix_4_byte, false},
3085              {8192, rep_prefix_8_byte, false}, {-1, libcall, false}}}};
3086 static stringop_algs slm_memset[2] = {
3087   {libcall, {{8, loop, false}, {15, unrolled_loop, false},
3088              {2048, rep_prefix_4_byte, false}, {-1, libcall, false}}},
3089   {libcall, {{24, loop, false}, {32, unrolled_loop, false},
3090              {8192, rep_prefix_8_byte, false}, {-1, libcall, false}}}};
3091 static const
3092 struct processor_costs slm_cost = {
3093   {
3094   /* Start of register allocator costs.  integer->integer move cost is 2. */
3095   8,                                    /* cost for loading QImode using movzbl */
3096   {8, 8, 8},                            /* cost of loading integer registers
3097                                            in QImode, HImode and SImode.
3098                                            Relative to reg-reg move (2).  */
3099   {6, 6, 6},                            /* cost of storing integer registers */
3100   2,                                    /* cost of reg,reg fld/fst */
3101   {8, 8, 18},                           /* cost of loading fp registers
3102                                            in SFmode, DFmode and XFmode */
3103   {6, 6, 18},                           /* cost of storing fp registers
3104                                            in SFmode, DFmode and XFmode */
3105   2,                                    /* cost of moving MMX register */
3106   {8, 8},                               /* cost of loading MMX registers
3107                                            in SImode and DImode */
3108   {6, 6},                               /* cost of storing MMX registers
3109                                            in SImode and DImode */
3110   2, 4, 8,                              /* cost of moving XMM,YMM,ZMM register */
3111   {8, 8, 8, 16, 32},                    /* cost of loading SSE registers
3112                                            in 32,64,128,256 and 512-bit */
3113   {8, 8, 8, 16, 32},                    /* cost of storing SSE registers
3114                                            in 32,64,128,256 and 512-bit */
3115   8, 6,                         /* SSE->integer and integer->SSE moves */
3116   8, 6,                         /* mask->integer and integer->mask moves */
3117   {8, 8, 8},                    /* cost of loading mask register
3118                                            in QImode, HImode, SImode.  */
3119   {6, 6, 6},                    /* cost if storing mask register
3120                                            in QImode, HImode, SImode.  */
3121   2,                                    /* cost of moving mask register.  */
3122   /* End of register allocator costs.  */
3123   },
3124
3125   COSTS_N_INSNS (1),                    /* cost of an add instruction */
3126   COSTS_N_INSNS (1) + 1,                /* cost of a lea instruction */
3127   COSTS_N_INSNS (1),                    /* variable shift costs */
3128   COSTS_N_INSNS (1),                    /* constant shift costs */
3129   {COSTS_N_INSNS (3),                   /* cost of starting multiply for QI */
3130    COSTS_N_INSNS (3),                   /*                               HI */
3131    COSTS_N_INSNS (3),                   /*                               SI */
3132    COSTS_N_INSNS (4),                   /*                               DI */
3133    COSTS_N_INSNS (2)},                  /*                            other */
3134   0,                                    /* cost of multiply per each bit set */
3135   {COSTS_N_INSNS (18),                  /* cost of a divide/mod for QI */
3136    COSTS_N_INSNS (26),                  /*                          HI */
3137    COSTS_N_INSNS (42),                  /*                          SI */
3138    COSTS_N_INSNS (74),                  /*                          DI */
3139    COSTS_N_INSNS (74)},                 /*                          other */
3140   COSTS_N_INSNS (1),                    /* cost of movsx */
3141   COSTS_N_INSNS (1),                    /* cost of movzx */
3142   8,                                    /* "large" insn */
3143   17,                                   /* MOVE_RATIO */
3144   6,                                    /* CLEAR_RATIO */
3145   {8, 8, 8},                            /* cost of loading integer registers
3146                                            in QImode, HImode and SImode.
3147                                            Relative to reg-reg move (2).  */
3148   {6, 6, 6},                            /* cost of storing integer registers */
3149   {8, 8, 8, 16, 32},                    /* cost of loading SSE register
3150                                            in 32bit, 64bit, 128bit, 256bit and 512bit */
3151   {8, 8, 8, 16, 32},                    /* cost of storing SSE register
3152                                            in SImode, DImode and TImode.  */
3153   {16, 16, 16, 32, 64},                 /* cost of unaligned loads.  */
3154   {16, 16, 16, 32, 64},                 /* cost of unaligned stores.  */
3155   2, 4, 8,                              /* cost of moving XMM,YMM,ZMM register */
3156   8,                                    /* cost of moving SSE register to integer.  */
3157   8, 8,                                 /* Gather load static, per_elt.  */
3158   8, 8,                                 /* Gather store static, per_elt.  */
3159   32,                                   /* size of l1 cache.  */
3160   256,                                  /* size of l2 cache.  */
3161   64,                                   /* size of prefetch block */
3162   6,                                    /* number of parallel prefetches */
3163   3,                                    /* Branch cost */
3164   COSTS_N_INSNS (8),                    /* cost of FADD and FSUB insns.  */
3165   COSTS_N_INSNS (8),                    /* cost of FMUL instruction.  */
3166   COSTS_N_INSNS (20),                   /* cost of FDIV instruction.  */
3167   COSTS_N_INSNS (8),                    /* cost of FABS instruction.  */
3168   COSTS_N_INSNS (8),                    /* cost of FCHS instruction.  */
3169   COSTS_N_INSNS (40),                   /* cost of FSQRT instruction.  */
3170
3171   COSTS_N_INSNS (1),                    /* cost of cheap SSE instruction.  */
3172   COSTS_N_INSNS (3),                    /* cost of ADDSS/SD SUBSS/SD insns.  */
3173   COSTS_N_INSNS (4),                    /* cost of MULSS instruction.  */
3174   COSTS_N_INSNS (5),                    /* cost of MULSD instruction.  */
3175   COSTS_N_INSNS (6),                    /* cost of FMA SS instruction.  */
3176   COSTS_N_INSNS (6),                    /* cost of FMA SD instruction.  */
3177   COSTS_N_INSNS (39),                   /* cost of DIVSS instruction.  */
3178   COSTS_N_INSNS (69),                   /* cost of DIVSD instruction.  */
3179   COSTS_N_INSNS (20),                   /* cost of SQRTSS instruction.  */
3180   COSTS_N_INSNS (35),                   /* cost of SQRTSD instruction.  */
3181   1, 2, 1, 1,                           /* reassoc int, fp, vec_int, vec_fp.  */
3182   slm_memcpy,
3183   slm_memset,
3184   COSTS_N_INSNS (3),                    /* cond_taken_branch_cost.  */
3185   COSTS_N_INSNS (1),                    /* cond_not_taken_branch_cost.  */
3186   "16",                                 /* Loop alignment.  */
3187   "16:8:8",                             /* Jump alignment.  */
3188   "0:0:8",                              /* Label alignment.  */
3189   "16",                                 /* Func alignment.  */
3190   4,                                    /* Small unroll limit.  */
3191   2,                                    /* Small unroll factor.  */
3192 };
3193
3194 static stringop_algs tremont_memcpy[2] = {
3195   {libcall,
3196    {{256, rep_prefix_1_byte, true},
3197     {256, loop, false},
3198     {-1, libcall, false}}},
3199   {libcall,
3200    {{256, rep_prefix_1_byte, true},
3201     {256, loop, false},
3202     {-1, libcall, false}}}};
3203 static stringop_algs tremont_memset[2] = {
3204   {libcall,
3205    {{256, rep_prefix_1_byte, true},
3206     {256, loop, false},
3207     {-1, libcall, false}}},
3208   {libcall,
3209    {{256, rep_prefix_1_byte, true},
3210     {256, loop, false},
3211     {-1, libcall, false}}}};
3212 static const
3213 struct processor_costs tremont_cost = {
3214   {
3215   /* Start of register allocator costs.  integer->integer move cost is 2. */
3216   6,                                 /* cost for loading QImode using movzbl */
3217   {6, 6, 6},                            /* cost of loading integer registers
3218                                            in QImode, HImode and SImode.
3219                                            Relative to reg-reg move (2).  */
3220   {6, 6, 6},                            /* cost of storing integer registers */
3221   4,                                    /* cost of reg,reg fld/fst */
3222   {6, 6, 12},                           /* cost of loading fp registers
3223                                            in SFmode, DFmode and XFmode */
3224   {6, 6, 12},                           /* cost of storing fp registers
3225                                            in SFmode, DFmode and XFmode */
3226   2,                                    /* cost of moving MMX register */
3227   {6, 6},                               /* cost of loading MMX registers
3228                                            in SImode and DImode */
3229   {6, 6},                               /* cost of storing MMX registers
3230                                            in SImode and DImode */
3231   2, 3, 4,                              /* cost of moving XMM,YMM,ZMM register */
3232   {6, 6, 6, 10, 15},                    /* cost of loading SSE registers
3233                                            in 32,64,128,256 and 512-bit */
3234   {6, 6, 6, 10, 15},                    /* cost of storing SSE registers
3235                                            in 32,64,128,256 and 512-bit */
3236   6, 6,                         /* SSE->integer and integer->SSE moves */
3237   6, 6,                         /* mask->integer and integer->mask moves */
3238   {6, 6, 6},                            /* cost of loading mask register
3239                                            in QImode, HImode, SImode.  */
3240   {6, 6, 6},                    /* cost if storing mask register
3241                                            in QImode, HImode, SImode.  */
3242   2,                                    /* cost of moving mask register.  */
3243   /* End of register allocator costs.  */
3244   },
3245
3246   COSTS_N_INSNS (1),                    /* cost of an add instruction */
3247   /* Setting cost to 2 makes our current implementation of synth_mult result in
3248      use of unnecessary temporary registers causing regression on several
3249      SPECfp benchmarks.  */
3250   COSTS_N_INSNS (1) + 1,                /* cost of a lea instruction */
3251   COSTS_N_INSNS (1),                    /* variable shift costs */
3252   COSTS_N_INSNS (1),                    /* constant shift costs */
3253   {COSTS_N_INSNS (3),                   /* cost of starting multiply for QI */
3254    COSTS_N_INSNS (3),                   /*                               HI */
3255    COSTS_N_INSNS (3),                   /*                               SI */
3256    COSTS_N_INSNS (3),                   /*                               DI */
3257    COSTS_N_INSNS (4)},                  /*                            other */
3258   0,                                    /* cost of multiply per each bit set */
3259   {COSTS_N_INSNS (16),                  /* cost of a divide/mod for QI */
3260    COSTS_N_INSNS (22),                  /*                          HI */
3261    COSTS_N_INSNS (30),                  /*                          SI */
3262    COSTS_N_INSNS (74),                  /*                          DI */
3263    COSTS_N_INSNS (74)},                 /*                          other */
3264   COSTS_N_INSNS (1),                    /* cost of movsx */
3265   COSTS_N_INSNS (1),                    /* cost of movzx */
3266   8,                                    /* "large" insn */
3267   17,                                   /* MOVE_RATIO */
3268   17,                                   /* CLEAR_RATIO */
3269   {6, 6, 6},                            /* cost of loading integer registers
3270                                            in QImode, HImode and SImode.
3271                                            Relative to reg-reg move (2).  */
3272   {6, 6, 6},                            /* cost of storing integer registers */
3273   {6, 6, 6, 10, 15},                    /* cost of loading SSE register
3274                                            in 32bit, 64bit, 128bit, 256bit and 512bit */
3275   {6, 6, 6, 10, 15},                    /* cost of storing SSE register
3276                                            in 32bit, 64bit, 128bit, 256bit and 512bit */
3277   {6, 6, 6, 10, 15},                    /* cost of unaligned loads.  */
3278   {6, 6, 6, 10, 15},                    /* cost of unaligned storess.  */
3279   2, 3, 4,                              /* cost of moving XMM,YMM,ZMM register */
3280   6,                                    /* cost of moving SSE register to integer.  */
3281   18, 6,                                /* Gather load static, per_elt.  */
3282   18, 6,                                /* Gather store static, per_elt.  */
3283   32,                                   /* size of l1 cache.  */
3284   512,                                  /* size of l2 cache.  */
3285   64,                                   /* size of prefetch block */
3286   6,                                    /* number of parallel prefetches */
3287   /* Benchmarks shows large regressions on K8 sixtrack benchmark when this
3288      value is increased to perhaps more appropriate value of 5.  */
3289   3,                                    /* Branch cost */
3290   COSTS_N_INSNS (3),                    /* cost of FADD and FSUB insns.  */
3291   COSTS_N_INSNS (5),                    /* cost of FMUL instruction.  */
3292   COSTS_N_INSNS (17),                   /* cost of FDIV instruction.  */
3293   COSTS_N_INSNS (1),                    /* cost of FABS instruction.  */
3294   COSTS_N_INSNS (1),                    /* cost of FCHS instruction.  */
3295   COSTS_N_INSNS (14),                   /* cost of FSQRT instruction.  */
3296
3297   COSTS_N_INSNS (1),                    /* cost of cheap SSE instruction.  */
3298   COSTS_N_INSNS (3),                    /* cost of ADDSS/SD SUBSS/SD insns.  */
3299   COSTS_N_INSNS (4),                    /* cost of MULSS instruction.  */
3300   COSTS_N_INSNS (5),                    /* cost of MULSD instruction.  */
3301   COSTS_N_INSNS (5),                    /* cost of FMA SS instruction.  */
3302   COSTS_N_INSNS (5),                    /* cost of FMA SD instruction.  */
3303   COSTS_N_INSNS (13),                   /* cost of DIVSS instruction.  */
3304   COSTS_N_INSNS (17),                   /* cost of DIVSD instruction.  */
3305   COSTS_N_INSNS (14),                   /* cost of SQRTSS instruction.  */
3306   COSTS_N_INSNS (18),                   /* cost of SQRTSD instruction.  */
3307   1, 4, 3, 3,                           /* reassoc int, fp, vec_int, vec_fp.  */
3308   tremont_memcpy,
3309   tremont_memset,
3310   COSTS_N_INSNS (4),                    /* cond_taken_branch_cost.  */
3311   COSTS_N_INSNS (2),                    /* cond_not_taken_branch_cost.  */
3312   "16:11:8",                            /* Loop alignment.  */
3313   "16:11:8",                            /* Jump alignment.  */
3314   "0:0:8",                              /* Label alignment.  */
3315   "16",                                 /* Func alignment.  */
3316   4,                                    /* Small unroll limit.  */
3317   2,                                    /* Small unroll factor.  */
3318 };
3319
3320 static stringop_algs intel_memcpy[2] = {
3321   {libcall, {{11, loop, false}, {-1, rep_prefix_4_byte, false}}},
3322   {libcall, {{32, loop, false}, {64, rep_prefix_4_byte, false},
3323              {8192, rep_prefix_8_byte, false}, {-1, libcall, false}}}};
3324 static stringop_algs intel_memset[2] = {
3325   {libcall, {{8, loop, false}, {15, unrolled_loop, false},
3326              {2048, rep_prefix_4_byte, false}, {-1, libcall, false}}},
3327   {libcall, {{24, loop, false}, {32, unrolled_loop, false},
3328              {8192, rep_prefix_8_byte, false}, {-1, libcall, false}}}};
3329 static const
3330 struct processor_costs intel_cost = {
3331   {
3332   /* Start of register allocator costs.  integer->integer move cost is 2. */
3333   6,                                 /* cost for loading QImode using movzbl */
3334   {4, 4, 4},                            /* cost of loading integer registers
3335                                            in QImode, HImode and SImode.
3336                                            Relative to reg-reg move (2).  */
3337   {6, 6, 6},                            /* cost of storing integer registers */
3338   2,                                    /* cost of reg,reg fld/fst */
3339   {6, 6, 8},                            /* cost of loading fp registers
3340                                            in SFmode, DFmode and XFmode */
3341   {6, 6, 10},                           /* cost of storing fp registers
3342                                            in SFmode, DFmode and XFmode */
3343   2,                                    /* cost of moving MMX register */
3344   {6, 6},                               /* cost of loading MMX registers
3345                                            in SImode and DImode */
3346   {6, 6},                               /* cost of storing MMX registers
3347                                            in SImode and DImode */
3348   2, 2, 2,                              /* cost of moving XMM,YMM,ZMM register */
3349   {6, 6, 6, 6, 6},                      /* cost of loading SSE registers
3350                                            in 32,64,128,256 and 512-bit */
3351   {6, 6, 6, 6, 6},                      /* cost of storing SSE registers
3352                                            in 32,64,128,256 and 512-bit */
3353   4, 4,                         /* SSE->integer and integer->SSE moves */
3354   4, 4,                         /* mask->integer and integer->mask moves */
3355   {4, 4, 4},                            /* cost of loading mask register
3356                                            in QImode, HImode, SImode.  */
3357   {6, 6, 6},                            /* cost if storing mask register
3358                                            in QImode, HImode, SImode.  */
3359   2,                                    /* cost of moving mask register.  */
3360   /* End of register allocator costs.  */
3361   },
3362
3363   COSTS_N_INSNS (1),                    /* cost of an add instruction */
3364   COSTS_N_INSNS (1) + 1,                /* cost of a lea instruction */
3365   COSTS_N_INSNS (1),                    /* variable shift costs */
3366   COSTS_N_INSNS (1),                    /* constant shift costs */
3367   {COSTS_N_INSNS (3),                   /* cost of starting multiply for QI */
3368    COSTS_N_INSNS (3),                   /*                               HI */
3369    COSTS_N_INSNS (3),                   /*                               SI */
3370    COSTS_N_INSNS (4),                   /*                               DI */
3371    COSTS_N_INSNS (2)},                  /*                            other */
3372   0,                                    /* cost of multiply per each bit set */
3373   {COSTS_N_INSNS (18),                  /* cost of a divide/mod for QI */
3374    COSTS_N_INSNS (26),                  /*                          HI */
3375    COSTS_N_INSNS (42),                  /*                          SI */
3376    COSTS_N_INSNS (74),                  /*                          DI */
3377    COSTS_N_INSNS (74)},                 /*                          other */
3378   COSTS_N_INSNS (1),                    /* cost of movsx */
3379   COSTS_N_INSNS (1),                    /* cost of movzx */
3380   8,                                    /* "large" insn */
3381   17,                                   /* MOVE_RATIO */
3382   6,                                    /* CLEAR_RATIO */
3383   {4, 4, 4},                            /* cost of loading integer registers
3384                                            in QImode, HImode and SImode.
3385                                            Relative to reg-reg move (2).  */
3386   {6, 6, 6},                            /* cost of storing integer registers */
3387   {6, 6, 6, 6, 6},                      /* cost of loading SSE register
3388                                            in 32bit, 64bit, 128bit, 256bit and 512bit */
3389   {6, 6, 6, 6, 6},                      /* cost of storing SSE register
3390                                            in 32bit, 64bit, 128bit, 256bit and 512bit */
3391   {10, 10, 10, 10, 10},                 /* cost of unaligned loads.  */
3392   {10, 10, 10, 10, 10},                 /* cost of unaligned loads.  */
3393   2, 2, 2,                              /* cost of moving XMM,YMM,ZMM register */
3394   4,                                    /* cost of moving SSE register to integer.  */
3395   6, 6,                                 /* Gather load static, per_elt.  */
3396   6, 6,                                 /* Gather store static, per_elt.  */
3397   32,                                   /* size of l1 cache.  */
3398   256,                                  /* size of l2 cache.  */
3399   64,                                   /* size of prefetch block */
3400   6,                                    /* number of parallel prefetches */
3401   3,                                    /* Branch cost */
3402   COSTS_N_INSNS (8),                    /* cost of FADD and FSUB insns.  */
3403   COSTS_N_INSNS (8),                    /* cost of FMUL instruction.  */
3404   COSTS_N_INSNS (20),                   /* cost of FDIV instruction.  */
3405   COSTS_N_INSNS (8),                    /* cost of FABS instruction.  */
3406   COSTS_N_INSNS (8),                    /* cost of FCHS instruction.  */
3407   COSTS_N_INSNS (40),                   /* cost of FSQRT instruction.  */
3408
3409   COSTS_N_INSNS (1),                    /* cost of cheap SSE instruction.  */
3410   COSTS_N_INSNS (8),                    /* cost of ADDSS/SD SUBSS/SD insns.  */
3411   COSTS_N_INSNS (8),                    /* cost of MULSS instruction.  */
3412   COSTS_N_INSNS (8),                    /* cost of MULSD instruction.  */
3413   COSTS_N_INSNS (6),                    /* cost of FMA SS instruction.  */
3414   COSTS_N_INSNS (6),                    /* cost of FMA SD instruction.  */
3415   COSTS_N_INSNS (20),                   /* cost of DIVSS instruction.  */
3416   COSTS_N_INSNS (20),                   /* cost of DIVSD instruction.  */
3417   COSTS_N_INSNS (40),                   /* cost of SQRTSS instruction.  */
3418   COSTS_N_INSNS (40),                   /* cost of SQRTSD instruction.  */
3419   1, 4, 1, 1,                           /* reassoc int, fp, vec_int, vec_fp.  */
3420   intel_memcpy,
3421   intel_memset,
3422   COSTS_N_INSNS (3),                    /* cond_taken_branch_cost.  */
3423   COSTS_N_INSNS (1),                    /* cond_not_taken_branch_cost.  */
3424   "16",                                 /* Loop alignment.  */
3425   "16:8:8",                             /* Jump alignment.  */
3426   "0:0:8",                              /* Label alignment.  */
3427   "16",                                 /* Func alignment.  */
3428   4,                                    /* Small unroll limit.  */
3429   2,                                    /* Small unroll factor.  */
3430 };
3431
3432 /* lujiazui_cost should produce code tuned for ZHAOXIN lujiazui CPU.  */
3433 static stringop_algs lujiazui_memcpy[2] = {
3434   {libcall, {{32, loop, false}, {8192, rep_prefix_4_byte, false},
3435                          {-1, libcall, false}}},
3436   {libcall, {{12, unrolled_loop, true}, {32, loop, false},
3437                          {6144, rep_prefix_8_byte, false},
3438                          {-1, libcall, false}}}};
3439 static stringop_algs lujiazui_memset[2] = {
3440   {libcall, {{32, loop, false}, {8192, rep_prefix_4_byte, false},
3441                          {-1, libcall, false}}},
3442   {libcall, {{12, loop, true}, {32, loop, false},
3443                          {640, rep_prefix_8_byte, false},
3444                          {-1, libcall, false}}}};
3445 static const
3446 struct processor_costs lujiazui_cost = {
3447   {
3448   /* Start of register allocator costs.  integer->integer move cost is 2.  */
3449   6,                            /* cost for loading QImode using movzbl.  */
3450   {6, 6, 6},                    /* cost of loading integer registers
3451                                            in QImode, HImode and SImode.
3452                                            Relative to reg-reg move (2).  */
3453   {6, 6, 6},                    /* cost of storing integer registers.  */
3454   2,                                    /* cost of reg,reg fld/fst.  */
3455   {6, 6, 8},                    /* cost of loading fp registers
3456                                 in SFmode, DFmode and XFmode.  */
3457   {6, 6, 8},                    /* cost of storing fp registers
3458                                 in SFmode, DFmode and XFmode.  */
3459   2,                            /* cost of moving MMX register.  */
3460   {6, 6},                       /* cost of loading MMX registers
3461                                 in SImode and DImode.  */
3462   {6, 6},                       /* cost of storing MMX registers
3463                                 in SImode and DImode.  */
3464   2, 3, 4,                      /* cost of moving XMM,YMM,ZMM register.  */
3465   {6, 6, 6, 10, 15},    /* cost of loading SSE registers
3466                                 in 32,64,128,256 and 512-bit.  */
3467   {6, 6, 6, 10, 15},    /* cost of storing SSE registers
3468                                 in 32,64,128,256 and 512-bit.  */
3469   6, 6,                         /* SSE->integer and integer->SSE moves.  */
3470   6, 6,                         /* mask->integer and integer->mask moves.  */
3471   {6, 6, 6},            /* cost of loading mask register
3472                                 in QImode, HImode, SImode.  */
3473   {6, 6, 6},            /* cost if storing mask register
3474                                 in QImode, HImode, SImode.  */
3475   2,                            /* cost of moving mask register.  */
3476   /* End of register allocator costs.  */
3477   },
3478
3479   COSTS_N_INSNS (1),                    /* cost of an add instruction.  */
3480   COSTS_N_INSNS (1) + 1,                /* cost of a lea instruction.  */
3481   COSTS_N_INSNS (1),                    /* variable shift costs.  */
3482   COSTS_N_INSNS (1),                    /* constant shift costs.  */
3483   {COSTS_N_INSNS (2),                   /* cost of starting multiply for QI.  */
3484    COSTS_N_INSNS (3),                   /*                               HI.  */
3485    COSTS_N_INSNS (3),                   /*                               SI.  */
3486    COSTS_N_INSNS (12),                  /*                               DI.  */
3487    COSTS_N_INSNS (14)},         /*                               other.  */
3488   0,                            /* cost of multiply per each bit set.  */
3489   {COSTS_N_INSNS (22),                  /* cost of a divide/mod for QI.  */
3490    COSTS_N_INSNS (24),                  /*                          HI.  */
3491    COSTS_N_INSNS (24),                  /*                          SI.  */
3492    COSTS_N_INSNS (150),                 /*                          DI.  */
3493    COSTS_N_INSNS (152)},                /*                          other.  */
3494   COSTS_N_INSNS (1),                    /* cost of movsx.  */
3495   COSTS_N_INSNS (1),                    /* cost of movzx.  */
3496   8,                                    /* "large" insn.  */
3497   17,                                   /* MOVE_RATIO.  */
3498   6,                                    /* CLEAR_RATIO.  */
3499   {6, 6, 6},                            /* cost of loading integer registers
3500                                            in QImode, HImode and SImode.
3501                                            Relative to reg-reg move (2).  */
3502   {6, 6, 6},                    /* cost of storing integer registers.  */
3503   {6, 6, 6, 10, 15},                    /* cost of loading SSE register
3504                                 in 32bit, 64bit, 128bit, 256bit and 512bit.  */
3505   {6, 6, 6, 10, 15},                    /* cost of storing SSE register
3506                                 in 32bit, 64bit, 128bit, 256bit and 512bit.  */
3507   {6, 6, 6, 10, 15},                    /* cost of unaligned loads.  */
3508   {6, 6, 6, 10, 15},                    /* cost of unaligned storess.  */
3509   2, 3, 4,                      /* cost of moving XMM,YMM,ZMM register.  */
3510   6,                            /* cost of moving SSE register to integer.  */
3511   18, 6,                                /* Gather load static, per_elt.  */
3512   18, 6,                                /* Gather store static, per_elt.  */
3513   32,                                   /* size of l1 cache.  */
3514   4096,                                 /* size of l2 cache.  */
3515   64,                                   /* size of prefetch block.  */
3516   /* Lujiazui processor never drop prefetches, like AMD processors.  */
3517   100,                                  /* number of parallel prefetches.  */
3518   3,                                    /* Branch cost.  */
3519   COSTS_N_INSNS (3),                    /* cost of FADD and FSUB insns.  */
3520   COSTS_N_INSNS (4),                    /* cost of FMUL instruction.  */
3521   COSTS_N_INSNS (22),                   /* cost of FDIV instruction.  */
3522   COSTS_N_INSNS (1),                    /* cost of FABS instruction.  */
3523   COSTS_N_INSNS (1),                    /* cost of FCHS instruction.  */
3524   COSTS_N_INSNS (44),                   /* cost of FSQRT instruction.  */
3525
3526   COSTS_N_INSNS (1),                    /* cost of cheap SSE instruction.  */
3527   COSTS_N_INSNS (3),                    /* cost of ADDSS/SD SUBSS/SD insns.  */
3528   COSTS_N_INSNS (3),                    /* cost of MULSS instruction.  */
3529   COSTS_N_INSNS (4),                    /* cost of MULSD instruction.  */
3530   COSTS_N_INSNS (6),                    /* cost of FMA SS instruction.  */
3531   COSTS_N_INSNS (6),                    /* cost of FMA SD instruction.  */
3532   COSTS_N_INSNS (13),                   /* cost of DIVSS instruction.  */
3533   COSTS_N_INSNS (17),                   /* cost of DIVSD instruction.  */
3534   COSTS_N_INSNS (32),                   /* cost of SQRTSS instruction.  */
3535   COSTS_N_INSNS (60),                   /* cost of SQRTSD instruction.  */
3536   1, 4, 3, 3,                           /* reassoc int, fp, vec_int, vec_fp.  */
3537   lujiazui_memcpy,
3538   lujiazui_memset,
3539   COSTS_N_INSNS (4),                    /* cond_taken_branch_cost.  */
3540   COSTS_N_INSNS (2),                    /* cond_not_taken_branch_cost.  */
3541   "16:11:8",                            /* Loop alignment.  */
3542   "16:11:8",                            /* Jump alignment.  */
3543   "0:0:8",                              /* Label alignment.  */
3544   "16",                                 /* Func alignment.  */
3545   4,                                    /* Small unroll limit.  */
3546   2,                                    /* Small unroll factor.  */
3547 };
3548
3549 /* yongfeng_cost should produce code tuned for ZHAOXIN yongfeng CPU.  */
3550 static stringop_algs yongfeng_memcpy[2] = {
3551   {libcall, {{6, unrolled_loop, true}, {256, unrolled_loop, false},
3552                          {-1, libcall, false}}},
3553   {libcall, {{8, loop, false}, {512, unrolled_loop, false},
3554                          {-1, libcall, false}}}};
3555 static stringop_algs yongfeng_memset[2] = {
3556   {libcall, {{6, loop_1_byte, false}, {128, loop, false},
3557                          {-1, libcall, false}}},
3558   {libcall, {{2, rep_prefix_4_byte, false}, {64, loop, false},
3559                          {1024, vector_loop, false},
3560                          {-1, libcall, false}}}};
3561 static const
3562 struct processor_costs yongfeng_cost = {
3563   {
3564   /* Start of register allocator costs.  integer->integer move cost is 2.  */
3565   8,                            /* cost for loading QImode using movzbl.  */
3566   {8, 8, 8},                    /* cost of loading integer registers
3567                                            in QImode, HImode and SImode.
3568                                            Relative to reg-reg move (2).  */
3569   {8, 8, 8},                    /* cost of storing integer registers.  */
3570   2,                                    /* cost of reg,reg fld/fst.  */
3571   {8, 8, 8},                    /* cost of loading fp registers
3572                                 in SFmode, DFmode and XFmode.  */
3573   {8, 8, 8},                    /* cost of storing fp registers
3574                                 in SFmode, DFmode and XFmode.  */
3575   2,                            /* cost of moving MMX register.  */
3576   {8, 8},                       /* cost of loading MMX registers
3577                                 in SImode and DImode.  */
3578   {8, 8},                       /* cost of storing MMX registers
3579                                 in SImode and DImode.  */
3580   2, 3, 4,                      /* cost of moving XMM,YMM,ZMM register.  */
3581   {8, 8, 8, 10, 15},    /* cost of loading SSE registers
3582                                 in 32,64,128,256 and 512-bit.  */
3583   {8, 8, 8, 10, 15},    /* cost of storing SSE registers
3584                                 in 32,64,128,256 and 512-bit.  */
3585   8, 8,                         /* SSE->integer and integer->SSE moves.  */
3586   8, 8,                         /* mask->integer and integer->mask moves.  */
3587   {8, 8, 8},            /* cost of loading mask register
3588                                 in QImode, HImode, SImode.  */
3589   {8, 8, 8},            /* cost if storing mask register
3590                                 in QImode, HImode, SImode.  */
3591   2,                            /* cost of moving mask register.  */
3592   /* End of register allocator costs.  */
3593   },
3594
3595   COSTS_N_INSNS (1),                    /* cost of an add instruction.  */
3596   COSTS_N_INSNS (1),                    /* cost of a lea instruction.  */
3597   COSTS_N_INSNS (1),                    /* variable shift costs.  */
3598   COSTS_N_INSNS (1),                    /* constant shift costs.  */
3599   {COSTS_N_INSNS (2),                   /* cost of starting multiply for QI.  */
3600    COSTS_N_INSNS (3),                   /*                               HI.  */
3601    COSTS_N_INSNS (2),                   /*                               SI.  */
3602    COSTS_N_INSNS (2),                   /*                               DI.  */
3603    COSTS_N_INSNS (3)},          /*                               other.  */
3604   0,                            /* cost of multiply per each bit set.  */
3605   {COSTS_N_INSNS (8),                   /* cost of a divide/mod for QI.  */
3606    COSTS_N_INSNS (9),                   /*                          HI.  */
3607    COSTS_N_INSNS (8),                   /*                          SI.  */
3608    COSTS_N_INSNS (41),                  /*                          DI.  */
3609    COSTS_N_INSNS (41)},         /*                          other.  */
3610   COSTS_N_INSNS (1),                    /* cost of movsx.  */
3611   COSTS_N_INSNS (1),                    /* cost of movzx.  */
3612   8,                                    /* "large" insn.  */
3613   17,                                   /* MOVE_RATIO.  */
3614   6,                                    /* CLEAR_RATIO.  */
3615   {8, 8, 8},                            /* cost of loading integer registers
3616                                            in QImode, HImode and SImode.
3617                                            Relative to reg-reg move (2).  */
3618   {8, 8, 8},                    /* cost of storing integer registers.  */
3619   {8, 8, 8, 12, 15},                    /* cost of loading SSE register
3620                                 in 32bit, 64bit, 128bit, 256bit and 512bit.  */
3621   {8, 8, 8, 12, 15},                    /* cost of storing SSE register
3622                                 in 32bit, 64bit, 128bit, 256bit and 512bit.  */
3623   {8, 8, 8, 12, 15},                    /* cost of unaligned loads.  */
3624   {8, 8, 8, 12, 15},                    /* cost of unaligned storess.  */
3625   2, 3, 4,                      /* cost of moving XMM,YMM,ZMM register.  */
3626   8,                            /* cost of moving SSE register to integer.  */
3627   18, 6,                                /* Gather load static, per_elt.  */
3628   18, 6,                                /* Gather store static, per_elt.  */
3629   32,                                   /* size of l1 cache.  */
3630   256,                                  /* size of l2 cache.  */
3631   64,                                   /* size of prefetch block.  */
3632   12,                                   /* number of parallel prefetches.  */
3633   3,                                    /* Branch cost.  */
3634   COSTS_N_INSNS (3),                    /* cost of FADD and FSUB insns.  */
3635   COSTS_N_INSNS (3),                    /* cost of FMUL instruction.  */
3636   COSTS_N_INSNS (14),                   /* cost of FDIV instruction.  */
3637   COSTS_N_INSNS (1),                    /* cost of FABS instruction.  */
3638   COSTS_N_INSNS (1),                    /* cost of FCHS instruction.  */
3639   COSTS_N_INSNS (40),                   /* cost of FSQRT instruction.  */
3640
3641   COSTS_N_INSNS (1),                    /* cost of cheap SSE instruction.  */
3642   COSTS_N_INSNS (3),                    /* cost of ADDSS/SD SUBSS/SD insns.  */
3643   COSTS_N_INSNS (3),                    /* cost of MULSS instruction.  */
3644   COSTS_N_INSNS (3),                    /* cost of MULSD instruction.  */
3645   COSTS_N_INSNS (5),                    /* cost of FMA SS instruction.  */
3646   COSTS_N_INSNS (5),                    /* cost of FMA SD instruction.  */
3647   COSTS_N_INSNS (10),                   /* cost of DIVSS instruction.  */
3648   COSTS_N_INSNS (14),                   /* cost of DIVSD instruction.  */
3649   COSTS_N_INSNS (20),                   /* cost of SQRTSS instruction.  */
3650   COSTS_N_INSNS (35),                   /* cost of SQRTSD instruction.  */
3651   4, 4, 4, 4,                           /* reassoc int, fp, vec_int, vec_fp.  */
3652   yongfeng_memcpy,
3653   yongfeng_memset,
3654   COSTS_N_INSNS (3),                    /* cond_taken_branch_cost.  */
3655   COSTS_N_INSNS (1),                    /* cond_not_taken_branch_cost.  */
3656   "16:11:8",                            /* Loop alignment.  */
3657   "16:11:8",                            /* Jump alignment.  */
3658   "0:0:8",                              /* Label alignment.  */
3659   "16",                                 /* Func alignment.  */
3660   4,                                    /* Small unroll limit.  */
3661   2,                                    /* Small unroll factor.  */
3662 };
3663
3664 /* shijidadao_cost should produce code tuned for ZHAOXIN shijidadao CPU.  */
3665 static stringop_algs shijidadao_memcpy[2] = {
3666   {libcall, {{8, unrolled_loop, true}, {256, unrolled_loop, false},
3667                          {-1, libcall, false}}},
3668   {libcall, {{10, loop, true}, {256, unrolled_loop, false},
3669                          {-1, libcall, false}}}};
3670 static stringop_algs shijidadao_memset[2] = {
3671   {libcall, {{4, loop, true}, {128, unrolled_loop, false},
3672                          {-1, libcall, false}}},
3673   {libcall, {{1, rep_prefix_4_byte, false}, {14, loop, true},
3674                          {1024, vector_loop, false},
3675                          {-1, libcall, false}}}};
3676 static const
3677 struct processor_costs shijidadao_cost = {
3678   {
3679   /* Start of register allocator costs.  integer->integer move cost is 2.  */
3680   8,                            /* cost for loading QImode using movzbl.  */
3681   {8, 8, 8},                    /* cost of loading integer registers
3682                                            in QImode, HImode and SImode.
3683                                            Relative to reg-reg move (2).  */
3684   {8, 8, 8},                    /* cost of storing integer registers.  */
3685   2,                                    /* cost of reg,reg fld/fst.  */
3686   {8, 8, 8},                    /* cost of loading fp registers
3687                                 in SFmode, DFmode and XFmode.  */
3688   {8, 8, 8},                    /* cost of storing fp registers
3689                                 in SFmode, DFmode and XFmode.  */
3690   2,                            /* cost of moving MMX register.  */
3691   {8, 8},                       /* cost of loading MMX registers
3692                                 in SImode and DImode.  */
3693   {8, 8},                       /* cost of storing MMX registers
3694                                 in SImode and DImode.  */
3695   2, 3, 4,                      /* cost of moving XMM,YMM,ZMM register.  */
3696   {8, 8, 8, 10, 15},    /* cost of loading SSE registers
3697                                 in 32,64,128,256 and 512-bit.  */
3698   {8, 8, 8, 10, 15},    /* cost of storing SSE registers
3699                                 in 32,64,128,256 and 512-bit.  */
3700   8, 8,                         /* SSE->integer and integer->SSE moves.  */
3701   8, 8,                         /* mask->integer and integer->mask moves.  */
3702   {8, 8, 8},            /* cost of loading mask register
3703                                 in QImode, HImode, SImode.  */
3704   {8, 8, 8},            /* cost if storing mask register
3705                                 in QImode, HImode, SImode.  */
3706   2,                            /* cost of moving mask register.  */
3707   /* End of register allocator costs.  */
3708   },
3709
3710   COSTS_N_INSNS (1),                    /* cost of an add instruction.  */
3711   COSTS_N_INSNS (1),                    /* cost of a lea instruction.  */
3712   COSTS_N_INSNS (1),                    /* variable shift costs.  */
3713   COSTS_N_INSNS (1),                    /* constant shift costs.  */
3714   {COSTS_N_INSNS (2),                   /* cost of starting multiply for QI.  */
3715    COSTS_N_INSNS (3),                   /*                               HI.  */
3716    COSTS_N_INSNS (2),                   /*                               SI.  */
3717    COSTS_N_INSNS (2),                   /*                               DI.  */
3718    COSTS_N_INSNS (3)},          /*                               other.  */
3719   0,                            /* cost of multiply per each bit set.  */
3720   {COSTS_N_INSNS (9),                   /* cost of a divide/mod for QI.  */
3721    COSTS_N_INSNS (10),                  /*                          HI.  */
3722    COSTS_N_INSNS (9),                   /*                          SI.  */
3723    COSTS_N_INSNS (50),                  /*                          DI.  */
3724    COSTS_N_INSNS (50)},         /*                          other.  */
3725   COSTS_N_INSNS (1),                    /* cost of movsx.  */
3726   COSTS_N_INSNS (1),                    /* cost of movzx.  */
3727   8,                                    /* "large" insn.  */
3728   17,                                   /* MOVE_RATIO.  */
3729   6,                                    /* CLEAR_RATIO.  */
3730   {8, 8, 8},                            /* cost of loading integer registers
3731                                            in QImode, HImode and SImode.
3732                                            Relative to reg-reg move (2).  */
3733   {8, 8, 8},                    /* cost of storing integer registers.  */
3734   {8, 8, 8, 12, 15},                    /* cost of loading SSE register
3735                                 in 32bit, 64bit, 128bit, 256bit and 512bit.  */
3736   {8, 8, 8, 12, 15},                    /* cost of storing SSE register
3737                                 in 32bit, 64bit, 128bit, 256bit and 512bit.  */
3738   {8, 8, 8, 12, 15},                    /* cost of unaligned loads.  */
3739   {8, 8, 8, 12, 15},                    /* cost of unaligned storess.  */
3740   2, 3, 4,                      /* cost of moving XMM,YMM,ZMM register.  */
3741   8,                            /* cost of moving SSE register to integer.  */
3742   18, 6,                                /* Gather load static, per_elt.  */
3743   18, 6,                                /* Gather store static, per_elt.  */
3744   32,                                   /* size of l1 cache.  */
3745   256,                                  /* size of l2 cache.  */
3746   64,                                   /* size of prefetch block.  */
3747   12,                                   /* number of parallel prefetches.  */
3748   3,                                    /* Branch cost.  */
3749   COSTS_N_INSNS (3),                    /* cost of FADD and FSUB insns.  */
3750   COSTS_N_INSNS (3),                    /* cost of FMUL instruction.  */
3751   COSTS_N_INSNS (13),                   /* cost of FDIV instruction.  */
3752   COSTS_N_INSNS (2),                    /* cost of FABS instruction.  */
3753   COSTS_N_INSNS (2),                    /* cost of FCHS instruction.  */
3754   COSTS_N_INSNS (44),                   /* cost of FSQRT instruction.  */
3755
3756   COSTS_N_INSNS (1),                    /* cost of cheap SSE instruction.  */
3757   COSTS_N_INSNS (3),                    /* cost of ADDSS/SD SUBSS/SD insns.  */
3758   COSTS_N_INSNS (3),                    /* cost of MULSS instruction.  */
3759   COSTS_N_INSNS (3),                    /* cost of MULSD instruction.  */
3760   COSTS_N_INSNS (5),                    /* cost of FMA SS instruction.  */
3761   COSTS_N_INSNS (5),                    /* cost of FMA SD instruction.  */
3762   COSTS_N_INSNS (11),                   /* cost of DIVSS instruction.  */
3763   COSTS_N_INSNS (14),                   /* cost of DIVSD instruction.  */
3764   COSTS_N_INSNS (11),                   /* cost of SQRTSS instruction.  */
3765   COSTS_N_INSNS (18),                   /* cost of SQRTSD instruction.  */
3766   4, 4, 4, 4,                           /* reassoc int, fp, vec_int, vec_fp.  */
3767   shijidadao_memcpy,
3768   shijidadao_memset,
3769   COSTS_N_INSNS (3),                    /* cond_taken_branch_cost.  */
3770   COSTS_N_INSNS (1),                    /* cond_not_taken_branch_cost.  */
3771   "16:11:8",                            /* Loop alignment.  */
3772   "16:11:8",                            /* Jump alignment.  */
3773   "0:0:8",                              /* Label alignment.  */
3774   "16",                         /* Func alignment.  */
3775   4,                                    /* Small unroll limit.  */
3776   2,                                    /* Small unroll factor.  */
3777 };
3778
3779
3780
3781 /* Generic should produce code tuned for Core-i7 (and newer chips)
3782    and btver1 (and newer chips).  */
3783
3784 static stringop_algs generic_memcpy[2] = {
3785   {libcall, {{32, loop, false}, {8192, rep_prefix_4_byte, false},
3786              {-1, libcall, false}}},
3787   {libcall, {{32, loop, false}, {8192, rep_prefix_8_byte, false},
3788              {-1, libcall, false}}}};
3789 static stringop_algs generic_memset[2] = {
3790   {libcall, {{32, loop, false}, {8192, rep_prefix_4_byte, false},
3791              {-1, libcall, false}}},
3792   {libcall, {{32, loop, false}, {8192, rep_prefix_8_byte, false},
3793              {-1, libcall, false}}}};
3794 static const
3795 struct processor_costs generic_cost = {
3796   {
3797   /* Start of register allocator costs.  integer->integer move cost is 2. */
3798   6,                                 /* cost for loading QImode using movzbl */
3799   {6, 6, 6},                            /* cost of loading integer registers
3800                                            in QImode, HImode and SImode.
3801                                            Relative to reg-reg move (2).  */
3802   {6, 6, 6},                            /* cost of storing integer registers */
3803   4,                                    /* cost of reg,reg fld/fst */
3804   {6, 6, 12},                           /* cost of loading fp registers
3805                                            in SFmode, DFmode and XFmode */
3806   {6, 6, 12},                           /* cost of storing fp registers
3807                                            in SFmode, DFmode and XFmode */
3808   2,                                    /* cost of moving MMX register */
3809   {6, 6},                               /* cost of loading MMX registers
3810                                            in SImode and DImode */
3811   {6, 6},                               /* cost of storing MMX registers
3812                                            in SImode and DImode */
3813   2, 3, 4,                              /* cost of moving XMM,YMM,ZMM register */
3814   {6, 6, 6, 10, 15},                    /* cost of loading SSE registers
3815                                            in 32,64,128,256 and 512-bit */
3816   {6, 6, 6, 10, 15},                    /* cost of storing SSE registers
3817                                            in 32,64,128,256 and 512-bit */
3818   6, 6,                         /* SSE->integer and integer->SSE moves */
3819   6, 6,                         /* mask->integer and integer->mask moves */
3820   {6, 6, 6},                            /* cost of loading mask register
3821                                            in QImode, HImode, SImode.  */
3822   {6, 6, 6},                    /* cost if storing mask register
3823                                            in QImode, HImode, SImode.  */
3824   2,                                    /* cost of moving mask register.  */
3825   /* End of register allocator costs.  */
3826   },
3827
3828   COSTS_N_INSNS (1),                    /* cost of an add instruction */
3829   /* Setting cost to 2 makes our current implementation of synth_mult result in
3830      use of unnecessary temporary registers causing regression on several
3831      SPECfp benchmarks.  */
3832   COSTS_N_INSNS (1) + 1,                /* cost of a lea instruction */
3833   COSTS_N_INSNS (1),                    /* variable shift costs */
3834   COSTS_N_INSNS (1),                    /* constant shift costs */
3835   {COSTS_N_INSNS (3),                   /* cost of starting multiply for QI */
3836    COSTS_N_INSNS (3),                   /*                               HI */
3837    COSTS_N_INSNS (3),                   /*                               SI */
3838    COSTS_N_INSNS (3),                   /*                               DI */
3839    COSTS_N_INSNS (4)},                  /*                            other */
3840   0,                                    /* cost of multiply per each bit set */
3841   {COSTS_N_INSNS (16),                  /* cost of a divide/mod for QI */
3842    COSTS_N_INSNS (22),                  /*                          HI */
3843    COSTS_N_INSNS (30),                  /*                          SI */
3844    COSTS_N_INSNS (74),                  /*                          DI */
3845    COSTS_N_INSNS (74)},                 /*                          other */
3846   COSTS_N_INSNS (1),                    /* cost of movsx */
3847   COSTS_N_INSNS (1),                    /* cost of movzx */
3848   8,                                    /* "large" insn */
3849   17,                                   /* MOVE_RATIO */
3850   6,                                    /* CLEAR_RATIO */
3851   {6, 6, 6},                            /* cost of loading integer registers
3852                                            in QImode, HImode and SImode.
3853                                            Relative to reg-reg move (2).  */
3854   {6, 6, 6},                            /* cost of storing integer registers */
3855   {6, 6, 6, 10, 15},                    /* cost of loading SSE register
3856                                            in 32bit, 64bit, 128bit, 256bit and 512bit */
3857   {6, 6, 6, 10, 15},                    /* cost of storing SSE register
3858                                            in 32bit, 64bit, 128bit, 256bit and 512bit */
3859   {6, 6, 6, 10, 15},                    /* cost of unaligned loads.  */
3860   {6, 6, 6, 10, 15},                    /* cost of unaligned storess.  */
3861   2, 3, 4,                              /* cost of moving XMM,YMM,ZMM register */
3862   6,                                    /* cost of moving SSE register to integer.  */
3863   18, 6,                                /* Gather load static, per_elt.  */
3864   18, 6,                                /* Gather store static, per_elt.  */
3865   32,                                   /* size of l1 cache.  */
3866   512,                                  /* size of l2 cache.  */
3867   64,                                   /* size of prefetch block */
3868   6,                                    /* number of parallel prefetches */
3869   /* Benchmarks shows large regressions on K8 sixtrack benchmark when this
3870      value is increased to perhaps more appropriate value of 5.  */
3871   3,                                    /* Branch cost */
3872   COSTS_N_INSNS (3),                    /* cost of FADD and FSUB insns.  */
3873   COSTS_N_INSNS (5),                    /* cost of FMUL instruction.  */
3874   COSTS_N_INSNS (17),                   /* cost of FDIV instruction.  */
3875   COSTS_N_INSNS (1),                    /* cost of FABS instruction.  */
3876   COSTS_N_INSNS (1),                    /* cost of FCHS instruction.  */
3877   COSTS_N_INSNS (14),                   /* cost of FSQRT instruction.  */
3878
3879   COSTS_N_INSNS (1),                    /* cost of cheap SSE instruction.  */
3880   COSTS_N_INSNS (3),                    /* cost of ADDSS/SD SUBSS/SD insns.  */
3881   COSTS_N_INSNS (4),                    /* cost of MULSS instruction.  */
3882   COSTS_N_INSNS (5),                    /* cost of MULSD instruction.  */
3883   COSTS_N_INSNS (5),                    /* cost of FMA SS instruction.  */
3884   COSTS_N_INSNS (5),                    /* cost of FMA SD instruction.  */
3885   COSTS_N_INSNS (13),                   /* cost of DIVSS instruction.  */
3886   COSTS_N_INSNS (17),                   /* cost of DIVSD instruction.  */
3887   COSTS_N_INSNS (14),                   /* cost of SQRTSS instruction.  */
3888   COSTS_N_INSNS (18),                   /* cost of SQRTSD instruction.  */
3889   1, 4, 3, 3,                           /* reassoc int, fp, vec_int, vec_fp.  */
3890   generic_memcpy,
3891   generic_memset,
3892   COSTS_N_INSNS (4),                    /* cond_taken_branch_cost.  */
3893   COSTS_N_INSNS (2),                    /* cond_not_taken_branch_cost.  */
3894   "16",                                 /* Loop alignment.  */
3895   "16:11:8",                            /* Jump alignment.  */
3896   "0:0:8",                              /* Label alignment.  */
3897   "16",                                 /* Func alignment.  */
3898   4,                                    /* Small unroll limit.  */
3899   2,                                    /* Small unroll factor.  */
3900 };
3901
3902 /* core_cost should produce code tuned for Core familly of CPUs.  */
3903 static stringop_algs core_memcpy[2] = {
3904   {libcall, {{1024, rep_prefix_4_byte, true}, {-1, libcall, false}}},
3905   {libcall, {{24, loop, true}, {128, rep_prefix_8_byte, true},
3906              {-1, libcall, false}}}};
3907 static stringop_algs core_memset[2] = {
3908   {libcall, {{6, loop_1_byte, true},
3909              {24, loop, true},
3910              {8192, rep_prefix_4_byte, true},
3911              {-1, libcall, false}}},
3912   {libcall, {{24, loop, true}, {512, rep_prefix_8_byte, true},
3913              {-1, libcall, false}}}};
3914
3915 static const
3916 struct processor_costs core_cost = {
3917   {
3918   /* Start of register allocator costs.  integer->integer move cost is 2. */
3919   6,                                 /* cost for loading QImode using movzbl */
3920   {4, 4, 4},                            /* cost of loading integer registers
3921                                            in QImode, HImode and SImode.
3922                                            Relative to reg-reg move (2).  */
3923   {6, 6, 6},                            /* cost of storing integer registers */
3924   2,                                    /* cost of reg,reg fld/fst */
3925   {6, 6, 8},                            /* cost of loading fp registers
3926                                            in SFmode, DFmode and XFmode */
3927   {6, 6, 10},                           /* cost of storing fp registers
3928                                            in SFmode, DFmode and XFmode */
3929   2,                                    /* cost of moving MMX register */
3930   {6, 6},                               /* cost of loading MMX registers
3931                                            in SImode and DImode */
3932   {6, 6},                               /* cost of storing MMX registers
3933                                            in SImode and DImode */
3934   2, 2, 4,                              /* cost of moving XMM,YMM,ZMM register */
3935   {6, 6, 6, 6, 12},                     /* cost of loading SSE registers
3936                                            in 32,64,128,256 and 512-bit */
3937   {6, 6, 6, 6, 12},                     /* cost of storing SSE registers
3938                                            in 32,64,128,256 and 512-bit */
3939   6, 6,                         /* SSE->integer and integer->SSE moves */
3940   6, 6,                         /* mask->integer and integer->mask moves */
3941   {4, 4, 4},                            /* cost of loading mask register
3942                                            in QImode, HImode, SImode.  */
3943   {6, 6, 6},                            /* cost if storing mask register
3944                                            in QImode, HImode, SImode.  */
3945   2,                                    /* cost of moving mask register.  */
3946   /* End of register allocator costs.  */
3947   },
3948
3949   COSTS_N_INSNS (1),                    /* cost of an add instruction */
3950   /* On all chips taken into consideration lea is 2 cycles and more.  With
3951      this cost however our current implementation of synth_mult results in
3952      use of unnecessary temporary registers causing regression on several
3953      SPECfp benchmarks.  */
3954   COSTS_N_INSNS (1) + 1,                /* cost of a lea instruction */
3955   COSTS_N_INSNS (1),                    /* variable shift costs */
3956   COSTS_N_INSNS (1),                    /* constant shift costs */
3957   {COSTS_N_INSNS (3),                   /* cost of starting multiply for QI */
3958    COSTS_N_INSNS (4),                   /*                               HI */
3959    COSTS_N_INSNS (3),                   /*                               SI */
3960    /* Here we tune for Sandybridge or newer.  */
3961    COSTS_N_INSNS (3),                   /*                               DI */
3962    COSTS_N_INSNS (3)},                  /*                            other */
3963   0,                                    /* cost of multiply per each bit set */
3964   /* Expanding div/mod currently doesn't consider parallelism. So the cost
3965      model is not realistic. We compensate by increasing the latencies a bit.  */
3966   {COSTS_N_INSNS (11),                  /* cost of a divide/mod for QI */
3967    COSTS_N_INSNS (11),                  /*                          HI */
3968    COSTS_N_INSNS (14),                  /*                          SI */
3969    COSTS_N_INSNS (81),                  /*                          DI */
3970    COSTS_N_INSNS (81)},                 /*                          other */
3971   COSTS_N_INSNS (1),                    /* cost of movsx */
3972   COSTS_N_INSNS (1),                    /* cost of movzx */
3973   8,                                    /* "large" insn */
3974   17,                                   /* MOVE_RATIO */
3975   6,                                    /* CLEAR_RATIO */
3976   {4, 4, 4},                            /* cost of loading integer registers
3977                                            in QImode, HImode and SImode.
3978                                            Relative to reg-reg move (2).  */
3979   {6, 6, 6},                            /* cost of storing integer registers */
3980   {6, 6, 6, 6, 12},                     /* cost of loading SSE register
3981                                            in 32bit, 64bit, 128bit, 256bit and 512bit */
3982   {6, 6, 6, 6, 12},                     /* cost of storing SSE register
3983                                            in 32bit, 64bit, 128bit, 256bit and 512bit */
3984   {6, 6, 6, 6, 12},                     /* cost of unaligned loads.  */
3985   {6, 6, 6, 6, 12},                     /* cost of unaligned stores.  */
3986   2, 2, 4,                              /* cost of moving XMM,YMM,ZMM register */
3987   2,                                    /* cost of moving SSE register to integer.  */
3988   /* VGATHERDPD is 7 uops, rec throughput 5, while VGATHERDPD is 9 uops,
3989      rec. throughput 6.
3990      So 5 uops statically and one uops per load.  */
3991   10, 6,                                /* Gather load static, per_elt.  */
3992   10, 6,                                /* Gather store static, per_elt.  */
3993   64,                                   /* size of l1 cache.  */
3994   512,                                  /* size of l2 cache.  */
3995   64,                                   /* size of prefetch block */
3996   6,                                    /* number of parallel prefetches */
3997   /* FIXME perhaps more appropriate value is 5.  */
3998   3,                                    /* Branch cost */
3999   COSTS_N_INSNS (3),                    /* cost of FADD and FSUB insns.  */
4000   COSTS_N_INSNS (5),                    /* cost of FMUL instruction.  */
4001   /* 10-24 */
4002   COSTS_N_INSNS (24),                   /* cost of FDIV instruction.  */
4003   COSTS_N_INSNS (1),                    /* cost of FABS instruction.  */
4004   COSTS_N_INSNS (1),                    /* cost of FCHS instruction.  */
4005   COSTS_N_INSNS (23),                   /* cost of FSQRT instruction.  */
4006
4007   COSTS_N_INSNS (1),                    /* cost of cheap SSE instruction.  */
4008   COSTS_N_INSNS (3),                    /* cost of ADDSS/SD SUBSS/SD insns.  */
4009   COSTS_N_INSNS (4),                    /* cost of MULSS instruction.  */
4010   COSTS_N_INSNS (5),                    /* cost of MULSD instruction.  */
4011   COSTS_N_INSNS (5),                    /* cost of FMA SS instruction.  */
4012   COSTS_N_INSNS (5),                    /* cost of FMA SD instruction.  */
4013   COSTS_N_INSNS (18),                   /* cost of DIVSS instruction.  */
4014   COSTS_N_INSNS (32),                   /* cost of DIVSD instruction.  */
4015   COSTS_N_INSNS (30),                   /* cost of SQRTSS instruction.  */
4016   COSTS_N_INSNS (58),                   /* cost of SQRTSD instruction.  */
4017   1, 4, 2, 2,                           /* reassoc int, fp, vec_int, vec_fp.  */
4018   core_memcpy,
4019   core_memset,
4020   COSTS_N_INSNS (3),                    /* cond_taken_branch_cost.  */
4021   COSTS_N_INSNS (1),                    /* cond_not_taken_branch_cost.  */
4022   "16:11:8",                            /* Loop alignment.  */
4023   "16:11:8",                            /* Jump alignment.  */
4024   "0:0:8",                              /* Label alignment.  */
4025   "16",                                 /* Func alignment.  */
4026   4,                                    /* Small unroll limit.  */
4027   2,                                    /* Small unroll factor.  */
4028 };
4029