gcc/config/sparc/m8.md

   1 ;; Scheduling description for the SPARC M8.
   2 ;;   Copyright (C) 2017-2024 Free Software Foundation, Inc.
   3 ;;
   4 ;; This file is part of GCC.
   5 ;;
   6 ;; GCC is free software; you can redistribute it and/or modify
   7 ;; it under the terms of the GNU General Public License as published by
   8 ;; the Free Software Foundation; either version 3, or (at your option)
   9 ;; any later version.
  10 ;;
  11 ;; GCC is distributed in the hope that it will be useful,
  12 ;; but WITHOUT ANY WARRANTY; without even the implied warranty of
  13 ;; MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
  14 ;; GNU General Public License for more details.
  15 ;;
  16 ;; You should have received a copy of the GNU General Public License
  17 ;; along with GCC; see the file COPYING3.  If not see
  18 ;; <http://www.gnu.org/licenses/>.
  19
  20 ;; Thigs to improve:
  21 ;;
  22 ;; - Store instructions are implemented by micro-ops, one of which
  23 ;;   generates the store address and is executed in the store address
  24 ;;   generation unit in the slot0.  We need to model that.
  25 ;;
  26 ;; - There are two V3 pipes connected to different slots.  The current
  27 ;;   implementation assumes that all the instructions executing in a
  28 ;;   V3 pipe are issued to the unit in slot3.
  29 ;;
  30 ;; - Single-issue ALU operations incur an additional cycle of latency to
  31 ;;   slot 0 and slot 1 instructions.  This is not currently reflected
  32 ;;   in the DFA.
  33
  34 (define_automaton "m8_0")
  35
  36 ;; The S5 core has two dual-issue queues, PQLS and PQEX.  Each queue
  37 ;; is divided into two slots: PQLS corresponds to slots 0 and 1, and
  38 ;; PQEX corresponds to slots 2 and 3.  The core can issue 4
  39 ;; instructions per-cycle, and up to 4 instructions are committed each
  40 ;; cycle.
  41 ;;
  42 ;;
  43 ;;                   m8_slot0  - Load Unit.
  44 ;;                             - Store address gen. Unit.
  45 ;;
  46 ;;
  47 ;;   === PQLS ==>    m8_slot1  - Store data unit.
  48 ;;                             - Branch unit.
  49 ;;
  50 ;;
  51 ;;   === PQEX ==>    m8_slot2  - Integer Unit (EXU2).
  52 ;;                             - 3-cycles Crypto Unit (SPU2).
  53 ;;
  54 ;;                   m8_slot3  - Integer Unit (EXU3).
  55 ;;                             - 3-cycles Crypto Unit (SPU3).
  56 ;;                             - Floating-point and graphics unit (FPG).
  57 ;;                             - Long-latency Crypto Unit.
  58 ;;                             - Oracle Numbers Unit (ONU).
  59
  60 (define_cpu_unit "m8_slot0,m8_slot1,m8_slot2,m8_slot3" "m8_0")
  61
  62 ;; Some instructions stall the pipeline and avoid any other
  63 ;; instruction to be issued in the same cycle.  We assume the same for
  64 ;; multi-instruction insns.
  65
  66 (define_reservation "m8_single_issue" "m8_slot0 + m8_slot1 + m8_slot2 + m8_slot3")
  67
  68 (define_insn_reservation "m8_single" 1
  69   (and (eq_attr "cpu" "m8")
  70        (eq_attr "type" "multi,savew,flushw,trap,bmask"))
  71   "m8_single_issue")
  72
  73 ;; Most of the instructions executing in the integer units have a
  74 ;; latency of 1.
  75
  76 (define_insn_reservation "m8_integer" 1
  77   (and (eq_attr "cpu" "m8")
  78        (eq_attr "type" "ialu,ialuX,shift,cmove,compare,bmask"))
  79   "(m8_slot2 | m8_slot3)")
  80
  81 ;; Flushing the instruction memory takes 27 cycles.
  82
  83
  84 (define_insn_reservation "m8_iflush" 27
  85   (and (eq_attr "cpu" "m8")
  86        (eq_attr "type" "iflush"))
  87   "(m8_slot2 | m8_slot3), nothing*26")
  88
  89 ;; The integer multiplication instructions have a latency of 10 cycles
  90 ;; and execute in integer units.
  91 ;;
  92 ;; Likewise for array*, edge* and pdistn instructions.
  93 ;;
  94 ;; However, the latency is only 9 cycles if the consumer of the
  95 ;; operation is also capable of 9 cycles latency.  We model this with
  96 ;; a bypass.
  97
  98 (define_insn_reservation "m8_imul" 10
  99   (and (eq_attr "cpu" "m8")
 100        (eq_attr "type" "imul,array,edge,edgen,pdistn"))
 101   "(m8_slot2 | m8_slot3), nothing*12")
 102
 103 (define_bypass 9 "m8_imul" "m8_imul")
 104
 105 ;; The integer division instructions `sdiv' and `udivx' have a latency
 106 ;; of 30 cycles and execute in integer units.
 107
 108 (define_insn_reservation "m8_idiv" 30
 109   (and (eq_attr "cpu" "m8")
 110        (eq_attr "type" "idiv"))
 111   "(m8_slot2 | m8_slot3), nothing*29")
 112
 113 ;; Both integer and floating-point load instructions have a latency of
 114 ;; only 3 cycles,and execute in the slot0.
 115 ;;
 116 ;; Misaligned load instructions feature a latency of 11 cycles.
 117 ;;
 118 ;; The prefetch instruction also executes in the load unit, but it's
 119 ;; latency is only 1 cycle.
 120
 121 (define_insn_reservation "m8_load" 3
 122   (and (eq_attr "cpu" "m8")
 123        (ior (eq_attr "type" "fpload,sload")
 124             (and (eq_attr "type" "load")
 125                  (eq_attr "subtype" "regular"))))
 126   "m8_slot0, nothing*2")
 127
 128 ;; (define_insn_reservation "m8_load_misalign" 11
 129 ;;  (and (eq_attr "cpu" "m8")
 130 ;;       (eq_attr "type" "load_mis,fpload_mis"))
 131 ;;  "m8_slot0, nothing*10")
 132
 133 (define_insn_reservation "m8_prefetch" 1
 134   (and (eq_attr "cpu" "m8")
 135        (eq_attr "type" "load")
 136        (eq_attr "subtype" "prefetch"))
 137   "m8_slot0")
 138
 139 ;; Both integer and floating-point store instructions have a latency
 140 ;; of 1 cycle, and execute in the store data unit in slot1.
 141 ;;
 142 ;; However, misaligned store instructions feature a latency of 3
 143 ;; cycles.
 144
 145 (define_insn_reservation "m8_store" 1
 146   (and (eq_attr "cpu" "m8")
 147        (eq_attr "type" "store,fpstore"))
 148   "m8_slot1")
 149
 150 ;; (define_insn_reservation "m8_store_misalign" 3
 151 ;;   (and (eq_attr "cpu" "m8")
 152 ;;        (eq_attr "type" "store_mis,fpstore_mis"))
 153 ;;   "m8_slot1, nothing*2")
 154
 155 ;; Control-transfer instructions execute in the Branch Unit in the
 156 ;; slot1.
 157
 158 (define_insn_reservation "m8_cti" 1
 159   (and (eq_attr "cpu" "m8")
 160        (eq_attr "type" "cbcond,uncond_cbcond,branch,call,sibcall,call_no_delay_slot,uncond_branch,return"))
 161   "m8_slot1")
 162
 163 ;; Many instructions executing in the Floating-point and Graphics Unit
 164 ;; (FGU) serving slot3 feature a default latency of 9 cycles.
 165
 166 (define_insn_reservation "m8_fp" 9
 167   (and (eq_attr "cpu" "m8")
 168        (ior (eq_attr "type" "fpmove,fpcmove,fpcrmove,fp,fpcmp,fpmul,fgm_pack,fgm_mul,pdist")
 169             (and (eq_attr "type" "fga")
 170                  (eq_attr "subtype" "fpu"))))
 171   "m8_slot3, nothing*8")
 172
 173 ;; Floating-point division and floating-point square-root instructions
 174 ;; have high latencies.  They execute in the FGU.
 175
 176 (define_insn_reservation "m8_fpdivs" 26
 177   (and (eq_attr "cpu" "m8")
 178        (eq_attr "type" "fpdivs"))
 179   "m8_slot3, nothing*25")
 180
 181 (define_insn_reservation "m8_fpsqrts" 33
 182   (and (eq_attr "cpu" "m8")
 183        (eq_attr "type" "fpsqrts"))
 184   "m8_slot3, nothing*32")
 185
 186 (define_insn_reservation "m8_fpdivd" 30
 187   (and (eq_attr "cpu" "m8")
 188        (eq_attr "type" "fpdivd"))
 189   "m8_slot3, nothing*29")
 190
 191 (define_insn_reservation "m8_fpsqrtd" 41
 192   (and (eq_attr "cpu" "m8")
 193        (eq_attr "type" "fpsqrtd"))
 194   "m8_slot3, nothing*40")
 195
 196 ;; SIMD VIS instructions executing in the Floating-point and graphics
 197 ;; unit (FPG) in slot3 usually have a latency of 5 cycles.
 198 ;;
 199 ;; However, the latency for many instructions is only 3 cycles if the
 200 ;; consumer can also be executed in 3 cycles.  We model this with a
 201 ;; bypass.  In these cases the instructions are executed in one of the
 202 ;; two 3-cycle crypto units (SPU, also known as "v3-pipes") in slots 2
 203 ;; and 3.
 204
 205 (define_insn_reservation "m8_vis" 5
 206   (and (eq_attr "cpu" "m8")
 207        (ior (eq_attr "type" "viscmp,lzd")
 208             (and (eq_attr "type" "fga")
 209                  (eq_attr "subtype" "maxmin,cmask,other"))
 210             (and (eq_attr "type" "vismv")
 211                  (eq_attr "subtype" "single,movstouw"))
 212             (and (eq_attr "type" "visl")
 213                  (eq_attr "subtype" "single"))))
 214   "m8_slot3, nothing*4")
 215
 216 (define_bypass 3 "m8_vis" "m8_vis")
 217
 218 (define_insn_reservation "m8_gsr" 5
 219   (and (eq_attr "cpu" "m8")
 220        (eq_attr "type" "gsr")
 221        (eq_attr "subtype" "alignaddr"))
 222   "m8_slot3, nothing*4")
 223
 224 ;; A few VIS instructions have a latency of 1.
 225
 226 (define_insn_reservation "m8_vis_1cycle" 1
 227   (and (eq_attr "cpu" "m8")
 228        (ior (and (eq_attr "type" "vismv")
 229                  (eq_attr "subtype" "double,movxtod,movdtox"))
 230             (and (eq_attr "type" "visl")
 231                  (eq_attr "subtype" "double"))
 232             (and (eq_attr "type" "fga")
 233                  (eq_attr "subtype" "addsub64"))))
 234   "m8_slot3")
 235
 236 ;; Reading and writing to the gsr register takes more than 70 cycles.
 237
 238 (define_insn_reservation "m8_gsr_reg" 70
 239   (and (eq_attr "cpu" "m8")
 240        (eq_attr "type" "gsr")
 241        (eq_attr "subtype" "reg"))
 242   "m8_slot3, nothing*69")