add: performance values for Lattice MachXO2
[zpu.git] / zpu / hdl / zealot / zpu_medium.vhdl
blob47950fee9e0fc6f7b5deac2c1e554ae904efb1c8
1 ------------------------------------------------------------------------------
2 ----                                                                      ----
3 ----  ZPU Medium                                                          ----
4 ----                                                                      ----
5 ----  http://www.opencores.org/                                           ----
6 ----                                                                      ----
7 ----  Description:                                                        ----
8 ----  ZPU is a 32 bits small stack cpu. This is the medium size version.  ----
9 ----  Supports external memories.                                         ----
10 ----                                                                      ----
11 ----  To Do:                                                              ----
12 ----  -                                                                   ----
13 ----                                                                      ----
14 ----  Author:                                                             ----
15 ----    - Øyvind Harboe, oyvind.harboe zylin.com                          ----
16 ----    - Salvador E. Tropea, salvador inti.gob.ar                        ----
17 ----                                                                      ----
18 ------------------------------------------------------------------------------
19 ----                                                                      ----
20 ---- Copyright (c) 2008 Øyvind Harboe <oyvind.harboe zylin.com>           ----
21 ---- Copyright (c) 2008 Salvador E. Tropea <salvador inti.gob.ar>         ----
22 ---- Copyright (c) 2008 Instituto Nacional de Tecnología Industrial       ----
23 ----                                                                      ----
24 ---- Distributed under the BSD license                                    ----
25 ----                                                                      ----
26 ------------------------------------------------------------------------------
27 ----                                                                      ----
28 ---- Design unit:      ZPUMediumCore(Behave) (Entity and architecture)    ----
29 ---- File name:        zpu_medium.vhdl                                    ----
30 ---- Note:             None                                               ----
31 ---- Limitations:      None known                                         ----
32 ---- Errors:           None known                                         ----
33 ---- Library:          zpu                                                ----
34 ---- Dependencies:     IEEE.std_logic_1164                                ----
35 ----                   IEEE.numeric_std                                   ----
36 ----                   zpu.zpupkg                                         ----
37 ---- Target FPGA:      Spartan 3 (XC3S400-4-FT256)                        ----
38 ---- Language:         VHDL                                               ----
39 ---- Wishbone:         No                                                 ----
40 ---- Synthesis tools:  Xilinx Release 9.2.03i - xst J.39                  ----
41 ---- Simulation tools: GHDL [Sokcho edition] (0.2x)                       ----
42 ---- Text editor:      SETEdit 0.5.x                                      ----
43 ----                                                                      ----
44 ------------------------------------------------------------------------------
46 -- write_en_o   - set to '1' for a single cycle to send off a write request.
47 --                data_o is valid only while write_en_o='1'.
48 -- read_en_o    - set to '1' for a single cycle to send off a read request.
49 -- mem_busy_i   - It is illegal to send off a read/write request when
50 --                mem_busy_i='1'.
51 --                Set to '0' when data_i  is valid after a read request.
52 --                If it goes to '1'(busy), it is on the cycle after read/
53 --                write_en_o is '1'.
54 -- addr_o       - address for read/write request
55 -- data_i       - read data. Valid only on the cycle after mem_busy_i='0'
56 --                after read_en_o='1' for a single cycle.
57 -- data_o       - data to write
58 -- break_o      - set to '1' when CPU hits break instruction
60 library IEEE;
61 use IEEE.std_logic_1164.all;
62 use IEEE.numeric_std.all;
64 library zpu;
65 use zpu.zpupkg.all;
67 entity ZPUMediumCore is
68    generic(
69       WORD_SIZE    : integer:=32;  -- 16/32 (2**wordPower)
70       ADDR_W       : integer:=16;  -- Total address space width (incl. I/O)
71       MEM_W        : integer:=15;  -- Memory (prog+data+stack) width
72       D_CARE_VAL   : std_logic:='X'; -- Value used to fill the unsused bits
73       MULT_PIPE    : boolean:=false; -- Pipeline multiplication
74       BINOP_PIPE   : integer range 0 to 2:=0; -- Pipeline binary operations (-, =, < and <=)
75       ENA_LEVEL0   : boolean:=true;  -- eq, loadb, neqbranch and pushspadd
76       ENA_LEVEL1   : boolean:=true;  -- lessthan, ulessthan, mult, storeb, callpcrel and sub
77       ENA_LEVEL2   : boolean:=false; -- lessthanorequal, ulessthanorequal, call and poppcrel
78       ENA_LSHR     : boolean:=true;  -- lshiftright
79       ENA_IDLE     : boolean:=false; -- Enable the enable_i input
80       FAST_FETCH   : boolean:=true); -- Merge the st_fetch with the st_execute states
81    port(
82       clk_i        : in  std_logic; -- CPU Clock
83       reset_i      : in  std_logic; -- Sync Reset
84       enable_i     : in  std_logic; -- Hold the CPU (after reset)
85       break_o      : out std_logic; -- Break instruction executed
86       dbg_o        : out zpu_dbgo_t; -- Debug outputs (i.e. trace log)
87       -- Memory interface
88       mem_busy_i   : in  std_logic; -- Memory is busy
89       data_i       : in  unsigned(WORD_SIZE-1 downto 0); -- Data from mem
90       data_o       : out unsigned(WORD_SIZE-1 downto 0); -- Data to mem
91       addr_o       : out unsigned(ADDR_W-1 downto 0); -- Memory address
92       write_en_o   : out std_logic;  -- Memory write enable
93       read_en_o    : out std_logic); -- Memory read enable
94 end entity ZPUMediumCore;
96 architecture Behave of ZPUMediumCore is
97    constant BYTE_BITS    : integer:=WORD_SIZE/16; -- # of bits in a word that addresses bytes
98    constant WORD_BYTES   : integer:=WORD_SIZE/OPCODE_W;
99    constant MAX_ADDR_BIT : integer:=ADDR_W-2;
100    -- Stack Pointer initial value: BRAM size-8
101    constant SP_START_1   : unsigned(ADDR_W-1 downto 0):=to_unsigned((2**MEM_W)-8,ADDR_W);
102    constant SP_START     : unsigned(ADDR_W-1 downto BYTE_BITS):=
103                            SP_START_1(ADDR_W-1 downto BYTE_BITS);
105    -- Update [SP+1]. We hold it in b_r, this writes the value to memory.
106    procedure FlushB(signal we     : out std_logic;
107                     signal addr   : out unsigned(ADDR_W-1 downto BYTE_BITS);
108                     signal inc_sp : in  unsigned(ADDR_W-1 downto BYTE_BITS);
109                     signal data   : out unsigned(WORD_SIZE-1 downto 0);
110                     signal b      : in  unsigned(WORD_SIZE-1 downto 0)) is
111    begin
112       we   <= '1';
113       addr <= inc_sp;
114       data <= b;
115    end procedure FlushB;
117    -- Do a simple stack push, it is performed in the internal cache registers,
118    -- not in the real memory.
119    procedure Push(signal sp     : inout unsigned(ADDR_W-1 downto BYTE_BITS);
120                   signal a      : in    unsigned(WORD_SIZE-1 downto 0);
121                   signal b      : out   unsigned(WORD_SIZE-1 downto 0)) is
122    begin
123       b  <= a;      -- Update cache [SP+1]=[SP]
124       sp <= sp-1;
125    end procedure Push;
127    -- Do a simple stack pop, it is performed in the internal cache registers,
128    -- not in the real memory.
129    procedure Pop(signal sp     : inout unsigned(ADDR_W-1 downto BYTE_BITS);
130                  signal a      : out   unsigned(WORD_SIZE-1 downto 0);
131                  signal b      : in    unsigned(WORD_SIZE-1 downto 0)) is
132    begin
133       a  <= b;      -- Update cache [SP]=[SP+1]
134       sp <= sp+1;
135    end procedure Pop;
137    -- Expand a PC value to WORD_SIZE
138    function ExpandPC(v : unsigned(ADDR_W-1 downto 0)) return unsigned is
139       variable nv : unsigned(WORD_SIZE-1 downto 0);
140    begin
141       nv:=(others => '0');
142       nv(ADDR_W-1 downto 0):=v;
143       return nv;
144    end function ExpandPC;
146    -- Program counter
147    signal pc_r          : unsigned(ADDR_W-1 downto 0):=(others => '0');
148    -- Stack pointer
149    signal sp_r          : unsigned(ADDR_W-1 downto BYTE_BITS):=SP_START;
150    -- SP+1, SP+2 and SP-1 are very used, these are shortcuts
151    signal inc_sp        : unsigned(ADDR_W-1 downto BYTE_BITS);
152    signal inc_inc_sp    : unsigned(ADDR_W-1 downto BYTE_BITS);
153    -- a_r is a cache for the top of the stack [SP]
154    -- Note: as this is a stack CPU this is a very important register.
155    signal a_r           : unsigned(WORD_SIZE-1 downto 0);
156    -- b_r is a cache for the next value in the stack [SP+1]
157    signal b_r           : unsigned(WORD_SIZE-1 downto 0);
158    signal bin_op_res1_r : unsigned(WORD_SIZE-1 downto 0):=(others => '0');
159    signal bin_op_res2_r : unsigned(WORD_SIZE-1 downto 0):=(others => '0');
160    signal mult_res1_r   : unsigned(WORD_SIZE-1 downto 0);
161    signal mult_res2_r   : unsigned(WORD_SIZE-1 downto 0);
162    signal mult_res3_r   : unsigned(WORD_SIZE-1 downto 0);
163    signal mult_a_r      : unsigned(WORD_SIZE-1 downto 0):=(others => '0');
164    signal mult_b_r      : unsigned(WORD_SIZE-1 downto 0):=(others => '0');
165    signal idim_r        : std_logic;
166    signal write_en_r    : std_logic;
167    signal read_en_r     : std_logic;
168    signal addr_r        : unsigned(ADDR_W-1 downto BYTE_BITS):=(others => '0');
169    signal fetched_w_r   : unsigned(WORD_SIZE-1 downto 0);
171    type state_t is(st_load2, st_popped, st_load_sp2, st_load_sp3, st_add_sp2,
172                    st_fetch, st_execute, st_decode, st_decode2, st_resync,
173                    st_store_sp2, st_resync2, st_resync3, st_loadb2, st_storeb2,
174                    st_mult2, st_mult3, st_mult5, st_mult4, st_binary_op_res2,
175                    st_binary_op_res, st_idle); 
176    signal state : state_t:=st_resync;
178    -- Go to st_fetch state or just do its work
179    procedure DoFetch(constant FAST   : boolean;
180                      signal state    : out state_t;
181                      signal addr     : out unsigned(ADDR_W-1 downto BYTE_BITS);
182                      signal pc       : in  unsigned(ADDR_W-1 downto 0);
183                      signal re       : out std_logic;
184                      signal busy     : in  std_logic) is
185    begin
186       if FAST then
187          -- Equivalent to st_fetch
188          if busy='0' then
189             addr  <= pc(ADDR_W-1 downto BYTE_BITS);
190             re    <= '1';
191             state <= st_decode;
192          end if;
193       else
194          state <= st_fetch;
195       end if;
196    end procedure DoFetch;
198    -- Perform a "binary operation" (2 operands)
199    procedure DoBinOp(result         : in    unsigned(WORD_SIZE-1 downto 0);
200                      signal state   : out   state_t;
201                      signal sp      : inout unsigned(ADDR_W-1 downto BYTE_BITS);
202                      signal addr    : out   unsigned(ADDR_W-1 downto BYTE_BITS);
203                      signal re      : out   std_logic;
204                      signal dest    : out   unsigned(WORD_SIZE-1 downto 0);
205                      signal dest_p  : out   unsigned(WORD_SIZE-1 downto 0);
206                      constant DEPTH :       natural) is
207    begin
208       if DEPTH=2 then
209          -- 2 clocks: st_binary_op_res+st_binary_op_res2
210          state  <= st_binary_op_res;
211          dest_p <= result;
212       elsif DEPTH=1 then
213          -- 1 clock: st_binary_op_res2
214          state  <= st_binary_op_res2;
215          dest_p <= result;
216       else -- 0 clocks
217          re    <= '1';
218          addr  <= sp+2;
219          sp    <= sp+1;
220          dest  <= result;
221          state <= st_popped;
222       end if;
223    end procedure DoBinOp;
225    -- Perform a boolean "binary operation" (2 operands)
226    procedure DoBinOpBool(result         : in    boolean;
227                          signal state   : out   state_t;
228                          signal sp      : inout unsigned(ADDR_W-1 downto BYTE_BITS);
229                          signal addr    : out   unsigned(ADDR_W-1 downto BYTE_BITS);
230                          signal re      : out   std_logic;
231                          signal dest    : out   unsigned(WORD_SIZE-1 downto 0);
232                          signal dest_p  : out   unsigned(WORD_SIZE-1 downto 0);
233                          constant DEPTH :       natural) is
234       variable res : unsigned(WORD_SIZE-1 downto 0):=(others => '0');
235    begin
236       if result then
237          res(0):='1';
238       end if;
239       DoBinOp(res,state,sp,addr,re,dest,dest_p,DEPTH);
240    end procedure DoBinOpBool;
242    type insn_t is (dec_add_top, dec_dup, dec_dup_stk_b, dec_pop, dec_add,
243                    dec_or, dec_and, dec_store, dec_add_sp, dec_shift, dec_nop,
244                    dec_im, dec_load_sp, dec_store_sp, dec_emulate, dec_load,
245                    dec_push_sp, dec_pop_pc, dec_pop_pc_rel, dec_not, dec_flip,
246                    dec_pop_sp, dec_neq_branch, dec_eq, dec_loadb, dec_mult,
247                    dec_less_than, dec_less_than_or_equal, dec_lshr,
248                    dec_u_less_than_or_equal, dec_u_less_than, dec_push_sp_add,
249                    dec_call, dec_call_pc_rel, dec_sub, dec_break, dec_storeb,
250                    dec_insn_fetch, dec_pop_down);
251    signal insn : insn_t;
252    type insn_array_t is array(0 to WORD_BYTES-1) of insn_t;
253    signal insns : insn_array_t;
254    type opcode_array_t is array(0 to WORD_BYTES-1) of unsigned(OPCODE_W-1 downto 0);
255    signal opcode_r : opcode_array_t;
256 begin
257    -- the memory subsystem will tell us one cycle later whether or
258    -- not it is busy
259    write_en_o <= write_en_r;
260    read_en_o  <= read_en_r;
261    addr_o(ADDR_W-1 downto BYTE_BITS) <= addr_r;
262    addr_o(BYTE_BITS-1 downto 0)      <= (others => '0');
264    -- SP+1 and +2
265    inc_sp     <= sp_r+1;
266    inc_inc_sp <= sp_r+2;
268    opcode_control:
269    process (clk_i)
270       variable topcode     : unsigned(OPCODE_W-1 downto 0);
271       variable ex_opcode   : unsigned(OPCODE_W-1 downto 0);
272       variable sp_offset   : unsigned(4 downto 0);
273       variable tsp_offset  : unsigned(4 downto 0);
274       variable next_pc     : unsigned(ADDR_W-1 downto 0);
275       variable tdecoded    : insn_t;
276       variable tinsns   : insn_array_t;
277       variable mult_res    : unsigned(WORD_SIZE*2-1 downto 0);
278       variable ipc_low     : integer range 0 to 3; -- Address inside a word (pc_r)
279       variable inpc_low    : integer range 0 to 3; -- Address inside a word (next_pc)
280       variable h_bit       : integer;
281       variable l_bit       : integer;
282       variable not_lshr    : std_logic:='1';
283    begin
284       if rising_edge(clk_i) then
285          break_o <= '0';
286          if reset_i='1' then
287             if ENA_IDLE then
288                state <= st_idle;
289             else
290                state <= st_resync;
291             end if;
292             sp_r         <= SP_START;
293             pc_r         <= (others => '0');
294             idim_r       <= '0';
295             write_en_r   <= '0';
296             read_en_r    <= '0';
297             mult_a_r     <= (others => '0');
298             mult_b_r     <= (others => '0');
299             dbg_o.b_inst <= '0';
300             -- Reseting add_r here makes XST fail to use BRAMs ?!
301          else -- reset_i='1'
302             if MULT_PIPE then
303                -- We must multiply unconditionally to get pipelined multiplication
304                mult_res:=mult_a_r*mult_b_r;
305                mult_res1_r <= mult_res(WORD_SIZE-1 downto 0);
306                mult_res2_r <= mult_res1_r;
307                mult_res3_r <= mult_res2_r;
308                mult_a_r    <= (others => D_CARE_VAL);
309                mult_b_r    <= (others => D_CARE_VAL);
310             end if;
312             if BINOP_PIPE=2 then
313                bin_op_res2_r <= bin_op_res1_r; -- pipeline a bit.
314             end if;
315    
316             read_en_r  <='0';
317             write_en_r <='0';
318             -- Allow synthesis tools to load bogus values when we don't
319             -- care about the address and output data.
320             addr_r     <= (others => D_CARE_VAL);
321             data_o     <= (others => D_CARE_VAL);
322    
323             if (write_en_r='1') and (read_en_r='1') then
324                report "read/write collision" severity failure;
325             end if;
327             ipc_low:=to_integer(pc_r(BYTE_BITS-1 downto 0));
328             sp_offset(4):=not opcode_r(ipc_low)(4);
329             sp_offset(3 downto 0):=opcode_r(ipc_low)(3 downto 0);
330             next_pc:=pc_r+1;
331    
332             -- Prepare trace snapshot
333             dbg_o.opcode <= opcode_r(ipc_low);
334             dbg_o.pc     <= resize(pc_r,32);
335             dbg_o.stk_a  <= resize(a_r,32);
336             dbg_o.stk_b  <= resize(b_r,32);
337             dbg_o.b_inst <= '0';
338             dbg_o.sp     <= (others => '0');
339             dbg_o.sp(ADDR_W-1 downto BYTE_BITS) <= sp_r;
340    
341             case state is
342                  when st_idle =>
343                       if enable_i='1' then
344                          state <= st_resync;
345                       end if;
346                  -- Initial state of ZPU, fetch top of stack (A/B) + first instruction
347                  when st_resync =>
348                       if mem_busy_i='0' then
349                          addr_r    <= sp_r;
350                          read_en_r <= '1';
351                          state     <= st_resync2;
352                       end if;
353                  when st_resync2 =>
354                       if mem_busy_i='0' then
355                          a_r       <= data_i;
356                          addr_r    <= inc_sp;
357                          read_en_r <= '1';
358                          state     <= st_resync3;
359                       end if;
360                  when st_resync3 =>
361                       if mem_busy_i='0' then
362                          b_r       <= data_i;
363                          addr_r    <= pc_r(ADDR_W-1 downto BYTE_BITS);
364                          read_en_r <= '1';
365                          state     <= st_decode;
366                       end if;
367                  when st_decode =>
368                       if mem_busy_i='0' then
369                          -- Here we latch the fetched word to give one full clock
370                          -- cycle to the instruction decoder. This could be removed
371                          -- if using BRAMs and the decoder delay isn't important.
372                          fetched_w_r <= data_i;
373                          state       <= st_decode2;
374                       end if;
375                  when st_decode2 =>
376                       -- decode 4 instructions in parallel
377                       for i in 0 to WORD_BYTES-1 loop
378                           topcode:=fetched_w_r((WORD_BYTES-1-i+1)*8-1 downto (WORD_BYTES-1-i)*8);
380                           tsp_offset(4):=not topcode(4);
381                           tsp_offset(3 downto 0):=topcode(3 downto 0);
383                           opcode_r(i) <= topcode;
384                           if topcode(7 downto 7)=OPCODE_IM then
385                              tdecoded:=dec_im;
386                           elsif topcode(7 downto 5)=OPCODE_STORESP then
387                              if tsp_offset=0 then
388                                 -- Special case, we can avoid a write
389                                 tdecoded:=dec_pop;
390                              elsif tsp_offset=1 then
391                                 -- Special case, collision
392                                 tdecoded:=dec_pop_down;
393                              else
394                                 tdecoded:=dec_store_sp;
395                              end if;
396                           elsif topcode(7 downto 5)=OPCODE_LOADSP then
397                              if tsp_offset=0 then
398                                 tdecoded:=dec_dup;
399                              elsif tsp_offset=1 then
400                                 tdecoded:=dec_dup_stk_b;
401                              else
402                                 tdecoded:=dec_load_sp;
403                              end if;
404                           elsif topcode(7 downto 5)=OPCODE_EMULATE then
405                              tdecoded:=dec_emulate;
406                              if ENA_LEVEL0 and topcode(5 downto 0)=OPCODE_NEQBRANCH then
407                                 tdecoded:=dec_neq_branch;
408                              elsif ENA_LEVEL0 and topcode(5 downto 0)=OPCODE_EQ then
409                                 tdecoded:=dec_eq;
410                              elsif ENA_LEVEL0 and topcode(5 downto 0)=OPCODE_LOADB then
411                                 tdecoded:=dec_loadb;
412                              elsif ENA_LEVEL0 and topcode(5 downto 0)=OPCODE_PUSHSPADD then
413                                 tdecoded:=dec_push_sp_add;
414                              elsif ENA_LEVEL1 and topcode(5 downto 0)=OPCODE_LESSTHAN then
415                                 tdecoded:=dec_less_than;
416                              elsif ENA_LEVEL1 and topcode(5 downto 0)=OPCODE_ULESSTHAN then
417                                 tdecoded:=dec_u_less_than;
418                              elsif ENA_LEVEL1 and topcode(5 downto 0)=OPCODE_MULT then
419                                 tdecoded:=dec_mult;
420                              elsif ENA_LEVEL1 and topcode(5 downto 0)=OPCODE_STOREB then
421                                 tdecoded:=dec_storeb;
422                              elsif ENA_LEVEL1 and topcode(5 downto 0)=OPCODE_CALLPCREL then
423                                 tdecoded:=dec_call_pc_rel;
424                              elsif ENA_LEVEL1 and topcode(5 downto 0)=OPCODE_SUB then
425                                 tdecoded:=dec_sub;
426                              elsif ENA_LEVEL2 and topcode(5 downto 0)=OPCODE_LESSTHANOREQUAL then
427                                 tdecoded:=dec_less_than_or_equal;
428                              elsif ENA_LEVEL2 and topcode(5 downto 0)=OPCODE_ULESSTHANOREQUAL then
429                                 tdecoded:=dec_u_less_than_or_equal;
430                              elsif ENA_LEVEL2 and topcode(5 downto 0)=OPCODE_CALL then
431                                 tdecoded:=dec_call;
432                              elsif ENA_LEVEL2 and topcode(5 downto 0)=OPCODE_POPPCREL then
433                                 tdecoded:=dec_pop_pc_rel;
434                              elsif ENA_LSHR and topcode(5 downto 0)=OPCODE_LSHIFTRIGHT then
435                                 tdecoded:=dec_lshr;
436                              end if;
437                           elsif topcode(7 downto 4)=OPCODE_ADDSP then
438                              if tsp_offset=0 then
439                                 tdecoded:=dec_shift;
440                              elsif tsp_offset=1 then
441                                 tdecoded:=dec_add_top;
442                              else
443                                 tdecoded:=dec_add_sp;
444                              end if;
445                           else -- OPCODE_SHORT
446                              case topcode(3 downto 0) is
447                                   when OPCODE_BREAK =>
448                                        tdecoded:=dec_break;
449                                   when OPCODE_PUSHSP =>
450                                        tdecoded:=dec_push_sp;
451                                   when OPCODE_POPPC =>
452                                        tdecoded:=dec_pop_pc;
453                                   when OPCODE_ADD =>
454                                        tdecoded:=dec_add;
455                                   when OPCODE_OR =>
456                                        tdecoded:=dec_or;
457                                   when OPCODE_AND =>
458                                        tdecoded:=dec_and;
459                                   when OPCODE_LOAD =>
460                                        tdecoded:=dec_load;
461                                   when OPCODE_NOT =>
462                                        tdecoded:=dec_not;
463                                   when OPCODE_FLIP =>
464                                        tdecoded:=dec_flip;
465                                   when OPCODE_STORE =>
466                                        tdecoded:=dec_store;
467                                   when OPCODE_POPSP =>
468                                        tdecoded:=dec_pop_sp;
469                                   when others => -- OPCODE_NOP and others
470                                        tdecoded:=dec_nop;
471                              end case;
472                           end if;
473                           tinsns(i):=tdecoded;
474                       end loop;
475                       
476                       insn <= tinsns(ipc_low);
477                       -- once we wrap, we need to fetch
478                       tinsns(0):=dec_insn_fetch;
479                       insns <= tinsns;
480                       state <= st_execute;
482                       -- Each instruction must:
483                       --
484                       -- 1. increase pc_r if applicable
485                       -- 2. set next state if applicable
486                       -- 3. do it's operation
487                  when st_execute =>
488                       -- Some shortcut to make the code readable:
489                       inpc_low:=to_integer(next_pc(BYTE_BITS-1 downto 0));
490                       ex_opcode:=opcode_r(ipc_low);
491                       insn <= insns(inpc_low);
492                       -- Defaults used by most instructions
493                       if insn/=dec_insn_fetch and insn/=dec_im then
494                          dbg_o.b_inst <= '1';
495                          idim_r       <= '0';
496                       end if;
497                       case insn is
498                            when dec_insn_fetch =>
499                                 -- Not a real instruction, fetch new instructions
500                                 DoFetch(FAST_FETCH,state,addr_r,pc_r,read_en_r,mem_busy_i);
501                            when dec_im =>
502                                 -- Push(immediate value), IDIM=1
503                                 -- if IDIM=0 Push(signed(opcode & 0x7F)) else
504                                 --           Push((Pop()<<7)|(opcode&0x7F))
505                                 if mem_busy_i='0' then
506                                    dbg_o.b_inst <= '1';
507                                    idim_r       <= '1';
508                                    pc_r             <= pc_r+1;
509                                    if idim_r='1' then
510                                       -- We already started an IM sequence
511                                       -- Shift left 7 bits
512                                       a_r(WORD_SIZE-1 downto 7) <= a_r(WORD_SIZE-8 downto 0);
513                                       -- Put the new value
514                                       a_r(6 downto 0) <= ex_opcode(6 downto 0);
515                                    else
516                                       -- First IM, push the value sign extended
517                                       FlushB(write_en_r,addr_r,inc_sp,data_o,b_r);
518                                       a_r <= unsigned(resize(signed(ex_opcode(6 downto 0)),WORD_SIZE));
519                                       Push(sp_r,a_r,b_r);
520                                    end if;
521                                 end if;
522                            when dec_store_sp =>
523                                 -- [SP+Offset]=Pop()
524                                 if mem_busy_i='0' then
525                                    write_en_r <= '1';
526                                    addr_r     <= sp_r+sp_offset;
527                                    data_o     <= a_r;
528                                    Pop(sp_r,a_r,b_r);
529                                    -- We need to fetch B
530                                    state      <= st_store_sp2;
531                                 end if;
532                            when dec_load_sp =>
533                                 -- Push([SP+Offset])
534                                 if mem_busy_i='0' then
535                                    FlushB(write_en_r,addr_r,inc_sp,data_o,b_r);
536                                    Push(sp_r,a_r,b_r);
537                                    -- We are flushing B cache, so we need more time to
538                                    -- read the value.
539                                    state <= st_load_sp2;
540                                 end if;
541                            when dec_emulate =>
542                                 -- Push(PC+1), PC=Opcode[4:0]*32
543                                 if mem_busy_i='0' then
544                                    FlushB(write_en_r,addr_r,inc_sp,data_o,b_r);
545                                    state <= st_fetch;
546                                    a_r   <= ExpandPC(pc_r+1);
547                                    Push(sp_r,a_r,b_r);
548                                    -- The emulate address is:
549                                    --        98 7654 3210
550                                    -- 0000 00aa aaa0 0000
551                                    pc_r             <= (others => '0');
552                                    pc_r(9 downto 5) <= ex_opcode(4 downto 0);
553                                 end if;
554                            when dec_call_pc_rel =>
555                                 -- t=Pop(), Push(PC+1), PC=PC+t
556                                 if mem_busy_i='0' and ENA_LEVEL1 then
557                                    state <= st_fetch;
558                                    a_r   <= ExpandPC(pc_r+1);
559                                    pc_r  <= pc_r+a_r(ADDR_W-1 downto 0);
560                                 end if;
561                            when dec_call =>
562                                 -- t=Pop(), Push(PC+1), PC=t
563                                 if mem_busy_i='0' and ENA_LEVEL2 then
564                                    state <= st_fetch;
565                                    a_r   <= ExpandPC(pc_r+1);
566                                    pc_r  <= a_r(ADDR_W-1 downto 0);
567                                 end if;
568                            when dec_add_sp =>
569                                 -- Push(Pop()+[SP+Offset])
570                                 if mem_busy_i='0' then
571                                    -- Read SP+Offset
572                                    state     <= st_add_sp2;
573                                    read_en_r <= '1';
574                                    addr_r    <= sp_r+sp_offset;
575                                    pc_r      <= pc_r+1;
576                                 end if;
577                            when dec_push_sp =>
578                                 -- Push(SP)
579                                 if mem_busy_i='0' then
580                                    FlushB(write_en_r,addr_r,inc_sp,data_o,b_r);
581                                    pc_r <= pc_r+1;
582                                    a_r  <= (others => '0');
583                                    a_r(ADDR_W-1 downto BYTE_BITS) <= sp_r;
584                                    Push(sp_r,a_r,b_r);
585                                 end if;
586                            when dec_pop_pc =>
587                                 -- PC=Pop() (return)
588                                 if mem_busy_i='0' then
589                                    FlushB(write_en_r,addr_r,inc_sp,data_o,b_r);
590                                    state <= st_resync;
591                                    pc_r  <= a_r(ADDR_W-1 downto 0);
592                                    sp_r  <= inc_sp;
593                                 end if;
594                            when dec_pop_pc_rel =>
595                                 -- PC=PC+Pop()
596                                 if mem_busy_i='0' and ENA_LEVEL2 then
597                                    FlushB(write_en_r,addr_r,inc_sp,data_o,b_r);
598                                    state <= st_resync;
599                                    pc_r  <= a_r(ADDR_W-1 downto 0)+pc_r;
600                                    sp_r  <= inc_sp;
601                                 end if;
602                            when dec_add =>
603                                 -- Push(Pop()+Pop())  [A=A+B, SP++, update B]
604                                 if mem_busy_i='0' then
605                                    state     <= st_popped;
606                                    a_r       <= a_r+b_r;
607                                    read_en_r <= '1';
608                                    addr_r    <= inc_inc_sp;
609                                    sp_r      <= inc_sp;
610                                 end if;
611                            when dec_sub =>
612                                 -- a=Pop(), b=Pop(), Push(b-a)
613                                 if mem_busy_i='0' and ENA_LEVEL1 then
614                                    DoBinOp(b_r-a_r,state,sp_r,addr_r,read_en_r,
615                                            a_r,bin_op_res1_r,BINOP_PIPE);
616                                 end if;
617                            when dec_pop =>
618                                 -- Pop()
619                                 if mem_busy_i='0' then
620                                    state     <= st_popped;
621                                    addr_r    <= inc_inc_sp;
622                                    read_en_r <= '1';
623                                    Pop(sp_r,a_r,b_r);
624                                 end if;
625                            when dec_pop_down =>
626                                 -- t=Pop(), Pop(), Push(t)
627                                 if mem_busy_i='0' then
628                                    -- PopDown leaves top of stack unchanged
629                                    state     <= st_popped;
630                                    addr_r    <= inc_inc_sp;
631                                    read_en_r <= '1';
632                                    sp_r      <= inc_sp;
633                                 end if;
634                            when dec_or =>
635                                 -- Push(Pop() or Pop())
636                                 if mem_busy_i='0' then
637                                    state     <= st_popped;
638                                    a_r       <= a_r or b_r;
639                                    read_en_r <= '1';
640                                    addr_r    <= inc_inc_sp;
641                                    sp_r      <= inc_sp;
642                                 end if;
643                            when dec_and =>
644                                 -- Push(Pop() and Pop())
645                                 if mem_busy_i='0' then
646                                    state     <= st_popped;
647                                    a_r       <= a_r and b_r;
648                                    read_en_r <= '1';
649                                    addr_r    <= inc_inc_sp;
650                                    sp_r      <= inc_sp;
651                                 end if;
652                            when dec_eq =>
653                                 -- a=Pop(), b=Pop(), Push(a=b ? 1 : 0)
654                                 if mem_busy_i='0' and ENA_LEVEL0 then
655                                    DoBinOpBool(a_r=b_r,state,sp_r,addr_r,read_en_r,
656                                                a_r,bin_op_res1_r,BINOP_PIPE);
657                                 end if;
658                            when dec_u_less_than =>
659                                 -- a=Pop(), b=Pop(), Push(a<b ? 1 : 0)
660                                 if mem_busy_i='0' and ENA_LEVEL1 then
661                                    DoBinOpBool(a_r<b_r,state,sp_r,addr_r,read_en_r,
662                                                a_r,bin_op_res1_r,BINOP_PIPE);
663                                 end if;
664                            when dec_u_less_than_or_equal =>
665                                 -- a=Pop(), b=Pop(), Push(a<=b ? 1 : 0)
666                                 if mem_busy_i='0' and ENA_LEVEL2 then
667                                    DoBinOpBool(a_r<=b_r,state,sp_r,addr_r,read_en_r,
668                                                a_r,bin_op_res1_r,BINOP_PIPE);
669                                 end if;
670                            when dec_less_than =>
671                                 -- a=signed(Pop()), b=signed(Pop()), Push(a<b ? 1 : 0)
672                                 if mem_busy_i='0' and ENA_LEVEL1 then
673                                    DoBinOpBool(signed(a_r)<signed(b_r),state,sp_r,
674                                                addr_r,read_en_r,a_r,bin_op_res1_r,
675                                                BINOP_PIPE);
676                                 end if;
677                            when dec_less_than_or_equal =>
678                                 -- a=signed(Pop()), b=signed(Pop()), Push(a<=b ? 1 : 0)
679                                 if mem_busy_i='0' and ENA_LEVEL2 then
680                                    DoBinOpBool(signed(a_r)<=signed(b_r),state,sp_r,
681                                                addr_r,read_en_r,a_r,bin_op_res1_r,
682                                                BINOP_PIPE);
683                                 end if;
684                            when dec_load =>
685                                 -- Push([Pop()])
686                                 if mem_busy_i='0' then
687                                    state     <= st_load2;
688                                    addr_r    <= a_r(ADDR_W-1 downto BYTE_BITS);
689                                    read_en_r <= '1';
690                                    pc_r      <= pc_r+1;
691                                 end if;
692                            when dec_dup =>
693                                 -- t=Pop(), Push(t), Push(t)
694                                 if mem_busy_i='0' then
695                                    pc_r <= pc_r+1;
696                                    -- A is dupped, no change
697                                    Push(sp_r,a_r,b_r);
698                                    FlushB(write_en_r,addr_r,inc_sp,data_o,b_r);
699                                 end if;
700                            when dec_dup_stk_b =>
701                                 -- Pop(), t=Pop(), Push(t), Push(t), Push(t)
702                                 if mem_busy_i='0' then
703                                    pc_r <= pc_r+1;
704                                    a_r  <= b_r;
705                                    -- B goes to A
706                                    Push(sp_r,a_r,b_r);
707                                    FlushB(write_en_r,addr_r,inc_sp,data_o,b_r);
708                                 end if;
709                            when dec_store =>
710                                 -- a=Pop(), b=Pop(), [a]=b
711                                 if mem_busy_i='0' then
712                                    state      <= st_resync;
713                                    pc_r       <= pc_r+1;
714                                    addr_r     <= a_r(ADDR_W-1 downto BYTE_BITS);
715                                    data_o     <= b_r;
716                                    write_en_r <= '1';
717                                    sp_r       <= inc_inc_sp;
718                                 end if;
719                            when dec_pop_sp =>
720                                 -- SP=Pop()
721                                 if mem_busy_i='0' then
722                                    FlushB(write_en_r,addr_r,inc_sp,data_o,b_r);
723                                    state <= st_resync;
724                                    pc_r  <= pc_r+1;
725                                    sp_r  <= a_r(ADDR_W-1 downto BYTE_BITS);
726                                 end if;
727                            when dec_nop =>
728                                 pc_r <= pc_r+1;
729                            when dec_not =>
730                                 -- Push(not(Pop()))
731                                 pc_r <= pc_r+1;
732                                 a_r  <= not a_r;
733                            when dec_flip =>
734                                 -- Push(flip(Pop()))
735                                 pc_r <= pc_r+1;
736                                 for i in 0 to WORD_SIZE-1 loop
737                                     a_r(i) <= a_r(WORD_SIZE-1-i);
738                                 end loop;
739                            when dec_add_top =>
740                                 -- a=Pop(), b=Pop(), Push(b), Push(a+b)
741                                 pc_r <= pc_r+1;
742                                 a_r  <= a_r+b_r;
743                            when dec_shift =>
744                                 -- Push(Pop()<<1) [equivalent to a=Pop(), Push(a+a)]
745                                 pc_r <= pc_r+1;
746                                 a_r(WORD_SIZE-1 downto 1) <= a_r(WORD_SIZE-2 downto 0);
747                                 a_r(0) <= '0';
748                            when dec_push_sp_add =>
749                                 -- Push(Pop()+SP)
750                                 if ENA_LEVEL0 then
751                                    pc_r <= pc_r+1;
752                                    a_r  <= (others => '0');
753                                    a_r(ADDR_W-1 downto BYTE_BITS) <=
754                                       a_r(ADDR_W-1-BYTE_BITS downto 0)+sp_r;
755                                 end if;
756                            when dec_neq_branch =>
757                                 -- a=Pop(), b=Pop(), PC+=b==0 ? 1 : a
758                                 -- Branches are almost always taken as they form loops
759                                 if ENA_LEVEL0 then
760                                    sp_r  <= inc_inc_sp;
761                                    -- Need to fetch stack again.
762                                    state <= st_resync;
763                                    if b_r/=0 then
764                                       pc_r <= a_r(ADDR_W-1 downto 0)+pc_r;
765                                    else
766                                       pc_r <= pc_r+1;
767                                    end if;
768                                 end if;
769                            when dec_mult =>
770                                 -- Push(Pop()*Pop())
771                                 if ENA_LEVEL1 then
772                                    if MULT_PIPE then
773                                       mult_a_r <= a_r;
774                                       mult_b_r <= b_r;
775                                       state    <= st_mult2;
776                                    else
777                                       mult_res:=a_r*b_r;
778                                       mult_res1_r <= mult_res(WORD_SIZE-1 downto 0);
779                                       state       <= st_mult5;
780                                    end if;
781                                 end if;
782                            when dec_break =>
783                                 -- Assert the break_o signal
784                                 --report "Break instruction encountered" severity failure;
785                                 break_o <= '1';
786                                 pc_r    <= pc_r+1;
787                            when dec_loadb =>
788                                 -- Push([Pop()] & 0xFF) (byte address)
789                                 if mem_busy_i='0' and ENA_LEVEL0 then
790                                    state     <= st_loadb2;
791                                    addr_r    <= a_r(ADDR_W-1 downto BYTE_BITS);
792                                    read_en_r <= '1';
793                                    pc_r      <= pc_r+1;
794                                 end if;
795                            when dec_storeb =>
796                                 -- [Pop()]=Pop() & 0xFF (byte address)
797                                 if mem_busy_i='0' and ENA_LEVEL1 then
798                                    state     <= st_storeb2;
799                                    addr_r    <= a_r(ADDR_W-1 downto BYTE_BITS);
800                                    read_en_r <= '1';
801                                    pc_r      <= pc_r+1;
802                                 end if;
803                            when dec_lshr =>
804                                 -- a=Pop(), b=Pop(), Push(b>>(a&0x3F))
805                                 if ENA_LSHR then
806                                    -- This instruction takes more than one cycle.
807                                    -- We must avoid duplications in the trace log.
808                                    dbg_o.b_inst <= not_lshr;
809                                    not_lshr:='0';
810                                    if a_r(5 downto 0)=0 then -- Only 6 bits used
811                                       -- No more shifts
812                                       if mem_busy_i='0' then
813                                          state     <= st_popped;
814                                          a_r       <= b_r;
815                                          read_en_r <= '1';
816                                          addr_r    <= inc_inc_sp;
817                                          sp_r      <= inc_sp;
818                                          not_lshr:='1';
819                                       end if;
820                                    else -- More shifts needed
821                                       b_r <= "0"&b_r(WORD_SIZE-1 downto 1);
822                                       a_r(5 downto 0) <= a_r(5 downto 0)-1;
823                                       insn <= insn;
824                                    end if;
825                                 end if;
826                            when others =>
827                                 -- Undefined behavior, we shouldn't get here.
828                                 -- It only helps synthesis tools.
829                                 sp_r <= (others => D_CARE_VAL);
830                                 report "Illegal decode instruction?!" severity failure;
831                                 --break_o <= '1';
832                       end case;
833                  -- The followup of operations that takes more than one execution clock
834                  when st_store_sp2 =>
835                       if mem_busy_i='0' then
836                          addr_r    <= inc_sp;
837                          read_en_r <= '1';
838                          state     <= st_popped;
839                       end if;
840                  when st_load_sp2 =>
841                       if mem_busy_i='0' then
842                          state     <= st_load_sp3;
843                          -- Now we can read SP+Offset (SP already decremented)
844                          read_en_r <= '1';
845                          addr_r    <= sp_r+sp_offset+1;
846                       end if;
847                  when st_load_sp3 =>
848                       if mem_busy_i='0' then
849                          -- Note: We can't increment PC in the decode stage
850                          -- because it will modify sp_offset.
851                          pc_r  <= pc_r+1;
852                          -- Finally we have the result in A
853                          state <= st_execute;
854                          a_r   <= data_i;
855                       end if;
856                  when st_add_sp2 =>
857                       if mem_busy_i='0' then
858                          state <= st_execute;
859                          a_r   <= a_r+data_i;
860                       end if;
861                  when st_load2 =>
862                       if mem_busy_i='0' then
863                          a_r   <= data_i;
864                          state <= st_execute;
865                       end if;
866                  when st_loadb2 =>
867                       if mem_busy_i='0' then
868                          a_r <= (others => '0');
869                          -- Select the source bits using the less significant bits (byte address)
870                          h_bit:=(WORD_BYTES-to_integer(a_r(BYTE_BITS-1 downto 0)))*8-1;
871                          l_bit:=h_bit-7;
872                          a_r(7 downto 0) <= data_i(h_bit downto l_bit);
873                          state <= st_execute;
874                       end if;
875                  when st_storeb2 =>
876                       if mem_busy_i='0' then
877                          addr_r <= a_r(ADDR_W-1 downto BYTE_BITS);
878                          data_o <= data_i;
879                          -- Select the source bits using the less significant bits (byte address)
880                          h_bit:=(WORD_BYTES-to_integer(a_r(BYTE_BITS-1 downto 0)))*8-1;
881                          l_bit:=h_bit-7;
882                          data_o(h_bit downto l_bit) <= b_r(7 downto 0);
883                          write_en_r <= '1';
884                          sp_r       <= inc_inc_sp;
885                          state      <= st_resync;
886                       end if;
887                  when st_fetch =>
888                       if mem_busy_i='0' then
889                          addr_r    <= pc_r(ADDR_W-1 downto BYTE_BITS);
890                          read_en_r <= '1';
891                          state     <= st_decode;
892                       end if;
893                  -- The following states can be used to leave cycles free for
894                  -- tools that can automagically decompose the multiplication
895                  -- in various stages. Xilinx tools can do it to increase the
896                  -- multipliers performance.
897                  when st_mult2 =>
898                       state <= st_mult3;
899                  when st_mult3 =>
900                       state <= st_mult4;
901                  when st_mult4 =>
902                       state <= st_mult5;
903                  when st_mult5 =>
904                       if mem_busy_i='0' then
905                          if MULT_PIPE then
906                             a_r <= mult_res3_r;
907                          else
908                             a_r <= mult_res1_r;
909                          end if;
910                          read_en_r <= '1';
911                          addr_r    <= inc_inc_sp;
912                          sp_r      <= inc_sp;
913                          state     <= st_popped;
914                       end if;
915                when st_binary_op_res =>
916                     -- BINOP_PIPE=2
917                     state <= st_binary_op_res2;
918                when st_binary_op_res2 =>
919                     -- BINOP_PIPE>=1
920                     read_en_r <= '1';
921                     addr_r    <= inc_inc_sp;
922                     sp_r      <= inc_sp;
923                     state     <= st_popped;
924                     if BINOP_PIPE=2 then
925                        a_r <= bin_op_res2_r;
926                     else -- 1
927                        a_r <= bin_op_res1_r;
928                     end if;
929                when st_popped =>
930                     if mem_busy_i='0' then
931                        -- Note: Moving this PC++ to the decoder seems to
932                        -- consume more LUTs.
933                        pc_r  <= pc_r+1;
934                        b_r   <= data_i;
935                        state <= st_execute;
936                     end if;
937                when others =>
938                     -- Undefined behavior, we shouldn't get here.
939                     -- It only helps synthesis tools.
940                     sp_r <= (others => D_CARE_VAL);
941                     report "Illegal state?!" severity failure;
942                     --break_o <= '1';
943             end case; -- state
944          end if; -- else reset_i='1'
945       end if; -- rising_edge(clk_i)
946    end process opcode_control;
947 end architecture Behave; -- Entity: ZPUMediumCore