third_party/cython/src/Cython/Plex/Lexicons.py

   1 #=======================================================================
   2 #
   3 #   Python Lexical Analyser
   4 #
   5 #   Lexical Analyser Specification
   6 #
   7 #=======================================================================
   8
   9 import types
  10
  11 import Actions
  12 import DFA
  13 import Errors
  14 import Machines
  15 import Regexps
  16
  17 # debug_flags for Lexicon constructor
  18 DUMP_NFA = 1
  19 DUMP_DFA = 2
  20
  21 class State(object):
  22   """
  23   This class is used as part of a Plex.Lexicon specification to
  24   introduce a user-defined state.
  25
  26   Constructor:
  27
  28      State(name, token_specifications)
  29   """
  30
  31   name = None
  32   tokens = None
  33
  34   def __init__(self, name, tokens):
  35     self.name = name
  36     self.tokens = tokens
  37
  38 class Lexicon(object):
  39   """
  40   Lexicon(specification) builds a lexical analyser from the given
  41   |specification|. The specification consists of a list of
  42   specification items. Each specification item may be either:
  43
  44      1) A token definition, which is a tuple:
  45
  46            (pattern, action)
  47
  48         The |pattern| is a regular axpression built using the
  49         constructors defined in the Plex module.
  50
  51         The |action| is the action to be performed when this pattern
  52         is recognised (see below).
  53
  54      2) A state definition:
  55
  56            State(name, tokens)
  57
  58         where |name| is a character string naming the state,
  59         and |tokens| is a list of token definitions as
  60         above. The meaning and usage of states is described
  61         below.
  62
  63   Actions
  64   -------
  65
  66   The |action| in a token specication may be one of three things:
  67
  68      1) A function, which is called as follows:
  69
  70            function(scanner, text)
  71
  72         where |scanner| is the relevant Scanner instance, and |text|
  73         is the matched text. If the function returns anything
  74         other than None, that value is returned as the value of the
  75         token. If it returns None, scanning continues as if the IGNORE
  76         action were specified (see below).
  77
  78       2) One of the following special actions:
  79
  80          IGNORE means that the recognised characters will be treated as
  81                 white space and ignored. Scanning will continue until
  82                 the next non-ignored token is recognised before returning.
  83
  84          TEXT   causes the scanned text itself to be returned as the
  85                 value of the token.
  86
  87       3) Any other value, which is returned as the value of the token.
  88
  89   States
  90   ------
  91
  92   At any given time, the scanner is in one of a number of states.
  93   Associated with each state is a set of possible tokens. When scanning,
  94   only tokens associated with the current state are recognised.
  95
  96   There is a default state, whose name is the empty string. Token
  97   definitions which are not inside any State definition belong to
  98   the default state.
  99
 100   The initial state of the scanner is the default state. The state can
 101   be changed in one of two ways:
 102
 103      1) Using Begin(state_name) as the action of a token.
 104
 105      2) Calling the begin(state_name) method of the Scanner.
 106
 107   To change back to the default state, use '' as the state name.
 108   """
 109
 110   machine = None # Machine
 111   tables = None # StateTableMachine
 112
 113   def __init__(self, specifications, debug = None, debug_flags = 7, timings = None):
 114     if type(specifications) != types.ListType:
 115       raise Errors.InvalidScanner("Scanner definition is not a list")
 116     if timings:
 117       from Timing import time
 118       total_time = 0.0
 119       time1 = time()
 120     nfa = Machines.Machine()
 121     default_initial_state = nfa.new_initial_state('')
 122     token_number = 1
 123     for spec in specifications:
 124       if isinstance(spec, State):
 125         user_initial_state = nfa.new_initial_state(spec.name)
 126         for token in spec.tokens:
 127           self.add_token_to_machine(
 128             nfa, user_initial_state, token, token_number)
 129           token_number = token_number + 1
 130       elif type(spec) == types.TupleType:
 131         self.add_token_to_machine(
 132           nfa, default_initial_state, spec, token_number)
 133         token_number = token_number + 1
 134       else:
 135         raise Errors.InvalidToken(
 136           token_number,
 137           "Expected a token definition (tuple) or State instance")
 138     if timings:
 139       time2 = time()
 140       total_time = total_time + (time2 - time1)
 141       time3 = time()
 142     if debug and (debug_flags & 1):
 143       debug.write("\n============= NFA ===========\n")
 144       nfa.dump(debug)
 145     dfa = DFA.nfa_to_dfa(nfa, debug = (debug_flags & 3) == 3 and debug)
 146     if timings:
 147       time4 = time()
 148       total_time = total_time + (time4 - time3)
 149     if debug and (debug_flags & 2):
 150       debug.write("\n============= DFA ===========\n")
 151       dfa.dump(debug)
 152     if timings:
 153       timings.write("Constructing NFA : %5.2f\n" % (time2 - time1))
 154       timings.write("Converting to DFA: %5.2f\n" % (time4 - time3))
 155       timings.write("TOTAL            : %5.2f\n" % total_time)
 156     self.machine = dfa
 157
 158   def add_token_to_machine(self, machine, initial_state, token_spec, token_number):
 159     try:
 160       (re, action_spec) = self.parse_token_definition(token_spec)
 161       # Disabled this -- matching empty strings can be useful
 162       #if re.nullable:
 163       #  raise Errors.InvalidToken(
 164       #    token_number, "Pattern can match 0 input symbols")
 165       if isinstance(action_spec, Actions.Action):
 166         action = action_spec
 167       else:
 168         try:
 169           action_spec.__call__
 170         except AttributeError:
 171           action = Actions.Return(action_spec)
 172         else:
 173           action = Actions.Call(action_spec)
 174       final_state = machine.new_state()
 175       re.build_machine(machine, initial_state, final_state,
 176                        match_bol = 1, nocase = 0)
 177       final_state.set_action(action, priority = -token_number)
 178     except Errors.PlexError, e:
 179       raise e.__class__("Token number %d: %s" % (token_number, e))
 180
 181   def parse_token_definition(self, token_spec):
 182     if type(token_spec) != types.TupleType:
 183       raise Errors.InvalidToken("Token definition is not a tuple")
 184     if len(token_spec) != 2:
 185       raise Errors.InvalidToken("Wrong number of items in token definition")
 186     pattern, action = token_spec
 187     if not isinstance(pattern, Regexps.RE):
 188       raise Errors.InvalidToken("Pattern is not an RE instance")
 189     return (pattern, action)
 190
 191   def get_initial_state(self, name):
 192     return self.machine.get_initial_state(name)
 193
 194
 195