1 # -----------------------------------------------------------------------------
4 # Copyright (C) 2001-2015,
5 # David M. Beazley (Dabeaz LLC)
8 # SPDX-License-Identifier: BSD-3-Clause
9 # -----------------------------------------------------------------------------
12 __tabversion__
= '3.8'
21 # This tuple contains known string types
24 StringTypes
= (types
.StringType
, types
.UnicodeType
)
25 except AttributeError:
27 StringTypes
= (str, bytes
)
29 # This regular expression is used to match valid token names
30 _is_identifier
= re
.compile(r
'^[a-zA-Z0-9_]+$')
32 # Exception thrown when invalid token encountered and no default error
34 class LexError(Exception):
35 def __init__(self
, message
, s
):
36 self
.args
= (message
,)
40 # Token class. This class is used to represent the tokens produced.
41 class LexToken(object):
43 return 'LexToken(%s,%r,%d,%d)' % (self
.type, self
.value
, self
.lineno
, self
.lexpos
)
49 # This object is a stand-in for a logging object created by the
52 class PlyLogger(object):
53 def __init__(self
, f
):
56 def critical(self
, msg
, *args
, **kwargs
):
57 self
.f
.write((msg
% args
) + '\n')
59 def warning(self
, msg
, *args
, **kwargs
):
60 self
.f
.write('WARNING: ' + (msg
% args
) + '\n')
62 def error(self
, msg
, *args
, **kwargs
):
63 self
.f
.write('ERROR: ' + (msg
% args
) + '\n')
69 # Null logger is used when no output is generated. Does nothing.
70 class NullLogger(object):
71 def __getattribute__(self
, name
):
74 def __call__(self
, *args
, **kwargs
):
78 # -----------------------------------------------------------------------------
79 # === Lexing Engine ===
81 # The following Lexer class implements the lexer runtime. There are only
82 # a few public methods and attributes:
84 # input() - Store a new string in the lexer
85 # token() - Get the next token
86 # clone() - Clone the lexer
88 # lineno - Current line number
89 # lexpos - Current position in the input string
90 # -----------------------------------------------------------------------------
94 self
.lexre
= None # Master regular expression. This is a list of
95 # tuples (re, findex) where re is a compiled
96 # regular expression and findex is a list
97 # mapping regex group numbers to rules
98 self
.lexretext
= None # Current regular expression strings
99 self
.lexstatere
= {} # Dictionary mapping lexer states to master regexs
100 self
.lexstateretext
= {} # Dictionary mapping lexer states to regex strings
101 self
.lexstaterenames
= {} # Dictionary mapping lexer states to symbol names
102 self
.lexstate
= 'INITIAL' # Current lexer state
103 self
.lexstatestack
= [] # Stack of lexer states
104 self
.lexstateinfo
= None # State information
105 self
.lexstateignore
= {} # Dictionary of ignored characters for each state
106 self
.lexstateerrorf
= {} # Dictionary of error functions for each state
107 self
.lexstateeoff
= {} # Dictionary of eof functions for each state
108 self
.lexreflags
= 0 # Optional re compile flags
109 self
.lexdata
= None # Actual input data (as a string)
110 self
.lexpos
= 0 # Current position in input text
111 self
.lexlen
= 0 # Length of the input text
112 self
.lexerrorf
= None # Error rule (if any)
113 self
.lexeoff
= None # EOF rule (if any)
114 self
.lextokens
= None # List of valid tokens
115 self
.lexignore
= '' # Ignored characters
116 self
.lexliterals
= '' # Literal characters that can be passed through
117 self
.lexmodule
= None # Module
118 self
.lineno
= 1 # Current line number
119 self
.lexoptimize
= False # Optimized mode
121 def clone(self
, object=None):
124 # If the object parameter has been supplied, it means we are attaching the
125 # lexer to a new object. In this case, we have to rebind all methods in
126 # the lexstatere and lexstateerrorf tables.
130 for key
, ritem
in self
.lexstatere
.items():
132 for cre
, findex
in ritem
:
135 if not f
or not f
[0]:
138 newfindex
.append((getattr(object, f
[0].__name
__), f
[1]))
139 newre
.append((cre
, newfindex
))
141 c
.lexstatere
= newtab
142 c
.lexstateerrorf
= {}
143 for key
, ef
in self
.lexstateerrorf
.items():
144 c
.lexstateerrorf
[key
] = getattr(object, ef
.__name
__)
148 # ------------------------------------------------------------
149 # writetab() - Write lexer information to a table file
150 # ------------------------------------------------------------
151 def writetab(self
, lextab
, outputdir
=''):
152 if isinstance(lextab
, types
.ModuleType
):
153 raise IOError("Won't overwrite existing lextab module")
154 basetabmodule
= lextab
.split('.')[-1]
155 filename
= os
.path
.join(outputdir
, basetabmodule
) + '.py'
156 with
open(filename
, 'w') as tf
:
157 tf
.write('# %s.py. This file automatically created by PLY (version %s). Don\'t edit!\n' % (basetabmodule
, __version__
))
158 tf
.write('_tabversion = %s\n' % repr(__tabversion__
))
159 tf
.write('_lextokens = %s\n' % repr(self
.lextokens
))
160 tf
.write('_lexreflags = %s\n' % repr(self
.lexreflags
))
161 tf
.write('_lexliterals = %s\n' % repr(self
.lexliterals
))
162 tf
.write('_lexstateinfo = %s\n' % repr(self
.lexstateinfo
))
164 # Rewrite the lexstatere table, replacing function objects with function names
166 for statename
, lre
in self
.lexstatere
.items():
168 for (pat
, func
), retext
, renames
in zip(lre
, self
.lexstateretext
[statename
], self
.lexstaterenames
[statename
]):
169 titem
.append((retext
, _funcs_to_names(func
, renames
)))
170 tabre
[statename
] = titem
172 tf
.write('_lexstatere = %s\n' % repr(tabre
))
173 tf
.write('_lexstateignore = %s\n' % repr(self
.lexstateignore
))
176 for statename
, ef
in self
.lexstateerrorf
.items():
177 taberr
[statename
] = ef
.__name
__ if ef
else None
178 tf
.write('_lexstateerrorf = %s\n' % repr(taberr
))
181 for statename
, ef
in self
.lexstateeoff
.items():
182 tabeof
[statename
] = ef
.__name
__ if ef
else None
183 tf
.write('_lexstateeoff = %s\n' % repr(tabeof
))
185 # ------------------------------------------------------------
186 # readtab() - Read lexer information from a tab file
187 # ------------------------------------------------------------
188 def readtab(self
, tabfile
, fdict
):
189 if isinstance(tabfile
, types
.ModuleType
):
192 exec('import %s' % tabfile
)
193 lextab
= sys
.modules
[tabfile
]
195 if getattr(lextab
, '_tabversion', '0.0') != __tabversion__
:
196 raise ImportError('Inconsistent PLY version')
198 self
.lextokens
= lextab
._lextokens
199 self
.lexreflags
= lextab
._lexreflags
200 self
.lexliterals
= lextab
._lexliterals
201 self
.lextokens_all
= self
.lextokens |
set(self
.lexliterals
)
202 self
.lexstateinfo
= lextab
._lexstateinfo
203 self
.lexstateignore
= lextab
._lexstateignore
205 self
.lexstateretext
= {}
206 for statename
, lre
in lextab
._lexstatere
.items():
209 for pat
, func_name
in lre
:
210 titem
.append((re
.compile(pat
, lextab
._lexreflags | re
.VERBOSE
), _names_to_funcs(func_name
, fdict
)))
212 self
.lexstatere
[statename
] = titem
213 self
.lexstateretext
[statename
] = txtitem
215 self
.lexstateerrorf
= {}
216 for statename
, ef
in lextab
._lexstateerrorf
.items():
217 self
.lexstateerrorf
[statename
] = fdict
[ef
]
219 self
.lexstateeoff
= {}
220 for statename
, ef
in lextab
._lexstateeoff
.items():
221 self
.lexstateeoff
[statename
] = fdict
[ef
]
223 self
.begin('INITIAL')
225 # ------------------------------------------------------------
226 # input() - Push a new string into the lexer
227 # ------------------------------------------------------------
229 # Pull off the first character to see if s looks like a string
231 if not isinstance(c
, StringTypes
):
232 raise ValueError('Expected a string')
237 # ------------------------------------------------------------
238 # begin() - Changes the lexing state
239 # ------------------------------------------------------------
240 def begin(self
, state
):
241 if state
not in self
.lexstatere
:
242 raise ValueError('Undefined state')
243 self
.lexre
= self
.lexstatere
[state
]
244 self
.lexretext
= self
.lexstateretext
[state
]
245 self
.lexignore
= self
.lexstateignore
.get(state
, '')
246 self
.lexerrorf
= self
.lexstateerrorf
.get(state
, None)
247 self
.lexeoff
= self
.lexstateeoff
.get(state
, None)
248 self
.lexstate
= state
250 # ------------------------------------------------------------
251 # push_state() - Changes the lexing state and saves old on stack
252 # ------------------------------------------------------------
253 def push_state(self
, state
):
254 self
.lexstatestack
.append(self
.lexstate
)
257 # ------------------------------------------------------------
258 # pop_state() - Restores the previous state
259 # ------------------------------------------------------------
261 self
.begin(self
.lexstatestack
.pop())
263 # ------------------------------------------------------------
264 # current_state() - Returns the current lexing state
265 # ------------------------------------------------------------
266 def current_state(self
):
269 # ------------------------------------------------------------
270 # skip() - Skip ahead n characters
271 # ------------------------------------------------------------
275 # ------------------------------------------------------------
276 # opttoken() - Return the next token from the Lexer
278 # Note: This function has been carefully implemented to be as fast
279 # as possible. Don't make changes unless you really know what
281 # ------------------------------------------------------------
283 # Make local copies of frequently referenced attributes
286 lexignore
= self
.lexignore
287 lexdata
= self
.lexdata
289 while lexpos
< lexlen
:
290 # This code provides some short-circuit code for whitespace, tabs, and other ignored characters
291 if lexdata
[lexpos
] in lexignore
:
295 # Look for a regular expression match
296 for lexre
, lexindexfunc
in self
.lexre
:
297 m
= lexre
.match(lexdata
, lexpos
)
301 # Create a token for return
303 tok
.value
= m
.group()
304 tok
.lineno
= self
.lineno
308 func
, tok
.type = lexindexfunc
[i
]
311 # If no token type was set, it's an ignored token
313 self
.lexpos
= m
.end()
321 # If token is processed by a function, call it
323 tok
.lexer
= self
# Set additional attributes useful in token rules
329 # Every function must return a token, if nothing, we just move to next token
331 lexpos
= self
.lexpos
# This is here in case user has updated lexpos.
332 lexignore
= self
.lexignore
# This is here in case there was a state change
335 # Verify type of the token. If not in the token map, raise an error
336 if not self
.lexoptimize
:
337 if newtok
.type not in self
.lextokens_all
:
338 raise LexError("%s:%d: Rule '%s' returned an unknown token type '%s'" % (
339 func
.__code
__.co_filename
, func
.__code
__.co_firstlineno
,
340 func
.__name
__, newtok
.type), lexdata
[lexpos
:])
344 # No match, see if in literals
345 if lexdata
[lexpos
] in self
.lexliterals
:
347 tok
.value
= lexdata
[lexpos
]
348 tok
.lineno
= self
.lineno
351 self
.lexpos
= lexpos
+ 1
354 # No match. Call t_error() if defined.
357 tok
.value
= self
.lexdata
[lexpos
:]
358 tok
.lineno
= self
.lineno
363 newtok
= self
.lexerrorf(tok
)
364 if lexpos
== self
.lexpos
:
365 # Error method didn't change text position at all. This is an error.
366 raise LexError("Scanning error. Illegal character '%s'" % (lexdata
[lexpos
]), lexdata
[lexpos
:])
373 raise LexError("Illegal character '%s' at index %d" % (lexdata
[lexpos
], lexpos
), lexdata
[lexpos
:])
379 tok
.lineno
= self
.lineno
383 newtok
= self
.lexeoff(tok
)
386 self
.lexpos
= lexpos
+ 1
387 if self
.lexdata
is None:
388 raise RuntimeError('No input string given with input()')
403 # -----------------------------------------------------------------------------
404 # ==== Lex Builder ===
406 # The functions and classes below are used to collect lexing information
407 # and build a Lexer object from it.
408 # -----------------------------------------------------------------------------
410 # -----------------------------------------------------------------------------
413 # Returns the regular expression assigned to a function either as a doc string
414 # or as a .regex attribute attached by the @TOKEN decorator.
415 # -----------------------------------------------------------------------------
416 def _get_regex(func
):
417 return getattr(func
, 'regex', func
.__doc
__)
419 # -----------------------------------------------------------------------------
420 # get_caller_module_dict()
422 # This function returns a dictionary containing all of the symbols defined within
423 # a caller further down the call stack. This is used to get the environment
424 # associated with the yacc() call if none was provided.
425 # -----------------------------------------------------------------------------
426 def get_caller_module_dict(levels
):
427 f
= sys
._getframe
(levels
)
428 ldict
= f
.f_globals
.copy()
429 if f
.f_globals
!= f
.f_locals
:
430 ldict
.update(f
.f_locals
)
433 # -----------------------------------------------------------------------------
436 # Given a list of regular expression functions, this converts it to a list
437 # suitable for output to a table file
438 # -----------------------------------------------------------------------------
439 def _funcs_to_names(funclist
, namelist
):
441 for f
, name
in zip(funclist
, namelist
):
443 result
.append((name
, f
[1]))
448 # -----------------------------------------------------------------------------
451 # Given a list of regular expression function names, this converts it back to
453 # -----------------------------------------------------------------------------
454 def _names_to_funcs(namelist
, fdict
):
458 result
.append((fdict
[n
[0]], n
[1]))
463 # -----------------------------------------------------------------------------
466 # This function takes a list of all of the regex components and attempts to
467 # form the master regular expression. Given limitations in the Python re
468 # module, it may be necessary to break the master regex into separate expressions.
469 # -----------------------------------------------------------------------------
470 def _form_master_re(relist
, reflags
, ldict
, toknames
):
473 regex
= '|'.join(relist
)
475 lexre
= re
.compile(regex
, re
.VERBOSE | reflags
)
477 # Build the index to function map for the matching engine
478 lexindexfunc
= [None] * (max(lexre
.groupindex
.values()) + 1)
479 lexindexnames
= lexindexfunc
[:]
481 for f
, i
in lexre
.groupindex
.items():
482 handle
= ldict
.get(f
, None)
483 if type(handle
) in (types
.FunctionType
, types
.MethodType
):
484 lexindexfunc
[i
] = (handle
, toknames
[f
])
486 elif handle
is not None:
488 if f
.find('ignore_') > 0:
489 lexindexfunc
[i
] = (None, None)
491 lexindexfunc
[i
] = (None, toknames
[f
])
493 return [(lexre
, lexindexfunc
)], [regex
], [lexindexnames
]
495 m
= int(len(relist
)/2)
498 llist
, lre
, lnames
= _form_master_re(relist
[:m
], reflags
, ldict
, toknames
)
499 rlist
, rre
, rnames
= _form_master_re(relist
[m
:], reflags
, ldict
, toknames
)
500 return (llist
+rlist
), (lre
+rre
), (lnames
+rnames
)
502 # -----------------------------------------------------------------------------
503 # def _statetoken(s,names)
505 # Given a declaration name s of the form "t_" and a dictionary whose keys are
506 # state names, this function returns a tuple (states,tokenname) where states
507 # is a tuple of state names and tokenname is the name of the token. For example,
508 # calling this with s = "t_foo_bar_SPAM" might return (('foo','bar'),'SPAM')
509 # -----------------------------------------------------------------------------
510 def _statetoken(s
, names
):
513 for i
, part
in enumerate(parts
[1:], 1):
514 if part
not in names
and part
!= 'ANY':
518 states
= tuple(parts
[1:i
])
520 states
= ('INITIAL',)
523 states
= tuple(names
)
525 tokenname
= '_'.join(parts
[i
:])
526 return (states
, tokenname
)
529 # -----------------------------------------------------------------------------
532 # This class represents information needed to build a lexer as extracted from a
534 # -----------------------------------------------------------------------------
535 class LexerReflect(object):
536 def __init__(self
, ldict
, log
=None, reflags
=0):
538 self
.error_func
= None
540 self
.reflags
= reflags
541 self
.stateinfo
= {'INITIAL': 'inclusive'}
544 self
.log
= PlyLogger(sys
.stderr
) if log
is None else log
546 # Get all of the basic information
553 # Validate all of the information
554 def validate_all(self
):
555 self
.validate_tokens()
556 self
.validate_literals()
557 self
.validate_rules()
561 def get_tokens(self
):
562 tokens
= self
.ldict
.get('tokens', None)
564 self
.log
.error('No token list is defined')
568 if not isinstance(tokens
, (list, tuple)):
569 self
.log
.error('tokens must be a list or tuple')
574 self
.log
.error('tokens is empty')
580 # Validate the tokens
581 def validate_tokens(self
):
583 for n
in self
.tokens
:
584 if not _is_identifier
.match(n
):
585 self
.log
.error("Bad token name '%s'", n
)
588 self
.log
.warning("Token '%s' multiply defined", n
)
591 # Get the literals specifier
592 def get_literals(self
):
593 self
.literals
= self
.ldict
.get('literals', '')
594 if not self
.literals
:
598 def validate_literals(self
):
600 for c
in self
.literals
:
601 if not isinstance(c
, StringTypes
) or len(c
) > 1:
602 self
.log
.error('Invalid literal %s. Must be a single character', repr(c
))
606 self
.log
.error('Invalid literals specification. literals must be a sequence of characters')
609 def get_states(self
):
610 self
.states
= self
.ldict
.get('states', None)
613 if not isinstance(self
.states
, (tuple, list)):
614 self
.log
.error('states must be defined as a tuple or list')
617 for s
in self
.states
:
618 if not isinstance(s
, tuple) or len(s
) != 2:
619 self
.log
.error("Invalid state specifier %s. Must be a tuple (statename,'exclusive|inclusive')", repr(s
))
623 if not isinstance(name
, StringTypes
):
624 self
.log
.error('State name %s must be a string', repr(name
))
627 if not (statetype
== 'inclusive' or statetype
== 'exclusive'):
628 self
.log
.error("State type for state %s must be 'inclusive' or 'exclusive'", name
)
631 if name
in self
.stateinfo
:
632 self
.log
.error("State '%s' already defined", name
)
635 self
.stateinfo
[name
] = statetype
637 # Get all of the symbols with a t_ prefix and sort them into various
638 # categories (functions, strings, error functions, and ignore characters)
641 tsymbols
= [f
for f
in self
.ldict
if f
[:2] == 't_']
643 # Now build up a list of functions and a list of strings
644 self
.toknames
= {} # Mapping of symbols to token names
645 self
.funcsym
= {} # Symbols defined as functions
646 self
.strsym
= {} # Symbols defined as strings
647 self
.ignore
= {} # Ignore strings by state
648 self
.errorf
= {} # Error functions by state
649 self
.eoff
= {} # EOF functions by state
651 for s
in self
.stateinfo
:
655 if len(tsymbols
) == 0:
656 self
.log
.error('No rules of the form t_rulename are defined')
662 states
, tokname
= _statetoken(f
, self
.stateinfo
)
663 self
.toknames
[f
] = tokname
665 if hasattr(t
, '__call__'):
666 if tokname
== 'error':
669 elif tokname
== 'eof':
672 elif tokname
== 'ignore':
673 line
= t
.__code
__.co_firstlineno
674 file = t
.__code
__.co_filename
675 self
.log
.error("%s:%d: Rule '%s' must be defined as a string", file, line
, t
.__name
__)
679 self
.funcsym
[s
].append((f
, t
))
680 elif isinstance(t
, StringTypes
):
681 if tokname
== 'ignore':
685 self
.log
.warning("%s contains a literal backslash '\\'", f
)
687 elif tokname
== 'error':
688 self
.log
.error("Rule '%s' must be defined as a function", f
)
692 self
.strsym
[s
].append((f
, t
))
694 self
.log
.error('%s not defined as a function or string', f
)
697 # Sort the functions by line number
698 for f
in self
.funcsym
.values():
699 f
.sort(key
=lambda x
: x
[1].__code
__.co_firstlineno
)
701 # Sort the strings by regular expression length
702 for s
in self
.strsym
.values():
703 s
.sort(key
=lambda x
: len(x
[1]), reverse
=True)
705 # Validate all of the t_rules collected
706 def validate_rules(self
):
707 for state
in self
.stateinfo
:
708 # Validate all rules defined by functions
710 for fname
, f
in self
.funcsym
[state
]:
711 line
= f
.__code
__.co_firstlineno
712 file = f
.__code
__.co_filename
713 module
= inspect
.getmodule(f
)
714 self
.modules
.add(module
)
716 tokname
= self
.toknames
[fname
]
717 if isinstance(f
, types
.MethodType
):
721 nargs
= f
.__code
__.co_argcount
723 self
.log
.error("%s:%d: Rule '%s' has too many arguments", file, line
, f
.__name
__)
728 self
.log
.error("%s:%d: Rule '%s' requires an argument", file, line
, f
.__name
__)
732 if not _get_regex(f
):
733 self
.log
.error("%s:%d: No regular expression defined for rule '%s'", file, line
, f
.__name
__)
738 c
= re
.compile('(?P<%s>%s)' % (fname
, _get_regex(f
)), re
.VERBOSE | self
.reflags
)
740 self
.log
.error("%s:%d: Regular expression for rule '%s' matches empty string", file, line
, f
.__name
__)
742 except re
.error
as e
:
743 self
.log
.error("%s:%d: Invalid regular expression for rule '%s'. %s", file, line
, f
.__name
__, e
)
744 if '#' in _get_regex(f
):
745 self
.log
.error("%s:%d. Make sure '#' in rule '%s' is escaped with '\\#'", file, line
, f
.__name
__)
748 # Validate all rules defined by strings
749 for name
, r
in self
.strsym
[state
]:
750 tokname
= self
.toknames
[name
]
751 if tokname
== 'error':
752 self
.log
.error("Rule '%s' must be defined as a function", name
)
756 if tokname
not in self
.tokens
and tokname
.find('ignore_') < 0:
757 self
.log
.error("Rule '%s' defined for an unspecified token %s", name
, tokname
)
762 c
= re
.compile('(?P<%s>%s)' % (name
, r
), re
.VERBOSE | self
.reflags
)
764 self
.log
.error("Regular expression for rule '%s' matches empty string", name
)
766 except re
.error
as e
:
767 self
.log
.error("Invalid regular expression for rule '%s'. %s", name
, e
)
769 self
.log
.error("Make sure '#' in rule '%s' is escaped with '\\#'", name
)
772 if not self
.funcsym
[state
] and not self
.strsym
[state
]:
773 self
.log
.error("No rules defined for state '%s'", state
)
776 # Validate the error function
777 efunc
= self
.errorf
.get(state
, None)
780 line
= f
.__code
__.co_firstlineno
781 file = f
.__code
__.co_filename
782 module
= inspect
.getmodule(f
)
783 self
.modules
.add(module
)
785 if isinstance(f
, types
.MethodType
):
789 nargs
= f
.__code
__.co_argcount
791 self
.log
.error("%s:%d: Rule '%s' has too many arguments", file, line
, f
.__name
__)
795 self
.log
.error("%s:%d: Rule '%s' requires an argument", file, line
, f
.__name
__)
798 for module
in self
.modules
:
799 self
.validate_module(module
)
801 # -----------------------------------------------------------------------------
804 # This checks to see if there are duplicated t_rulename() functions or strings
805 # in the parser input file. This is done using a simple regular expression
806 # match on each line in the source code of the given module.
807 # -----------------------------------------------------------------------------
809 def validate_module(self
, module
):
810 lines
, linen
= inspect
.getsourcelines(module
)
812 fre
= re
.compile(r
'\s*def\s+(t_[a-zA-Z_0-9]*)\(')
813 sre
= re
.compile(r
'\s*(t_[a-zA-Z_0-9]*)\s*=')
823 prev
= counthash
.get(name
)
825 counthash
[name
] = linen
827 filename
= inspect
.getsourcefile(module
)
828 self
.log
.error('%s:%d: Rule %s redefined. Previously defined on line %d', filename
, linen
, name
, prev
)
832 # -----------------------------------------------------------------------------
835 # Build all of the regular expression rules from definitions in the supplied module
836 # -----------------------------------------------------------------------------
837 def lex(module
=None, object=None, debug
=False, optimize
=False, lextab
='lextab',
838 reflags
=0, nowarn
=False, outputdir
=None, debuglog
=None, errorlog
=None):
846 stateinfo
= {'INITIAL': 'inclusive'}
848 lexobj
.lexoptimize
= optimize
852 errorlog
= PlyLogger(sys
.stderr
)
856 debuglog
= PlyLogger(sys
.stderr
)
858 # Get the module dictionary used for the lexer
862 # Get the module dictionary used for the parser
864 _items
= [(k
, getattr(module
, k
)) for k
in dir(module
)]
866 # If no __file__ attribute is available, try to obtain it from the __module__ instead
867 if '__file__' not in ldict
:
868 ldict
['__file__'] = sys
.modules
[ldict
['__module__']].__file
__
870 ldict
= get_caller_module_dict(2)
872 # Determine if the module is package of a package or not.
873 # If so, fix the tabmodule setting so that tables load correctly
874 pkg
= ldict
.get('__package__')
875 if pkg
and isinstance(lextab
, str):
876 if '.' not in lextab
:
877 lextab
= pkg
+ '.' + lextab
879 # Collect parser information from the dictionary
880 linfo
= LexerReflect(ldict
, log
=errorlog
, reflags
=reflags
)
883 if linfo
.validate_all():
884 raise SyntaxError("Can't build lexer")
886 if optimize
and lextab
:
888 lexobj
.readtab(lextab
, ldict
)
897 # Dump some basic debugging information
899 debuglog
.info('lex: tokens = %r', linfo
.tokens
)
900 debuglog
.info('lex: literals = %r', linfo
.literals
)
901 debuglog
.info('lex: states = %r', linfo
.stateinfo
)
903 # Build a dictionary of valid token names
904 lexobj
.lextokens
= set()
905 for n
in linfo
.tokens
:
906 lexobj
.lextokens
.add(n
)
908 # Get literals specification
909 if isinstance(linfo
.literals
, (list, tuple)):
910 lexobj
.lexliterals
= type(linfo
.literals
[0])().join(linfo
.literals
)
912 lexobj
.lexliterals
= linfo
.literals
914 lexobj
.lextokens_all
= lexobj
.lextokens |
set(lexobj
.lexliterals
)
916 # Get the stateinfo dictionary
917 stateinfo
= linfo
.stateinfo
920 # Build the master regular expressions
921 for state
in stateinfo
:
924 # Add rules defined by functions first
925 for fname
, f
in linfo
.funcsym
[state
]:
926 line
= f
.__code
__.co_firstlineno
927 file = f
.__code
__.co_filename
928 regex_list
.append('(?P<%s>%s)' % (fname
, _get_regex(f
)))
930 debuglog
.info("lex: Adding rule %s -> '%s' (state '%s')", fname
, _get_regex(f
), state
)
932 # Now add all of the simple rules
933 for name
, r
in linfo
.strsym
[state
]:
934 regex_list
.append('(?P<%s>%s)' % (name
, r
))
936 debuglog
.info("lex: Adding rule %s -> '%s' (state '%s')", name
, r
, state
)
938 regexs
[state
] = regex_list
940 # Build the master regular expressions
943 debuglog
.info('lex: ==== MASTER REGEXS FOLLOW ====')
946 lexre
, re_text
, re_names
= _form_master_re(regexs
[state
], reflags
, ldict
, linfo
.toknames
)
947 lexobj
.lexstatere
[state
] = lexre
948 lexobj
.lexstateretext
[state
] = re_text
949 lexobj
.lexstaterenames
[state
] = re_names
951 for i
, text
in enumerate(re_text
):
952 debuglog
.info("lex: state '%s' : regex[%d] = '%s'", state
, i
, text
)
954 # For inclusive states, we need to add the regular expressions from the INITIAL state
955 for state
, stype
in stateinfo
.items():
956 if state
!= 'INITIAL' and stype
== 'inclusive':
957 lexobj
.lexstatere
[state
].extend(lexobj
.lexstatere
['INITIAL'])
958 lexobj
.lexstateretext
[state
].extend(lexobj
.lexstateretext
['INITIAL'])
959 lexobj
.lexstaterenames
[state
].extend(lexobj
.lexstaterenames
['INITIAL'])
961 lexobj
.lexstateinfo
= stateinfo
962 lexobj
.lexre
= lexobj
.lexstatere
['INITIAL']
963 lexobj
.lexretext
= lexobj
.lexstateretext
['INITIAL']
964 lexobj
.lexreflags
= reflags
966 # Set up ignore variables
967 lexobj
.lexstateignore
= linfo
.ignore
968 lexobj
.lexignore
= lexobj
.lexstateignore
.get('INITIAL', '')
970 # Set up error functions
971 lexobj
.lexstateerrorf
= linfo
.errorf
972 lexobj
.lexerrorf
= linfo
.errorf
.get('INITIAL', None)
973 if not lexobj
.lexerrorf
:
974 errorlog
.warning('No t_error rule is defined')
976 # Set up eof functions
977 lexobj
.lexstateeoff
= linfo
.eoff
978 lexobj
.lexeoff
= linfo
.eoff
.get('INITIAL', None)
980 # Check state information for ignore and error rules
981 for s
, stype
in stateinfo
.items():
982 if stype
== 'exclusive':
983 if s
not in linfo
.errorf
:
984 errorlog
.warning("No error rule is defined for exclusive state '%s'", s
)
985 if s
not in linfo
.ignore
and lexobj
.lexignore
:
986 errorlog
.warning("No ignore rule is defined for exclusive state '%s'", s
)
987 elif stype
== 'inclusive':
988 if s
not in linfo
.errorf
:
989 linfo
.errorf
[s
] = linfo
.errorf
.get('INITIAL', None)
990 if s
not in linfo
.ignore
:
991 linfo
.ignore
[s
] = linfo
.ignore
.get('INITIAL', '')
993 # Create global versions of the token() and input() functions
998 # If in optimize mode, we write the lextab
999 if lextab
and optimize
:
1000 if outputdir
is None:
1001 # If no output directory is set, the location of the output files
1002 # is determined according to the following rules:
1003 # - If lextab specifies a package, files go into that package directory
1004 # - Otherwise, files go in the same directory as the specifying module
1005 if isinstance(lextab
, types
.ModuleType
):
1006 srcfile
= lextab
.__file
__
1008 if '.' not in lextab
:
1009 srcfile
= ldict
['__file__']
1011 parts
= lextab
.split('.')
1012 pkgname
= '.'.join(parts
[:-1])
1013 exec('import %s' % pkgname
)
1014 srcfile
= getattr(sys
.modules
[pkgname
], '__file__', '')
1015 outputdir
= os
.path
.dirname(srcfile
)
1017 lexobj
.writetab(lextab
, outputdir
)
1018 except IOError as e
:
1019 errorlog
.warning("Couldn't write lextab module %r. %s" % (lextab
, e
))
1023 # -----------------------------------------------------------------------------
1026 # This runs the lexer as a main program
1027 # -----------------------------------------------------------------------------
1029 def runmain(lexer
=None, data
=None):
1032 filename
= sys
.argv
[1]
1037 sys
.stdout
.write('Reading from standard input (type EOF to end):\n')
1038 data
= sys
.stdin
.read()
1041 _input
= lexer
.input
1046 _token
= lexer
.token
1054 sys
.stdout
.write('(%s,%r,%d,%d)\n' % (tok
.type, tok
.value
, tok
.lineno
, tok
.lexpos
))
1056 # -----------------------------------------------------------------------------
1059 # This decorator function can be used to set the regex expression on a function
1060 # when its docstring might need to be set in an alternative way
1061 # -----------------------------------------------------------------------------
1065 if hasattr(r
, '__call__'):
1066 f
.regex
= _get_regex(r
)
1072 # Alternative spelling of the TOKEN decorator