1 # -----------------------------------------------------------------------------
4 # Copyright (C) 2001-2011,
5 # David M. Beazley (Dabeaz LLC)
8 # Redistribution and use in source and binary forms, with or without
9 # modification, are permitted provided that the following conditions are
12 # * Redistributions of source code must retain the above copyright notice,
13 # this list of conditions and the following disclaimer.
14 # * Redistributions in binary form must reproduce the above copyright notice,
15 # this list of conditions and the following disclaimer in the documentation
16 # and/or other materials provided with the distribution.
17 # * Neither the name of the David Beazley or Dabeaz LLC may be used to
18 # endorse or promote products derived from this software without
19 # specific prior written permission.
21 # THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
22 # "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
23 # LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
24 # A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
25 # OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
26 # SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
27 # LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
28 # DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
29 # THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
30 # (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
31 # OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
32 # -----------------------------------------------------------------------------
35 __tabversion__
= "3.2" # Version of table file used
37 import re
, sys
, types
, copy
, os
39 # This tuple contains known string types
42 StringTypes
= (types
.StringType
, types
.UnicodeType
)
43 except AttributeError:
45 StringTypes
= (str, bytes
)
47 # Extract the code attribute of a function. Different implementations
48 # are for Python 2/3 compatibility.
50 if sys
.version_info
[0] < 3:
57 # This regular expression is used to match valid token names
58 _is_identifier
= re
.compile(r
'^[a-zA-Z0-9_]+$')
60 # Exception thrown when invalid token encountered and no default error
63 class LexError(Exception):
64 def __init__(self
,message
,s
):
65 self
.args
= (message
,)
68 # Token class. This class is used to represent the tokens produced.
69 class LexToken(object):
71 return "LexToken(%s,%r,%d,%d)" % (self
.type,self
.value
,self
.lineno
,self
.lexpos
)
75 # This object is a stand-in for a logging object created by the
78 class PlyLogger(object):
81 def critical(self
,msg
,*args
,**kwargs
):
82 self
.f
.write((msg
% args
) + "\n")
84 def warning(self
,msg
,*args
,**kwargs
):
85 self
.f
.write("WARNING: "+ (msg
% args
) + "\n")
87 def error(self
,msg
,*args
,**kwargs
):
88 self
.f
.write("ERROR: " + (msg
% args
) + "\n")
93 # Null logger is used when no output is generated. Does nothing.
94 class NullLogger(object):
95 def __getattribute__(self
,name
):
97 def __call__(self
,*args
,**kwargs
):
100 # -----------------------------------------------------------------------------
101 # === Lexing Engine ===
103 # The following Lexer class implements the lexer runtime. There are only
104 # a few public methods and attributes:
106 # input() - Store a new string in the lexer
107 # token() - Get the next token
108 # clone() - Clone the lexer
110 # lineno - Current line number
111 # lexpos - Current position in the input string
112 # -----------------------------------------------------------------------------
116 self
.lexre
= None # Master regular expression. This is a list of
117 # tuples (re,findex) where re is a compiled
118 # regular expression and findex is a list
119 # mapping regex group numbers to rules
120 self
.lexretext
= None # Current regular expression strings
121 self
.lexstatere
= {} # Dictionary mapping lexer states to master regexs
122 self
.lexstateretext
= {} # Dictionary mapping lexer states to regex strings
123 self
.lexstaterenames
= {} # Dictionary mapping lexer states to symbol names
124 self
.lexstate
= "INITIAL" # Current lexer state
125 self
.lexstatestack
= [] # Stack of lexer states
126 self
.lexstateinfo
= None # State information
127 self
.lexstateignore
= {} # Dictionary of ignored characters for each state
128 self
.lexstateerrorf
= {} # Dictionary of error functions for each state
129 self
.lexreflags
= 0 # Optional re compile flags
130 self
.lexdata
= None # Actual input data (as a string)
131 self
.lexpos
= 0 # Current position in input text
132 self
.lexlen
= 0 # Length of the input text
133 self
.lexerrorf
= None # Error rule (if any)
134 self
.lextokens
= None # List of valid tokens
135 self
.lexignore
= "" # Ignored characters
136 self
.lexliterals
= "" # Literal characters that can be passed through
137 self
.lexmodule
= None # Module
138 self
.lineno
= 1 # Current line number
139 self
.lexoptimize
= 0 # Optimized mode
141 def clone(self
,object=None):
144 # If the object parameter has been supplied, it means we are attaching the
145 # lexer to a new object. In this case, we have to rebind all methods in
146 # the lexstatere and lexstateerrorf tables.
150 for key
, ritem
in self
.lexstatere
.items():
152 for cre
, findex
in ritem
:
155 if not f
or not f
[0]:
158 newfindex
.append((getattr(object,f
[0].__name
__),f
[1]))
159 newre
.append((cre
,newfindex
))
161 c
.lexstatere
= newtab
162 c
.lexstateerrorf
= { }
163 for key
, ef
in self
.lexstateerrorf
.items():
164 c
.lexstateerrorf
[key
] = getattr(object,ef
.__name
__)
168 # ------------------------------------------------------------
169 # writetab() - Write lexer information to a table file
170 # ------------------------------------------------------------
171 def writetab(self
,tabfile
,outputdir
=""):
172 if isinstance(tabfile
,types
.ModuleType
):
174 basetabfilename
= tabfile
.split(".")[-1]
175 filename
= os
.path
.join(outputdir
,basetabfilename
)+".py"
176 tf
= open(filename
,"w")
177 tf
.write("# %s.py. This file automatically created by PLY (version %s). Don't edit!\n" % (tabfile
,__version__
))
178 tf
.write("_tabversion = %s\n" % repr(__version__
))
179 tf
.write("_lextokens = %s\n" % repr(self
.lextokens
))
180 tf
.write("_lexreflags = %s\n" % repr(self
.lexreflags
))
181 tf
.write("_lexliterals = %s\n" % repr(self
.lexliterals
))
182 tf
.write("_lexstateinfo = %s\n" % repr(self
.lexstateinfo
))
185 # Collect all functions in the initial state
186 initial
= self
.lexstatere
["INITIAL"]
191 initialfuncs
.append(f
)
193 for key
, lre
in self
.lexstatere
.items():
195 for i
in range(len(lre
)):
196 titem
.append((self
.lexstateretext
[key
][i
],_funcs_to_names(lre
[i
][1],self
.lexstaterenames
[key
][i
])))
199 tf
.write("_lexstatere = %s\n" % repr(tabre
))
200 tf
.write("_lexstateignore = %s\n" % repr(self
.lexstateignore
))
203 for key
, ef
in self
.lexstateerrorf
.items():
205 taberr
[key
] = ef
.__name
__
208 tf
.write("_lexstateerrorf = %s\n" % repr(taberr
))
211 # ------------------------------------------------------------
212 # readtab() - Read lexer information from a tab file
213 # ------------------------------------------------------------
214 def readtab(self
,tabfile
,fdict
):
215 if isinstance(tabfile
,types
.ModuleType
):
218 if sys
.version_info
[0] < 3:
219 exec("import %s as lextab" % tabfile
)
222 exec("import %s as lextab" % tabfile
, env
,env
)
223 lextab
= env
['lextab']
225 if getattr(lextab
,"_tabversion","0.0") != __version__
:
226 raise ImportError("Inconsistent PLY version")
228 self
.lextokens
= lextab
._lextokens
229 self
.lexreflags
= lextab
._lexreflags
230 self
.lexliterals
= lextab
._lexliterals
231 self
.lexstateinfo
= lextab
._lexstateinfo
232 self
.lexstateignore
= lextab
._lexstateignore
233 self
.lexstatere
= { }
234 self
.lexstateretext
= { }
235 for key
,lre
in lextab
._lexstatere
.items():
238 for i
in range(len(lre
)):
239 titem
.append((re
.compile(lre
[i
][0],lextab
._lexreflags | re
.VERBOSE
),_names_to_funcs(lre
[i
][1],fdict
)))
240 txtitem
.append(lre
[i
][0])
241 self
.lexstatere
[key
] = titem
242 self
.lexstateretext
[key
] = txtitem
243 self
.lexstateerrorf
= { }
244 for key
,ef
in lextab
._lexstateerrorf
.items():
245 self
.lexstateerrorf
[key
] = fdict
[ef
]
246 self
.begin('INITIAL')
248 # ------------------------------------------------------------
249 # input() - Push a new string into the lexer
250 # ------------------------------------------------------------
252 # Pull off the first character to see if s looks like a string
254 if not isinstance(c
,StringTypes
):
255 raise ValueError("Expected a string")
260 # ------------------------------------------------------------
261 # begin() - Changes the lexing state
262 # ------------------------------------------------------------
263 def begin(self
,state
):
264 if not state
in self
.lexstatere
:
265 raise ValueError("Undefined state")
266 self
.lexre
= self
.lexstatere
[state
]
267 self
.lexretext
= self
.lexstateretext
[state
]
268 self
.lexignore
= self
.lexstateignore
.get(state
,"")
269 self
.lexerrorf
= self
.lexstateerrorf
.get(state
,None)
270 self
.lexstate
= state
272 # ------------------------------------------------------------
273 # push_state() - Changes the lexing state and saves old on stack
274 # ------------------------------------------------------------
275 def push_state(self
,state
):
276 self
.lexstatestack
.append(self
.lexstate
)
279 # ------------------------------------------------------------
280 # pop_state() - Restores the previous state
281 # ------------------------------------------------------------
283 self
.begin(self
.lexstatestack
.pop())
285 # ------------------------------------------------------------
286 # current_state() - Returns the current lexing state
287 # ------------------------------------------------------------
288 def current_state(self
):
291 # ------------------------------------------------------------
292 # skip() - Skip ahead n characters
293 # ------------------------------------------------------------
297 # ------------------------------------------------------------
298 # opttoken() - Return the next token from the Lexer
300 # Note: This function has been carefully implemented to be as fast
301 # as possible. Don't make changes unless you really know what
303 # ------------------------------------------------------------
305 # Make local copies of frequently referenced attributes
308 lexignore
= self
.lexignore
309 lexdata
= self
.lexdata
311 while lexpos
< lexlen
:
312 # This code provides some short-circuit code for whitespace, tabs, and other ignored characters
313 if lexdata
[lexpos
] in lexignore
:
317 # Look for a regular expression match
318 for lexre
,lexindexfunc
in self
.lexre
:
319 m
= lexre
.match(lexdata
,lexpos
)
322 # Create a token for return
324 tok
.value
= m
.group()
325 tok
.lineno
= self
.lineno
329 func
,tok
.type = lexindexfunc
[i
]
332 # If no token type was set, it's an ignored token
334 self
.lexpos
= m
.end()
342 # If token is processed by a function, call it
344 tok
.lexer
= self
# Set additional attributes useful in token rules
350 # Every function must return a token, if nothing, we just move to next token
352 lexpos
= self
.lexpos
# This is here in case user has updated lexpos.
353 lexignore
= self
.lexignore
# This is here in case there was a state change
356 # Verify type of the token. If not in the token map, raise an error
357 if not self
.lexoptimize
:
358 if not newtok
.type in self
.lextokens
:
359 raise LexError("%s:%d: Rule '%s' returned an unknown token type '%s'" % (
360 func_code(func
).co_filename
, func_code(func
).co_firstlineno
,
361 func
.__name
__, newtok
.type),lexdata
[lexpos
:])
365 # No match, see if in literals
366 if lexdata
[lexpos
] in self
.lexliterals
:
368 tok
.value
= lexdata
[lexpos
]
369 tok
.lineno
= self
.lineno
372 self
.lexpos
= lexpos
+ 1
375 # No match. Call t_error() if defined.
378 tok
.value
= self
.lexdata
[lexpos
:]
379 tok
.lineno
= self
.lineno
384 newtok
= self
.lexerrorf(tok
)
385 if lexpos
== self
.lexpos
:
386 # Error method didn't change text position at all. This is an error.
387 raise LexError("Scanning error. Illegal character '%s'" % (lexdata
[lexpos
]), lexdata
[lexpos
:])
389 if not newtok
: continue
393 raise LexError("Illegal character '%s' at index %d" % (lexdata
[lexpos
],lexpos
), lexdata
[lexpos
:])
395 self
.lexpos
= lexpos
+ 1
396 if self
.lexdata
is None:
397 raise RuntimeError("No input string given with input()")
412 # -----------------------------------------------------------------------------
413 # ==== Lex Builder ===
415 # The functions and classes below are used to collect lexing information
416 # and build a Lexer object from it.
417 # -----------------------------------------------------------------------------
419 # -----------------------------------------------------------------------------
420 # get_caller_module_dict()
422 # This function returns a dictionary containing all of the symbols defined within
423 # a caller further down the call stack. This is used to get the environment
424 # associated with the yacc() call if none was provided.
425 # -----------------------------------------------------------------------------
427 def get_caller_module_dict(levels
):
431 e
,b
,t
= sys
.exc_info()
436 ldict
= f
.f_globals
.copy()
437 if f
.f_globals
!= f
.f_locals
:
438 ldict
.update(f
.f_locals
)
442 # -----------------------------------------------------------------------------
445 # Given a list of regular expression functions, this converts it to a list
446 # suitable for output to a table file
447 # -----------------------------------------------------------------------------
449 def _funcs_to_names(funclist
,namelist
):
451 for f
,name
in zip(funclist
,namelist
):
453 result
.append((name
, f
[1]))
458 # -----------------------------------------------------------------------------
461 # Given a list of regular expression function names, this converts it back to
463 # -----------------------------------------------------------------------------
465 def _names_to_funcs(namelist
,fdict
):
469 result
.append((fdict
[n
[0]],n
[1]))
474 # -----------------------------------------------------------------------------
477 # This function takes a list of all of the regex components and attempts to
478 # form the master regular expression. Given limitations in the Python re
479 # module, it may be necessary to break the master regex into separate expressions.
480 # -----------------------------------------------------------------------------
482 def _form_master_re(relist
,reflags
,ldict
,toknames
):
483 if not relist
: return []
484 regex
= "|".join(relist
)
486 lexre
= re
.compile(regex
,re
.VERBOSE | reflags
)
488 # Build the index to function map for the matching engine
489 lexindexfunc
= [ None ] * (max(lexre
.groupindex
.values())+1)
490 lexindexnames
= lexindexfunc
[:]
492 for f
,i
in lexre
.groupindex
.items():
493 handle
= ldict
.get(f
,None)
494 if type(handle
) in (types
.FunctionType
, types
.MethodType
):
495 lexindexfunc
[i
] = (handle
,toknames
[f
])
497 elif handle
is not None:
499 if f
.find("ignore_") > 0:
500 lexindexfunc
[i
] = (None,None)
502 lexindexfunc
[i
] = (None, toknames
[f
])
504 return [(lexre
,lexindexfunc
)],[regex
],[lexindexnames
]
506 m
= int(len(relist
)/2)
508 llist
, lre
, lnames
= _form_master_re(relist
[:m
],reflags
,ldict
,toknames
)
509 rlist
, rre
, rnames
= _form_master_re(relist
[m
:],reflags
,ldict
,toknames
)
510 return llist
+rlist
, lre
+rre
, lnames
+rnames
512 # -----------------------------------------------------------------------------
513 # def _statetoken(s,names)
515 # Given a declaration name s of the form "t_" and a dictionary whose keys are
516 # state names, this function returns a tuple (states,tokenname) where states
517 # is a tuple of state names and tokenname is the name of the token. For example,
518 # calling this with s = "t_foo_bar_SPAM" might return (('foo','bar'),'SPAM')
519 # -----------------------------------------------------------------------------
521 def _statetoken(s
,names
):
524 for i
in range(1,len(parts
)):
525 if not parts
[i
] in names
and parts
[i
] != 'ANY': break
527 states
= tuple(parts
[1:i
])
529 states
= ('INITIAL',)
532 states
= tuple(names
)
534 tokenname
= "_".join(parts
[i
:])
535 return (states
,tokenname
)
538 # -----------------------------------------------------------------------------
541 # This class represents information needed to build a lexer as extracted from a
543 # -----------------------------------------------------------------------------
544 class LexerReflect(object):
545 def __init__(self
,ldict
,log
=None,reflags
=0):
547 self
.error_func
= None
549 self
.reflags
= reflags
550 self
.stateinfo
= { 'INITIAL' : 'inclusive'}
555 self
.log
= PlyLogger(sys
.stderr
)
559 # Get all of the basic information
566 # Validate all of the information
567 def validate_all(self
):
568 self
.validate_tokens()
569 self
.validate_literals()
570 self
.validate_rules()
574 def get_tokens(self
):
575 tokens
= self
.ldict
.get("tokens",None)
577 self
.log
.error("No token list is defined")
581 if not isinstance(tokens
,(list, tuple)):
582 self
.log
.error("tokens must be a list or tuple")
587 self
.log
.error("tokens is empty")
593 # Validate the tokens
594 def validate_tokens(self
):
596 for n
in self
.tokens
:
597 if not _is_identifier
.match(n
):
598 self
.log
.error("Bad token name '%s'",n
)
601 self
.log
.warning("Token '%s' multiply defined", n
)
604 # Get the literals specifier
605 def get_literals(self
):
606 self
.literals
= self
.ldict
.get("literals","")
609 def validate_literals(self
):
611 for c
in self
.literals
:
612 if not isinstance(c
,StringTypes
) or len(c
) > 1:
613 self
.log
.error("Invalid literal %s. Must be a single character", repr(c
))
618 self
.log
.error("Invalid literals specification. literals must be a sequence of characters")
621 def get_states(self
):
622 self
.states
= self
.ldict
.get("states",None)
625 if not isinstance(self
.states
,(tuple,list)):
626 self
.log
.error("states must be defined as a tuple or list")
629 for s
in self
.states
:
630 if not isinstance(s
,tuple) or len(s
) != 2:
631 self
.log
.error("Invalid state specifier %s. Must be a tuple (statename,'exclusive|inclusive')",repr(s
))
635 if not isinstance(name
,StringTypes
):
636 self
.log
.error("State name %s must be a string", repr(name
))
639 if not (statetype
== 'inclusive' or statetype
== 'exclusive'):
640 self
.log
.error("State type for state %s must be 'inclusive' or 'exclusive'",name
)
643 if name
in self
.stateinfo
:
644 self
.log
.error("State '%s' already defined",name
)
647 self
.stateinfo
[name
] = statetype
649 # Get all of the symbols with a t_ prefix and sort them into various
650 # categories (functions, strings, error functions, and ignore characters)
653 tsymbols
= [f
for f
in self
.ldict
if f
[:2] == 't_' ]
655 # Now build up a list of functions and a list of strings
657 self
.toknames
= { } # Mapping of symbols to token names
658 self
.funcsym
= { } # Symbols defined as functions
659 self
.strsym
= { } # Symbols defined as strings
660 self
.ignore
= { } # Ignore strings by state
661 self
.errorf
= { } # Error functions by state
663 for s
in self
.stateinfo
:
667 if len(tsymbols
) == 0:
668 self
.log
.error("No rules of the form t_rulename are defined")
674 states
, tokname
= _statetoken(f
,self
.stateinfo
)
675 self
.toknames
[f
] = tokname
677 if hasattr(t
,"__call__"):
678 if tokname
== 'error':
681 elif tokname
== 'ignore':
682 line
= func_code(t
).co_firstlineno
683 file = func_code(t
).co_filename
684 self
.log
.error("%s:%d: Rule '%s' must be defined as a string",file,line
,t
.__name
__)
688 self
.funcsym
[s
].append((f
,t
))
689 elif isinstance(t
, StringTypes
):
690 if tokname
== 'ignore':
694 self
.log
.warning("%s contains a literal backslash '\\'",f
)
696 elif tokname
== 'error':
697 self
.log
.error("Rule '%s' must be defined as a function", f
)
701 self
.strsym
[s
].append((f
,t
))
703 self
.log
.error("%s not defined as a function or string", f
)
706 # Sort the functions by line number
707 for f
in self
.funcsym
.values():
708 if sys
.version_info
[0] < 3:
709 f
.sort(lambda x
,y
: cmp(func_code(x
[1]).co_firstlineno
,func_code(y
[1]).co_firstlineno
))
712 f
.sort(key
=lambda x
: func_code(x
[1]).co_firstlineno
)
714 # Sort the strings by regular expression length
715 for s
in self
.strsym
.values():
716 if sys
.version_info
[0] < 3:
717 s
.sort(lambda x
,y
: (len(x
[1]) < len(y
[1])) - (len(x
[1]) > len(y
[1])))
720 s
.sort(key
=lambda x
: len(x
[1]),reverse
=True)
722 # Validate all of the t_rules collected
723 def validate_rules(self
):
724 for state
in self
.stateinfo
:
725 # Validate all rules defined by functions
729 for fname
, f
in self
.funcsym
[state
]:
730 line
= func_code(f
).co_firstlineno
731 file = func_code(f
).co_filename
734 tokname
= self
.toknames
[fname
]
735 if isinstance(f
, types
.MethodType
):
739 nargs
= func_code(f
).co_argcount
741 self
.log
.error("%s:%d: Rule '%s' has too many arguments",file,line
,f
.__name
__)
746 self
.log
.error("%s:%d: Rule '%s' requires an argument", file,line
,f
.__name
__)
751 self
.log
.error("%s:%d: No regular expression defined for rule '%s'",file,line
,f
.__name
__)
756 c
= re
.compile("(?P<%s>%s)" % (fname
,f
.__doc
__), re
.VERBOSE | self
.reflags
)
758 self
.log
.error("%s:%d: Regular expression for rule '%s' matches empty string", file,line
,f
.__name
__)
761 _etype
, e
, _etrace
= sys
.exc_info()
762 self
.log
.error("%s:%d: Invalid regular expression for rule '%s'. %s", file,line
,f
.__name
__,e
)
764 self
.log
.error("%s:%d. Make sure '#' in rule '%s' is escaped with '\\#'",file,line
, f
.__name
__)
767 # Validate all rules defined by strings
768 for name
,r
in self
.strsym
[state
]:
769 tokname
= self
.toknames
[name
]
770 if tokname
== 'error':
771 self
.log
.error("Rule '%s' must be defined as a function", name
)
775 if not tokname
in self
.tokens
and tokname
.find("ignore_") < 0:
776 self
.log
.error("Rule '%s' defined for an unspecified token %s",name
,tokname
)
781 c
= re
.compile("(?P<%s>%s)" % (name
,r
),re
.VERBOSE | self
.reflags
)
783 self
.log
.error("Regular expression for rule '%s' matches empty string",name
)
786 _etype
, e
, _etrace
= sys
.exc_info()
787 self
.log
.error("Invalid regular expression for rule '%s'. %s",name
,e
)
789 self
.log
.error("Make sure '#' in rule '%s' is escaped with '\\#'",name
)
792 if not self
.funcsym
[state
] and not self
.strsym
[state
]:
793 self
.log
.error("No rules defined for state '%s'",state
)
796 # Validate the error function
797 efunc
= self
.errorf
.get(state
,None)
800 line
= func_code(f
).co_firstlineno
801 file = func_code(f
).co_filename
804 if isinstance(f
, types
.MethodType
):
808 nargs
= func_code(f
).co_argcount
810 self
.log
.error("%s:%d: Rule '%s' has too many arguments",file,line
,f
.__name
__)
814 self
.log
.error("%s:%d: Rule '%s' requires an argument", file,line
,f
.__name
__)
818 self
.validate_file(f
)
821 # -----------------------------------------------------------------------------
824 # This checks to see if there are duplicated t_rulename() functions or strings
825 # in the parser input file. This is done using a simple regular expression
826 # match on each line in the given file.
827 # -----------------------------------------------------------------------------
829 def validate_file(self
,filename
):
831 base
,ext
= os
.path
.splitext(filename
)
832 if ext
!= '.py': return # No idea what the file is. Return OK
836 lines
= f
.readlines()
839 return # Couldn't find the file. Don't worry about it
841 fre
= re
.compile(r
'\s*def\s+(t_[a-zA-Z_0-9]*)\(')
842 sre
= re
.compile(r
'\s*(t_[a-zA-Z_0-9]*)\s*=')
852 prev
= counthash
.get(name
)
854 counthash
[name
] = linen
856 self
.log
.error("%s:%d: Rule %s redefined. Previously defined on line %d",filename
,linen
,name
,prev
)
860 # -----------------------------------------------------------------------------
863 # Build all of the regular expression rules from definitions in the supplied module
864 # -----------------------------------------------------------------------------
865 def lex(module
=None,object=None,debug
=0,optimize
=0,lextab
="lextab",reflags
=0,nowarn
=0,outputdir
="", debuglog
=None, errorlog
=None):
868 stateinfo
= { 'INITIAL' : 'inclusive'}
870 lexobj
.lexoptimize
= optimize
874 errorlog
= PlyLogger(sys
.stderr
)
878 debuglog
= PlyLogger(sys
.stderr
)
880 # Get the module dictionary used for the lexer
881 if object: module
= object
884 _items
= [(k
,getattr(module
,k
)) for k
in dir(module
)]
887 ldict
= get_caller_module_dict(2)
889 # Collect parser information from the dictionary
890 linfo
= LexerReflect(ldict
,log
=errorlog
,reflags
=reflags
)
893 if linfo
.validate_all():
894 raise SyntaxError("Can't build lexer")
896 if optimize
and lextab
:
898 lexobj
.readtab(lextab
,ldict
)
907 # Dump some basic debugging information
909 debuglog
.info("lex: tokens = %r", linfo
.tokens
)
910 debuglog
.info("lex: literals = %r", linfo
.literals
)
911 debuglog
.info("lex: states = %r", linfo
.stateinfo
)
913 # Build a dictionary of valid token names
914 lexobj
.lextokens
= { }
915 for n
in linfo
.tokens
:
916 lexobj
.lextokens
[n
] = 1
918 # Get literals specification
919 if isinstance(linfo
.literals
,(list,tuple)):
920 lexobj
.lexliterals
= type(linfo
.literals
[0])().join(linfo
.literals
)
922 lexobj
.lexliterals
= linfo
.literals
924 # Get the stateinfo dictionary
925 stateinfo
= linfo
.stateinfo
928 # Build the master regular expressions
929 for state
in stateinfo
:
932 # Add rules defined by functions first
933 for fname
, f
in linfo
.funcsym
[state
]:
934 line
= func_code(f
).co_firstlineno
935 file = func_code(f
).co_filename
936 regex_list
.append("(?P<%s>%s)" % (fname
,f
.__doc
__))
938 debuglog
.info("lex: Adding rule %s -> '%s' (state '%s')",fname
,f
.__doc
__, state
)
940 # Now add all of the simple rules
941 for name
,r
in linfo
.strsym
[state
]:
942 regex_list
.append("(?P<%s>%s)" % (name
,r
))
944 debuglog
.info("lex: Adding rule %s -> '%s' (state '%s')",name
,r
, state
)
946 regexs
[state
] = regex_list
948 # Build the master regular expressions
951 debuglog
.info("lex: ==== MASTER REGEXS FOLLOW ====")
954 lexre
, re_text
, re_names
= _form_master_re(regexs
[state
],reflags
,ldict
,linfo
.toknames
)
955 lexobj
.lexstatere
[state
] = lexre
956 lexobj
.lexstateretext
[state
] = re_text
957 lexobj
.lexstaterenames
[state
] = re_names
959 for i
in range(len(re_text
)):
960 debuglog
.info("lex: state '%s' : regex[%d] = '%s'",state
, i
, re_text
[i
])
962 # For inclusive states, we need to add the regular expressions from the INITIAL state
963 for state
,stype
in stateinfo
.items():
964 if state
!= "INITIAL" and stype
== 'inclusive':
965 lexobj
.lexstatere
[state
].extend(lexobj
.lexstatere
['INITIAL'])
966 lexobj
.lexstateretext
[state
].extend(lexobj
.lexstateretext
['INITIAL'])
967 lexobj
.lexstaterenames
[state
].extend(lexobj
.lexstaterenames
['INITIAL'])
969 lexobj
.lexstateinfo
= stateinfo
970 lexobj
.lexre
= lexobj
.lexstatere
["INITIAL"]
971 lexobj
.lexretext
= lexobj
.lexstateretext
["INITIAL"]
972 lexobj
.lexreflags
= reflags
974 # Set up ignore variables
975 lexobj
.lexstateignore
= linfo
.ignore
976 lexobj
.lexignore
= lexobj
.lexstateignore
.get("INITIAL","")
978 # Set up error functions
979 lexobj
.lexstateerrorf
= linfo
.errorf
980 lexobj
.lexerrorf
= linfo
.errorf
.get("INITIAL",None)
981 if not lexobj
.lexerrorf
:
982 errorlog
.warning("No t_error rule is defined")
984 # Check state information for ignore and error rules
985 for s
,stype
in stateinfo
.items():
986 if stype
== 'exclusive':
987 if not s
in linfo
.errorf
:
988 errorlog
.warning("No error rule is defined for exclusive state '%s'", s
)
989 if not s
in linfo
.ignore
and lexobj
.lexignore
:
990 errorlog
.warning("No ignore rule is defined for exclusive state '%s'", s
)
991 elif stype
== 'inclusive':
992 if not s
in linfo
.errorf
:
993 linfo
.errorf
[s
] = linfo
.errorf
.get("INITIAL",None)
994 if not s
in linfo
.ignore
:
995 linfo
.ignore
[s
] = linfo
.ignore
.get("INITIAL","")
997 # Create global versions of the token() and input() functions
1002 # If in optimize mode, we write the lextab
1003 if lextab
and optimize
:
1004 lexobj
.writetab(lextab
,outputdir
)
1008 # -----------------------------------------------------------------------------
1011 # This runs the lexer as a main program
1012 # -----------------------------------------------------------------------------
1014 def runmain(lexer
=None,data
=None):
1017 filename
= sys
.argv
[1]
1022 sys
.stdout
.write("Reading from standard input (type EOF to end):\n")
1023 data
= sys
.stdin
.read()
1026 _input
= lexer
.input
1031 _token
= lexer
.token
1038 sys
.stdout
.write("(%s,%r,%d,%d)\n" % (tok
.type, tok
.value
, tok
.lineno
,tok
.lexpos
))
1040 # -----------------------------------------------------------------------------
1043 # This decorator function can be used to set the regex expression on a function
1044 # when its docstring might need to be set in an alternative way
1045 # -----------------------------------------------------------------------------
1049 if hasattr(r
,"__call__"):
1050 f
.__doc
__ = r
.__doc
__
1056 # Alternative spelling of the TOKEN decorator