1 #=======================================================================
3 # Python Lexical Analyser
7 #=======================================================================
10 from sys
import maxint
as maxint
28 def chars_to_ranges(s
):
30 Return a list of character codes consisting of pairs
31 [code1a, code1b, code2a, code2b,...] which cover all
32 the characters in |s|.
40 code1
= ord(char_list
[i
])
43 while i
< n
and code2
>= ord(char_list
[i
]):
50 def uppercase_range(code1
, code2
):
52 If the range of characters from code1 to code2-1 includes any
53 lower case letters, return the corresponding upper case range.
55 code3
= max(code1
, ord('a'))
56 code4
= min(code2
, ord('z') + 1)
58 d
= ord('A') - ord('a')
59 return (code3
+ d
, code4
+ d
)
63 def lowercase_range(code1
, code2
):
65 If the range of characters from code1 to code2-1 includes any
66 upper case letters, return the corresponding lower case range.
68 code3
= max(code1
, ord('A'))
69 code4
= min(code2
, ord('Z') + 1)
71 d
= ord('a') - ord('A')
72 return (code3
+ d
, code4
+ d
)
76 def CodeRanges(code_list
):
78 Given a list of codes as returned by chars_to_ranges, return
79 an RE which will match a character in any of the ranges.
82 for i
in xrange(0, len(code_list
), 2):
83 re_list
.append(CodeRange(code_list
[i
], code_list
[i
+ 1]))
86 def CodeRange(code1
, code2
):
88 CodeRange(code1, code2) is an RE which matches any character
89 with a code |c| in the range |code1| <= |c| < |code2|.
91 if code1
<= nl_code
< code2
:
92 return Alt(RawCodeRange(code1
, nl_code
),
94 RawCodeRange(nl_code
+ 1, code2
))
96 return RawCodeRange(code1
, code2
)
103 """RE is the base class for regular expression constructors.
104 The following operators are defined on REs:
106 re1 + re2 is an RE which matches |re1| followed by |re2|
107 re1 | re2 is an RE which matches either |re1| or |re2|
110 nullable
= 1 # True if this RE can match 0 input symbols
111 match_nl
= 1 # True if this RE can match a string ending with '\n'
112 str = None # Set to a string to override the class's __str__ result
114 def build_machine(self
, machine
, initial_state
, final_state
,
117 This method should add states to |machine| to implement this
118 RE, starting at |initial_state| and ending at |final_state|.
119 If |match_bol| is true, the RE must be able to match at the
120 beginning of a line. If nocase is true, upper and lower case
121 letters should be treated as equivalent.
123 raise NotImplementedError("%s.build_machine not implemented" %
124 self
.__class
__.__name
__)
126 def build_opt(self
, m
, initial_state
, c
):
128 Given a state |s| of machine |m|, return a new state
129 reachable from |s| on character |c| or epsilon.
132 initial_state
.link_to(s
)
133 initial_state
.add_transition(c
, s
)
136 def __add__(self
, other
):
137 return Seq(self
, other
)
139 def __or__(self
, other
):
140 return Alt(self
, other
)
146 return self
.calc_str()
148 def check_re(self
, num
, value
):
149 if not isinstance(value
, RE
):
150 self
.wrong_type(num
, value
, "Plex.RE instance")
152 def check_string(self
, num
, value
):
153 if type(value
) != type(''):
154 self
.wrong_type(num
, value
, "string")
156 def check_char(self
, num
, value
):
157 self
.check_string(num
, value
)
159 raise Errors
.PlexValueError("Invalid value for argument %d of Plex.%s."
160 "Expected a string of length 1, got: %s" % (
161 num
, self
.__class
__.__name
__, repr(value
)))
163 def wrong_type(self
, num
, value
, expected
):
164 if type(value
) == types
.InstanceType
:
165 got
= "%s.%s instance" % (
166 value
.__class
__.__module
__, value
.__class
__.__name
__)
168 got
= type(value
).__name
__
169 raise Errors
.PlexTypeError("Invalid type for argument %d of Plex.%s "
170 "(expected %s, got %s" % (
171 num
, self
.__class
__.__name
__, expected
, got
))
174 # Primitive RE constructors
175 # -------------------------
177 # These are the basic REs from which all others are built.
182 ## Char(c) is an RE which matches the character |c|.
187 ## def __init__(self, char):
189 ## self.match_nl = char == '\n'
191 ## def build_machine(self, m, initial_state, final_state, match_bol, nocase):
193 ## if match_bol and c != BOL:
194 ## s1 = self.build_opt(m, initial_state, BOL)
196 ## s1 = initial_state
197 ## if c == '\n' or c == EOF:
198 ## s1 = self.build_opt(m, s1, EOL)
200 ## code = ord(self.char)
201 ## s1.add_transition((code, code+1), final_state)
202 ## if nocase and is_letter_code(code):
203 ## code2 = other_case_code(code)
204 ## s1.add_transition((code2, code2+1), final_state)
206 ## s1.add_transition(c, final_state)
208 ## def calc_str(self):
209 ## return "Char(%s)" % repr(self.char)
213 Char(c) is an RE which matches the character |c|.
216 result
= CodeRange(ord(c
), ord(c
) + 1)
218 result
= SpecialSymbol(c
)
219 result
.str = "Char(%s)" % repr(c
)
222 class RawCodeRange(RE
):
224 RawCodeRange(code1, code2) is a low-level RE which matches any character
225 with a code |c| in the range |code1| <= |c| < |code2|, where the range
226 does not include newline. For internal use only.
230 range = None # (code, code)
231 uppercase_range
= None # (code, code) or None
232 lowercase_range
= None # (code, code) or None
234 def __init__(self
, code1
, code2
):
235 self
.range = (code1
, code2
)
236 self
.uppercase_range
= uppercase_range(code1
, code2
)
237 self
.lowercase_range
= lowercase_range(code1
, code2
)
239 def build_machine(self
, m
, initial_state
, final_state
, match_bol
, nocase
):
241 initial_state
= self
.build_opt(m
, initial_state
, BOL
)
242 initial_state
.add_transition(self
.range, final_state
)
244 if self
.uppercase_range
:
245 initial_state
.add_transition(self
.uppercase_range
, final_state
)
246 if self
.lowercase_range
:
247 initial_state
.add_transition(self
.lowercase_range
, final_state
)
250 return "CodeRange(%d,%d)" % (self
.code1
, self
.code2
)
252 class _RawNewline(RE
):
254 RawNewline is a low-level RE which matches a newline character.
255 For internal use only.
260 def build_machine(self
, m
, initial_state
, final_state
, match_bol
, nocase
):
262 initial_state
= self
.build_opt(m
, initial_state
, BOL
)
263 s
= self
.build_opt(m
, initial_state
, EOL
)
264 s
.add_transition((nl_code
, nl_code
+ 1), final_state
)
266 RawNewline
= _RawNewline()
269 class SpecialSymbol(RE
):
271 SpecialSymbol(sym) is an RE which matches the special input
272 symbol |sym|, which is one of BOL, EOL or EOF.
278 def __init__(self
, sym
):
281 def build_machine(self
, m
, initial_state
, final_state
, match_bol
, nocase
):
282 # Sequences 'bol bol' and 'bol eof' are impossible, so only need
283 # to allow for bol if sym is eol
284 if match_bol
and self
.sym
== EOL
:
285 initial_state
= self
.build_opt(m
, initial_state
, BOL
)
286 initial_state
.add_transition(self
.sym
, final_state
)
290 """Seq(re1, re2, re3...) is an RE which matches |re1| followed by
291 |re2| followed by |re3|..."""
293 def __init__(self
, *re_list
):
295 for i
in xrange(len(re_list
)):
298 nullable
= nullable
and re
.nullable
299 self
.re_list
= re_list
300 self
.nullable
= nullable
311 self
.match_nl
= match_nl
313 def build_machine(self
, m
, initial_state
, final_state
, match_bol
, nocase
):
314 re_list
= self
.re_list
315 if len(re_list
) == 0:
316 initial_state
.link_to(final_state
)
326 re
.build_machine(m
, s1
, s2
, match_bol
, nocase
)
328 match_bol
= re
.match_nl
or (match_bol
and re
.nullable
)
331 return "Seq(%s)" % ','.join(map(str, self
.re_list
))
335 """Alt(re1, re2, re3...) is an RE which matches either |re1| or
338 def __init__(self
, *re_list
):
339 self
.re_list
= re_list
343 non_nullable_res
= []
348 nullable_res
.append(re
)
351 non_nullable_res
.append(re
)
355 self
.nullable_res
= nullable_res
356 self
.non_nullable_res
= non_nullable_res
357 self
.nullable
= nullable
358 self
.match_nl
= match_nl
360 def build_machine(self
, m
, initial_state
, final_state
, match_bol
, nocase
):
361 for re
in self
.nullable_res
:
362 re
.build_machine(m
, initial_state
, final_state
, match_bol
, nocase
)
363 if self
.non_nullable_res
:
365 initial_state
= self
.build_opt(m
, initial_state
, BOL
)
366 for re
in self
.non_nullable_res
:
367 re
.build_machine(m
, initial_state
, final_state
, 0, nocase
)
370 return "Alt(%s)" % ','.join(map(str, self
.re_list
))
374 """Rep1(re) is an RE which matches one or more repetitions of |re|."""
376 def __init__(self
, re
):
379 self
.nullable
= re
.nullable
380 self
.match_nl
= re
.match_nl
382 def build_machine(self
, m
, initial_state
, final_state
, match_bol
, nocase
):
385 initial_state
.link_to(s1
)
386 self
.re
.build_machine(m
, s1
, s2
, match_bol
or self
.re
.match_nl
, nocase
)
388 s2
.link_to(final_state
)
391 return "Rep1(%s)" % self
.re
394 class SwitchCase(RE
):
396 SwitchCase(re, nocase) is an RE which matches the same strings as RE,
397 but treating upper and lower case letters according to |nocase|. If
398 |nocase| is true, case is ignored, otherwise it is not.
403 def __init__(self
, re
, nocase
):
406 self
.nullable
= re
.nullable
407 self
.match_nl
= re
.match_nl
409 def build_machine(self
, m
, initial_state
, final_state
, match_bol
, nocase
):
410 self
.re
.build_machine(m
, initial_state
, final_state
, match_bol
,
418 return "%s(%s)" % (name
, self
.re
)
421 # Composite RE constructors
422 # -------------------------
424 # These REs are defined in terms of the primitive REs.
430 Empty is an RE which matches the empty string.
436 Str1(s) is an RE which matches the literal string |s|.
438 result
= Seq(*tuple(map(Char
, s
)))
439 result
.str = "Str(%s)" % repr(s
)
444 Str(s) is an RE which matches the literal string |s|.
445 Str(s1, s2, s3, ...) is an RE which matches any of |s1| or |s2| or |s3|...
450 result
= Alt(*tuple(map(Str1
, strs
)))
451 result
.str = "Str(%s)" % ','.join(map(repr, strs
))
456 Any(s) is an RE which matches any character in the string |s|.
458 #result = apply(Alt, tuple(map(Char, s)))
459 result
= CodeRanges(chars_to_ranges(s
))
460 result
.str = "Any(%s)" % repr(s
)
465 AnyBut(s) is an RE which matches any character (including
466 newline) which is not in the string |s|.
468 ranges
= chars_to_ranges(s
)
469 ranges
.insert(0, -maxint
)
470 ranges
.append(maxint
)
471 result
= CodeRanges(ranges
)
472 result
.str = "AnyBut(%s)" % repr(s
)
478 AnyChar is an RE which matches any single character (including a newline).
480 AnyChar
.str = "AnyChar"
482 def Range(s1
, s2
= None):
484 Range(c1, c2) is an RE which matches any single character in the range
485 |c1| to |c2| inclusive.
486 Range(s) where |s| is a string of even length is an RE which matches
487 any single character in the ranges |s[0]| to |s[1]|, |s[2]| to |s[3]|,...
490 result
= CodeRange(ord(s1
), ord(s2
) + 1)
491 result
.str = "Range(%s,%s)" % (s1
, s2
)
494 for i
in range(0, len(s1
), 2):
495 ranges
.append(CodeRange(ord(s1
[i
]), ord(s1
[i
+1]) + 1))
496 result
= Alt(*ranges
)
497 result
.str = "Range(%s)" % repr(s1
)
502 Opt(re) is an RE which matches either |re| or the empty string.
504 result
= Alt(re
, Empty
)
505 result
.str = "Opt(%s)" % re
510 Rep(re) is an RE which matches zero or more repetitions of |re|.
512 result
= Opt(Rep1(re
))
513 result
.str = "Rep(%s)" % re
518 NoCase(re) is an RE which matches the same strings as RE, but treating
519 upper and lower case letters as equivalent.
521 return SwitchCase(re
, nocase
= 1)
525 Case(re) is an RE which matches the same strings as RE, but treating
526 upper and lower case letters as distinct, i.e. it cancels the effect
527 of any enclosing NoCase().
529 return SwitchCase(re
, nocase
= 0)
538 Bol is an RE which matches the beginning of a line.
545 Eol is an RE which matches the end of a line.
552 Eof is an RE which matches the end of the file.