Better example and comments.
[rox-lib.git] / python / rox / Plex / Regexps.py
blob3dc987080fd1a0e33994878bc1a12ec8fa53d775
1 #=======================================================================
3 # Python Lexical Analyser
5 # Regular Expressions
7 #=======================================================================
9 import array
10 import string
11 import types
12 from sys import maxint
14 import Errors
17 # Constants
20 BOL = 'bol'
21 EOL = 'eol'
22 EOF = 'eof'
24 nl_code = ord('\n')
27 # Helper functions
30 def chars_to_ranges(s):
31 """
32 Return a list of character codes consisting of pairs
33 [code1a, code1b, code2a, code2b,...] which cover all
34 the characters in |s|.
35 """
36 char_list = list(s)
37 char_list.sort()
38 i = 0
39 n = len(char_list)
40 result = []
41 while i < n:
42 code1 = ord(char_list[i])
43 code2 = code1 + 1
44 i = i + 1
45 while i < n and code2 >= ord(char_list[i]):
46 code2 = code2 + 1
47 i = i + 1
48 result.append(code1)
49 result.append(code2)
50 return result
52 def uppercase_range(code1, code2):
53 """
54 If the range of characters from code1 to code2-1 includes any
55 lower case letters, return the corresponding upper case range.
56 """
57 code3 = max(code1, ord('a'))
58 code4 = min(code2, ord('z') + 1)
59 if code3 < code4:
60 d = ord('A') - ord('a')
61 return (code3 + d, code4 + d)
62 else:
63 return None
65 def lowercase_range(code1, code2):
66 """
67 If the range of characters from code1 to code2-1 includes any
68 upper case letters, return the corresponding lower case range.
69 """
70 code3 = max(code1, ord('A'))
71 code4 = min(code2, ord('Z') + 1)
72 if code3 < code4:
73 d = ord('a') - ord('A')
74 return (code3 + d, code4 + d)
75 else:
76 return None
78 def CodeRanges(code_list):
79 """
80 Given a list of codes as returned by chars_to_ranges, return
81 an RE which will match a character in any of the ranges.
82 """
83 re_list = []
84 for i in xrange(0, len(code_list), 2):
85 re_list.append(CodeRange(code_list[i], code_list[i + 1]))
86 return apply(Alt, tuple(re_list))
88 def CodeRange(code1, code2):
89 """
90 CodeRange(code1, code2) is an RE which matches any character
91 with a code |c| in the range |code1| <= |c| < |code2|.
92 """
93 if code1 <= nl_code < code2:
94 return Alt(RawCodeRange(code1, nl_code),
95 RawNewline,
96 RawCodeRange(nl_code + 1, code2))
97 else:
98 return RawCodeRange(code1, code2)
101 # Abstract classes
104 class RE:
105 """RE is the base class for regular expression constructors.
106 The following operators are defined on REs:
108 re1 + re2 is an RE which matches |re1| followed by |re2|
109 re1 | re2 is an RE which matches either |re1| or |re2|
112 nullable = 1 # True if this RE can match 0 input symbols
113 match_nl = 1 # True if this RE can match a string ending with '\n'
114 str = None # Set to a string to override the class's __str__ result
116 def build_machine(self, machine, initial_state, final_state,
117 match_bol, nocase):
119 This method should add states to |machine| to implement this
120 RE, starting at |initial_state| and ending at |final_state|.
121 If |match_bol| is true, the RE must be able to match at the
122 beginning of a line. If nocase is true, upper and lower case
123 letters should be treated as equivalent.
125 raise exceptions.UnimplementedMethod("%s.build_machine not implemented" %
126 self.__class__.__name__)
128 def build_opt(self, m, initial_state, c):
130 Given a state |s| of machine |m|, return a new state
131 reachable from |s| on character |c| or epsilon.
133 s = m.new_state()
134 initial_state.link_to(s)
135 initial_state.add_transition(c, s)
136 return s
138 def __add__(self, other):
139 return Seq(self, other)
141 def __or__(self, other):
142 return Alt(self, other)
144 def __str__(self):
145 if self.str:
146 return self.str
147 else:
148 return self.calc_str()
150 def check_re(self, num, value):
151 if not isinstance(value, RE):
152 self.wrong_type(num, value, "Plex.RE instance")
154 def check_string(self, num, value):
155 if type(value) <> type(''):
156 self.wrong_type(num, value, "string")
158 def check_char(self, num, value):
159 self.check_string(num, value)
160 if len(value) <> 1:
161 raise Errors.PlexValueError("Invalid value for argument %d of Plex.%s."
162 "Expected a string of length 1, got: %s" % (
163 num, self.__class__.__name__, repr(value)))
165 def wrong_type(self, num, value, expected):
166 if type(value) == types.InstanceType:
167 got = "%s.%s instance" % (
168 value.__class__.__module__, value.__class__.__name__)
169 else:
170 got = type(value).__name__
171 raise Errors.PlexTypeError("Invalid type for argument %d of Plex.%s "
172 "(expected %s, got %s" % (
173 num, self.__class__.__name__, expected, got))
176 # Primitive RE constructors
177 # -------------------------
179 # These are the basic REs from which all others are built.
182 ## class Char(RE):
183 ## """
184 ## Char(c) is an RE which matches the character |c|.
185 ## """
187 ## nullable = 0
189 ## def __init__(self, char):
190 ## self.char = char
191 ## self.match_nl = char == '\n'
193 ## def build_machine(self, m, initial_state, final_state, match_bol, nocase):
194 ## c = self.char
195 ## if match_bol and c <> BOL:
196 ## s1 = self.build_opt(m, initial_state, BOL)
197 ## else:
198 ## s1 = initial_state
199 ## if c == '\n' or c == EOF:
200 ## s1 = self.build_opt(m, s1, EOL)
201 ## if len(c) == 1:
202 ## code = ord(self.char)
203 ## s1.add_transition((code, code+1), final_state)
204 ## if nocase and is_letter_code(code):
205 ## code2 = other_case_code(code)
206 ## s1.add_transition((code2, code2+1), final_state)
207 ## else:
208 ## s1.add_transition(c, final_state)
210 ## def calc_str(self):
211 ## return "Char(%s)" % repr(self.char)
213 def Char(c):
215 Char(c) is an RE which matches the character |c|.
217 if len(c) == 1:
218 result = CodeRange(ord(c), ord(c) + 1)
219 else:
220 result = SpecialSymbol(c)
221 result.str = "Char(%s)" % repr(c)
222 return result
224 class RawCodeRange(RE):
226 RawCodeRange(code1, code2) is a low-level RE which matches any character
227 with a code |c| in the range |code1| <= |c| < |code2|, where the range
228 does not include newline. For internal use only.
230 nullable = 0
231 match_nl = 0
232 range = None # (code, code)
233 uppercase_range = None # (code, code) or None
234 lowercase_range = None # (code, code) or None
236 def __init__(self, code1, code2):
237 self.range = (code1, code2)
238 self.uppercase_range = uppercase_range(code1, code2)
239 self.lowercase_range = lowercase_range(code1, code2)
241 def build_machine(self, m, initial_state, final_state, match_bol, nocase):
242 if match_bol:
243 initial_state = self.build_opt(m, initial_state, BOL)
244 initial_state.add_transition(self.range, final_state)
245 if nocase:
246 if self.uppercase_range:
247 initial_state.add_transition(self.uppercase_range, final_state)
248 if self.lowercase_range:
249 initial_state.add_transition(self.lowercase_range, final_state)
251 def calc_str(self):
252 return "CodeRange(%d,%d)" % (self.code1, self.code2)
254 class _RawNewline(RE):
256 RawNewline is a low-level RE which matches a newline character.
257 For internal use only.
259 nullable = 0
260 match_nl = 1
262 def build_machine(self, m, initial_state, final_state, match_bol, nocase):
263 s = self.build_opt(m, initial_state, EOL)
264 s.add_transition((nl_code, nl_code + 1), final_state)
266 RawNewline = _RawNewline()
269 class SpecialSymbol(RE):
271 SpecialSymbol(sym) is an RE which matches the special input
272 symbol |sym|, which is one of BOL, EOL or EOF.
274 nullable = 0
275 match_nl = 0
276 sym = None
278 def __init__(self, sym):
279 self.sym = sym
281 def build_machine(self, m, initial_state, final_state, match_bol, nocase):
282 # Sequences 'bol bol' and 'bol eof' are impossible, so only need
283 # to allow for bol if sym is eol
284 if match_bol and self.sym == EOL:
285 initial_state = self.build_opt(m, initial_state, BOL)
286 initial_state.add_transition(self.sym, final_state)
289 class Seq(RE):
290 """Seq(re1, re2, re3...) is an RE which matches |re1| followed by
291 |re2| followed by |re3|..."""
293 def __init__(self, *re_list):
294 nullable = 1
295 for i in xrange(len(re_list)):
296 re = re_list[i]
297 self.check_re(i, re)
298 nullable = nullable and re.nullable
299 self.re_list = re_list
300 self.nullable = nullable
301 i = len(re_list)
302 match_nl = 0
303 while i:
304 i = i - 1
305 re = re_list[i]
306 if re.match_nl:
307 match_nl = 1
308 break
309 if not re.nullable:
310 break
311 self.match_nl = match_nl
313 def build_machine(self, m, initial_state, final_state, match_bol, nocase):
314 re_list = self.re_list
315 if len(re_list) == 0:
316 initial_state.link_to(final_state)
317 else:
318 s1 = initial_state
319 n = len(re_list)
320 for i in xrange(n):
321 if i < n - 1:
322 s2 = m.new_state()
323 else:
324 s2 = final_state
325 re = re_list[i]
326 re.build_machine(m, s1, s2, match_bol, nocase)
327 s1 = s2
328 match_bol = re.match_nl or (match_bol and re.nullable)
330 def calc_str(self):
331 return "Seq(%s)" % string.join(map(str, self.re_list), ",")
334 class Alt(RE):
335 """Alt(re1, re2, re3...) is an RE which matches either |re1| or
336 |re2| or |re3|..."""
338 def __init__(self, *re_list):
339 self.re_list = re_list
340 nullable = 0
341 match_nl = 0
342 nullable_res = []
343 non_nullable_res = []
344 i = 1
345 for re in re_list:
346 self.check_re(i, re)
347 if re.nullable:
348 nullable_res.append(re)
349 nullable = 1
350 else:
351 non_nullable_res.append(re)
352 if re.match_nl:
353 match_nl = 1
354 i = i + 1
355 self.nullable_res = nullable_res
356 self.non_nullable_res = non_nullable_res
357 self.nullable = nullable
358 self.match_nl = match_nl
360 def build_machine(self, m, initial_state, final_state, match_bol, nocase):
361 for re in self.nullable_res:
362 re.build_machine(m, initial_state, final_state, match_bol, nocase)
363 if self.non_nullable_res:
364 if match_bol:
365 initial_state = self.build_opt(m, initial_state, BOL)
366 for re in self.non_nullable_res:
367 re.build_machine(m, initial_state, final_state, 0, nocase)
369 def calc_str(self):
370 return "Alt(%s)" % string.join(map(str, self.re_list), ",")
373 class Rep1(RE):
374 """Rep1(re) is an RE which matches one or more repetitions of |re|."""
376 def __init__(self, re):
377 self.check_re(1, re)
378 self.re = re
379 self.nullable = re.nullable
380 self.match_nl = re.match_nl
382 def build_machine(self, m, initial_state, final_state, match_bol, nocase):
383 s1 = m.new_state()
384 s2 = m.new_state()
385 initial_state.link_to(s1)
386 self.re.build_machine(m, s1, s2, match_bol, nocase)
387 s2.link_to(s1)
388 s2.link_to(final_state)
390 def calc_str(self):
391 return "Rep1(%s)" % self.re
394 class SwitchCase(RE):
396 SwitchCase(re, nocase) is an RE which matches the same strings as RE,
397 but treating upper and lower case letters according to |nocase|. If
398 |nocase| is true, case is ignored, otherwise it is not.
400 re = None
401 nocase = None
403 def __init__(self, re, nocase):
404 self.re = re
405 self.nocase = nocase
406 self.nullable = re.nullable
407 self.match_nl = re.match_nl
409 def build_machine(self, m, initial_state, final_state, match_bol, nocase):
410 self.re.build_machine(m, initial_state, final_state, match_bol,
411 self.nocase)
413 def calc_str(self):
414 if self.nocase:
415 name = "NoCase"
416 else:
417 name = "Case"
418 return "%s(%s)" % (name, self.re)
421 # Composite RE constructors
422 # -------------------------
424 # These REs are defined in terms of the primitive REs.
427 Empty = Seq()
428 Empty.__doc__ = \
430 Empty is an RE which matches the empty string.
432 Empty.str = "Empty"
434 def Str1(s):
436 Str1(s) is an RE which matches the literal string |s|.
438 result = apply(Seq, tuple(map(Char, s)))
439 result.str = "Str(%s)" % repr(s)
440 return result
442 def Str(*strs):
444 Str(s) is an RE which matches the literal string |s|.
445 Str(s1, s2, s3, ...) is an RE which matches any of |s1| or |s2| or |s3|...
447 if len(strs) == 1:
448 return Str1(strs[0])
449 else:
450 result = apply(Alt, tuple(map(Str1, strs)))
451 result.str = "Str(%s)" % string.join(map(repr, strs), ",")
452 return result
454 def Any(s):
456 Any(s) is an RE which matches any character in the string |s|.
458 #result = apply(Alt, tuple(map(Char, s)))
459 result = CodeRanges(chars_to_ranges(s))
460 result.str = "Any(%s)" % repr(s)
461 return result
463 def AnyBut(s):
465 AnyBut(s) is an RE which matches any character (including
466 newline) which is not in the string |s|.
468 ranges = chars_to_ranges(s)
469 ranges.insert(0, -maxint)
470 ranges.append(maxint)
471 result = CodeRanges(ranges)
472 result.str = "AnyBut(%s)" % repr(s)
473 return result
475 AnyChar = AnyBut("")
476 AnyChar.__doc__ = \
478 AnyChar is an RE which matches any single character (including a newline).
480 AnyChar.str = "AnyChar"
482 def Range(s1, s2 = None):
484 Range(c1, c2) is an RE which matches any single character in the range
485 |c1| to |c2| inclusive.
486 Range(s) where |s| is a string of even length is an RE which matches
487 any single character in the ranges |s[0]| to |s[1]|, |s[2]| to |s[3]|,...
489 if s2:
490 result = CodeRange(ord(s1), ord(s2) + 1)
491 result.str = "Range(%s,%s)" % (s1, s2)
492 else:
493 ranges = []
494 for i in range(0, len(s1), 2):
495 ranges.append(CodeRange(ord(s1[i]), ord(s1[i+1]) + 1))
496 result = apply(Alt, tuple(ranges))
497 result.str = "Range(%s)" % repr(s1)
498 return result
500 def Opt(re):
502 Opt(re) is an RE which matches either |re| or the empty string.
504 result = Alt(re, Empty)
505 result.str = "Opt(%s)" % re
506 return result
508 def Rep(re):
510 Rep(re) is an RE which matches zero or more repetitions of |re|.
512 result = Opt(Rep1(re))
513 result.str = "Rep(%s)" % re
514 return result
516 def NoCase(re):
518 NoCase(re) is an RE which matches the same strings as RE, but treating
519 upper and lower case letters as equivalent.
521 return SwitchCase(re, nocase = 1)
523 def Case(re):
525 Case(re) is an RE which matches the same strings as RE, but treating
526 upper and lower case letters as distinct, i.e. it cancels the effect
527 of any enclosing NoCase().
529 return SwitchCase(re, nocase = 0)
532 # RE Constants
535 Bol = Char(BOL)
536 Bol.__doc__ = \
538 Bol is an RE which matches the beginning of a line.
540 Bol.str = "Bol"
542 Eol = Char(EOL)
543 Eol.__doc__ = \
545 Eol is an RE which matches the end of a line.
547 Eol.str = "Eol"
549 Eof = Char(EOF)
550 Eof.__doc__ = \
552 Eof is an RE which matches the end of the file.
554 Eof.str = "Eof"