Sync translations from Transifex and run lupdate
[qBittorrent.git] / src / searchengine / nova3 / sgmllib3.py
blob88a02a307f4013abccb867cac97075e22acf06a5
1 """A parser for SGML, using the derived class as a static DTD."""
3 # XXX This only supports those SGML features used by HTML.
5 # XXX There should be a way to distinguish between PCDATA (parsed
6 # character data -- the normal case), RCDATA (replaceable character
7 # data -- only char and entity references and end tags are special)
8 # and CDATA (character data -- only end tags are special). RCDATA is
9 # not supported at all.
11 import _markupbase
12 import re
14 __all__ = ["SGMLParser", "SGMLParseError"]
16 # Regular expressions used for parsing
18 interesting = re.compile('[&<]')
19 incomplete = re.compile('&([a-zA-Z][a-zA-Z0-9]*|#[0-9]*)?|'
20 '<([a-zA-Z][^<>]*|'
21 '/([a-zA-Z][^<>]*)?|'
22 '![^<>]*)?')
24 entityref = re.compile('&([a-zA-Z][-.a-zA-Z0-9]*)[^a-zA-Z0-9]')
25 charref = re.compile('&#([0-9]+)[^0-9]')
27 starttagopen = re.compile('<[>a-zA-Z]')
28 shorttagopen = re.compile('<[a-zA-Z][-.a-zA-Z0-9]*/')
29 shorttag = re.compile('<([a-zA-Z][-.a-zA-Z0-9]*)/([^/]*)/')
30 piclose = re.compile('>')
31 endbracket = re.compile('[<>]')
32 tagfind = re.compile('[a-zA-Z][-_.a-zA-Z0-9]*')
33 attrfind = re.compile(
34 r'\s*([a-zA-Z_][-:.a-zA-Z_0-9]*)(\s*=\s*'
35 r'(\'[^\']*\'|"[^"]*"|[][\-a-zA-Z0-9./,:;+*%?!&$\(\)_#=~\'"@]*))?')
38 class SGMLParseError(RuntimeError):
39 """Exception raised for all parse errors."""
40 pass
43 # SGML parser base class -- find tags and call handler functions.
44 # Usage: p = SGMLParser(); p.feed(data); ...; p.close().
45 # The dtd is defined by deriving a class which defines methods
46 # with special names to handle tags: start_foo and end_foo to handle
47 # <foo> and </foo>, respectively, or do_foo to handle <foo> by itself.
48 # (Tags are converted to lower case for this purpose.) The data
49 # between tags is passed to the parser by calling self.handle_data()
50 # with some data as argument (the data may be split up in arbitrary
51 # chunks). Entity references are passed by calling
52 # self.handle_entityref() with the entity reference as argument.
54 class SGMLParser(_markupbase.ParserBase):
55 # Definition of entities -- derived classes may override
56 entity_or_charref = re.compile('&(?:'
57 '([a-zA-Z][-.a-zA-Z0-9]*)|#([0-9]+)'
58 ')(;?)')
60 def __init__(self, verbose=0):
61 """Initialize and reset this instance."""
62 self.verbose = verbose
63 self.reset()
65 def reset(self):
66 """Reset this instance. Loses all unprocessed data."""
67 self.__starttag_text = None
68 self.rawdata = ''
69 self.stack = []
70 self.lasttag = '???'
71 self.nomoretags = 0
72 self.literal = 0
73 _markupbase.ParserBase.reset(self)
75 def setnomoretags(self):
76 """Enter literal mode (CDATA) till EOF.
78 Intended for derived classes only.
79 """
80 self.nomoretags = self.literal = 1
82 def setliteral(self, *args):
83 """Enter literal mode (CDATA).
85 Intended for derived classes only.
86 """
87 self.literal = 1
89 def feed(self, data):
90 """Feed some data to the parser.
92 Call this as often as you want, with as little or as much text
93 as you want (may include '\n'). (This just saves the text,
94 all the processing is done by goahead().)
95 """
97 self.rawdata = self.rawdata + data
98 self.goahead(0)
100 def close(self):
101 """Handle the remaining data."""
102 self.goahead(1)
104 def error(self, message):
105 raise SGMLParseError(message)
107 # Internal -- handle data as far as reasonable. May leave state
108 # and data to be processed by a subsequent call. If 'end' is
109 # true, force handling all data as if followed by EOF marker.
110 def goahead(self, end):
111 rawdata = self.rawdata
112 i = 0
113 n = len(rawdata)
114 while i < n:
115 if self.nomoretags:
116 self.handle_data(rawdata[i:n])
117 i = n
118 break
119 match = interesting.search(rawdata, i)
120 if match: j = match.start()
121 else: j = n
122 if i < j:
123 self.handle_data(rawdata[i:j])
124 i = j
125 if i == n: break
126 if rawdata[i] == '<':
127 if starttagopen.match(rawdata, i):
128 if self.literal:
129 self.handle_data(rawdata[i])
130 i = i+1
131 continue
132 k = self.parse_starttag(i)
133 if k < 0: break
134 i = k
135 continue
136 if rawdata.startswith("</", i):
137 k = self.parse_endtag(i)
138 if k < 0: break
139 i = k
140 self.literal = 0
141 continue
142 if self.literal:
143 if n > (i + 1):
144 self.handle_data("<")
145 i = i+1
146 else:
147 # incomplete
148 break
149 continue
150 if rawdata.startswith("<!--", i):
151 # Strictly speaking, a comment is --.*--
152 # within a declaration tag <!...>.
153 # This should be removed,
154 # and comments handled only in parse_declaration.
155 k = self.parse_comment(i)
156 if k < 0: break
157 i = k
158 continue
159 if rawdata.startswith("<?", i):
160 k = self.parse_pi(i)
161 if k < 0: break
162 i = i+k
163 continue
164 if rawdata.startswith("<!", i):
165 # This is some sort of declaration; in "HTML as
166 # deployed," this should only be the document type
167 # declaration ("<!DOCTYPE html...>").
168 k = self.parse_declaration(i)
169 if k < 0: break
170 i = k
171 continue
172 elif rawdata[i] == '&':
173 if self.literal:
174 self.handle_data(rawdata[i])
175 i = i+1
176 continue
177 match = charref.match(rawdata, i)
178 if match:
179 name = match.group(1)
180 self.handle_charref(name)
181 i = match.end(0)
182 if rawdata[i-1] != ';': i = i-1
183 continue
184 match = entityref.match(rawdata, i)
185 if match:
186 name = match.group(1)
187 self.handle_entityref(name)
188 i = match.end(0)
189 if rawdata[i-1] != ';': i = i-1
190 continue
191 else:
192 self.error('neither < nor & ??')
193 # We get here only if incomplete matches but
194 # nothing else
195 match = incomplete.match(rawdata, i)
196 if not match:
197 self.handle_data(rawdata[i])
198 i = i+1
199 continue
200 j = match.end(0)
201 if j == n:
202 break # Really incomplete
203 self.handle_data(rawdata[i:j])
204 i = j
205 # end while
206 if end and i < n:
207 self.handle_data(rawdata[i:n])
208 i = n
209 self.rawdata = rawdata[i:]
210 # XXX if end: check for empty stack
212 # Extensions for the DOCTYPE scanner:
213 _decl_otherchars = '='
215 # Internal -- parse processing instr, return length or -1 if not terminated
216 def parse_pi(self, i):
217 rawdata = self.rawdata
218 if rawdata[i:i+2] != '<?':
219 self.error('unexpected call to parse_pi()')
220 match = piclose.search(rawdata, i+2)
221 if not match:
222 return -1
223 j = match.start(0)
224 self.handle_pi(rawdata[i+2: j])
225 j = match.end(0)
226 return j-i
228 def get_starttag_text(self):
229 return self.__starttag_text
231 # Internal -- handle starttag, return length or -1 if not terminated
232 def parse_starttag(self, i):
233 self.__starttag_text = None
234 start_pos = i
235 rawdata = self.rawdata
236 if shorttagopen.match(rawdata, i):
237 # SGML shorthand: <tag/data/ == <tag>data</tag>
238 # XXX Can data contain &... (entity or char refs)?
239 # XXX Can data contain < or > (tag characters)?
240 # XXX Can there be whitespace before the first /?
241 match = shorttag.match(rawdata, i)
242 if not match:
243 return -1
244 tag, data = match.group(1, 2)
245 self.__starttag_text = '<%s/' % tag
246 tag = tag.lower()
247 k = match.end(0)
248 self.finish_shorttag(tag, data)
249 self.__starttag_text = rawdata[start_pos:match.end(1) + 1]
250 return k
251 # XXX The following should skip matching quotes (' or ")
252 # As a shortcut way to exit, this isn't so bad, but shouldn't
253 # be used to locate the actual end of the start tag since the
254 # < or > characters may be embedded in an attribute value.
255 match = endbracket.search(rawdata, i+1)
256 if not match:
257 return -1
258 j = match.start(0)
259 # Now parse the data between i+1 and j into a tag and attrs
260 attrs = []
261 if rawdata[i:i+2] == '<>':
262 # SGML shorthand: <> == <last open tag seen>
263 k = j
264 tag = self.lasttag
265 else:
266 match = tagfind.match(rawdata, i+1)
267 if not match:
268 self.error('unexpected call to parse_starttag')
269 k = match.end(0)
270 tag = rawdata[i+1:k].lower()
271 self.lasttag = tag
272 while k < j:
273 match = attrfind.match(rawdata, k)
274 if not match: break
275 attrname, rest, attrvalue = match.group(1, 2, 3)
276 if not rest:
277 attrvalue = attrname
278 else:
279 if (attrvalue[:1] == "'" == attrvalue[-1:] or
280 attrvalue[:1] == '"' == attrvalue[-1:]):
281 # strip quotes
282 attrvalue = attrvalue[1:-1]
283 attrvalue = self.entity_or_charref.sub(
284 self._convert_ref, attrvalue)
285 attrs.append((attrname.lower(), attrvalue))
286 k = match.end(0)
287 if rawdata[j] == '>':
288 j = j+1
289 self.__starttag_text = rawdata[start_pos:j]
290 self.finish_starttag(tag, attrs)
291 return j
293 # Internal -- convert entity or character reference
294 def _convert_ref(self, match):
295 if match.group(2):
296 return self.convert_charref(match.group(2)) or \
297 '&#%s%s' % match.groups()[1:]
298 elif match.group(3):
299 return self.convert_entityref(match.group(1)) or \
300 '&%s;' % match.group(1)
301 else:
302 return '&%s' % match.group(1)
304 # Internal -- parse endtag
305 def parse_endtag(self, i):
306 rawdata = self.rawdata
307 match = endbracket.search(rawdata, i+1)
308 if not match:
309 return -1
310 j = match.start(0)
311 tag = rawdata[i+2:j].strip().lower()
312 if rawdata[j] == '>':
313 j = j+1
314 self.finish_endtag(tag)
315 return j
317 # Internal -- finish parsing of <tag/data/ (same as <tag>data</tag>)
318 def finish_shorttag(self, tag, data):
319 self.finish_starttag(tag, [])
320 self.handle_data(data)
321 self.finish_endtag(tag)
323 # Internal -- finish processing of start tag
324 # Return -1 for unknown tag, 0 for open-only tag, 1 for balanced tag
325 def finish_starttag(self, tag, attrs):
326 try:
327 method = getattr(self, 'start_' + tag)
328 except AttributeError:
329 try:
330 method = getattr(self, 'do_' + tag)
331 except AttributeError:
332 self.unknown_starttag(tag, attrs)
333 return -1
334 else:
335 self.handle_starttag(tag, method, attrs)
336 return 0
337 else:
338 self.stack.append(tag)
339 self.handle_starttag(tag, method, attrs)
340 return 1
342 # Internal -- finish processing of end tag
343 def finish_endtag(self, tag):
344 if not tag:
345 found = len(self.stack) - 1
346 if found < 0:
347 self.unknown_endtag(tag)
348 return
349 else:
350 if tag not in self.stack:
351 try:
352 method = getattr(self, 'end_' + tag)
353 except AttributeError:
354 self.unknown_endtag(tag)
355 else:
356 self.report_unbalanced(tag)
357 return
358 found = len(self.stack)
359 for i in range(found):
360 if self.stack[i] == tag: found = i
361 while len(self.stack) > found:
362 tag = self.stack[-1]
363 try:
364 method = getattr(self, 'end_' + tag)
365 except AttributeError:
366 method = None
367 if method:
368 self.handle_endtag(tag, method)
369 else:
370 self.unknown_endtag(tag)
371 del self.stack[-1]
373 # Overridable -- handle start tag
374 def handle_starttag(self, tag, method, attrs):
375 method(attrs)
377 # Overridable -- handle end tag
378 def handle_endtag(self, tag, method):
379 method()
381 # Example -- report an unbalanced </...> tag.
382 def report_unbalanced(self, tag):
383 if self.verbose:
384 print('*** Unbalanced </' + tag + '>')
385 print('*** Stack:', self.stack)
387 def convert_charref(self, name):
388 """Convert character reference, may be overridden."""
389 try:
390 n = int(name)
391 except ValueError:
392 return
393 if not 0 <= n <= 127:
394 return
395 return self.convert_codepoint(n)
397 def convert_codepoint(self, codepoint):
398 return chr(codepoint)
400 def handle_charref(self, name):
401 """Handle character reference, no need to override."""
402 replacement = self.convert_charref(name)
403 if replacement is None:
404 self.unknown_charref(name)
405 else:
406 self.handle_data(replacement)
408 # Definition of entities -- derived classes may override
409 entitydefs = \
410 {'lt': '<', 'gt': '>', 'amp': '&', 'quot': '"', 'apos': '\''}
412 def convert_entityref(self, name):
413 """Convert entity references.
415 As an alternative to overriding this method; one can tailor the
416 results by setting up the self.entitydefs mapping appropriately.
418 table = self.entitydefs
419 if name in table:
420 return table[name]
421 else:
422 return
424 def handle_entityref(self, name):
425 """Handle entity references, no need to override."""
426 replacement = self.convert_entityref(name)
427 if replacement is None:
428 self.unknown_entityref(name)
429 else:
430 self.handle_data(replacement)
432 # Example -- handle data, should be overridden
433 def handle_data(self, data):
434 pass
436 # Example -- handle comment, could be overridden
437 def handle_comment(self, data):
438 pass
440 # Example -- handle declaration, could be overridden
441 def handle_decl(self, decl):
442 pass
444 # Example -- handle processing instruction, could be overridden
445 def handle_pi(self, data):
446 pass
448 # To be overridden -- handlers for unknown objects
449 def unknown_starttag(self, tag, attrs): pass
450 def unknown_endtag(self, tag): pass
451 def unknown_charref(self, ref): pass
452 def unknown_entityref(self, ref): pass
455 class TestSGMLParser(SGMLParser):
457 def __init__(self, verbose=0):
458 self.testdata = ""
459 SGMLParser.__init__(self, verbose)
461 def handle_data(self, data):
462 self.testdata = self.testdata + data
463 if len(repr(self.testdata)) >= 70:
464 self.flush()
466 def flush(self):
467 data = self.testdata
468 if data:
469 self.testdata = ""
470 print('data:', repr(data))
472 def handle_comment(self, data):
473 self.flush()
474 r = repr(data)
475 if len(r) > 68:
476 r = r[:32] + '...' + r[-32:]
477 print('comment:', r)
479 def unknown_starttag(self, tag, attrs):
480 self.flush()
481 if not attrs:
482 print('start tag: <' + tag + '>')
483 else:
484 print('start tag: <' + tag, end=' ')
485 for name, value in attrs:
486 print(name + '=' + '"' + value + '"', end=' ')
487 print('>')
489 def unknown_endtag(self, tag):
490 self.flush()
491 print('end tag: </' + tag + '>')
493 def unknown_entityref(self, ref):
494 self.flush()
495 print('*** unknown entity ref: &' + ref + ';')
497 def unknown_charref(self, ref):
498 self.flush()
499 print('*** unknown char ref: &#' + ref + ';')
501 def unknown_decl(self, data):
502 self.flush()
503 print('*** unknown decl: [' + data + ']')
505 def close(self):
506 SGMLParser.close(self)
507 self.flush()
510 def test(args = None):
511 import sys
513 if args is None:
514 args = sys.argv[1:]
516 if args and args[0] == '-s':
517 args = args[1:]
518 klass = SGMLParser
519 else:
520 klass = TestSGMLParser
522 if args:
523 file = args[0]
524 else:
525 file = 'test.html'
527 if file == '-':
528 f = sys.stdin
529 else:
530 try:
531 f = open(file, 'r')
532 except IOError as msg:
533 print(file, ":", msg)
534 sys.exit(1)
536 data = f.read()
537 if f is not sys.stdin:
538 f.close()
540 x = klass()
541 for c in data:
542 x.feed(c)
543 x.close()
546 if __name__ == '__main__':
547 test()