Fullscreen support, UI fixes, reset improved
[smpy-maemo.git] / mechanize / _html.py
blob2d562c98bf56b41d792b27266bea76799f75c48c
1 """HTML handling.
3 Copyright 2003-2006 John J. Lee <jjl@pobox.com>
5 This code is free software; you can redistribute it and/or modify it under
6 the terms of the BSD or ZPL 2.1 licenses (see the file COPYING.txt
7 included with the distribution).
9 """
11 import re, copy, htmlentitydefs
12 import sgmllib, HTMLParser, ClientForm
14 import _request
15 from _headersutil import split_header_words, is_html as _is_html
16 import _rfc3986
18 DEFAULT_ENCODING = "latin-1"
21 # the base classe is purely for backwards compatibility
22 class ParseError(ClientForm.ParseError): pass
25 class CachingGeneratorFunction(object):
26 """Caching wrapper around a no-arguments iterable."""
28 def __init__(self, iterable):
29 self._cache = []
30 # wrap iterable to make it non-restartable (otherwise, repeated
31 # __call__ would give incorrect results)
32 self._iterator = iter(iterable)
34 def __call__(self):
35 cache = self._cache
36 for item in cache:
37 yield item
38 for item in self._iterator:
39 cache.append(item)
40 yield item
43 class EncodingFinder:
44 def __init__(self, default_encoding):
45 self._default_encoding = default_encoding
46 def encoding(self, response):
47 # HTTPEquivProcessor may be in use, so both HTTP and HTTP-EQUIV
48 # headers may be in the response. HTTP-EQUIV headers come last,
49 # so try in order from first to last.
50 for ct in response.info().getheaders("content-type"):
51 for k, v in split_header_words([ct])[0]:
52 if k == "charset":
53 return v
54 return self._default_encoding
56 class ResponseTypeFinder:
57 def __init__(self, allow_xhtml):
58 self._allow_xhtml = allow_xhtml
59 def is_html(self, response, encoding):
60 ct_hdrs = response.info().getheaders("content-type")
61 url = response.geturl()
62 # XXX encoding
63 return _is_html(ct_hdrs, url, self._allow_xhtml)
66 # idea for this argument-processing trick is from Peter Otten
67 class Args:
68 def __init__(self, args_map):
69 self.dictionary = dict(args_map)
70 def __getattr__(self, key):
71 try:
72 return self.dictionary[key]
73 except KeyError:
74 return getattr(self.__class__, key)
76 def form_parser_args(
77 select_default=False,
78 form_parser_class=None,
79 request_class=None,
80 backwards_compat=False,
82 return Args(locals())
85 class Link:
86 def __init__(self, base_url, url, text, tag, attrs):
87 assert None not in [url, tag, attrs]
88 self.base_url = base_url
89 self.absolute_url = _rfc3986.urljoin(base_url, url)
90 self.url, self.text, self.tag, self.attrs = url, text, tag, attrs
91 def __cmp__(self, other):
92 try:
93 for name in "url", "text", "tag", "attrs":
94 if getattr(self, name) != getattr(other, name):
95 return -1
96 except AttributeError:
97 return -1
98 return 0
99 def __repr__(self):
100 return "Link(base_url=%r, url=%r, text=%r, tag=%r, attrs=%r)" % (
101 self.base_url, self.url, self.text, self.tag, self.attrs)
104 class LinksFactory:
106 def __init__(self,
107 link_parser_class=None,
108 link_class=Link,
109 urltags=None,
111 import _pullparser
112 if link_parser_class is None:
113 link_parser_class = _pullparser.TolerantPullParser
114 self.link_parser_class = link_parser_class
115 self.link_class = link_class
116 if urltags is None:
117 urltags = {
118 "a": "href",
119 "area": "href",
120 "frame": "src",
121 "iframe": "src",
123 self.urltags = urltags
124 self._response = None
125 self._encoding = None
127 def set_response(self, response, base_url, encoding):
128 self._response = response
129 self._encoding = encoding
130 self._base_url = base_url
132 def links(self):
133 """Return an iterator that provides links of the document."""
134 response = self._response
135 encoding = self._encoding
136 base_url = self._base_url
137 p = self.link_parser_class(response, encoding=encoding)
139 try:
140 for token in p.tags(*(self.urltags.keys()+["base"])):
141 if token.type == "endtag":
142 continue
143 if token.data == "base":
144 base_href = dict(token.attrs).get("href")
145 if base_href is not None:
146 base_url = base_href
147 continue
148 attrs = dict(token.attrs)
149 tag = token.data
150 name = attrs.get("name")
151 text = None
152 # XXX use attr_encoding for ref'd doc if that doc does not
153 # provide one by other means
154 #attr_encoding = attrs.get("charset")
155 url = attrs.get(self.urltags[tag]) # XXX is "" a valid URL?
156 if not url:
157 # Probably an <A NAME="blah"> link or <AREA NOHREF...>.
158 # For our purposes a link is something with a URL, so
159 # ignore this.
160 continue
162 url = _rfc3986.clean_url(url, encoding)
163 if tag == "a":
164 if token.type != "startendtag":
165 # hmm, this'd break if end tag is missing
166 text = p.get_compressed_text(("endtag", tag))
167 # but this doesn't work for eg.
168 # <a href="blah"><b>Andy</b></a>
169 #text = p.get_compressed_text()
171 yield Link(base_url, url, text, tag, token.attrs)
172 except sgmllib.SGMLParseError, exc:
173 raise ParseError(exc)
175 class FormsFactory:
177 """Makes a sequence of objects satisfying ClientForm.HTMLForm interface.
179 After calling .forms(), the .global_form attribute is a form object
180 containing all controls not a descendant of any FORM element.
182 For constructor argument docs, see ClientForm.ParseResponse
183 argument docs.
187 def __init__(self,
188 select_default=False,
189 form_parser_class=None,
190 request_class=None,
191 backwards_compat=False,
193 import ClientForm
194 self.select_default = select_default
195 if form_parser_class is None:
196 form_parser_class = ClientForm.FormParser
197 self.form_parser_class = form_parser_class
198 if request_class is None:
199 request_class = _request.Request
200 self.request_class = request_class
201 self.backwards_compat = backwards_compat
202 self._response = None
203 self.encoding = None
204 self.global_form = None
206 def set_response(self, response, encoding):
207 self._response = response
208 self.encoding = encoding
209 self.global_form = None
211 def forms(self):
212 import ClientForm
213 encoding = self.encoding
214 try:
215 forms = ClientForm.ParseResponseEx(
216 self._response,
217 select_default=self.select_default,
218 form_parser_class=self.form_parser_class,
219 request_class=self.request_class,
220 encoding=encoding,
221 _urljoin=_rfc3986.urljoin,
222 _urlparse=_rfc3986.urlsplit,
223 _urlunparse=_rfc3986.urlunsplit,
225 except ClientForm.ParseError, exc:
226 raise ParseError(exc)
227 self.global_form = forms[0]
228 return forms[1:]
230 class TitleFactory:
231 def __init__(self):
232 self._response = self._encoding = None
234 def set_response(self, response, encoding):
235 self._response = response
236 self._encoding = encoding
238 def title(self):
239 import _pullparser
240 p = _pullparser.TolerantPullParser(
241 self._response, encoding=self._encoding)
242 try:
243 try:
244 p.get_tag("title")
245 except _pullparser.NoMoreTokensError:
246 return None
247 else:
248 return p.get_text()
249 except sgmllib.SGMLParseError, exc:
250 raise ParseError(exc)
253 def unescape(data, entities, encoding):
254 if data is None or "&" not in data:
255 return data
257 def replace_entities(match):
258 ent = match.group()
259 if ent[1] == "#":
260 return unescape_charref(ent[2:-1], encoding)
262 repl = entities.get(ent[1:-1])
263 if repl is not None:
264 repl = unichr(repl)
265 if type(repl) != type(""):
266 try:
267 repl = repl.encode(encoding)
268 except UnicodeError:
269 repl = ent
270 else:
271 repl = ent
272 return repl
274 return re.sub(r"&#?[A-Za-z0-9]+?;", replace_entities, data)
276 def unescape_charref(data, encoding):
277 name, base = data, 10
278 if name.startswith("x"):
279 name, base= name[1:], 16
280 uc = unichr(int(name, base))
281 if encoding is None:
282 return uc
283 else:
284 try:
285 repl = uc.encode(encoding)
286 except UnicodeError:
287 repl = "&#%s;" % data
288 return repl
291 # bizarre import gymnastics for bundled BeautifulSoup
292 import _beautifulsoup
293 import ClientForm
294 RobustFormParser, NestingRobustFormParser = ClientForm._create_bs_classes(
295 _beautifulsoup.BeautifulSoup, _beautifulsoup.ICantBelieveItsBeautifulSoup
297 # monkeypatch sgmllib to fix http://www.python.org/sf/803422 :-(
298 import sgmllib
299 sgmllib.charref = re.compile("&#(x?[0-9a-fA-F]+)[^0-9a-fA-F]")
301 class MechanizeBs(_beautifulsoup.BeautifulSoup):
302 _entitydefs = htmlentitydefs.name2codepoint
303 # don't want the magic Microsoft-char workaround
304 PARSER_MASSAGE = [(re.compile('(<[^<>]*)/>'),
305 lambda(x):x.group(1) + ' />'),
306 (re.compile('<!\s+([^<>]*)>'),
307 lambda(x):'<!' + x.group(1) + '>')
310 def __init__(self, encoding, text=None, avoidParserProblems=True,
311 initialTextIsEverything=True):
312 self._encoding = encoding
313 _beautifulsoup.BeautifulSoup.__init__(
314 self, text, avoidParserProblems, initialTextIsEverything)
316 def handle_charref(self, ref):
317 t = unescape("&#%s;"%ref, self._entitydefs, self._encoding)
318 self.handle_data(t)
319 def handle_entityref(self, ref):
320 t = unescape("&%s;"%ref, self._entitydefs, self._encoding)
321 self.handle_data(t)
322 def unescape_attrs(self, attrs):
323 escaped_attrs = []
324 for key, val in attrs:
325 val = unescape(val, self._entitydefs, self._encoding)
326 escaped_attrs.append((key, val))
327 return escaped_attrs
329 class RobustLinksFactory:
331 compress_re = re.compile(r"\s+")
333 def __init__(self,
334 link_parser_class=None,
335 link_class=Link,
336 urltags=None,
338 import _beautifulsoup
339 if link_parser_class is None:
340 link_parser_class = MechanizeBs
341 self.link_parser_class = link_parser_class
342 self.link_class = link_class
343 if urltags is None:
344 urltags = {
345 "a": "href",
346 "area": "href",
347 "frame": "src",
348 "iframe": "src",
350 self.urltags = urltags
351 self._bs = None
352 self._encoding = None
353 self._base_url = None
355 def set_soup(self, soup, base_url, encoding):
356 self._bs = soup
357 self._base_url = base_url
358 self._encoding = encoding
360 def links(self):
361 import _beautifulsoup
362 bs = self._bs
363 base_url = self._base_url
364 encoding = self._encoding
365 gen = bs.recursiveChildGenerator()
366 for ch in bs.recursiveChildGenerator():
367 if (isinstance(ch, _beautifulsoup.Tag) and
368 ch.name in self.urltags.keys()+["base"]):
369 link = ch
370 attrs = bs.unescape_attrs(link.attrs)
371 attrs_dict = dict(attrs)
372 if link.name == "base":
373 base_href = attrs_dict.get("href")
374 if base_href is not None:
375 base_url = base_href
376 continue
377 url_attr = self.urltags[link.name]
378 url = attrs_dict.get(url_attr)
379 if not url:
380 continue
381 url = _rfc3986.clean_url(url, encoding)
382 text = link.firstText(lambda t: True)
383 if text is _beautifulsoup.Null:
384 # follow _pullparser's weird behaviour rigidly
385 if link.name == "a":
386 text = ""
387 else:
388 text = None
389 else:
390 text = self.compress_re.sub(" ", text.strip())
391 yield Link(base_url, url, text, link.name, attrs)
394 class RobustFormsFactory(FormsFactory):
395 def __init__(self, *args, **kwds):
396 import ClientForm
397 args = form_parser_args(*args, **kwds)
398 if args.form_parser_class is None:
399 args.form_parser_class = RobustFormParser
400 FormsFactory.__init__(self, **args.dictionary)
402 def set_response(self, response, encoding):
403 self._response = response
404 self.encoding = encoding
407 class RobustTitleFactory:
408 def __init__(self):
409 self._bs = self._encoding = None
411 def set_soup(self, soup, encoding):
412 self._bs = soup
413 self._encoding = encoding
415 def title(self):
416 import _beautifulsoup
417 title = self._bs.first("title")
418 if title == _beautifulsoup.Null:
419 return None
420 else:
421 return title.firstText(lambda t: True)
424 class Factory:
425 """Factory for forms, links, etc.
427 This interface may expand in future.
429 Public methods:
431 set_request_class(request_class)
432 set_response(response)
433 forms()
434 links()
436 Public attributes:
438 Note that accessing these attributes may raise ParseError.
440 encoding: string specifying the encoding of response if it contains a text
441 document (this value is left unspecified for documents that do not have
442 an encoding, e.g. an image file)
443 is_html: true if response contains an HTML document (XHTML may be
444 regarded as HTML too)
445 title: page title, or None if no title or not HTML
446 global_form: form object containing all controls that are not descendants
447 of any FORM element, or None if the forms_factory does not support
448 supplying a global form
452 LAZY_ATTRS = ["encoding", "is_html", "title", "global_form"]
454 def __init__(self, forms_factory, links_factory, title_factory,
455 encoding_finder=EncodingFinder(DEFAULT_ENCODING),
456 response_type_finder=ResponseTypeFinder(allow_xhtml=False),
460 Pass keyword arguments only.
462 default_encoding: character encoding to use if encoding cannot be
463 determined (or guessed) from the response. You should turn on
464 HTTP-EQUIV handling if you want the best chance of getting this right
465 without resorting to this default. The default value of this
466 parameter (currently latin-1) may change in future.
469 self._forms_factory = forms_factory
470 self._links_factory = links_factory
471 self._title_factory = title_factory
472 self._encoding_finder = encoding_finder
473 self._response_type_finder = response_type_finder
475 self.set_response(None)
477 def set_request_class(self, request_class):
478 """Set urllib2.Request class.
480 ClientForm.HTMLForm instances returned by .forms() will return
481 instances of this class when .click()ed.
484 self._forms_factory.request_class = request_class
486 def set_response(self, response):
487 """Set response.
489 The response must either be None or implement the same interface as
490 objects returned by urllib2.urlopen().
493 self._response = response
494 self._forms_genf = self._links_genf = None
495 self._get_title = None
496 for name in self.LAZY_ATTRS:
497 try:
498 delattr(self, name)
499 except AttributeError:
500 pass
502 def __getattr__(self, name):
503 if name not in self.LAZY_ATTRS:
504 return getattr(self.__class__, name)
506 if name == "encoding":
507 self.encoding = self._encoding_finder.encoding(
508 copy.copy(self._response))
509 return self.encoding
510 elif name == "is_html":
511 self.is_html = self._response_type_finder.is_html(
512 copy.copy(self._response), self.encoding)
513 return self.is_html
514 elif name == "title":
515 if self.is_html:
516 self.title = self._title_factory.title()
517 else:
518 self.title = None
519 return self.title
520 elif name == "global_form":
521 self.forms()
522 return self.global_form
524 def forms(self):
525 """Return iterable over ClientForm.HTMLForm-like objects.
527 Raises mechanize.ParseError on failure.
529 # this implementation sets .global_form as a side-effect, for benefit
530 # of __getattr__ impl
531 if self._forms_genf is None:
532 try:
533 self._forms_genf = CachingGeneratorFunction(
534 self._forms_factory.forms())
535 except: # XXXX define exception!
536 self.set_response(self._response)
537 raise
538 self.global_form = getattr(
539 self._forms_factory, "global_form", None)
540 return self._forms_genf()
542 def links(self):
543 """Return iterable over mechanize.Link-like objects.
545 Raises mechanize.ParseError on failure.
547 if self._links_genf is None:
548 try:
549 self._links_genf = CachingGeneratorFunction(
550 self._links_factory.links())
551 except: # XXXX define exception!
552 self.set_response(self._response)
553 raise
554 return self._links_genf()
556 class DefaultFactory(Factory):
557 """Based on sgmllib."""
558 def __init__(self, i_want_broken_xhtml_support=False):
559 Factory.__init__(
560 self,
561 forms_factory=FormsFactory(),
562 links_factory=LinksFactory(),
563 title_factory=TitleFactory(),
564 response_type_finder=ResponseTypeFinder(
565 allow_xhtml=i_want_broken_xhtml_support),
568 def set_response(self, response):
569 Factory.set_response(self, response)
570 if response is not None:
571 self._forms_factory.set_response(
572 copy.copy(response), self.encoding)
573 self._links_factory.set_response(
574 copy.copy(response), response.geturl(), self.encoding)
575 self._title_factory.set_response(
576 copy.copy(response), self.encoding)
578 class RobustFactory(Factory):
579 """Based on BeautifulSoup, hopefully a bit more robust to bad HTML than is
580 DefaultFactory.
583 def __init__(self, i_want_broken_xhtml_support=False,
584 soup_class=None):
585 Factory.__init__(
586 self,
587 forms_factory=RobustFormsFactory(),
588 links_factory=RobustLinksFactory(),
589 title_factory=RobustTitleFactory(),
590 response_type_finder=ResponseTypeFinder(
591 allow_xhtml=i_want_broken_xhtml_support),
593 if soup_class is None:
594 soup_class = MechanizeBs
595 self._soup_class = soup_class
597 def set_response(self, response):
598 import _beautifulsoup
599 Factory.set_response(self, response)
600 if response is not None:
601 data = response.read()
602 soup = self._soup_class(self.encoding, data)
603 self._forms_factory.set_response(
604 copy.copy(response), self.encoding)
605 self._links_factory.set_soup(
606 soup, response.geturl(), self.encoding)
607 self._title_factory.set_soup(soup, self.encoding)