mechanize/_html.py

   1 """HTML handling.
   2
   3 Copyright 2003-2006 John J. Lee <jjl@pobox.com>
   4
   5 This code is free software; you can redistribute it and/or modify it under
   6 the terms of the BSD or ZPL 2.1 licenses (see the file COPYING.txt
   7 included with the distribution).
   8
   9 """
  10
  11 import re, copy, htmlentitydefs
  12 import sgmllib, HTMLParser, ClientForm
  13
  14 import _request
  15 from _headersutil import split_header_words, is_html as _is_html
  16 import _rfc3986
  17
  18 DEFAULT_ENCODING = "latin-1"
  19
  20
  21 # the base classe is purely for backwards compatibility
  22 class ParseError(ClientForm.ParseError): pass
  23
  24
  25 class CachingGeneratorFunction(object):
  26     """Caching wrapper around a no-arguments iterable."""
  27
  28     def __init__(self, iterable):
  29         self._cache = []
  30         # wrap iterable to make it non-restartable (otherwise, repeated
  31         # __call__ would give incorrect results)
  32         self._iterator = iter(iterable)
  33
  34     def __call__(self):
  35         cache = self._cache
  36         for item in cache:
  37             yield item
  38         for item in self._iterator:
  39             cache.append(item)
  40             yield item
  41
  42
  43 class EncodingFinder:
  44     def __init__(self, default_encoding):
  45         self._default_encoding = default_encoding
  46     def encoding(self, response):
  47         # HTTPEquivProcessor may be in use, so both HTTP and HTTP-EQUIV
  48         # headers may be in the response.  HTTP-EQUIV headers come last,
  49         # so try in order from first to last.
  50         for ct in response.info().getheaders("content-type"):
  51             for k, v in split_header_words([ct])[0]:
  52                 if k == "charset":
  53                     return v
  54         return self._default_encoding
  55
  56 class ResponseTypeFinder:
  57     def __init__(self, allow_xhtml):
  58         self._allow_xhtml = allow_xhtml
  59     def is_html(self, response, encoding):
  60         ct_hdrs = response.info().getheaders("content-type")
  61         url = response.geturl()
  62         # XXX encoding
  63         return _is_html(ct_hdrs, url, self._allow_xhtml)
  64
  65
  66 # idea for this argument-processing trick is from Peter Otten
  67 class Args:
  68     def __init__(self, args_map):
  69         self.dictionary = dict(args_map)
  70     def __getattr__(self, key):
  71         try:
  72             return self.dictionary[key]
  73         except KeyError:
  74             return getattr(self.__class__, key)
  75
  76 def form_parser_args(
  77     select_default=False,
  78     form_parser_class=None,
  79     request_class=None,
  80     backwards_compat=False,
  81     ):
  82     return Args(locals())
  83
  84
  85 class Link:
  86     def __init__(self, base_url, url, text, tag, attrs):
  87         assert None not in [url, tag, attrs]
  88         self.base_url = base_url
  89         self.absolute_url = _rfc3986.urljoin(base_url, url)
  90         self.url, self.text, self.tag, self.attrs = url, text, tag, attrs
  91     def __cmp__(self, other):
  92         try:
  93             for name in "url", "text", "tag", "attrs":
  94                 if getattr(self, name) != getattr(other, name):
  95                     return -1
  96         except AttributeError:
  97             return -1
  98         return 0
  99     def __repr__(self):
 100         return "Link(base_url=%r, url=%r, text=%r, tag=%r, attrs=%r)" % (
 101             self.base_url, self.url, self.text, self.tag, self.attrs)
 102
 103
 104 class LinksFactory:
 105
 106     def __init__(self,
 107                  link_parser_class=None,
 108                  link_class=Link,
 109                  urltags=None,
 110                  ):
 111         import _pullparser
 112         if link_parser_class is None:
 113             link_parser_class = _pullparser.TolerantPullParser
 114         self.link_parser_class = link_parser_class
 115         self.link_class = link_class
 116         if urltags is None:
 117             urltags = {
 118                 "a": "href",
 119                 "area": "href",
 120                 "frame": "src",
 121                 "iframe": "src",
 122                 }
 123         self.urltags = urltags
 124         self._response = None
 125         self._encoding = None
 126
 127     def set_response(self, response, base_url, encoding):
 128         self._response = response
 129         self._encoding = encoding
 130         self._base_url = base_url
 131
 132     def links(self):
 133         """Return an iterator that provides links of the document."""
 134         response = self._response
 135         encoding = self._encoding
 136         base_url = self._base_url
 137         p = self.link_parser_class(response, encoding=encoding)
 138
 139         try:
 140             for token in p.tags(*(self.urltags.keys()+["base"])):
 141                 if token.type == "endtag":
 142                     continue
 143                 if token.data == "base":
 144                     base_href = dict(token.attrs).get("href")
 145                     if base_href is not None:
 146                         base_url = base_href
 147                     continue
 148                 attrs = dict(token.attrs)
 149                 tag = token.data
 150                 name = attrs.get("name")
 151                 text = None
 152                 # XXX use attr_encoding for ref'd doc if that doc does not
 153                 #  provide one by other means
 154                 #attr_encoding = attrs.get("charset")
 155                 url = attrs.get(self.urltags[tag])  # XXX is "" a valid URL?
 156                 if not url:
 157                     # Probably an <A NAME="blah"> link or <AREA NOHREF...>.
 158                     # For our purposes a link is something with a URL, so
 159                     # ignore this.
 160                     continue
 161
 162                 url = _rfc3986.clean_url(url, encoding)
 163                 if tag == "a":
 164                     if token.type != "startendtag":
 165                         # hmm, this'd break if end tag is missing
 166                         text = p.get_compressed_text(("endtag", tag))
 167                     # but this doesn't work for eg.
 168                     # <a href="blah"><b>Andy</b></a>
 169                     #text = p.get_compressed_text()
 170
 171                 yield Link(base_url, url, text, tag, token.attrs)
 172         except sgmllib.SGMLParseError, exc:
 173             raise ParseError(exc)
 174
 175 class FormsFactory:
 176
 177     """Makes a sequence of objects satisfying ClientForm.HTMLForm interface.
 178
 179     After calling .forms(), the .global_form attribute is a form object
 180     containing all controls not a descendant of any FORM element.
 181
 182     For constructor argument docs, see ClientForm.ParseResponse
 183     argument docs.
 184
 185     """
 186
 187     def __init__(self,
 188                  select_default=False,
 189                  form_parser_class=None,
 190                  request_class=None,
 191                  backwards_compat=False,
 192                  ):
 193         import ClientForm
 194         self.select_default = select_default
 195         if form_parser_class is None:
 196             form_parser_class = ClientForm.FormParser
 197         self.form_parser_class = form_parser_class
 198         if request_class is None:
 199             request_class = _request.Request
 200         self.request_class = request_class
 201         self.backwards_compat = backwards_compat
 202         self._response = None
 203         self.encoding = None
 204         self.global_form = None
 205
 206     def set_response(self, response, encoding):
 207         self._response = response
 208         self.encoding = encoding
 209         self.global_form = None
 210
 211     def forms(self):
 212         import ClientForm
 213         encoding = self.encoding
 214         try:
 215             forms = ClientForm.ParseResponseEx(
 216                 self._response,
 217                 select_default=self.select_default,
 218                 form_parser_class=self.form_parser_class,
 219                 request_class=self.request_class,
 220                 encoding=encoding,
 221                 _urljoin=_rfc3986.urljoin,
 222                 _urlparse=_rfc3986.urlsplit,
 223                 _urlunparse=_rfc3986.urlunsplit,
 224                 )
 225         except ClientForm.ParseError, exc:
 226             raise ParseError(exc)
 227         self.global_form = forms[0]
 228         return forms[1:]
 229
 230 class TitleFactory:
 231     def __init__(self):
 232         self._response = self._encoding = None
 233
 234     def set_response(self, response, encoding):
 235         self._response = response
 236         self._encoding = encoding
 237
 238     def title(self):
 239         import _pullparser
 240         p = _pullparser.TolerantPullParser(
 241             self._response, encoding=self._encoding)
 242         try:
 243             try:
 244                 p.get_tag("title")
 245             except _pullparser.NoMoreTokensError:
 246                 return None
 247             else:
 248                 return p.get_text()
 249         except sgmllib.SGMLParseError, exc:
 250             raise ParseError(exc)
 251
 252
 253 def unescape(data, entities, encoding):
 254     if data is None or "&" not in data:
 255         return data
 256
 257     def replace_entities(match):
 258         ent = match.group()
 259         if ent[1] == "#":
 260             return unescape_charref(ent[2:-1], encoding)
 261
 262         repl = entities.get(ent[1:-1])
 263         if repl is not None:
 264             repl = unichr(repl)
 265             if type(repl) != type(""):
 266                 try:
 267                     repl = repl.encode(encoding)
 268                 except UnicodeError:
 269                     repl = ent
 270         else:
 271             repl = ent
 272         return repl
 273
 274     return re.sub(r"&#?[A-Za-z0-9]+?;", replace_entities, data)
 275
 276 def unescape_charref(data, encoding):
 277     name, base = data, 10
 278     if name.startswith("x"):
 279         name, base= name[1:], 16
 280     uc = unichr(int(name, base))
 281     if encoding is None:
 282         return uc
 283     else:
 284         try:
 285             repl = uc.encode(encoding)
 286         except UnicodeError:
 287             repl = "&#%s;" % data
 288         return repl
 289
 290
 291 # bizarre import gymnastics for bundled BeautifulSoup
 292 import _beautifulsoup
 293 import ClientForm
 294 RobustFormParser, NestingRobustFormParser = ClientForm._create_bs_classes(
 295     _beautifulsoup.BeautifulSoup, _beautifulsoup.ICantBelieveItsBeautifulSoup
 296     )
 297 # monkeypatch sgmllib to fix http://www.python.org/sf/803422 :-(
 298 import sgmllib
 299 sgmllib.charref = re.compile("&#(x?[0-9a-fA-F]+)[^0-9a-fA-F]")
 300
 301 class MechanizeBs(_beautifulsoup.BeautifulSoup):
 302     _entitydefs = htmlentitydefs.name2codepoint
 303     # don't want the magic Microsoft-char workaround
 304     PARSER_MASSAGE = [(re.compile('(<[^<>]*)/>'),
 305                        lambda(x):x.group(1) + ' />'),
 306                       (re.compile('<!\s+([^<>]*)>'),
 307                        lambda(x):'<!' + x.group(1) + '>')
 308                       ]
 309
 310     def __init__(self, encoding, text=None, avoidParserProblems=True,
 311                  initialTextIsEverything=True):
 312         self._encoding = encoding
 313         _beautifulsoup.BeautifulSoup.__init__(
 314             self, text, avoidParserProblems, initialTextIsEverything)
 315
 316     def handle_charref(self, ref):
 317         t = unescape("&#%s;"%ref, self._entitydefs, self._encoding)
 318         self.handle_data(t)
 319     def handle_entityref(self, ref):
 320         t = unescape("&%s;"%ref, self._entitydefs, self._encoding)
 321         self.handle_data(t)
 322     def unescape_attrs(self, attrs):
 323         escaped_attrs = []
 324         for key, val in attrs:
 325             val = unescape(val, self._entitydefs, self._encoding)
 326             escaped_attrs.append((key, val))
 327         return escaped_attrs
 328
 329 class RobustLinksFactory:
 330
 331     compress_re = re.compile(r"\s+")
 332
 333     def __init__(self,
 334                  link_parser_class=None,
 335                  link_class=Link,
 336                  urltags=None,
 337                  ):
 338         import _beautifulsoup
 339         if link_parser_class is None:
 340             link_parser_class = MechanizeBs
 341         self.link_parser_class = link_parser_class
 342         self.link_class = link_class
 343         if urltags is None:
 344             urltags = {
 345                 "a": "href",
 346                 "area": "href",
 347                 "frame": "src",
 348                 "iframe": "src",
 349                 }
 350         self.urltags = urltags
 351         self._bs = None
 352         self._encoding = None
 353         self._base_url = None
 354
 355     def set_soup(self, soup, base_url, encoding):
 356         self._bs = soup
 357         self._base_url = base_url
 358         self._encoding = encoding
 359
 360     def links(self):
 361         import _beautifulsoup
 362         bs = self._bs
 363         base_url = self._base_url
 364         encoding = self._encoding
 365         gen = bs.recursiveChildGenerator()
 366         for ch in bs.recursiveChildGenerator():
 367             if (isinstance(ch, _beautifulsoup.Tag) and
 368                 ch.name in self.urltags.keys()+["base"]):
 369                 link = ch
 370                 attrs = bs.unescape_attrs(link.attrs)
 371                 attrs_dict = dict(attrs)
 372                 if link.name == "base":
 373                     base_href = attrs_dict.get("href")
 374                     if base_href is not None:
 375                         base_url = base_href
 376                     continue
 377                 url_attr = self.urltags[link.name]
 378                 url = attrs_dict.get(url_attr)
 379                 if not url:
 380                     continue
 381                 url = _rfc3986.clean_url(url, encoding)
 382                 text = link.firstText(lambda t: True)
 383                 if text is _beautifulsoup.Null:
 384                     # follow _pullparser's weird behaviour rigidly
 385                     if link.name == "a":
 386                         text = ""
 387                     else:
 388                         text = None
 389                 else:
 390                     text = self.compress_re.sub(" ", text.strip())
 391                 yield Link(base_url, url, text, link.name, attrs)
 392
 393
 394 class RobustFormsFactory(FormsFactory):
 395     def __init__(self, *args, **kwds):
 396         import ClientForm
 397         args = form_parser_args(*args, **kwds)
 398         if args.form_parser_class is None:
 399             args.form_parser_class = RobustFormParser
 400         FormsFactory.__init__(self, **args.dictionary)
 401
 402     def set_response(self, response, encoding):
 403         self._response = response
 404         self.encoding = encoding
 405
 406
 407 class RobustTitleFactory:
 408     def __init__(self):
 409         self._bs = self._encoding = None
 410
 411     def set_soup(self, soup, encoding):
 412         self._bs = soup
 413         self._encoding = encoding
 414
 415     def title(self):
 416         import _beautifulsoup
 417         title = self._bs.first("title")
 418         if title == _beautifulsoup.Null:
 419             return None
 420         else:
 421             return title.firstText(lambda t: True)
 422
 423
 424 class Factory:
 425     """Factory for forms, links, etc.
 426
 427     This interface may expand in future.
 428
 429     Public methods:
 430
 431     set_request_class(request_class)
 432     set_response(response)
 433     forms()
 434     links()
 435
 436     Public attributes:
 437
 438     Note that accessing these attributes may raise ParseError.
 439
 440     encoding: string specifying the encoding of response if it contains a text
 441      document (this value is left unspecified for documents that do not have
 442      an encoding, e.g. an image file)
 443     is_html: true if response contains an HTML document (XHTML may be
 444      regarded as HTML too)
 445     title: page title, or None if no title or not HTML
 446     global_form: form object containing all controls that are not descendants
 447      of any FORM element, or None if the forms_factory does not support
 448      supplying a global form
 449
 450     """
 451
 452     LAZY_ATTRS = ["encoding", "is_html", "title", "global_form"]
 453
 454     def __init__(self, forms_factory, links_factory, title_factory,
 455                  encoding_finder=EncodingFinder(DEFAULT_ENCODING),
 456                  response_type_finder=ResponseTypeFinder(allow_xhtml=False),
 457                  ):
 458         """
 459
 460         Pass keyword arguments only.
 461
 462         default_encoding: character encoding to use if encoding cannot be
 463          determined (or guessed) from the response.  You should turn on
 464          HTTP-EQUIV handling if you want the best chance of getting this right
 465          without resorting to this default.  The default value of this
 466          parameter (currently latin-1) may change in future.
 467
 468         """
 469         self._forms_factory = forms_factory
 470         self._links_factory = links_factory
 471         self._title_factory = title_factory
 472         self._encoding_finder = encoding_finder
 473         self._response_type_finder = response_type_finder
 474
 475         self.set_response(None)
 476
 477     def set_request_class(self, request_class):
 478         """Set urllib2.Request class.
 479
 480         ClientForm.HTMLForm instances returned by .forms() will return
 481         instances of this class when .click()ed.
 482
 483         """
 484         self._forms_factory.request_class = request_class
 485
 486     def set_response(self, response):
 487         """Set response.
 488
 489         The response must either be None or implement the same interface as
 490         objects returned by urllib2.urlopen().
 491
 492         """
 493         self._response = response
 494         self._forms_genf = self._links_genf = None
 495         self._get_title = None
 496         for name in self.LAZY_ATTRS:
 497             try:
 498                 delattr(self, name)
 499             except AttributeError:
 500                 pass
 501
 502     def __getattr__(self, name):
 503         if name not in self.LAZY_ATTRS:
 504             return getattr(self.__class__, name)
 505
 506         if name == "encoding":
 507             self.encoding = self._encoding_finder.encoding(
 508                 copy.copy(self._response))
 509             return self.encoding
 510         elif name == "is_html":
 511             self.is_html = self._response_type_finder.is_html(
 512                 copy.copy(self._response), self.encoding)
 513             return self.is_html
 514         elif name == "title":
 515             if self.is_html:
 516                 self.title = self._title_factory.title()
 517             else:
 518                 self.title = None
 519             return self.title
 520         elif name == "global_form":
 521             self.forms()
 522             return self.global_form
 523
 524     def forms(self):
 525         """Return iterable over ClientForm.HTMLForm-like objects.
 526
 527         Raises mechanize.ParseError on failure.
 528         """
 529         # this implementation sets .global_form as a side-effect, for benefit
 530         # of __getattr__ impl
 531         if self._forms_genf is None:
 532             try:
 533                 self._forms_genf = CachingGeneratorFunction(
 534                     self._forms_factory.forms())
 535             except:  # XXXX define exception!
 536                 self.set_response(self._response)
 537                 raise
 538             self.global_form = getattr(
 539                 self._forms_factory, "global_form", None)
 540         return self._forms_genf()
 541
 542     def links(self):
 543         """Return iterable over mechanize.Link-like objects.
 544
 545         Raises mechanize.ParseError on failure.
 546         """
 547         if self._links_genf is None:
 548             try:
 549                 self._links_genf = CachingGeneratorFunction(
 550                     self._links_factory.links())
 551             except:  # XXXX define exception!
 552                 self.set_response(self._response)
 553                 raise
 554         return self._links_genf()
 555
 556 class DefaultFactory(Factory):
 557     """Based on sgmllib."""
 558     def __init__(self, i_want_broken_xhtml_support=False):
 559         Factory.__init__(
 560             self,
 561             forms_factory=FormsFactory(),
 562             links_factory=LinksFactory(),
 563             title_factory=TitleFactory(),
 564             response_type_finder=ResponseTypeFinder(
 565                 allow_xhtml=i_want_broken_xhtml_support),
 566             )
 567
 568     def set_response(self, response):
 569         Factory.set_response(self, response)
 570         if response is not None:
 571             self._forms_factory.set_response(
 572                 copy.copy(response), self.encoding)
 573             self._links_factory.set_response(
 574                 copy.copy(response), response.geturl(), self.encoding)
 575             self._title_factory.set_response(
 576                 copy.copy(response), self.encoding)
 577
 578 class RobustFactory(Factory):
 579     """Based on BeautifulSoup, hopefully a bit more robust to bad HTML than is
 580     DefaultFactory.
 581
 582     """
 583     def __init__(self, i_want_broken_xhtml_support=False,
 584                  soup_class=None):
 585         Factory.__init__(
 586             self,
 587             forms_factory=RobustFormsFactory(),
 588             links_factory=RobustLinksFactory(),
 589             title_factory=RobustTitleFactory(),
 590             response_type_finder=ResponseTypeFinder(
 591                 allow_xhtml=i_want_broken_xhtml_support),
 592             )
 593         if soup_class is None:
 594             soup_class = MechanizeBs
 595         self._soup_class = soup_class
 596
 597     def set_response(self, response):
 598         import _beautifulsoup
 599         Factory.set_response(self, response)
 600         if response is not None:
 601             data = response.read()
 602             soup = self._soup_class(self.encoding, data)
 603             self._forms_factory.set_response(
 604                 copy.copy(response), self.encoding)
 605             self._links_factory.set_soup(
 606                 soup, response.geturl(), self.encoding)
 607             self._title_factory.set_soup(soup, self.encoding)