3 Copyright 2003-2006 John J. Lee <jjl@pobox.com>
5 This code is free software; you can redistribute it and/or modify it under
6 the terms of the BSD or ZPL 2.1 licenses (see the file COPYING.txt
7 included with the distribution).
11 import re
, copy
, htmlentitydefs
12 import sgmllib
, HTMLParser
, ClientForm
15 from _headersutil
import split_header_words
, is_html
as _is_html
18 DEFAULT_ENCODING
= "latin-1"
21 # the base classe is purely for backwards compatibility
22 class ParseError(ClientForm
.ParseError
): pass
25 class CachingGeneratorFunction(object):
26 """Caching wrapper around a no-arguments iterable."""
28 def __init__(self
, iterable
):
30 # wrap iterable to make it non-restartable (otherwise, repeated
31 # __call__ would give incorrect results)
32 self
._iterator
= iter(iterable
)
38 for item
in self
._iterator
:
44 def __init__(self
, default_encoding
):
45 self
._default
_encoding
= default_encoding
46 def encoding(self
, response
):
47 # HTTPEquivProcessor may be in use, so both HTTP and HTTP-EQUIV
48 # headers may be in the response. HTTP-EQUIV headers come last,
49 # so try in order from first to last.
50 for ct
in response
.info().getheaders("content-type"):
51 for k
, v
in split_header_words([ct
])[0]:
54 return self
._default
_encoding
56 class ResponseTypeFinder
:
57 def __init__(self
, allow_xhtml
):
58 self
._allow
_xhtml
= allow_xhtml
59 def is_html(self
, response
, encoding
):
60 ct_hdrs
= response
.info().getheaders("content-type")
61 url
= response
.geturl()
63 return _is_html(ct_hdrs
, url
, self
._allow
_xhtml
)
66 # idea for this argument-processing trick is from Peter Otten
68 def __init__(self
, args_map
):
69 self
.dictionary
= dict(args_map
)
70 def __getattr__(self
, key
):
72 return self
.dictionary
[key
]
74 return getattr(self
.__class
__, key
)
78 form_parser_class
=None,
80 backwards_compat
=False,
86 def __init__(self
, base_url
, url
, text
, tag
, attrs
):
87 assert None not in [url
, tag
, attrs
]
88 self
.base_url
= base_url
89 self
.absolute_url
= _rfc3986
.urljoin(base_url
, url
)
90 self
.url
, self
.text
, self
.tag
, self
.attrs
= url
, text
, tag
, attrs
91 def __cmp__(self
, other
):
93 for name
in "url", "text", "tag", "attrs":
94 if getattr(self
, name
) != getattr(other
, name
):
96 except AttributeError:
100 return "Link(base_url=%r, url=%r, text=%r, tag=%r, attrs=%r)" % (
101 self
.base_url
, self
.url
, self
.text
, self
.tag
, self
.attrs
)
107 link_parser_class
=None,
112 if link_parser_class
is None:
113 link_parser_class
= _pullparser
.TolerantPullParser
114 self
.link_parser_class
= link_parser_class
115 self
.link_class
= link_class
123 self
.urltags
= urltags
124 self
._response
= None
125 self
._encoding
= None
127 def set_response(self
, response
, base_url
, encoding
):
128 self
._response
= response
129 self
._encoding
= encoding
130 self
._base
_url
= base_url
133 """Return an iterator that provides links of the document."""
134 response
= self
._response
135 encoding
= self
._encoding
136 base_url
= self
._base
_url
137 p
= self
.link_parser_class(response
, encoding
=encoding
)
140 for token
in p
.tags(*(self
.urltags
.keys()+["base"])):
141 if token
.type == "endtag":
143 if token
.data
== "base":
144 base_href
= dict(token
.attrs
).get("href")
145 if base_href
is not None:
148 attrs
= dict(token
.attrs
)
150 name
= attrs
.get("name")
152 # XXX use attr_encoding for ref'd doc if that doc does not
153 # provide one by other means
154 #attr_encoding = attrs.get("charset")
155 url
= attrs
.get(self
.urltags
[tag
]) # XXX is "" a valid URL?
157 # Probably an <A NAME="blah"> link or <AREA NOHREF...>.
158 # For our purposes a link is something with a URL, so
162 url
= _rfc3986
.clean_url(url
, encoding
)
164 if token
.type != "startendtag":
165 # hmm, this'd break if end tag is missing
166 text
= p
.get_compressed_text(("endtag", tag
))
167 # but this doesn't work for eg.
168 # <a href="blah"><b>Andy</b></a>
169 #text = p.get_compressed_text()
171 yield Link(base_url
, url
, text
, tag
, token
.attrs
)
172 except sgmllib
.SGMLParseError
, exc
:
173 raise ParseError(exc
)
177 """Makes a sequence of objects satisfying ClientForm.HTMLForm interface.
179 After calling .forms(), the .global_form attribute is a form object
180 containing all controls not a descendant of any FORM element.
182 For constructor argument docs, see ClientForm.ParseResponse
188 select_default
=False,
189 form_parser_class
=None,
191 backwards_compat
=False,
194 self
.select_default
= select_default
195 if form_parser_class
is None:
196 form_parser_class
= ClientForm
.FormParser
197 self
.form_parser_class
= form_parser_class
198 if request_class
is None:
199 request_class
= _request
.Request
200 self
.request_class
= request_class
201 self
.backwards_compat
= backwards_compat
202 self
._response
= None
204 self
.global_form
= None
206 def set_response(self
, response
, encoding
):
207 self
._response
= response
208 self
.encoding
= encoding
209 self
.global_form
= None
213 encoding
= self
.encoding
215 forms
= ClientForm
.ParseResponseEx(
217 select_default
=self
.select_default
,
218 form_parser_class
=self
.form_parser_class
,
219 request_class
=self
.request_class
,
221 _urljoin
=_rfc3986
.urljoin
,
222 _urlparse
=_rfc3986
.urlsplit
,
223 _urlunparse
=_rfc3986
.urlunsplit
,
225 except ClientForm
.ParseError
, exc
:
226 raise ParseError(exc
)
227 self
.global_form
= forms
[0]
232 self
._response
= self
._encoding
= None
234 def set_response(self
, response
, encoding
):
235 self
._response
= response
236 self
._encoding
= encoding
240 p
= _pullparser
.TolerantPullParser(
241 self
._response
, encoding
=self
._encoding
)
245 except _pullparser
.NoMoreTokensError
:
249 except sgmllib
.SGMLParseError
, exc
:
250 raise ParseError(exc
)
253 def unescape(data
, entities
, encoding
):
254 if data
is None or "&" not in data
:
257 def replace_entities(match
):
260 return unescape_charref(ent
[2:-1], encoding
)
262 repl
= entities
.get(ent
[1:-1])
265 if type(repl
) != type(""):
267 repl
= repl
.encode(encoding
)
274 return re
.sub(r
"&#?[A-Za-z0-9]+?;", replace_entities
, data
)
276 def unescape_charref(data
, encoding
):
277 name
, base
= data
, 10
278 if name
.startswith("x"):
279 name
, base
= name
[1:], 16
280 uc
= unichr(int(name
, base
))
285 repl
= uc
.encode(encoding
)
287 repl
= "&#%s;" % data
291 # bizarre import gymnastics for bundled BeautifulSoup
292 import _beautifulsoup
294 RobustFormParser
, NestingRobustFormParser
= ClientForm
._create
_bs
_classes
(
295 _beautifulsoup
.BeautifulSoup
, _beautifulsoup
.ICantBelieveItsBeautifulSoup
297 # monkeypatch sgmllib to fix http://www.python.org/sf/803422 :-(
299 sgmllib
.charref
= re
.compile("&#(x?[0-9a-fA-F]+)[^0-9a-fA-F]")
301 class MechanizeBs(_beautifulsoup
.BeautifulSoup
):
302 _entitydefs
= htmlentitydefs
.name2codepoint
303 # don't want the magic Microsoft-char workaround
304 PARSER_MASSAGE
= [(re
.compile('(<[^<>]*)/>'),
305 lambda(x
):x
.group(1) + ' />'),
306 (re
.compile('<!\s+([^<>]*)>'),
307 lambda(x
):'<!' + x
.group(1) + '>')
310 def __init__(self
, encoding
, text
=None, avoidParserProblems
=True,
311 initialTextIsEverything
=True):
312 self
._encoding
= encoding
313 _beautifulsoup
.BeautifulSoup
.__init
__(
314 self
, text
, avoidParserProblems
, initialTextIsEverything
)
316 def handle_charref(self
, ref
):
317 t
= unescape("&#%s;"%ref, self
._entitydefs
, self
._encoding
)
319 def handle_entityref(self
, ref
):
320 t
= unescape("&%s;"%ref, self
._entitydefs
, self
._encoding
)
322 def unescape_attrs(self
, attrs
):
324 for key
, val
in attrs
:
325 val
= unescape(val
, self
._entitydefs
, self
._encoding
)
326 escaped_attrs
.append((key
, val
))
329 class RobustLinksFactory
:
331 compress_re
= re
.compile(r
"\s+")
334 link_parser_class
=None,
338 import _beautifulsoup
339 if link_parser_class
is None:
340 link_parser_class
= MechanizeBs
341 self
.link_parser_class
= link_parser_class
342 self
.link_class
= link_class
350 self
.urltags
= urltags
352 self
._encoding
= None
353 self
._base
_url
= None
355 def set_soup(self
, soup
, base_url
, encoding
):
357 self
._base
_url
= base_url
358 self
._encoding
= encoding
361 import _beautifulsoup
363 base_url
= self
._base
_url
364 encoding
= self
._encoding
365 gen
= bs
.recursiveChildGenerator()
366 for ch
in bs
.recursiveChildGenerator():
367 if (isinstance(ch
, _beautifulsoup
.Tag
) and
368 ch
.name
in self
.urltags
.keys()+["base"]):
370 attrs
= bs
.unescape_attrs(link
.attrs
)
371 attrs_dict
= dict(attrs
)
372 if link
.name
== "base":
373 base_href
= attrs_dict
.get("href")
374 if base_href
is not None:
377 url_attr
= self
.urltags
[link
.name
]
378 url
= attrs_dict
.get(url_attr
)
381 url
= _rfc3986
.clean_url(url
, encoding
)
382 text
= link
.firstText(lambda t
: True)
383 if text
is _beautifulsoup
.Null
:
384 # follow _pullparser's weird behaviour rigidly
390 text
= self
.compress_re
.sub(" ", text
.strip())
391 yield Link(base_url
, url
, text
, link
.name
, attrs
)
394 class RobustFormsFactory(FormsFactory
):
395 def __init__(self
, *args
, **kwds
):
397 args
= form_parser_args(*args
, **kwds
)
398 if args
.form_parser_class
is None:
399 args
.form_parser_class
= RobustFormParser
400 FormsFactory
.__init
__(self
, **args
.dictionary
)
402 def set_response(self
, response
, encoding
):
403 self
._response
= response
404 self
.encoding
= encoding
407 class RobustTitleFactory
:
409 self
._bs
= self
._encoding
= None
411 def set_soup(self
, soup
, encoding
):
413 self
._encoding
= encoding
416 import _beautifulsoup
417 title
= self
._bs
.first("title")
418 if title
== _beautifulsoup
.Null
:
421 return title
.firstText(lambda t
: True)
425 """Factory for forms, links, etc.
427 This interface may expand in future.
431 set_request_class(request_class)
432 set_response(response)
438 Note that accessing these attributes may raise ParseError.
440 encoding: string specifying the encoding of response if it contains a text
441 document (this value is left unspecified for documents that do not have
442 an encoding, e.g. an image file)
443 is_html: true if response contains an HTML document (XHTML may be
444 regarded as HTML too)
445 title: page title, or None if no title or not HTML
446 global_form: form object containing all controls that are not descendants
447 of any FORM element, or None if the forms_factory does not support
448 supplying a global form
452 LAZY_ATTRS
= ["encoding", "is_html", "title", "global_form"]
454 def __init__(self
, forms_factory
, links_factory
, title_factory
,
455 encoding_finder
=EncodingFinder(DEFAULT_ENCODING
),
456 response_type_finder
=ResponseTypeFinder(allow_xhtml
=False),
460 Pass keyword arguments only.
462 default_encoding: character encoding to use if encoding cannot be
463 determined (or guessed) from the response. You should turn on
464 HTTP-EQUIV handling if you want the best chance of getting this right
465 without resorting to this default. The default value of this
466 parameter (currently latin-1) may change in future.
469 self
._forms
_factory
= forms_factory
470 self
._links
_factory
= links_factory
471 self
._title
_factory
= title_factory
472 self
._encoding
_finder
= encoding_finder
473 self
._response
_type
_finder
= response_type_finder
475 self
.set_response(None)
477 def set_request_class(self
, request_class
):
478 """Set urllib2.Request class.
480 ClientForm.HTMLForm instances returned by .forms() will return
481 instances of this class when .click()ed.
484 self
._forms
_factory
.request_class
= request_class
486 def set_response(self
, response
):
489 The response must either be None or implement the same interface as
490 objects returned by urllib2.urlopen().
493 self
._response
= response
494 self
._forms
_genf
= self
._links
_genf
= None
495 self
._get
_title
= None
496 for name
in self
.LAZY_ATTRS
:
499 except AttributeError:
502 def __getattr__(self
, name
):
503 if name
not in self
.LAZY_ATTRS
:
504 return getattr(self
.__class
__, name
)
506 if name
== "encoding":
507 self
.encoding
= self
._encoding
_finder
.encoding(
508 copy
.copy(self
._response
))
510 elif name
== "is_html":
511 self
.is_html
= self
._response
_type
_finder
.is_html(
512 copy
.copy(self
._response
), self
.encoding
)
514 elif name
== "title":
516 self
.title
= self
._title
_factory
.title()
520 elif name
== "global_form":
522 return self
.global_form
525 """Return iterable over ClientForm.HTMLForm-like objects.
527 Raises mechanize.ParseError on failure.
529 # this implementation sets .global_form as a side-effect, for benefit
530 # of __getattr__ impl
531 if self
._forms
_genf
is None:
533 self
._forms
_genf
= CachingGeneratorFunction(
534 self
._forms
_factory
.forms())
535 except: # XXXX define exception!
536 self
.set_response(self
._response
)
538 self
.global_form
= getattr(
539 self
._forms
_factory
, "global_form", None)
540 return self
._forms
_genf
()
543 """Return iterable over mechanize.Link-like objects.
545 Raises mechanize.ParseError on failure.
547 if self
._links
_genf
is None:
549 self
._links
_genf
= CachingGeneratorFunction(
550 self
._links
_factory
.links())
551 except: # XXXX define exception!
552 self
.set_response(self
._response
)
554 return self
._links
_genf
()
556 class DefaultFactory(Factory
):
557 """Based on sgmllib."""
558 def __init__(self
, i_want_broken_xhtml_support
=False):
561 forms_factory
=FormsFactory(),
562 links_factory
=LinksFactory(),
563 title_factory
=TitleFactory(),
564 response_type_finder
=ResponseTypeFinder(
565 allow_xhtml
=i_want_broken_xhtml_support
),
568 def set_response(self
, response
):
569 Factory
.set_response(self
, response
)
570 if response
is not None:
571 self
._forms
_factory
.set_response(
572 copy
.copy(response
), self
.encoding
)
573 self
._links
_factory
.set_response(
574 copy
.copy(response
), response
.geturl(), self
.encoding
)
575 self
._title
_factory
.set_response(
576 copy
.copy(response
), self
.encoding
)
578 class RobustFactory(Factory
):
579 """Based on BeautifulSoup, hopefully a bit more robust to bad HTML than is
583 def __init__(self
, i_want_broken_xhtml_support
=False,
587 forms_factory
=RobustFormsFactory(),
588 links_factory
=RobustLinksFactory(),
589 title_factory
=RobustTitleFactory(),
590 response_type_finder
=ResponseTypeFinder(
591 allow_xhtml
=i_want_broken_xhtml_support
),
593 if soup_class
is None:
594 soup_class
= MechanizeBs
595 self
._soup
_class
= soup_class
597 def set_response(self
, response
):
598 import _beautifulsoup
599 Factory
.set_response(self
, response
)
600 if response
is not None:
601 data
= response
.read()
602 soup
= self
._soup
_class
(self
.encoding
, data
)
603 self
._forms
_factory
.set_response(
604 copy
.copy(response
), self
.encoding
)
605 self
._links
_factory
.set_soup(
606 soup
, response
.geturl(), self
.encoding
)
607 self
._title
_factory
.set_soup(soup
, self
.encoding
)