1 import urllib2
, urllib
, re
, threading
, thread
2 from HTMLParser
import HTMLParser
4 MAX_DOWNLOAD
= 10240 * 2 # KB
9 class TitleParser(HTMLParser
):
11 def handle_starttag(self
, tag
, args
):
14 def handle_endtag(self
, tag
):
18 def handle_data(self
, data
):
22 class FirstONRParser(HTMLParser
):
24 def handle_starttag(self
, tag
, args
):
27 if args
.get("class") == "searchlink":
28 self
.result
= args
['href']
31 class FirstGoogleParser(HTMLParser
):
33 def handle_starttag(self
, tag
, args
):
36 if args
.get("class") == "l":
37 self
.result
= args
['href']
40 class GoogleCalcParser(HTMLParser
):
43 def handle_starttag(self
, tag
, args
):
46 if args
.get("size") == "+1":
48 elif self
.in_calc
and args
.get("size") == "-1":
52 def handle_data(self
, data
):
53 if self
.in_calc
and data
== "Web":
57 def handle_charref(self
, char
):
61 end_of
= {"'": "'", '"': '"', '[': '\\]', '(': ')', '<': '>'}
62 def extract_url(message
):
64 start
= message
.index('http://')
66 end_chars
+= end_of
.get(message
[start
-1], "")
67 url
= re
.split("[%s]" % end_chars
, message
[start
:], 1)[0]
72 def _get_html(url
, d
):
73 d
["result"] = urllib2
.urlopen(url
).read(MAX_DOWNLOAD
)
77 # This (and _get_html) is an ugly way of adding a timeout to the urllib2 call.
78 # Unfortunately, since urllib2 didn't think to give a normal timeout, this is
81 d
["event"] = threading
.Event()
82 d
["result"] = " (timed out) "
83 thread
.start_new_thread(_get_html
, (url
, d
))
87 def do_parse(url
, parser
):
88 if url
== " (timed out) ":
91 if html
== " (timed out) ":
99 whitespace
= re
.compile(r
"\s+")
102 raw_title
= do_parse(url
, TitleParser())
103 if raw_title
== " (timed out) ":
105 safe_title
= whitespace
.sub(" ", raw_title
)
106 title
= safe_title
.strip()
109 def first_onr(query
, comic
=None):
110 url
= 'http://ohnorobot.com/index.pl?s=' + urllib
.quote_plus(query
)
112 url
+= "&comic=%d" % comic
113 return do_parse(url
, FirstONRParser())
115 def first_google(query
):
116 url
= "http://www.google.com/search?q=" + urllib
.quote_plus(query
)
117 return do_parse(url
, FirstGoogleParser())
119 def google_calc(query
):
120 url
= "http://www.google.com/search?q=" + urllib
.quote_plus(query
)
121 return do_parse(url
, GoogleCalcParser())
123 def get_extension(url
):
124 last_period
= url
.rfind('.')
125 if len(url
) - 7 < last_period
< len(url
) - 1:
126 return url
[last_period
+ 1:].lower()
128 _known_non_webpage_extensions
= {'mp4v': 1, 'gz': 1, 'jpeg': 1, 'jar': 1, 'mp4': 1, 'mp3': 1, 'gl': 1, 'mng': 1, 'pcx': 1, 'tz': 1, 'm4v': 1, 'wmv': 1, 'xpm': 1, 'mpg': 1, 'dl': 1, 'mpc': 1, 'cpio': 1, 'lzh': 1, 'bat': 1, 'qt': 1, 'cmd': 1, 'patch': 1, 'pbm': 1, 'nuv': 1, 'tex': 1, 'btm': 1, 'arj': 1, 'mpeg': 1, 'm2v': 1, 'rz': 1, 'ra': 1, 'rm': 1, 'asf': 1, 'flc': 1, 'bz': 1, 'log': 1, 'mka': 1, 'ace': 1, 'midi': 1, 'yuv': 1, 'tbz2': 1, 'pdf': 1, 'com': 1, 'deb': 1, 'tgz': 1, 'tiff': 1, 'pgm': 1, 'ppm': 1, 'tga': 1, 'diff': 1, 'txt': 1, 'rpm': 1, 'ps': 1, 'vob': 1, 'zip': 1, 'gif': 1, 'mkv': 1, 'rmvb': 1, 'wav': 1, 'ogm': 1, 'bmp': 1, 'jpg': 1, 'flac': 1, 'ogg': 1, 'Z': 1, 'png': 1, 'aac': 1, 'fli': 1, 'au': 1, 'xwd': 1, 'z': 1, 'xcf': 1, 'tar': 1, 'taz': 1, 'rar': 1, 'avi': 1, '7z': 1, 'csh': 1, 'mid': 1, 'zoo': 1, 'tif': 1, 'mov': 1, 'bz2': 1, 'exe': 1, 'doc': 1, 'xbm': 1, 'sh': 1}
131 # Note: non_webpage and is_webpage are NOT inverses. Unknown URL types will
132 # return False from both.
133 def non_webpage(url
):
134 '''Returns true if the URL's extension is a known non-webpage type.'''
135 return get_extension(url
) in _known_non_webpage_extensions
137 _known_webpage_extensions
= {'htm': 1, 'html': 1, 'shtml': 1, 'asp': 1, 'pl': 1, 'cgi': 1, 'jsp': 1, 'php': 1}
139 # Note: non_webpage and is_webpage are NOT inverses. Unknown URL types will
140 # return False from both.
142 '''Returns true if the URL's extension is a known webpage type.'''
143 return get_extension(url
) in _known_webpage_extensions