1 # Copyright 2015 The Chromium Authors. All rights reserved.
2 # Use of this source code is governed by a BSD-style license that can be
3 # found in the LICENSE file.
12 class _HRefParser(HTMLParser
.HTMLParser
):
14 HTMLParser
.HTMLParser
.__init
__(self
)
17 def handle_starttag(self
, tag
, attrs
):
19 for name
, value
in attrs
:
21 self
.hrefs
.append(value
)
24 def _AbsoluteUrlHasSaneScheme(absolute_url
):
25 if len(absolute_url
) < 4:
27 return absolute_url
[0:4] == 'http'
30 def GenerateSafeUrls():
31 """Prints a list of safe urls.
33 Generates a safe list of urls from a seed list. Each href in the HTML
34 fetched from the url from the seed list is placed into the safe list. The
35 safe list contains unsanitized urls.
37 # A list of websites whose hrefs are unlikely to link to sites that contain
41 "https://www.youtube.com",
42 "https://www.facebook.com",
43 "https://www.twitter.com",
44 "https://www.yahoo.com",
45 "https://www.amazon.com",
46 "https://www.wikipedia.com",
47 "https://www.bing.com",
48 "https://www.dailymotion.com",
49 "https://www.stackoverflow.com",
50 "https://www.google.com/#q=dumpling",
51 "http://www.baidu.com/s?wd=rice",
52 "http://www.baidu.com/s?wd=cow",
53 "https://www.google.com/#q=fox",
54 "http://www.yahoo.co.jp/",
55 "http://www.yandex.ru/",
56 "https://www.imdb.com/",
57 "http://www.huffingtonpost.com/",
58 "https://www.deviantart.com/",
59 "http://www.wsj.com/",
66 # Fetch and parse the HTML.
67 response
= urllib2
.urlopen(url
)
68 encoding
= response
.headers
.getparam('charset')
69 html
= response
.read()
71 html
= html
.decode(encoding
)
73 parser
= _HRefParser()
76 logging
.exception("Error fetching or parsing url: %s", url
)
79 # Looks for all hrefs.
80 for relative_url
in parser
.hrefs
:
84 absolute_url
= urlparse
.urljoin(url
, relative_url
)
85 if not _AbsoluteUrlHasSaneScheme(absolute_url
):
87 safe_urls
.add(absolute_url
)
89 # Sort the urls, to make them easier to view in bulk.
90 safe_urls_list
= list(safe_urls
)
93 print json
.dumps(safe_urls_list
, indent
=2, separators
=(",", ":"))
95 if __name__
== "__main__":