1 # Copyright 2015 The Chromium Authors. All rights reserved.
2 # Use of this source code is governed by a BSD-style license that can be
3 # found in the LICENSE file.
12 class _HRefParser(HTMLParser
.HTMLParser
):
14 HTMLParser
.HTMLParser
.__init
__(self
)
17 def handle_starttag(self
, tag
, attrs
):
19 for name
, value
in attrs
:
21 self
.hrefs
.append(value
)
24 def GenerateSafeUrls():
25 """Prints a list of safe urls.
27 Generates a safe list of urls from a seed list. Each href in the HTML
28 fetched from the url from the seed list is placed into the safe list. The
29 safe list contains unsanitized urls.
31 # A list of websites whose hrefs are unlikely to link to sites that contain
34 "https://www.cnn.com",
35 "https://www.youtube.com",
36 "https://www.facebook.com",
37 "https://www.twitter.com",
38 "https://www.yahoo.com",
39 "https://www.amazon.com",
40 "https://www.wikipedia.com",
41 "https://www.bing.com",
42 "https://www.dailymotion.com",
43 "https://www.stackoverflow.com",
44 "https://www.google.com/#q=dumpling",
45 "http://www.baidu.com/s?wd=rice",
46 "http://www.baidu.com/s?wd=cow",
47 "https://www.google.com/#q=fox",
48 "http://www.yahoo.co.jp/",
49 "http://www.yandex.ru/",
50 "https://www.imdb.com/",
51 "http://www.huffingtonpost.com/",
52 "https://www.deviantart.com/",
53 "http://www.wsj.com/",
60 # Fetch and parse the HTML.
61 response
= urllib2
.urlopen(url
)
62 encoding
= response
.headers
.getparam('charset')
63 html
= response
.read()
65 html
= html
.decode(encoding
)
67 parser
= _HRefParser()
70 logging
.exception("Error fetching or parsing url: %s", url
)
73 # Looks for all hrefs.
74 for relative_url
in parser
.hrefs
:
78 absolute_url
= urlparse
.urljoin(url
, relative_url
)
79 safe_urls
.add(absolute_url
)
81 # Sort the urls, to make them easier to view in bulk.
82 safe_urls_list
= list(safe_urls
)
85 print json
.dumps(safe_urls_list
, indent
=2, separators
=(",", ":"))
87 if __name__
== "__main__":