4 # Christophe DUMEZ (chris@qbittorrent.org)
6 # Redistribution and use in source and binary forms, with or without
7 # modification, are permitted provided that the following conditions are met:
9 # * Redistributions of source code must retain the above copyright notice,
10 # this list of conditions and the following disclaimer.
11 # * Redistributions in binary form must reproduce the above copyright
12 # notice, this list of conditions and the following disclaimer in the
13 # documentation and/or other materials provided with the distribution.
14 # * Neither the name of the author nor the names of its contributors may be
15 # used to endorse or promote products derived from this software without
16 # specific prior written permission.
18 # THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
19 # AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
20 # IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
21 # ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE
22 # LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR
23 # CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF
24 # SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS
25 # INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN
26 # CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
27 # ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
28 # POSSIBILITY OF SUCH DAMAGE.
42 # Some sites blocks default python User-agent
43 user_agent
= 'Mozilla/5.0 (X11; Linux x86_64; rv:68.0) Gecko/20100101 Firefox/68.0'
44 headers
= {'User-Agent': user_agent
}
45 # SOCKS5 Proxy support
46 if "sock_proxy" in os
.environ
and len(os
.environ
["sock_proxy"].strip()) > 0:
47 proxy_str
= os
.environ
["sock_proxy"].strip()
48 m
= re
.match(r
"^(?:(?P<username>[^:]+):(?P<password>[^@]+)@)?(?P<host>[^:]+):(?P<port>\w+)$",
51 socks
.setdefaultproxy(socks
.PROXY_TYPE_SOCKS5
, m
.group('host'),
52 int(m
.group('port')), True, m
.group('username'), m
.group('password'))
53 socket
.socket
= socks
.socksocket
56 def htmlentitydecode(s
):
57 # First convert alpha entities (such as é)
58 # (Inspired from http://mail.python.org/pipermail/python-list/2007-June/443813.html)
61 if entity
in html
.entities
.name2codepoint
:
62 return chr(html
.entities
.name2codepoint
[entity
])
63 return " " # Unknown entity: We replace with a space.
64 t
= re
.sub('&(%s);' % '|'.join(html
.entities
.name2codepoint
), entity2char
, s
)
66 # Then convert numerical entities (such as é)
67 t
= re
.sub(r
'&#(\d+);', lambda x
: chr(int(x
.group(1))), t
)
69 # Then convert hexa entities (such as é)
70 return re
.sub(r
'&#x(\w+);', lambda x
: chr(int(x
.group(1), 16)), t
)
73 def retrieve_url(url
):
74 """ Return the content of the url page as a string """
75 req
= urllib
.request
.Request(url
, headers
=headers
)
77 response
= urllib
.request
.urlopen(req
)
78 except urllib
.error
.URLError
as errno
:
79 print(" ".join(("Connection error:", str(errno
.reason
))))
82 # Check if it is gzipped
83 if dat
[:2] == b
'\x1f\x8b':
84 # Data is gzip encoded, decode it
85 compressedstream
= io
.BytesIO(dat
)
86 gzipper
= gzip
.GzipFile(fileobj
=compressedstream
)
87 extracted_data
= gzipper
.read()
89 info
= response
.info()
92 ignore
, charset
= info
['Content-Type'].split('charset=')
95 dat
= dat
.decode(charset
, 'replace')
96 dat
= htmlentitydecode(dat
)
97 # return dat.encode('utf-8', 'replace')
101 def download_file(url
, referer
=None):
102 """ Download file at url and write it to a file, return the path to the file and the url """
103 file, path
= tempfile
.mkstemp()
104 file = os
.fdopen(file, "wb")
106 req
= urllib
.request
.Request(url
, headers
=headers
)
107 if referer
is not None:
108 req
.add_header('referer', referer
)
109 response
= urllib
.request
.urlopen(req
)
110 dat
= response
.read()
111 # Check if it is gzipped
112 if dat
[:2] == b
'\x1f\x8b':
113 # Data is gzip encoded, decode it
114 compressedstream
= io
.BytesIO(dat
)
115 gzipper
= gzip
.GzipFile(fileobj
=compressedstream
)
116 extracted_data
= gzipper
.read()
123 return (path
+ " " + url
)