src/searchengine/nova3/helpers.py

   1 #VERSION: 1.43
   2
   3 # Author:
   4 #  Christophe DUMEZ (chris@qbittorrent.org)
   5
   6 # Redistribution and use in source and binary forms, with or without
   7 # modification, are permitted provided that the following conditions are met:
   8 #
   9 #    * Redistributions of source code must retain the above copyright notice,
  10 #      this list of conditions and the following disclaimer.
  11 #    * Redistributions in binary form must reproduce the above copyright
  12 #      notice, this list of conditions and the following disclaimer in the
  13 #      documentation and/or other materials provided with the distribution.
  14 #    * Neither the name of the author nor the names of its contributors may be
  15 #      used to endorse or promote products derived from this software without
  16 #      specific prior written permission.
  17 #
  18 # THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
  19 # AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
  20 # IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
  21 # ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE
  22 # LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR
  23 # CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF
  24 # SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS
  25 # INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN
  26 # CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
  27 # ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
  28 # POSSIBILITY OF SUCH DAMAGE.
  29
  30 import gzip
  31 import html.entities
  32 import io
  33 import os
  34 import re
  35 import socket
  36 import socks
  37 import tempfile
  38 import urllib.error
  39 import urllib.parse
  40 import urllib.request
  41
  42 # Some sites blocks default python User-agent
  43 user_agent = 'Mozilla/5.0 (X11; Linux x86_64; rv:68.0) Gecko/20100101 Firefox/68.0'
  44 headers = {'User-Agent': user_agent}
  45 # SOCKS5 Proxy support
  46 if "sock_proxy" in os.environ and len(os.environ["sock_proxy"].strip()) > 0:
  47     proxy_str = os.environ["sock_proxy"].strip()
  48     m = re.match(r"^(?:(?P<username>[^:]+):(?P<password>[^@]+)@)?(?P<host>[^:]+):(?P<port>\w+)$",
  49                  proxy_str)
  50     if m is not None:
  51         socks.setdefaultproxy(socks.PROXY_TYPE_SOCKS5, m.group('host'),
  52                               int(m.group('port')), True, m.group('username'), m.group('password'))
  53         socket.socket = socks.socksocket
  54
  55
  56 def htmlentitydecode(s):
  57     # First convert alpha entities (such as &eacute;)
  58     # (Inspired from http://mail.python.org/pipermail/python-list/2007-June/443813.html)
  59     def entity2char(m):
  60         entity = m.group(1)
  61         if entity in html.entities.name2codepoint:
  62             return chr(html.entities.name2codepoint[entity])
  63         return " "  # Unknown entity: We replace with a space.
  64     t = re.sub('&(%s);' % '|'.join(html.entities.name2codepoint), entity2char, s)
  65
  66     # Then convert numerical entities (such as &#233;)
  67     t = re.sub(r'&#(\d+);', lambda x: chr(int(x.group(1))), t)
  68
  69     # Then convert hexa entities (such as &#x00E9;)
  70     return re.sub(r'&#x(\w+);', lambda x: chr(int(x.group(1), 16)), t)
  71
  72
  73 def retrieve_url(url):
  74     """ Return the content of the url page as a string """
  75     req = urllib.request.Request(url, headers=headers)
  76     try:
  77         response = urllib.request.urlopen(req)
  78     except urllib.error.URLError as errno:
  79         print(" ".join(("Connection error:", str(errno.reason))))
  80         return ""
  81     dat = response.read()
  82     # Check if it is gzipped
  83     if dat[:2] == b'\x1f\x8b':
  84         # Data is gzip encoded, decode it
  85         compressedstream = io.BytesIO(dat)
  86         gzipper = gzip.GzipFile(fileobj=compressedstream)
  87         extracted_data = gzipper.read()
  88         dat = extracted_data
  89     info = response.info()
  90     charset = 'utf-8'
  91     try:
  92         ignore, charset = info['Content-Type'].split('charset=')
  93     except Exception:
  94         pass
  95     dat = dat.decode(charset, 'replace')
  96     dat = htmlentitydecode(dat)
  97     # return dat.encode('utf-8', 'replace')
  98     return dat
  99
 100
 101 def download_file(url, referer=None):
 102     """ Download file at url and write it to a file, return the path to the file and the url """
 103     file, path = tempfile.mkstemp()
 104     file = os.fdopen(file, "wb")
 105     # Download url
 106     req = urllib.request.Request(url, headers=headers)
 107     if referer is not None:
 108         req.add_header('referer', referer)
 109     response = urllib.request.urlopen(req)
 110     dat = response.read()
 111     # Check if it is gzipped
 112     if dat[:2] == b'\x1f\x8b':
 113         # Data is gzip encoded, decode it
 114         compressedstream = io.BytesIO(dat)
 115         gzipper = gzip.GzipFile(fileobj=compressedstream)
 116         extracted_data = gzipper.read()
 117         dat = extracted_data
 118
 119     # Write it to a file
 120     file.write(dat)
 121     file.close()
 122     # return file path
 123     return (path + " " + url)