src/searchengine/nova3/helpers.py

   1 #VERSION: 1.49
   2
   3 # Author:
   4 #  Christophe DUMEZ (chris@qbittorrent.org)
   5
   6 # Redistribution and use in source and binary forms, with or without
   7 # modification, are permitted provided that the following conditions are met:
   8 #
   9 #    * Redistributions of source code must retain the above copyright notice,
  10 #      this list of conditions and the following disclaimer.
  11 #    * Redistributions in binary form must reproduce the above copyright
  12 #      notice, this list of conditions and the following disclaimer in the
  13 #      documentation and/or other materials provided with the distribution.
  14 #    * Neither the name of the author nor the names of its contributors may be
  15 #      used to endorse or promote products derived from this software without
  16 #      specific prior written permission.
  17 #
  18 # THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
  19 # AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
  20 # IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
  21 # ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE
  22 # LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR
  23 # CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF
  24 # SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS
  25 # INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN
  26 # CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
  27 # ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
  28 # POSSIBILITY OF SUCH DAMAGE.
  29
  30 import datetime
  31 import gzip
  32 import html.entities
  33 import io
  34 import os
  35 import re
  36 import socket
  37 import socks
  38 import sys
  39 import tempfile
  40 import urllib.error
  41 import urllib.request
  42 from collections.abc import Mapping
  43 from typing import Any, Optional
  44
  45
  46 def getBrowserUserAgent() -> str:
  47     """ Disguise as browser to circumvent website blocking """
  48
  49     # Firefox release calendar
  50     # https://whattrainisitnow.com/calendar/
  51     # https://wiki.mozilla.org/index.php?title=Release_Management/Calendar&redirect=no
  52
  53     baseDate = datetime.date(2024, 4, 16)
  54     baseVersion = 125
  55
  56     nowDate = datetime.date.today()
  57     nowVersion = baseVersion + ((nowDate - baseDate).days // 30)
  58
  59     return f"Mozilla/5.0 (Windows NT 10.0; Win64; x64; rv:{nowVersion}.0) Gecko/20100101 Firefox/{nowVersion}.0"
  60
  61
  62 headers: dict[str, Any] = {'User-Agent': getBrowserUserAgent()}
  63
  64 # SOCKS5 Proxy support
  65 if "sock_proxy" in os.environ and len(os.environ["sock_proxy"].strip()) > 0:
  66     proxy_str = os.environ["sock_proxy"].strip()
  67     m = re.match(r"^(?:(?P<username>[^:]+):(?P<password>[^@]+)@)?(?P<host>[^:]+):(?P<port>\w+)$",
  68                  proxy_str)
  69     if m is not None:
  70         socks.setdefaultproxy(socks.PROXY_TYPE_SOCKS5, m.group('host'),
  71                               int(m.group('port')), True, m.group('username'), m.group('password'))
  72         socket.socket = socks.socksocket  # type: ignore[misc]
  73
  74
  75 def htmlentitydecode(s: str) -> str:
  76     # First convert alpha entities (such as &eacute;)
  77     # (Inspired from http://mail.python.org/pipermail/python-list/2007-June/443813.html)
  78     def entity2char(m: re.Match[str]) -> str:
  79         entity = m.group(1)
  80         if entity in html.entities.name2codepoint:
  81             return chr(html.entities.name2codepoint[entity])
  82         return " "  # Unknown entity: We replace with a space.
  83     t = re.sub('&(%s);' % '|'.join(html.entities.name2codepoint), entity2char, s)
  84
  85     # Then convert numerical entities (such as &#233;)
  86     t = re.sub(r'&#(\d+);', lambda x: chr(int(x.group(1))), t)
  87
  88     # Then convert hexa entities (such as &#x00E9;)
  89     return re.sub(r'&#x(\w+);', lambda x: chr(int(x.group(1), 16)), t)
  90
  91
  92 def retrieve_url(url: str, custom_headers: Mapping[str, Any] = {}, request_data: Optional[Any] = None) -> str:
  93     """ Return the content of the url page as a string """
  94
  95     request = urllib.request.Request(url, request_data, {**headers, **custom_headers})
  96     try:
  97         response = urllib.request.urlopen(request)
  98     except urllib.error.URLError as errno:
  99         print(f"Connection error: {errno.reason}", file=sys.stderr)
 100         return ""
 101     data: bytes = response.read()
 102
 103     # Check if it is gzipped
 104     if data[:2] == b'\x1f\x8b':
 105         # Data is gzip encoded, decode it
 106         with io.BytesIO(data) as compressedStream, gzip.GzipFile(fileobj=compressedStream) as gzipper:
 107             data = gzipper.read()
 108
 109     charset = 'utf-8'
 110     try:
 111         charset = response.getheader('Content-Type', '').split('charset=', 1)[1]
 112     except IndexError:
 113         pass
 114
 115     dataStr = data.decode(charset, 'replace')
 116     dataStr = htmlentitydecode(dataStr)
 117     return dataStr
 118
 119
 120 def download_file(url: str, referer: Optional[str] = None) -> str:
 121     """ Download file at url and write it to a file, return the path to the file and the url """
 122
 123     # Download url
 124     request = urllib.request.Request(url, headers=headers)
 125     if referer is not None:
 126         request.add_header('referer', referer)
 127     response = urllib.request.urlopen(request)
 128     data = response.read()
 129
 130     # Check if it is gzipped
 131     if data[:2] == b'\x1f\x8b':
 132         # Data is gzip encoded, decode it
 133         with io.BytesIO(data) as compressedStream, gzip.GzipFile(fileobj=compressedStream) as gzipper:
 134             data = gzipper.read()
 135
 136     # Write it to a file
 137     fileHandle, path = tempfile.mkstemp()
 138     with os.fdopen(fileHandle, "wb") as file:
 139         file.write(data)
 140
 141     # return file path
 142     return f"{path} {url}"