src/searchengine/nova3/helpers.py

   1 #VERSION: 1.51
   2
   3 # Author:
   4 #  Christophe DUMEZ (chris@qbittorrent.org)
   5
   6 # Redistribution and use in source and binary forms, with or without
   7 # modification, are permitted provided that the following conditions are met:
   8 #
   9 #    * Redistributions of source code must retain the above copyright notice,
  10 #      this list of conditions and the following disclaimer.
  11 #    * Redistributions in binary form must reproduce the above copyright
  12 #      notice, this list of conditions and the following disclaimer in the
  13 #      documentation and/or other materials provided with the distribution.
  14 #    * Neither the name of the author nor the names of its contributors may be
  15 #      used to endorse or promote products derived from this software without
  16 #      specific prior written permission.
  17 #
  18 # THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
  19 # AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
  20 # IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
  21 # ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE
  22 # LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR
  23 # CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF
  24 # SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS
  25 # INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN
  26 # CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
  27 # ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
  28 # POSSIBILITY OF SUCH DAMAGE.
  29
  30 import datetime
  31 import gzip
  32 import html
  33 import io
  34 import os
  35 import re
  36 import socket
  37 import socks
  38 import ssl
  39 import sys
  40 import tempfile
  41 import urllib.error
  42 import urllib.request
  43 from collections.abc import Mapping
  44 from typing import Any, Optional
  45
  46
  47 def getBrowserUserAgent() -> str:
  48     """ Disguise as browser to circumvent website blocking """
  49
  50     # Firefox release calendar
  51     # https://whattrainisitnow.com/calendar/
  52     # https://wiki.mozilla.org/index.php?title=Release_Management/Calendar&redirect=no
  53
  54     baseDate = datetime.date(2024, 4, 16)
  55     baseVersion = 125
  56
  57     nowDate = datetime.date.today()
  58     nowVersion = baseVersion + ((nowDate - baseDate).days // 30)
  59
  60     return f"Mozilla/5.0 (Windows NT 10.0; Win64; x64; rv:{nowVersion}.0) Gecko/20100101 Firefox/{nowVersion}.0"
  61
  62
  63 headers: dict[str, Any] = {'User-Agent': getBrowserUserAgent()}
  64
  65 # SOCKS5 Proxy support
  66 if "sock_proxy" in os.environ and len(os.environ["sock_proxy"].strip()) > 0:
  67     proxy_str = os.environ["sock_proxy"].strip()
  68     m = re.match(r"^(?:(?P<username>[^:]+):(?P<password>[^@]+)@)?(?P<host>[^:]+):(?P<port>\w+)$",
  69                  proxy_str)
  70     if m is not None:
  71         socks.setdefaultproxy(socks.PROXY_TYPE_SOCKS5, m.group('host'),
  72                               int(m.group('port')), True, m.group('username'), m.group('password'))
  73         socket.socket = socks.socksocket  # type: ignore[misc]
  74
  75
  76 # This is only provided for backward compatibility, new code should not use it
  77 htmlentitydecode = html.unescape
  78
  79
  80 def retrieve_url(url: str, custom_headers: Mapping[str, Any] = {}, request_data: Optional[Any] = None, ssl_context: Optional[ssl.SSLContext] = None, unescape_html_entities: bool = True) -> str:
  81     """ Return the content of the url page as a string """
  82
  83     request = urllib.request.Request(url, request_data, {**headers, **custom_headers})
  84     try:
  85         response = urllib.request.urlopen(request, context=ssl_context)
  86     except urllib.error.URLError as errno:
  87         print(f"Connection error: {errno.reason}", file=sys.stderr)
  88         return ""
  89     data: bytes = response.read()
  90
  91     # Check if it is gzipped
  92     if data[:2] == b'\x1f\x8b':
  93         # Data is gzip encoded, decode it
  94         with io.BytesIO(data) as compressedStream, gzip.GzipFile(fileobj=compressedStream) as gzipper:
  95             data = gzipper.read()
  96
  97     charset = 'utf-8'
  98     try:
  99         charset = response.getheader('Content-Type', '').split('charset=', 1)[1]
 100     except IndexError:
 101         pass
 102
 103     dataStr = data.decode(charset, 'replace')
 104
 105     if unescape_html_entities:
 106         dataStr = html.unescape(dataStr)
 107
 108     return dataStr
 109
 110
 111 def download_file(url: str, referer: Optional[str] = None, ssl_context: Optional[ssl.SSLContext] = None) -> str:
 112     """ Download file at url and write it to a file, return the path to the file and the url """
 113
 114     # Download url
 115     request = urllib.request.Request(url, headers=headers)
 116     if referer is not None:
 117         request.add_header('referer', referer)
 118     response = urllib.request.urlopen(request, context=ssl_context)
 119     data = response.read()
 120
 121     # Check if it is gzipped
 122     if data[:2] == b'\x1f\x8b':
 123         # Data is gzip encoded, decode it
 124         with io.BytesIO(data) as compressedStream, gzip.GzipFile(fileobj=compressedStream) as gzipper:
 125             data = gzipper.read()
 126
 127     # Write it to a file
 128     fileHandle, path = tempfile.mkstemp()
 129     with os.fdopen(fileHandle, "wb") as file:
 130         file.write(data)
 131
 132     # return file path
 133     return f"{path} {url}"