calc: on editing invalidation of view with different zoom is wrong
[LibreOffice.git] / bin / get-forum-attachments.py
blob4300778e4fdad30dbce1cd82a74d75092b2a2e65
1 #!/usr/bin/env python3
3 # This file is part of the LibreOffice project.
5 # This Source Code Form is subject to the terms of the Mozilla Public
6 # License, v. 2.0. If a copy of the MPL was not distributed with this
7 # file, You can obtain one at http://mozilla.org/MPL/2.0/.
9 import argparse
10 import configparser
11 import hashlib
12 import magic
13 import os
14 import requests
15 import shutil
16 import sys
17 import tempfile
19 from bs4 import BeautifulSoup
20 from attachment_mimetypes import mimetypes
21 from concurrent.futures import ThreadPoolExecutor, as_completed
22 from requests.adapters import HTTPAdapter
23 from requests.packages.urllib3.util.retry import Retry
25 forums = {
26 # https://wiki.documentfoundation.org/Website/Web_Sites_services#Unofficial_and_Related_Pages
27 'en': ["https://forum.openoffice.org/en/forum", False, 0],
28 'es': ["https://forum.openoffice.org/es/forum", False, 0],
29 'fr': ["https://forum.openoffice.org/fr/forum", False, 0],
30 'hu': ["https://forum.openoffice.org/hu/forum", False, 1300],
31 'it': ["https://forum.openoffice.org/it/forum", False, 0],
32 'ja': ["https://forum.openoffice.org/ja/forum", False, 0],
33 'nl': ["https://forum.openoffice.org/nl/forum", False, 0],
34 'pl': ["https://forum.openoffice.org/pl/forum", False, 0],
35 'vi': ["https://forum.openoffice.org/vi/forum", False, 0],
36 'tr': ["https://forum.libreoffice.org.tr", False, 0],
37 'de': ["https://www.openoffice-forum.de", False, 0],
38 'de2': ["https://www.libreoffice-forum.de", False, 0],
39 'de3': ["https://de.openoffice.info", False, 0],
40 # Others
41 'mso-de': ["https://www.ms-office-forum.net/forum", True, 0],
42 'mso-en': ["https://www.msofficeforums.com", True, 0],
43 'mso-en2': ["https://www.excelguru.ca/forums", False, 0],
44 'mso-en3': ["http://www.vbaexpress.com/forum", True, 5100],
45 'mso-en4': ["https://www.excelforum.com", True, 5100],
46 # lang : [url, doLogin, startIndex]
49 def get_attachment_query(lang):
50 if lang.startswith("mso"):
51 return "/attachment.php?attachmentid="
52 else:
53 return "/download/file.php?id="
55 def login(session, url, configFile):
56 config = configparser.ConfigParser()
58 config.read(configFile)
59 username = config.get('login', 'username')
60 password = config.get('login', 'password')
61 resp = session.post(url + '/login.php?do=login', {
62 'vb_login_username': username,
63 'vb_login_password': '',
64 'vb_login_md5password': hashlib.md5(password.encode()).hexdigest(),
65 'vb_login_md5password_utf': hashlib.md5(password.encode()).hexdigest(),
66 'cookieuser': 1,
67 'do': 'login',
68 's': '',
69 'securitytoken': 'guest'
72 if resp.status_code != 200:
73 return False
75 soup = BeautifulSoup(resp.content, 'lxml')
76 for p in soup.find_all("p"):
77 if 'Thank you for logging in' in p.get_text():
78 return True
79 elif 'Danke für Ihre Anmeldung' in p.get_text():
80 return True
82 return False
84 def get_attachments_from_url(lang, config, pathes):
85 url = config[0]
86 doLogin = config[1]
87 startIndex = config[2]
89 print("Checking " + url)
91 # Keep the index and resume from there
92 indexFile = os.path.join(pathes.outdir, lang + ".index")
93 if os.path.isfile(indexFile):
94 with open(indexFile) as f:
95 startIndex = int(f.readline().rstrip()) + 1
97 session = requests.Session()
98 retry = Retry(connect=3, backoff_factor=0.5)
99 adapter = HTTPAdapter(max_retries=retry)
100 session.mount('http://', adapter)
101 session.mount('https://', adapter)
103 if doLogin:
104 if not login(session, url, pathes.config):
105 print("Can't log in to " + url)
106 return
108 invalidCount = 0
109 for i in range(startIndex, 999999):
110 fileUrl = url + get_attachment_query(lang) + str(i)
112 h = session.head(fileUrl)
113 header = h.headers
114 content_type = header.get('content-type')
115 if "html" in content_type:
116 # Let's assume this is an invalid file link
117 invalidCount += 1
119 # Let's assume, if we get 200 invalid files, that there are no more files
120 if invalidCount == 200:
121 print("No more attachments found in " + url)
122 break
123 else:
124 invalidCount = 0
126 r = session.get(fileUrl, allow_redirects=True)
127 with tempfile.NamedTemporaryFile() as tmp:
128 tmp.write(r.content)
129 mimetype = magic.from_file(tmp.name, mime=True)
130 if mimetype in mimetypes:
131 suffix = mimetypes[mimetype]
132 suffixDir = os.path.join(pathes.outdir, suffix)
133 try:
134 os.mkdir(suffixDir)
135 except:
136 pass
138 download = os.path.join(suffixDir,
139 "forum-" + lang + '-' + str(i) + '.' + suffix)
141 print("Downloading as " + download)
142 shutil.copy(tmp.name, download)
144 # Save the index
145 with open(indexFile, 'w') as f:
146 f.write(str(i))
148 if __name__ == '__main__':
149 parser = argparse.ArgumentParser()
151 parser.add_argument('--outdir', action='store', dest="outdir", required=True)
152 parser.add_argument('--config', action="store", dest="config", required=True)
154 pathes = parser.parse_args()
156 if not os.path.exists(pathes.outdir) or os.path.isfile(pathes.outdir):
157 print("Outdir folder doesn't exists")
158 sys.exit(1)
159 elif not os.path.exists(pathes.config) or not os.path.isfile(pathes.config):
160 print("Config file doesn't exists")
161 sys.exit(1)
163 processes = []
164 # by default, 10 at a time seems to work fine
165 with ThreadPoolExecutor(max_workers=int(os.environ.get('PARALLELISM', 10))) as executor:
166 for lang, config in forums.items():
167 processes.append(executor.submit(get_attachments_from_url, lang, config, pathes))
169 for task in as_completed(processes):
170 result = task.result()
171 if result:
172 print(result)