Remove duplicated include
[LibreOffice.git] / bin / get-forum-attachments.py
blob74827fff929cccc7a95d4658eaf59cf7441715d8
1 #!/usr/bin/env python3
3 # This file is part of the LibreOffice project.
5 # This Source Code Form is subject to the terms of the Mozilla Public
6 # License, v. 2.0. If a copy of the MPL was not distributed with this
7 # file, You can obtain one at http://mozilla.org/MPL/2.0/.
9 import argparse
10 import configparser
11 import hashlib
12 import magic
13 import os
14 import requests
15 import shutil
16 import sys
17 import tempfile
19 from bs4 import BeautifulSoup
20 from attachment_mimetypes import mimetypes
21 from concurrent.futures import ThreadPoolExecutor, as_completed
22 from requests.adapters import HTTPAdapter
23 from requests.packages.urllib3.util.retry import Retry
25 forums = {
26 # https://wiki.documentfoundation.org/Website/Web_Sites_services#Unofficial_and_Related_Pages
27 'en': ["https://forum.openoffice.org/en/forum", False, 0],
28 'es': ["https://forum.openoffice.org/es/forum", False, 0],
29 'fr': ["https://forum.openoffice.org/fr/forum", False, 0],
30 'hu': ["https://forum.openoffice.org/hu/forum", False, 1300],
31 'it': ["https://forum.openoffice.org/it/forum", False, 0],
32 'ja': ["https://forum.openoffice.org/ja/forum", False, 0],
33 'nl': ["https://forum.openoffice.org/nl/forum", False, 0],
34 'pl': ["https://forum.openoffice.org/pl/forum", False, 0],
35 'vi': ["https://forum.openoffice.org/vi/forum", False, 0],
36 'tr': ["https://forum.libreoffice.org.tr", False, 0],
37 'de': ["https://www.openoffice-forum.de", False, 0],
38 'de2': ["https://www.libreoffice-forum.de", False, 0],
39 'de3': ["https://de.openoffice.info", False, 0],
40 # Others
41 'mso-de': ["https://www.ms-office-forum.net/forum", True, 0],
42 'mso-en': ["https://www.msofficeforums.com", True, 0],
43 'mso-en2': ["https://www.excelguru.ca/forums", False, 0],
44 'mso-en3': ["http://www.vbaexpress.com/forum", True, 5100],
45 'mso-en4': ["https://www.excelforum.com", True, 5100],
46 # forum : [url, doLogin, startIndex]
49 def get_attachment_query(forum):
50 if forum.startswith("mso"):
51 return "/attachment.php?attachmentid="
52 else:
53 return "/download/file.php?id="
55 def createSession():
56 session = requests.Session()
57 retry = Retry(connect=3, backoff_factor=0.5)
58 adapter = HTTPAdapter(max_retries=retry)
59 session.mount('http://', adapter)
60 session.mount('https://', adapter)
61 return session
63 def login(session, url, configFile):
64 config = configparser.ConfigParser()
66 config.read(configFile)
67 username = config.get('login', 'username')
68 password = config.get('login', 'password')
69 resp = session.post(url + '/login.php?do=login', {
70 'vb_login_username': username,
71 'vb_login_password': '',
72 'vb_login_md5password': hashlib.md5(password.encode()).hexdigest(),
73 'vb_login_md5password_utf': hashlib.md5(password.encode()).hexdigest(),
74 'cookieuser': 1,
75 'do': 'login',
76 's': '',
77 'securitytoken': 'guest'
80 if resp.status_code != 200:
81 return False
83 soup = BeautifulSoup(resp.content, 'lxml')
84 for p in soup.find_all("p"):
85 if 'Thank you for logging in' in p.get_text():
86 return True
87 elif 'Danke für Ihre Anmeldung' in p.get_text():
88 return True
90 return False
92 def get_attachments_from_url(forum, config, args):
93 url = config[0]
94 doLogin = config[1]
95 startIndex = config[2]
97 print("Checking " + url)
99 # Keep the index and resume from there
100 indexFile = os.path.join(args.outdir, forum + ".index")
101 if os.path.isfile(indexFile):
102 with open(indexFile) as f:
103 startIndex = int(f.readline().rstrip()) + 1
105 session = createSession()
107 if doLogin:
108 if not login(session, url, args.config):
109 print("Can't log in to " + url)
110 return
112 invalidCount = 0
113 for i in range(startIndex, 999999):
114 fileUrl = url + get_attachment_query(forum) + str(i)
116 h = session.head(fileUrl)
117 header = h.headers
118 content_type = header.get('content-type')
119 if "html" in content_type:
120 # Let's assume this is an invalid file link
121 invalidCount += 1
123 # Let's assume, if we get 200 invalid files, that there are no more files
124 if invalidCount == 200:
125 print("No more attachments found in " + url)
126 break
127 else:
128 invalidCount = 0
130 r = session.get(fileUrl, allow_redirects=True)
131 with tempfile.NamedTemporaryFile() as tmp:
132 tmp.write(r.content)
133 mimetype = magic.from_file(tmp.name, mime=True)
134 if mimetype in mimetypes:
135 suffix = mimetypes[mimetype]
136 suffixDir = os.path.join(args.outdir, suffix)
137 try:
138 os.mkdir(suffixDir)
139 except:
140 pass
142 download = os.path.join(suffixDir,
143 "forum-" + forum + '-' + str(i) + '.' + suffix)
145 print("Downloading as " + download)
146 shutil.copy(tmp.name, download)
148 # Save the index
149 with open(indexFile, 'w') as f:
150 f.write(str(i))
152 if __name__ == '__main__':
153 parser = argparse.ArgumentParser()
155 parser.add_argument('--outdir', action='store', dest="outdir", required=True)
156 parser.add_argument('--config', action="store", dest="config", required=True)
157 parser.add_argument('--get-file', action="store", dest="fileName", required=False)
159 args = parser.parse_args()
161 if not os.path.exists(args.outdir) or os.path.isfile(args.outdir):
162 print("Outdir folder doesn't exists")
163 sys.exit(1)
164 elif not os.path.exists(args.config) or not os.path.isfile(args.config):
165 print("Config file doesn't exists")
166 sys.exit(1)
168 if not args.fileName:
169 processes = []
170 # by default, 10 at a time seems to work fine
171 with ThreadPoolExecutor(max_workers=int(os.environ.get('PARALLELISM', 10))) as executor:
172 for forum, config in forums.items():
173 processes.append(executor.submit(get_attachments_from_url, forum, config, args))
175 for task in as_completed(processes):
176 result = task.result()
177 if result:
178 print(result)
179 else:
180 fileNameSplit = args.fileName.split("-")
181 if fileNameSplit[0] != "forum" or (len(fileNameSplit) != 3 and len(fileNameSplit) != 4):
182 print("Incorrect file name")
183 sys.exit(1)
185 forum = fileNameSplit[1]
186 fileId = fileNameSplit[2]
187 if fileNameSplit[1] == "mso":
188 forum += "-" + fileNameSplit[2]
189 fileId = fileNameSplit[3]
191 url = forums[forum][0]
192 fileUrl = url + get_attachment_query(forum) + fileId.split(".")[0]
194 session = createSession()
196 doLogin = forums[forum][1]
197 if doLogin:
198 if not login(session, url, args.config):
199 print("Can't log in to " + url)
200 sys.exit(1)
202 r = session.get(fileUrl, allow_redirects=True)
203 with tempfile.NamedTemporaryFile() as tmp:
204 tmp.write(r.content)
206 download = os.path.join(args.outdir, args.fileName)
208 print("Downloading " + fileUrl + " as " + download)
209 shutil.copy(tmp.name, download)