3 # This file is part of the LibreOffice project.
5 # This Source Code Form is subject to the terms of the Mozilla Public
6 # License, v. 2.0. If a copy of the MPL was not distributed with this
7 # file, You can obtain one at http://mozilla.org/MPL/2.0/.
19 from bs4
import BeautifulSoup
20 from attachment_mimetypes
import mimetypes
21 from concurrent
.futures
import ThreadPoolExecutor
, as_completed
22 from requests
.adapters
import HTTPAdapter
23 from requests
.packages
.urllib3
.util
.retry
import Retry
26 # https://wiki.documentfoundation.org/Website/Web_Sites_services#Unofficial_and_Related_Pages
27 'en': ["https://forum.openoffice.org/en/forum", False, 0],
28 'es': ["https://forum.openoffice.org/es/forum", False, 0],
29 'fr': ["https://forum.openoffice.org/fr/forum", False, 0],
30 'hu': ["https://forum.openoffice.org/hu/forum", False, 1300],
31 'it': ["https://forum.openoffice.org/it/forum", False, 0],
32 'ja': ["https://forum.openoffice.org/ja/forum", False, 0],
33 'nl': ["https://forum.openoffice.org/nl/forum", False, 0],
34 'pl': ["https://forum.openoffice.org/pl/forum", False, 0],
35 'vi': ["https://forum.openoffice.org/vi/forum", False, 0],
36 'tr': ["https://forum.libreoffice.org.tr", False, 0],
37 'de': ["https://www.openoffice-forum.de", False, 0],
38 'de2': ["https://www.libreoffice-forum.de", False, 0],
39 'de3': ["https://de.openoffice.info", False, 0],
41 'mso-de': ["https://www.ms-office-forum.net/forum", True, 0],
42 'mso-en': ["https://www.msofficeforums.com", True, 0],
43 'mso-en2': ["https://www.excelguru.ca/forums", False, 0],
44 'mso-en3': ["http://www.vbaexpress.com/forum", True, 5100],
45 'mso-en4': ["https://www.excelforum.com", True, 5100],
46 # forum : [url, doLogin, startIndex]
49 def get_attachment_query(forum
):
50 if forum
.startswith("mso"):
51 return "/attachment.php?attachmentid="
53 return "/download/file.php?id="
56 session
= requests
.Session()
57 retry
= Retry(connect
=3, backoff_factor
=0.5)
58 adapter
= HTTPAdapter(max_retries
=retry
)
59 session
.mount('http://', adapter
)
60 session
.mount('https://', adapter
)
63 def login(session
, url
, configFile
):
64 config
= configparser
.ConfigParser()
66 config
.read(configFile
)
67 username
= config
.get('login', 'username')
68 password
= config
.get('login', 'password')
69 resp
= session
.post(url
+ '/login.php?do=login', {
70 'vb_login_username': username
,
71 'vb_login_password': '',
72 'vb_login_md5password': hashlib
.md5(password
.encode()).hexdigest(),
73 'vb_login_md5password_utf': hashlib
.md5(password
.encode()).hexdigest(),
77 'securitytoken': 'guest'
80 if resp
.status_code
!= 200:
83 soup
= BeautifulSoup(resp
.content
, 'lxml')
84 for p
in soup
.find_all("p"):
85 if 'Thank you for logging in' in p
.get_text():
87 elif 'Danke für Ihre Anmeldung' in p
.get_text():
92 def get_attachments_from_url(forum
, config
, args
):
95 startIndex
= config
[2]
97 print("Checking " + url
)
99 # Keep the index and resume from there
100 indexFile
= os
.path
.join(args
.outdir
, forum
+ ".index")
101 if os
.path
.isfile(indexFile
):
102 with
open(indexFile
) as f
:
103 startIndex
= int(f
.readline().rstrip()) + 1
105 session
= createSession()
108 if not login(session
, url
, args
.config
):
109 print("Can't log in to " + url
)
113 for i
in range(startIndex
, 999999):
114 fileUrl
= url
+ get_attachment_query(forum
) + str(i
)
116 h
= session
.head(fileUrl
)
118 content_type
= header
.get('content-type')
119 if "html" in content_type
:
120 # Let's assume this is an invalid file link
123 # Let's assume, if we get 200 invalid files, that there are no more files
124 if invalidCount
== 200:
125 print("No more attachments found in " + url
)
130 r
= session
.get(fileUrl
, allow_redirects
=True)
131 with tempfile
.NamedTemporaryFile() as tmp
:
133 mimetype
= magic
.from_file(tmp
.name
, mime
=True)
134 if mimetype
in mimetypes
:
135 suffix
= mimetypes
[mimetype
]
136 suffixDir
= os
.path
.join(args
.outdir
, suffix
)
142 download
= os
.path
.join(suffixDir
,
143 "forum-" + forum
+ '-' + str(i
) + '.' + suffix
)
145 print("Downloading as " + download
)
146 shutil
.copy(tmp
.name
, download
)
149 with
open(indexFile
, 'w') as f
:
152 if __name__
== '__main__':
153 parser
= argparse
.ArgumentParser()
155 parser
.add_argument('--outdir', action
='store', dest
="outdir", required
=True)
156 parser
.add_argument('--config', action
="store", dest
="config", required
=True)
157 parser
.add_argument('--get-file', action
="store", dest
="fileName", required
=False)
159 args
= parser
.parse_args()
161 if not os
.path
.exists(args
.outdir
) or os
.path
.isfile(args
.outdir
):
162 print("Outdir folder doesn't exists")
164 elif not os
.path
.exists(args
.config
) or not os
.path
.isfile(args
.config
):
165 print("Config file doesn't exists")
168 if not args
.fileName
:
170 # by default, 10 at a time seems to work fine
171 with
ThreadPoolExecutor(max_workers
=int(os
.environ
.get('PARALLELISM', 10))) as executor
:
172 for forum
, config
in forums
.items():
173 processes
.append(executor
.submit(get_attachments_from_url
, forum
, config
, args
))
175 for task
in as_completed(processes
):
176 result
= task
.result()
180 fileNameSplit
= args
.fileName
.split("-")
181 if fileNameSplit
[0] != "forum" or (len(fileNameSplit
) != 3 and len(fileNameSplit
) != 4):
182 print("Incorrect file name")
185 forum
= fileNameSplit
[1]
186 fileId
= fileNameSplit
[2]
187 if fileNameSplit
[1] == "mso":
188 forum
+= "-" + fileNameSplit
[2]
189 fileId
= fileNameSplit
[3]
191 url
= forums
[forum
][0]
192 fileUrl
= url
+ get_attachment_query(forum
) + fileId
.split(".")[0]
194 session
= createSession()
196 doLogin
= forums
[forum
][1]
198 if not login(session
, url
, args
.config
):
199 print("Can't log in to " + url
)
202 r
= session
.get(fileUrl
, allow_redirects
=True)
203 with tempfile
.NamedTemporaryFile() as tmp
:
206 download
= os
.path
.join(args
.outdir
, args
.fileName
)
208 print("Downloading " + fileUrl
+ " as " + download
)
209 shutil
.copy(tmp
.name
, download
)