3 # This file is part of the LibreOffice project.
5 # This Source Code Form is subject to the terms of the Mozilla Public
6 # License, v. 2.0. If a copy of the MPL was not distributed with this
7 # file, You can obtain one at http://mozilla.org/MPL/2.0/.
19 from bs4
import BeautifulSoup
20 from attachment_mimetypes
import mimetypes
21 from concurrent
.futures
import ThreadPoolExecutor
, as_completed
22 from requests
.adapters
import HTTPAdapter
23 from requests
.packages
.urllib3
.util
.retry
import Retry
26 # https://wiki.documentfoundation.org/Website/Web_Sites_services#Unofficial_and_Related_Pages
27 'en': ["https://forum.openoffice.org/en/forum", False, 0],
28 'es': ["https://forum.openoffice.org/es/forum", False, 0],
29 'fr': ["https://forum.openoffice.org/fr/forum", False, 0],
30 'hu': ["https://forum.openoffice.org/hu/forum", False, 1300],
31 'it': ["https://forum.openoffice.org/it/forum", False, 0],
32 'ja': ["https://forum.openoffice.org/ja/forum", False, 0],
33 'nl': ["https://forum.openoffice.org/nl/forum", False, 0],
34 'pl': ["https://forum.openoffice.org/pl/forum", False, 0],
35 'vi': ["https://forum.openoffice.org/vi/forum", False, 0],
36 'tr': ["https://forum.libreoffice.org.tr", False, 0],
37 'de': ["https://www.openoffice-forum.de", False, 0],
38 'de2': ["https://www.libreoffice-forum.de", False, 0],
39 'de3': ["https://de.openoffice.info", False, 0],
41 'mso-de': ["https://www.ms-office-forum.net/forum", True, 0],
42 'mso-en': ["https://www.msofficeforums.com", True, 0],
43 'mso-en2': ["https://www.excelguru.ca/forums", False, 0],
44 'mso-en3': ["http://www.vbaexpress.com/forum", True, 5100],
45 'mso-en4': ["https://www.excelforum.com", True, 5100],
46 # lang : [url, doLogin, startIndex]
49 def get_attachment_query(lang
):
50 if lang
.startswith("mso"):
51 return "/attachment.php?attachmentid="
53 return "/download/file.php?id="
55 def login(session
, url
, configFile
):
56 config
= configparser
.ConfigParser()
58 config
.read(configFile
)
59 username
= config
.get('login', 'username')
60 password
= config
.get('login', 'password')
61 resp
= session
.post(url
+ '/login.php?do=login', {
62 'vb_login_username': username
,
63 'vb_login_password': '',
64 'vb_login_md5password': hashlib
.md5(password
.encode()).hexdigest(),
65 'vb_login_md5password_utf': hashlib
.md5(password
.encode()).hexdigest(),
69 'securitytoken': 'guest'
72 if resp
.status_code
!= 200:
75 soup
= BeautifulSoup(resp
.content
, 'lxml')
76 for p
in soup
.find_all("p"):
77 if 'Thank you for logging in' in p
.get_text():
79 elif 'Danke für Ihre Anmeldung' in p
.get_text():
84 def get_attachments_from_url(lang
, config
, pathes
):
87 startIndex
= config
[2]
89 print("Checking " + url
)
91 # Keep the index and resume from there
92 indexFile
= os
.path
.join(pathes
.outdir
, lang
+ ".index")
93 if os
.path
.isfile(indexFile
):
94 with
open(indexFile
) as f
:
95 startIndex
= int(f
.readline().rstrip()) + 1
97 session
= requests
.Session()
98 retry
= Retry(connect
=3, backoff_factor
=0.5)
99 adapter
= HTTPAdapter(max_retries
=retry
)
100 session
.mount('http://', adapter
)
101 session
.mount('https://', adapter
)
104 if not login(session
, url
, pathes
.config
):
105 print("Can't log in to " + url
)
109 for i
in range(startIndex
, 999999):
110 fileUrl
= url
+ get_attachment_query(lang
) + str(i
)
112 h
= session
.head(fileUrl
)
114 content_type
= header
.get('content-type')
115 if "html" in content_type
:
116 # Let's assume this is an invalid file link
119 # Let's assume, if we get 200 invalid files, that there are no more files
120 if invalidCount
== 200:
121 print("No more attachments found in " + url
)
126 r
= session
.get(fileUrl
, allow_redirects
=True)
127 with tempfile
.NamedTemporaryFile() as tmp
:
129 mimetype
= magic
.from_file(tmp
.name
, mime
=True)
130 if mimetype
in mimetypes
:
131 suffix
= mimetypes
[mimetype
]
132 suffixDir
= os
.path
.join(pathes
.outdir
, suffix
)
138 download
= os
.path
.join(suffixDir
,
139 "forum-" + lang
+ '-' + str(i
) + '.' + suffix
)
141 print("Downloading as " + download
)
142 shutil
.copy(tmp
.name
, download
)
145 with
open(indexFile
, 'w') as f
:
148 if __name__
== '__main__':
149 parser
= argparse
.ArgumentParser()
151 parser
.add_argument('--outdir', action
='store', dest
="outdir", required
=True)
152 parser
.add_argument('--config', action
="store", dest
="config", required
=True)
154 pathes
= parser
.parse_args()
156 if not os
.path
.exists(pathes
.outdir
) or os
.path
.isfile(pathes
.outdir
):
157 print("Outdir folder doesn't exists")
159 elif not os
.path
.exists(pathes
.config
) or not os
.path
.isfile(pathes
.config
):
160 print("Config file doesn't exists")
164 # by default, 10 at a time seems to work fine
165 with
ThreadPoolExecutor(max_workers
=int(os
.environ
.get('PARALLELISM', 10))) as executor
:
166 for lang
, config
in forums
.items():
167 processes
.append(executor
.submit(get_attachments_from_url
, lang
, config
, pathes
))
169 for task
in as_completed(processes
):
170 result
= task
.result()