LanguageTool: don't crash if REST protocol isn't set
[LibreOffice.git] / bin / get-bugzilla-attachments-by-mimetype
blob47669d04434abc7f61a307dce95fc7e104c30efb
1 #!/usr/bin/env python
2 # -*- coding: utf-8 -*-
4 # This file is part of the LibreOffice project.
6 # This Source Code Form is subject to the terms of the Mozilla Public
7 # License, v. 2.0. If a copy of the MPL was not distributed with this
8 # file, You can obtain one at http://mozilla.org/MPL/2.0/.
11 # This digs through a pile of bugzilla's and populates the cwd with a big
12 # collection of bug-docs in per-filetype dirs with bug-ids as names with
13 # prefixes to indicate which bug-tracker, e.g.
15 # fdo-bugid-X.suffix
16 # rhbz-bugid-X.suffix
17 # moz-bugid-X.suffix
19 # where X is the n'th attachment of that type in the bug
21 # The results are stored in the current directory, categorized by the
22 # extension of the downloaded file. When a file already exists, it is assumed
23 # it is already downloaded by a previous run, and up-to-date.
25 from __future__ import print_function
26 import feedparser
27 import base64
28 import datetime
29 import glob
30 import re
31 import os, os.path
32 import stat
33 import sys
34 import threading
35 try:
36 import queue
37 except:
38 import Queue as queue
39 try:
40 from urllib.request import urlopen
41 except:
42 from urllib import urlopen
43 try:
44 import xmlrpc.client as xmlrpclib
45 except:
46 import xmlrpclib
47 from xml.dom import minidom
48 from xml.sax.saxutils import escape
50 def urlopen_retry(url):
51 maxretries = 3
52 for i in range(maxretries + 1):
53 try:
54 return urlopen(url)
55 except IOError as e:
56 print("caught IOError: " + str(e))
57 if maxretries == i:
58 raise
59 print("retrying...")
61 def get_from_bug_url_via_xml(url, mimetype, prefix, suffix):
62 id = url.rsplit('=', 2)[1]
63 print("id is " + prefix + id + " " + suffix)
64 print("parsing " + id)
65 sock = urlopen_retry(url+"&ctype=xml")
66 dom = minidom.parse(sock)
67 sock.close()
68 attachmentid=0
69 for attachment in dom.getElementsByTagName('attachment'):
70 attachmentid += 1
71 print(" mimetype is", end=' ')
72 for node in attachment.childNodes:
73 if node.nodeName == 'type':
74 # check if attachment is deleted
75 if not node.firstChild:
76 print('deleted attachment, skipping')
77 continue
79 print(node.firstChild.nodeValue, end=' ')
80 if node.firstChild.nodeValue.lower() != mimetype.lower():
81 print('skipping')
82 break
83 elif node.nodeName == 'data':
84 # check if attachment is deleted (i.e. https://bugs.kde.org/show_bug.cgi?id=53343&ctype=xml)
85 if not node.firstChild:
86 print('deleted attachment, skipping')
87 continue
89 download = suffix + '/' +prefix + id + '-' + str(attachmentid) + '.' + suffix
90 if os.path.isfile(download):
91 print("assuming " + download + " is up to date")
92 continue
94 # prevent re-downloading FDO attachments from TDF
95 if prefix == "tdf" and int(id) < 88776:
96 fdodownload = download.replace("tdf", "fdo")
97 if os.path.isfile(fdodownload):
98 print("assuming FDO " + fdodownload + " is up to date")
99 continue
101 print('downloading as ' + download)
102 tmpfile = download + ".tmp"
103 f = open(tmpfile, 'wb')
104 f.write(base64.b64decode(node.firstChild.nodeValue))
105 f.close()
106 os.rename(tmpfile, download)
107 break
109 def get_novell_bug_via_xml(url, mimetype, prefix, suffix):
110 id = url.rsplit('=', 2)[1]
111 print("id is " + prefix + id + " " + suffix)
112 print("parsing " + id)
113 sock = urlopen_retry(url+"&ctype=xml")
114 dom = minidom.parse(sock)
115 sock.close()
116 attachmentid=0
117 for comment in dom.getElementsByTagName('thetext'):
118 commentText = comment.firstChild.nodeValue
119 match = re.search(r".*Created an attachment \(id=([0-9]+)\)", commentText)
120 if not match:
121 continue
123 attachmentid += 1
125 download = suffix + '/' + prefix + id + '-' + str(attachmentid) + '.' + suffix
126 if os.path.isfile(download):
127 print("assuming " + download + " is up to date")
128 continue
130 realAttachmentId = match.group(1)
131 handle = urlopen_retry(novellattach + realAttachmentId)
132 if not handle:
133 print("attachment %s is not accessible" % realAttachmentId)
134 continue
135 print(" mimetype is", end=' ')
137 info = handle.info()
138 if info.get_content_type:
139 remoteMime = info.get_content_type()
140 else:
141 remoteMime = info.gettype()
142 print(remoteMime, end=' ')
143 if remoteMime != mimetype:
144 print("skipping")
145 continue
147 print('downloading as ' + download)
148 tmpfile = download + ".tmp"
149 f = open(tmpfile, 'wb')
150 f.write(handle.read())
151 f.close()
152 os.rename(tmpfile, download)
154 def create_query(mimetype):
155 query = dict()
156 query['query_format']='advanced'
157 query['field0-0-0']='attachments.mimetype'
158 query['type0-0-0']='equals'
159 query['value0-0-0']=mimetype
160 return query
162 def get_downloaded_files(prefix, suffix):
163 return glob.glob(os.path.join(suffix, '%s*.%s' % (prefix, suffix)))
165 def get_file_bz_ids(files, prefix):
166 return set([os.path.basename(f).split('-')[0].replace(prefix, '', 1) for f in files])
168 def get_changed_date(files):
169 newest = max([os.stat(f)[stat.ST_MTIME] for f in files])
170 # Subtract a day to avoid timezone differences. The worst thing that
171 # can happen is that we are going to process more bugs than necessary.
172 return datetime.date.fromtimestamp(newest - 24 * 60 * 60)
174 def get_through_rpc_query(rpcurl, showurl, mimetype, prefix, suffix):
175 try:
176 os.mkdir(suffix)
177 except:
178 pass
180 def process(query, full, have=[]):
181 try:
182 proxy = xmlrpclib.ServerProxy(rpcurl)
183 result = proxy.Bug.search(query)
184 bugs = result['bugs']
185 print(str(len(bugs)) + ' bugs to process')
187 if full:
188 available = set([str(bug['id']) for bug in bugs])
189 # we already have files from all available bugs
190 if available.difference(set(have)) == set():
191 print("assuming all downloaded files are up to date")
192 return
194 for bug in bugs:
195 url = showurl + str(bug['id'])
196 get_from_bug_url_via_xml(url, mimetype, prefix, suffix)
197 except xmlrpclib.Fault as err:
198 print("A fault occurred")
199 print("Fault code: %s" % err.faultCode)
200 print(err.faultString)
202 query = create_query(mimetype)
203 query['column_list']='bug_id'
205 files = get_downloaded_files(prefix, suffix)
207 if files != []:
208 print('looking for updated bugs having %s attachment(s)' % mimetype)
209 query_changed = query.copy()
210 query_changed['field0-1-0'] = 'days_elapsed'
211 query_changed['type0-1-0'] = 'lessthaneq'
212 query_changed['value0-1-0'] = str((datetime.date.today() - get_changed_date(files)).days)
213 process(query_changed, False)
215 print('looking for all bugs having %s attachment(s)' % mimetype)
216 process(query, True, get_file_bz_ids(files, prefix))
218 def get_through_rss_query(queryurl, mimetype, prefix, suffix):
219 try:
220 os.mkdir(suffix)
221 except:
222 pass
224 #Getting detailed bug information and downloading an attachment body is not possible without logging in to Novell bugzilla
225 #get_novell_bug_via_xml function is a workaround for that situation
226 get_bug_function = get_novell_bug_via_xml if prefix == "novell" else get_from_bug_url_via_xml
228 def process(query, full, have=[]):
229 url = queryurl + '?' + '&'.join(['='.join(kv) for kv in query.items()])
230 print('url is ' + url)
231 d = feedparser.parse(url)
232 print(str(len(d['entries'])) + ' bugs to process')
234 entries = []
235 for entry in d['entries']:
236 bugid = entry['id'].split('=')[-1]
237 entries.append(entry)
239 if full:
240 available = set([str(entry['id'].split('=')[-1]) for entry in entries])
241 # we already have files from all available bugs
242 if available.difference(set(have)) == set():
243 print("assuming all downloaded files are up to date")
244 return
246 for entry in entries:
247 try:
248 get_bug_function(entry['id'], mimetype, prefix, suffix)
249 except KeyboardInterrupt:
250 raise # Ctrl+C should work
251 except:
252 print(entry['id'] + " failed: " + str(sys.exc_info()[0]))
253 pass
255 query = create_query(escape(mimetype.replace("+","%2B")))
256 query['ctype'] = 'rss'
258 files = get_downloaded_files(prefix, suffix)
260 if files != []:
261 print('looking for updated bugs having %s attachment(s)' % mimetype)
262 query_changed = query.copy()
263 query_changed['field0-1-0'] = 'delta_ts'
264 query_changed['type0-1-0'] = 'greaterthaneq'
265 query_changed['value0-1-0'] = get_changed_date(files).isoformat()
266 process(query_changed, False)
268 print('looking for all bugs having %s attachment(s)' % mimetype)
269 process(query, True, get_file_bz_ids(files, prefix))
271 #since searching bugs having attachments with specific mimetypes is not available in launchpad API
272 #we're iterating over all bugs of the most interesting source packages
273 launchpad_pkgs = (
274 "abiword",
275 "calibre",
276 "calligra",
277 "gnumeric",
278 "inkscape",
279 "koffice",
280 "libabw",
281 "libcdr",
282 "libe-book",
283 "libetonyek",
284 "libfreehand",
285 "libmspub",
286 "libmwaw",
287 "liborcus",
288 "libpagemaker",
289 "libreoffice",
290 "libvisio",
291 "libwpd",
292 "libwpg",
293 "libwps",
294 "openoffice.org",
295 "python-uniconvertor",
296 "scribus",
297 "sk1",
298 "unoconv",
301 def get_launchpad_bugs(prefix):
302 #launchpadlib python module is required to download launchpad attachments
303 from launchpadlib.launchpad import Launchpad
305 launchpad = Launchpad.login_anonymously("attachmentdownload", "production")
306 ubuntu = launchpad.distributions["ubuntu"]
308 for pkg in launchpad_pkgs:
309 srcpkg = ubuntu.getSourcePackage(name=pkg)
310 pkgbugs = srcpkg.searchTasks(status=["New", "Fix Committed", "Invalid", "Won't Fix", "Confirmed", "Triaged", "In Progress", "Incomplete", "Incomplete (with response)", "Incomplete (without response)", "Fix Released", "Opinion", "Expired"])
312 for bugtask in pkgbugs:
313 bug = bugtask.bug
314 id = str(bug.id)
315 print("parsing " + id + " status: " + bugtask.status + " title: " + bug.title[:50])
316 attachmentid = 0
317 for attachment in bug.attachments:
318 attachmentid += 1
319 handle = attachment.data.open()
320 if not handle.content_type in mimetypes:
321 #print "skipping"
322 continue
324 suffix = mimetypes[handle.content_type]
325 if not os.path.isdir(suffix):
326 try:
327 os.mkdir(suffix)
328 except:
329 pass
331 download = suffix + '/' + prefix + id + '-' + str(attachmentid) + '.' + suffix
333 if os.path.isfile(download):
334 print("assuming " + id + " is up to date")
335 break
337 print('mimetype is ' + handle.content_type + ' downloading as ' + download)
339 tmpfile = download + ".tmp"
340 f = open(tmpfile, "wb")
341 f.write(handle.read())
342 f.close()
343 os.rename(tmpfile, download)
345 rss_bugzillas = (
346 ( 'abi', 'http://bugzilla.abisource.com/buglist.cgi' ), #added for abiword
347 ( 'fdo', 'http://bugs.freedesktop.org/buglist.cgi' ),
348 ( 'gentoo', 'http://bugs.gentoo.org/buglist.cgi' ),
349 ( 'gnome', 'http://bugzilla.gnome.org/buglist.cgi' ), # added for gnumeric
350 ( 'kde', 'http://bugs.kde.org/buglist.cgi' ), # added for koffice/calligra
351 ( 'mandriva', 'https://qa.mandriva.com/buglist.cgi' ),
352 ( 'moz', 'https://bugzilla.mozilla.org/buglist.cgi' ),
353 # It seems something has changed and it is no longer possible to
354 # download any files from there.
355 # NOTE: I am leaving it in the list, commented out, just so someone
356 # does not add it back immediately .-)
357 # 'novell': 'https://bugzilla.novell.com/buglist.cgi',
358 # note: running this script against bz.apache.org apparently causes one's IP
359 # to be banned or something; you won't get new files in any case...
360 # ( 'ooo', 'https://bz.apache.org/ooo/buglist.cgi' ),
361 ( 'tdf', 'http://bugs.documentfoundation.org/buglist.cgi' ),
364 redhatrpc = 'https://bugzilla.redhat.com/xmlrpc.cgi'
365 redhatbug = 'https://bugzilla.redhat.com/show_bug.cgi?id='
367 #Novell Bugzilla requires users to log in, in order to get details of the bugs such as attachment bodies etc.
368 #As a dirty workaround, we parse comments containing "Created an attachment (id=xxxxxx)" and download attachments manually
369 #python-bugzilla claims that it supports Novell bugzilla login but it's not working right now and novell bugzilla login
370 #system is a nightmare
371 novellattach = 'https://bugzilla.novell.com/attachment.cgi?id='
373 mimetypes = {
374 # ODF
375 'application/vnd.oasis.opendocument.base': 'odb',
376 'application/vnd.oasis.opendocument.database': 'odb',
377 'application/vnd.oasis.opendocument.chart': 'odc',
378 'application/vnd.oasis.opendocument.chart-template': 'otc',
379 'application/vnd.oasis.opendocument.formula': 'odf',
380 'application/vnd.oasis.opendocument.formula-template': 'otf',
381 'application/vnd.oasis.opendocument.graphics': 'odg',
382 'application/vnd.oasis.opendocument.graphics-template': 'otg',
383 'application/vnd.oasis.opendocument.graphics-flat-xml': 'fodg',
384 'application/vnd.oasis.opendocument.presentation': 'odp',
385 'application/vnd.oasis.opendocument.presentation-template': 'otp',
386 'application/vnd.oasis.opendocument.presentation-flat-xml': 'fodp',
387 'application/vnd.oasis.opendocument.spreadsheet': 'ods',
388 'application/vnd.oasis.opendocument.spreadsheet-template': 'ots',
389 'application/vnd.oasis.opendocument.spreadsheet-flat-xml': 'fods',
390 'application/vnd.oasis.opendocument.text': 'odt',
391 'application/vnd.oasis.opendocument.text-flat-xml': 'fodt',
392 'application/vnd.oasis.opendocument.text-master': 'odm',
393 'application/vnd.oasis.opendocument.text-template': 'ott',
394 'application/vnd.oasis.opendocument.text-master-template': 'otm',
395 'application/vnd.oasis.opendocument.text-web': 'oth',
396 # OOo XML
397 'application/vnd.sun.xml.base': 'odb',
398 'application/vnd.sun.xml.calc': 'sxc',
399 'application/vnd.sun.xml.calc.template': 'stc',
400 'application/vnd.sun.xml.chart': 'sxs',
401 'application/vnd.sun.xml.draw': 'sxd',
402 'application/vnd.sun.xml.draw.template': 'std',
403 'application/vnd.sun.xml.impress': 'sxi',
404 'application/vnd.sun.xml.impress.template': 'sti',
405 'application/vnd.sun.xml.math': 'sxm',
406 'application/vnd.sun.xml.writer': 'sxw',
407 'application/vnd.sun.xml.writer.global': 'sxg',
408 'application/vnd.sun.xml.writer.template': 'stw',
409 'application/vnd.sun.xml.writer.web': 'stw',
410 # MSO
411 'application/rtf': 'rtf',
412 'text/rtf': 'rtf',
413 'application/msword': 'doc',
414 'application/vnd.ms-powerpoint': 'ppt',
415 'application/vnd.ms-excel': 'xls',
416 'application/vnd.ms-excel.sheet.binary.macroEnabled.12': 'xlsb',
417 'application/vnd.ms-excel.sheet.macroEnabled.12': 'xlsm',
418 'application/vnd.ms-excel.template.macroEnabled.12': 'xltm',
419 'application/vnd.ms-powerpoint.presentation.macroEnabled.12': 'pptm',
420 'application/vnd.ms-powerpoint.slide.macroEnabled.12': 'sldm',
421 'application/vnd.ms-powerpoint.slideshow.macroEnabled.12': 'ppsm',
422 'application/vnd.ms-powerpoint.template.macroEnabled.12': 'potm',
423 'application/vnd.ms-word.document.macroEnabled.12': 'docm',
424 'application/vnd.ms-word.template.macroEnabled.12': 'dotm',
425 'application/vnd.openxmlformats-officedocument.spreadsheetml.sheet': 'xlsx',
426 'application/vnd.openxmlformats-officedocument.spreadsheetml.template': 'xltx',
427 'application/vnd.openxmlformats-officedocument.presentationml.presentation': 'pptx',
428 'application/vnd.openxmlformats-officedocument.presentationml.template': 'potx',
429 'application/vnd.openxmlformats-officedocument.presentationml.slideshow': 'ppsx',
430 'application/vnd.openxmlformats-officedocument.presentationml.slide': 'sldx',
431 'application/vnd.openxmlformats-officedocument.wordprocessingml.document': 'docx',
432 'application/vnd.openxmlformats-officedocument.wordprocessingml.template': 'dotx',
433 'application/vnd.visio': 'vsd',
434 'application/visio.drawing': 'vsd',
435 'application/vnd.visio2013': 'vsdx',
436 'application/vnd.visio.xml': 'vdx',
437 'application/x-mspublisher': 'pub',
438 #WPS Office
439 'application/wps-office.doc': 'doc',
440 'application/wps-office.docx': 'docx',
441 'application/wps-office.xls': 'xls',
442 'application/wps-office.xlsx': 'xlsx',
443 'application/wps-office.ppt': 'ppt',
444 'application/wps-office.pptx': 'pptx',
445 # W3C
446 'application/xhtml+xml': 'xhtml',
447 'application/mathml+xml': 'mml',
448 'text/html': 'html',
449 'application/docbook+xml': 'docbook',
450 # misc
451 'text/csv': 'csv',
452 'text/spreadsheet': 'slk',
453 'application/x-qpro': 'qpro',
454 'application/x-dbase': 'dbf',
455 'application/vnd.corel-draw': 'cdr',
456 'application/vnd.lotus-wordpro': 'lwp',
457 'application/vnd.lotus-1-2-3': 'wks',
458 'application/vnd.wordperfect': 'wpd',
459 'application/wordperfect5.1': 'wpd',
460 'application/vnd.ms-works': 'wps',
461 'application/clarisworks' : 'cwk',
462 'application/macwriteii' : 'mw',
463 'application/vnd.apple.keynote': 'key',
464 'application/vnd.apple.numbers': 'numbers',
465 'application/vnd.apple.pages': 'pages',
466 'application/x-iwork-keynote-sffkey': 'key',
467 'application/x-iwork-numbers-sffnumbers': 'numbers',
468 'application/x-iwork-pages-sffpages': 'pages',
469 'application/x-hwp': 'hwp',
470 'application/x-aportisdoc': 'pdb',
471 'application/prs.plucker' : 'pdb_plucker',
472 'application/vnd.palm' : 'pdb_palm',
473 'application/x-sony-bbeb' : 'lrf',
474 'application/x-pocket-word': 'psw',
475 'application/x-t602': '602',
476 'application/x-fictionbook+xml': 'fb2',
477 'application/x-abiword': 'abw',
478 'application/x-pagemaker': 'pmd',
479 'application/x-gnumeric': 'gnumeric',
480 'application/vnd.stardivision.calc': 'sdc',
481 'application/vnd.stardivision.draw': 'sda',
482 'application/vnd.stardivision.writer': 'sdw',
483 'application/x-starcalc': 'sdc',
484 'application/x-stardraw': 'sdd',
485 'application/x-starwriter': 'sdw',
486 # relatively uncommon image mimetypes
487 'image/x-freehand': 'fh',
488 'image/cgm': 'cgm',
489 'image/tif': 'tiff',
490 'image/tiff': 'tiff',
491 'image/vnd.dxf': 'dxf',
492 'image/emf': 'emf',
493 'image/x-emf': 'emf',
494 'image/x-targa': 'tga',
495 'image/x-sgf': 'sgf',
496 'image/x-svm': 'svm',
497 'image/wmf': 'wmf',
498 'image/x-wmf': 'wmf',
499 'image/x-pict': 'pict',
500 'image/x-cmx': 'cmx',
501 'image/svg+xml': 'svg',
502 'image/bmp': 'bmp',
503 'image/x-ms-bmp': 'bmp',
504 'image/x-MS-bmp': 'bmp',
505 'image/x-wpg': 'wpg',
506 'image/x-eps': 'eps',
507 'image/x-met': 'met',
508 'image/x-portable-bitmap': 'pbm',
509 'image/x-photo-cd': 'pcd',
510 'image/x-pcx': 'pcx',
511 'image/x-portable-graymap': 'pgm',
512 'image/x-portable-pixmap': 'ppm',
513 'image/vnd.adobe.photoshop': 'psd',
514 'image/x-cmu-raster': 'ras',
515 'image/x-sun-raster': 'ras',
516 'image/x-xbitmap': 'xbm',
517 'image/x-xpixmap': 'xpm',
520 # disabled for now, this would download gigs of pngs/jpegs...
521 common_noncore_mimetypes = {
522 # graphics
523 'image/gif': 'gif',
524 'image/jpeg': 'jpeg',
525 'image/png': 'png',
526 # pdf, etc.
527 'application/pdf': 'pdf',
530 class manage_threads(threading.Thread):
531 def run(self):
532 #print(threading.current_thread().get_ident())
533 while 1:
534 # Try to receive a job from queue
535 try:
536 # Get job from queue
537 # Use job parameters to call our query
538 # Then let the queue know we are done with this job
539 (uri, mimetype, prefix, extension) = jobs.get(True,6)
540 try:
541 get_through_rss_query(uri, mimetype, prefix, extension)
542 finally:
543 jobs.task_done()
544 except KeyboardInterrupt:
545 raise # Ctrl+C should work
546 except queue.Empty:
547 break
549 def generate_multi_threading():
550 for (prefix, uri) in rss_bugzillas:
552 # Initialize threads
553 for i in range(max_threads):
554 manage_threads().start()
556 # Create a job for every mimetype for a bugzilla
557 for (mimetype,extension) in mimetypes.items():
558 # It seems that bugzilla has problems returning that many results
559 # (10000 results is probably a limit set somewhere) so we always
560 # end processing the complete list.
561 if mimetype == 'text/html' and prefix == 'moz':
562 continue
564 jobs.put([uri, mimetype, prefix, extension], block=True)
565 print("successfully placed a job in the queue searching for " + mimetype + " in bugtracker " + prefix)
567 # Continue when all mimetypes are done for a bugzilla
568 jobs.join()
569 print("DONE with bugtracker " + prefix)
571 max_threads = 20 # Number of threads to create, (1 = without multi-threading)
572 jobs = queue.Queue()
574 generate_multi_threading()
576 for (mimetype,extension) in mimetypes.items():
577 get_through_rpc_query(redhatrpc, redhatbug, mimetype, "rhbz", extension)
579 try:
580 get_launchpad_bugs("lp")
581 except ImportError:
582 print("launchpadlib unavailable, skipping Ubuntu tracker")
584 # vim:set shiftwidth=4 softtabstop=4 expandtab: