Version 6.1.4.1, tag libreoffice-6.1.4.1
[LibreOffice.git] / bin / get-bugzilla-attachments-by-mimetype
blob5d6227996cb8802ad6acba8418d2856ac7ecaf6e
1 #!/usr/bin/env python
2 # -*- coding: utf-8 -*-
4 # This file is part of the LibreOffice project.
6 # This Source Code Form is subject to the terms of the Mozilla Public
7 # License, v. 2.0. If a copy of the MPL was not distributed with this
8 # file, You can obtain one at http://mozilla.org/MPL/2.0/.
11 # This digs through a pile of bugzilla's and populates the cwd with a big
12 # collection of bug-docs in per-filetype dirs with bug-ids as names with
13 # prefixes to indicate which bug-tracker, e.g.
15 # fdo-bugid-X.suffix
16 # rhbz-bugid-X.suffix
17 # moz-bugid-X.suffix
19 # where X is the n'th attachment of that type in the bug
21 # The results are stored in the current directory, categorized by the
22 # extension of the downloaded file. When a file already exists, it is assumed
23 # it is already downloaded by a previous run, and up-to-date.
25 from __future__ import print_function
26 import feedparser
27 import base64
28 import datetime
29 import glob
30 import re
31 import os, os.path
32 import stat
33 import sys
34 import threading
35 try:
36 import queue
37 except:
38 import Queue as queue
39 try:
40 from urllib.request import urlopen
41 except:
42 from urllib import urlopen
43 try:
44 import xmlrpc.client as xmlrpclib
45 except:
46 import xmlrpclib
47 from xml.dom import minidom
48 from xml.sax.saxutils import escape
50 def urlopen_retry(url):
51 maxretries = 3
52 for i in range(maxretries + 1):
53 try:
54 return urlopen(url)
55 except IOError as e:
56 print("caught IOError: " + str(e))
57 if maxretries == i:
58 raise
59 print("retrying...")
61 def get_from_bug_url_via_xml(url, mimetype, prefix, suffix):
62 id = url.rsplit('=', 2)[1]
63 print("id is " + prefix + id + " " + suffix)
64 print("parsing " + id)
65 sock = urlopen_retry(url+"&ctype=xml")
66 dom = minidom.parse(sock)
67 sock.close()
68 attachmentid=0
69 for attachment in dom.getElementsByTagName('attachment'):
70 attachmentid += 1
71 print(" mimetype is", end=' ')
72 for node in attachment.childNodes:
73 if node.nodeName == 'type':
74 print(node.firstChild.nodeValue, end=' ')
75 if node.firstChild.nodeValue.lower() != mimetype.lower():
76 print('skipping')
77 break
78 elif node.nodeName == 'data':
79 # check if attachment is deleted (i.e. https://bugs.kde.org/show_bug.cgi?id=53343&ctype=xml)
80 if not node.firstChild:
81 print('deleted attachment, skipping')
82 continue
84 download = suffix + '/' +prefix + id + '-' + str(attachmentid) + '.' + suffix
85 if os.path.isfile(download):
86 print("assuming " + download + " is up to date")
87 continue
89 # prevent re-downloading FDO attachments from TDF
90 if prefix == "tdf" and int(id) < 88776:
91 fdodownload = download.replace("tdf", "fdo")
92 if os.path.isfile(fdodownload):
93 print("assuming FDO " + fdodownload + " is up to date")
94 continue
96 print('downloading as ' + download)
97 tmpfile = download + ".tmp"
98 f = open(tmpfile, 'wb')
99 f.write(base64.b64decode(node.firstChild.nodeValue))
100 f.close()
101 os.rename(tmpfile, download)
102 break
104 def get_novell_bug_via_xml(url, mimetype, prefix, suffix):
105 id = url.rsplit('=', 2)[1]
106 print("id is " + prefix + id + " " + suffix)
107 print("parsing " + id)
108 sock = urlopen_retry(url+"&ctype=xml")
109 dom = minidom.parse(sock)
110 sock.close()
111 attachmentid=0
112 for comment in dom.getElementsByTagName('thetext'):
113 commentText = comment.firstChild.nodeValue
114 match = re.search(r".*Created an attachment \(id=([0-9]+)\)", commentText)
115 if not match:
116 continue
118 attachmentid += 1
120 download = suffix + '/' + prefix + id + '-' + str(attachmentid) + '.' + suffix
121 if os.path.isfile(download):
122 print("assuming " + download + " is up to date")
123 continue
125 realAttachmentId = match.group(1)
126 handle = urlopen_retry(novellattach + realAttachmentId)
127 if not handle:
128 print("attachment %s is not accessible" % realAttachmentId)
129 continue
130 print(" mimetype is", end=' ')
132 info = handle.info()
133 if info.get_content_type:
134 remoteMime = info.get_content_type()
135 else:
136 remoteMime = info.gettype()
137 print(remoteMime, end=' ')
138 if remoteMime != mimetype:
139 print("skipping")
140 continue
142 print('downloading as ' + download)
143 tmpfile = download + ".tmp"
144 f = open(tmpfile, 'wb')
145 f.write(handle.read())
146 f.close()
147 os.rename(tmpfile, download)
149 def create_query(mimetype):
150 query = dict()
151 query['query_format']='advanced'
152 query['field0-0-0']='attachments.mimetype'
153 query['type0-0-0']='equals'
154 query['value0-0-0']=mimetype
155 return query
157 def get_downloaded_files(prefix, suffix):
158 return glob.glob(os.path.join(suffix, '%s*.%s' % (prefix, suffix)))
160 def get_file_bz_ids(files, prefix):
161 return set([os.path.basename(f).split('-')[0].replace(prefix, '', 1) for f in files])
163 def get_changed_date(files):
164 newest = max([os.stat(f)[stat.ST_MTIME] for f in files])
165 # Subtract a day to avoid timezone differences. The worst thing that
166 # can happen is that we are going to process more bugs than necessary.
167 return datetime.date.fromtimestamp(newest - 24 * 60 * 60)
169 def get_through_rpc_query(rpcurl, showurl, mimetype, prefix, suffix):
170 try:
171 os.mkdir(suffix)
172 except:
173 pass
175 def process(query, full, have=[]):
176 try:
177 proxy = xmlrpclib.ServerProxy(rpcurl)
178 result = proxy.Bug.search(query)
179 bugs = result['bugs']
180 print(str(len(bugs)) + ' bugs to process')
182 if full:
183 available = set([str(bug['id']) for bug in bugs])
184 # we already have files from all available bugs
185 if available.difference(set(have)) == set():
186 print("assuming all downloaded files are up to date")
187 return
189 for bug in bugs:
190 url = showurl + str(bug['id'])
191 get_from_bug_url_via_xml(url, mimetype, prefix, suffix)
192 except xmlrpclib.Fault as err:
193 print("A fault occurred")
194 print("Fault code: %s" % err.faultCode)
195 print(err.faultString)
197 query = create_query(mimetype)
198 query['column_list']='bug_id'
200 files = get_downloaded_files(prefix, suffix)
202 if files != []:
203 print('looking for updated bugs having %s attachment(s)' % mimetype)
204 query_changed = query.copy()
205 query_changed['field0-1-0'] = 'days_elapsed'
206 query_changed['type0-1-0'] = 'lessthaneq'
207 query_changed['value0-1-0'] = str((datetime.date.today() - get_changed_date(files)).days)
208 process(query_changed, False)
210 print('looking for all bugs having %s attachment(s)' % mimetype)
211 process(query, True, get_file_bz_ids(files, prefix))
213 def get_through_rss_query(queryurl, mimetype, prefix, suffix):
214 try:
215 os.mkdir(suffix)
216 except:
217 pass
219 #Getting detailed bug information and downloading an attachment body is not possible without logging in to Novell bugzilla
220 #get_novell_bug_via_xml function is a workaround for that situation
221 get_bug_function = get_novell_bug_via_xml if prefix == "novell" else get_from_bug_url_via_xml
223 def process(query, full, have=[]):
224 url = queryurl + '?' + '&'.join(['='.join(kv) for kv in query.items()])
225 print('url is ' + url)
226 d = feedparser.parse(url)
227 print(str(len(d['entries'])) + ' bugs to process')
229 entries = []
230 for entry in d['entries']:
231 bugid = entry['id'].split('=')[-1]
232 entries.append(entry)
234 if full:
235 available = set([str(entry['id'].split('=')[-1]) for entry in entries])
236 # we already have files from all available bugs
237 if available.difference(set(have)) == set():
238 print("assuming all downloaded files are up to date")
239 return
241 for entry in entries:
242 try:
243 get_bug_function(entry['id'], mimetype, prefix, suffix)
244 except KeyboardInterrupt:
245 raise # Ctrl+C should work
246 except:
247 print(entry['id'] + " failed: " + str(sys.exc_info()[0]))
248 pass
250 query = create_query(escape(mimetype.replace("+","%2B")))
251 query['ctype'] = 'rss'
253 files = get_downloaded_files(prefix, suffix)
255 if files != []:
256 print('looking for updated bugs having %s attachment(s)' % mimetype)
257 query_changed = query.copy()
258 query_changed['field0-1-0'] = 'changed'
259 query_changed['type0-1-0'] = 'changedbefore'
260 query_changed['value0-1-0'] = get_changed_date(files).isoformat()
261 process(query_changed, False)
263 print('looking for all bugs having %s attachment(s)' % mimetype)
264 process(query, True, get_file_bz_ids(files, prefix))
266 #since searching bugs having attachments with specific mimetypes is not available in launchpad API
267 #we're iterating over all bugs of the most interesting source packages
268 launchpad_pkgs = (
269 "abiword",
270 "calibre",
271 "calligra",
272 "gnumeric",
273 "inkscape",
274 "koffice",
275 "libabw",
276 "libcdr",
277 "libe-book",
278 "libetonyek",
279 "libfreehand",
280 "libmspub",
281 "libmwaw",
282 "liborcus",
283 "libpagemaker",
284 "libreoffice",
285 "libvisio",
286 "libwpd",
287 "libwpg",
288 "libwps",
289 "openoffice.org",
290 "python-uniconvertor",
291 "scribus",
292 "sk1",
293 "unoconv",
296 def get_launchpad_bugs(prefix):
297 #launchpadlib python module is required to download launchpad attachments
298 from launchpadlib.launchpad import Launchpad
300 launchpad = Launchpad.login_anonymously("attachmentdownload", "production")
301 ubuntu = launchpad.distributions["ubuntu"]
303 for pkg in launchpad_pkgs:
304 srcpkg = ubuntu.getSourcePackage(name=pkg)
305 pkgbugs = srcpkg.searchTasks(status=["New", "Fix Committed", "Invalid", "Won't Fix", "Confirmed", "Triaged", "In Progress", "Incomplete", "Incomplete (with response)", "Incomplete (without response)", "Fix Released", "Opinion", "Expired"])
307 for bugtask in pkgbugs:
308 bug = bugtask.bug
309 id = str(bug.id)
310 print("parsing " + id + " status: " + bugtask.status + " title: " + bug.title[:50])
311 attachmentid = 0
312 for attachment in bug.attachments:
313 attachmentid += 1
314 handle = attachment.data.open()
315 if not handle.content_type in mimetypes:
316 #print "skipping"
317 continue
319 suffix = mimetypes[handle.content_type]
320 if not os.path.isdir(suffix):
321 try:
322 os.mkdir(suffix)
323 except:
324 pass
326 download = suffix + '/' + prefix + id + '-' + str(attachmentid) + '.' + suffix
328 if os.path.isfile(download):
329 print("assuming " + id + " is up to date")
330 break
332 print('mimetype is ' + handle.content_type + ' downloading as ' + download)
334 tmpfile = download + ".tmp"
335 f = open(tmpfile, "wb")
336 f.write(handle.read())
337 f.close()
338 os.rename(tmpfile, download)
340 rss_bugzillas = (
341 ( 'abi', 'http://bugzilla.abisource.com/buglist.cgi' ), #added for abiword
342 ( 'fdo', 'http://bugs.freedesktop.org/buglist.cgi' ),
343 ( 'gentoo', 'http://bugs.gentoo.org/buglist.cgi' ),
344 ( 'gnome', 'http://bugzilla.gnome.org/buglist.cgi' ), # added for gnumeric
345 ( 'kde', 'http://bugs.kde.org/buglist.cgi' ), # added for koffice/calligra
346 ( 'mandriva', 'https://qa.mandriva.com/buglist.cgi' ),
347 ( 'moz', 'https://bugzilla.mozilla.org/buglist.cgi' ),
348 # It seems something has changed and it is no longer possible to
349 # download any files from there.
350 # NOTE: I am leaving it in the list, commented out, just so someone
351 # does not add it back immediately .-)
352 # 'novell': 'https://bugzilla.novell.com/buglist.cgi',
353 ( 'ooo', 'https://bz.apache.org/ooo/buglist.cgi' ),
354 ( 'tdf', 'http://bugs.documentfoundation.org/buglist.cgi' ),
357 redhatrpc = 'https://bugzilla.redhat.com/xmlrpc.cgi'
358 redhatbug = 'https://bugzilla.redhat.com/show_bug.cgi?id='
360 #Novell Bugzilla requires users to log in, in order to get details of the bugs such as attachment bodies etc.
361 #As a dirty workaround, we parse comments containing "Created an attachment (id=xxxxxx)" and download attachments manually
362 #python-bugzilla claims that it supports Novell bugzilla login but it's not working right now and novell bugzilla login
363 #system is a nightmare
364 novellattach = 'https://bugzilla.novell.com/attachment.cgi?id='
366 mimetypes = {
367 # ODF
368 'application/vnd.oasis.opendocument.base': 'odb',
369 'application/vnd.oasis.opendocument.database': 'odb',
370 'application/vnd.oasis.opendocument.chart': 'odc',
371 'application/vnd.oasis.opendocument.chart-template': 'otc',
372 'application/vnd.oasis.opendocument.formula': 'odf',
373 'application/vnd.oasis.opendocument.formula-template': 'otf',
374 'application/vnd.oasis.opendocument.graphics': 'odg',
375 'application/vnd.oasis.opendocument.graphics-template': 'otg',
376 'application/vnd.oasis.opendocument.graphics-flat-xml': 'fodg',
377 'application/vnd.oasis.opendocument.presentation': 'odp',
378 'application/vnd.oasis.opendocument.presentation-template': 'otp',
379 'application/vnd.oasis.opendocument.presentation-flat-xml': 'fodp',
380 'application/vnd.oasis.opendocument.spreadsheet': 'ods',
381 'application/vnd.oasis.opendocument.spreadsheet-template': 'ots',
382 'application/vnd.oasis.opendocument.spreadsheet-flat-xml': 'fods',
383 'application/vnd.oasis.opendocument.text': 'odt',
384 'application/vnd.oasis.opendocument.text-flat-xml': 'fodt',
385 'application/vnd.oasis.opendocument.text-master': 'odm',
386 'application/vnd.oasis.opendocument.text-template': 'ott',
387 'application/vnd.oasis.opendocument.text-master-template': 'otm',
388 'application/vnd.oasis.opendocument.text-web': 'oth',
389 # OOo XML
390 'application/vnd.sun.xml.base': 'odb',
391 'application/vnd.sun.xml.calc': 'sxc',
392 'application/vnd.sun.xml.calc.template': 'stc',
393 'application/vnd.sun.xml.chart': 'sxs',
394 'application/vnd.sun.xml.draw': 'sxd',
395 'application/vnd.sun.xml.draw.template': 'std',
396 'application/vnd.sun.xml.impress': 'sxi',
397 'application/vnd.sun.xml.impress.template': 'sti',
398 'application/vnd.sun.xml.math': 'sxm',
399 'application/vnd.sun.xml.writer': 'sxw',
400 'application/vnd.sun.xml.writer.global': 'sxg',
401 'application/vnd.sun.xml.writer.template': 'stw',
402 'application/vnd.sun.xml.writer.web': 'stw',
403 # MSO
404 'application/rtf': 'rtf',
405 'text/rtf': 'rtf',
406 'application/msword': 'doc',
407 'application/vnd.ms-powerpoint': 'ppt',
408 'application/vnd.ms-excel': 'xls',
409 'application/vnd.ms-excel.sheet.binary.macroEnabled.12': 'xlsb',
410 'application/vnd.ms-excel.sheet.macroEnabled.12': 'xlsm',
411 'application/vnd.ms-excel.template.macroEnabled.12': 'xltm',
412 'application/vnd.ms-powerpoint.presentation.macroEnabled.12': 'pptm',
413 'application/vnd.ms-powerpoint.slide.macroEnabled.12': 'sldm',
414 'application/vnd.ms-powerpoint.slideshow.macroEnabled.12': 'ppsm',
415 'application/vnd.ms-powerpoint.template.macroEnabled.12': 'potm',
416 'application/vnd.ms-word.document.macroEnabled.12': 'docm',
417 'application/vnd.ms-word.template.macroEnabled.12': 'dotm',
418 'application/vnd.openxmlformats-officedocument.spreadsheetml.sheet': 'xlsx',
419 'application/vnd.openxmlformats-officedocument.spreadsheetml.template': 'xltx',
420 'application/vnd.openxmlformats-officedocument.presentationml.presentation': 'pptx',
421 'application/vnd.openxmlformats-officedocument.presentationml.template': 'potx',
422 'application/vnd.openxmlformats-officedocument.presentationml.slideshow': 'ppsx',
423 'application/vnd.openxmlformats-officedocument.presentationml.slide': 'sldx',
424 'application/vnd.openxmlformats-officedocument.wordprocessingml.document': 'docx',
425 'application/vnd.openxmlformats-officedocument.wordprocessingml.template': 'dotx',
426 'application/vnd.visio': 'vsd',
427 'application/visio.drawing': 'vsd',
428 'application/vnd.visio2013': 'vsdx',
429 'application/vnd.visio.xml': 'vdx',
430 'application/x-mspublisher': 'pub',
431 #WPS Office
432 'application/wps-office.doc': 'doc',
433 'application/wps-office.docx': 'docx',
434 'application/wps-office.xls': 'xls',
435 'application/wps-office.xlsx': 'xlsx',
436 'application/wps-office.ppt': 'ppt',
437 'application/wps-office.pptx': 'pptx',
438 # W3C
439 'application/xhtml+xml': 'xhtml',
440 'application/mathml+xml': 'mml',
441 'text/html': 'html',
442 'application/docbook+xml': 'docbook',
443 # misc
444 'text/csv': 'csv',
445 'text/spreadsheet': 'slk',
446 'application/x-qpro': 'qpro',
447 'application/x-dbase': 'dbf',
448 'application/vnd.corel-draw': 'cdr',
449 'application/vnd.lotus-wordpro': 'lwp',
450 'application/vnd.lotus-1-2-3': 'wks',
451 'application/vnd.wordperfect': 'wpd',
452 'application/wordperfect5.1': 'wpd',
453 'application/vnd.ms-works': 'wps',
454 'application/clarisworks' : 'cwk',
455 'application/macwriteii' : 'mw',
456 'application/vnd.apple.keynote': 'key',
457 'application/vnd.apple.numbers': 'numbers',
458 'application/vnd.apple.pages': 'pages',
459 'application/x-iwork-keynote-sffkey': 'key',
460 'application/x-iwork-numbers-sffnumbers': 'numbers',
461 'application/x-iwork-pages-sffpages': 'pages',
462 'application/x-hwp': 'hwp',
463 'application/x-aportisdoc': 'pdb',
464 'application/prs.plucker' : 'pdb_plucker',
465 'application/vnd.palm' : 'pdb_palm',
466 'application/x-sony-bbeb' : 'lrf',
467 'application/x-pocket-word': 'psw',
468 'application/x-t602': '602',
469 'application/x-fictionbook+xml': 'fb2',
470 'application/x-abiword': 'abw',
471 'application/x-pagemaker': 'pmd',
472 'application/x-gnumeric': 'gnumeric',
473 'application/vnd.stardivision.calc': 'sdc',
474 'application/vnd.stardivision.draw': 'sda',
475 'application/vnd.stardivision.writer': 'sdw',
476 'application/x-starcalc': 'sdc',
477 'application/x-stardraw': 'sdd',
478 'application/x-starwriter': 'sdw',
479 # relatively uncommon image mimetypes
480 'image/x-freehand': 'fh',
481 'image/cgm': 'cgm',
482 'image/tif': 'tiff',
483 'image/tiff': 'tiff',
484 'image/vnd.dxf': 'dxf',
485 'image/emf': 'emf',
486 'image/x-emf': 'emf',
487 'image/x-targa': 'tga',
488 'image/x-sgf': 'sgf',
489 'image/x-svm': 'svm',
490 'image/wmf': 'wmf',
491 'image/x-wmf': 'wmf',
492 'image/x-pict': 'pict',
493 'image/x-cmx': 'cmx',
494 'image/svg+xml': 'svg',
495 'image/bmp': 'bmp',
496 'image/x-ms-bmp': 'bmp',
497 'image/x-MS-bmp': 'bmp',
498 'image/x-wpg': 'wpg',
499 'image/x-eps': 'eps',
500 'image/x-met': 'met',
501 'image/x-portable-bitmap': 'pbm',
502 'image/x-photo-cd': 'pcd',
503 'image/x-pcx': 'pcx',
504 'image/x-portable-graymap': 'pgm',
505 'image/x-portable-pixmap': 'ppm',
506 'image/vnd.adobe.photoshop': 'psd',
507 'image/x-cmu-raster': 'ras',
508 'image/x-sun-raster': 'ras',
509 'image/x-xbitmap': 'xbm',
510 'image/x-xpixmap': 'xpm',
513 # disabled for now, this would download gigs of pngs/jpegs...
514 common_noncore_mimetypes = {
515 # graphics
516 'image/gif': 'gif',
517 'image/jpeg': 'jpeg',
518 'image/png': 'png',
519 # pdf, etc.
520 'application/pdf': 'pdf',
523 class manage_threads(threading.Thread):
524 def run(self):
525 #print(threading.current_thread().get_ident())
526 while 1:
527 # Try to receive a job from queue
528 try:
529 # Get job from queue
530 # Use job parameters to call our query
531 # Then let the queue know we are done with this job
532 (uri, mimetype, prefix, extension) = jobs.get(True,6)
533 try:
534 get_through_rss_query(uri, mimetype, prefix, extension)
535 finally:
536 jobs.task_done()
537 except KeyboardInterrupt:
538 raise # Ctrl+C should work
539 except queue.Empty:
540 break
542 def generate_multi_threading():
543 for (prefix, uri) in rss_bugzillas:
545 # Initialize threads
546 for i in range(max_threads):
547 manage_threads().start()
549 # Create a job for every mimetype for a bugzilla
550 for (mimetype,extension) in mimetypes.items():
551 # It seems that bugzilla has problems returning that many results
552 # (10000 results is probably a limit set somewhere) so we always
553 # end processing the complete list.
554 if mimetype == 'text/html' and prefix == 'moz':
555 continue
557 jobs.put([uri, mimetype, prefix, extension], block=True)
558 print("successfully placed a job in the queue searching for " + mimetype + " in bugtracker " + prefix)
560 # Continue when all mimetypes are done for a bugzilla
561 jobs.join()
562 print("DONE with bugtracker " + prefix)
564 max_threads = 20 # Number of threads to create, (1 = without multi-threading)
565 jobs = queue.Queue()
567 generate_multi_threading()
569 for (mimetype,extension) in mimetypes.items():
570 get_through_rpc_query(redhatrpc, redhatbug, mimetype, "rhbz", extension)
572 try:
573 get_launchpad_bugs("lp")
574 except ImportError:
575 print("launchpadlib unavailable, skipping Ubuntu tracker")
577 # vim:set shiftwidth=4 softtabstop=4 expandtab: