Branch libreoffice-5-0-4
[LibreOffice.git] / bin / get-bugzilla-attachments-by-mimetype
blobea92bb82258d80bd6e7c0beb4e9e2a59258b96a6
1 #!/usr/bin/env python
2 # -*- coding: utf-8 -*-
4 # This file is part of the LibreOffice project.
6 # This Source Code Form is subject to the terms of the Mozilla Public
7 # License, v. 2.0. If a copy of the MPL was not distributed with this
8 # file, You can obtain one at http://mozilla.org/MPL/2.0/.
11 # This digs through a pile of bugzilla's and populates the cwd with a big
12 # collection of bug-docs in per-filetype dirs with bug-ids as names with
13 # prefixes to indicate which bug-tracker, e.g.
15 # fdo-bugid-X.suffix
16 # rhbz-bugid-X.suffix
17 # moz-bugid-X.suffix
19 # where X is the n'th attachment of that type in the bug
21 # The results are stored in the current directory, categorized by the
22 # extension of the downloaded file. When a file already exists, it is assumed
23 # it is already downloaded by a previous run, and up-to-date.
25 from __future__ import print_function
26 import feedparser
27 import base64
28 import datetime
29 import glob
30 import re
31 import os, os.path
32 import stat
33 import sys
34 import threading, Queue
35 try:
36 from urllib.request import urlopen
37 except:
38 from urllib import urlopen
39 try:
40 import xmlrpc.client as xmlrpclib
41 except:
42 import xmlrpclib
43 from xml.dom import minidom
44 from xml.sax.saxutils import escape
46 def urlopen_retry(url):
47 maxretries = 3
48 for i in range(maxretries + 1):
49 try:
50 return urlopen(url)
51 except IOError as e:
52 print("caught IOError: " + str(e))
53 if maxretries == i:
54 raise
55 print("retrying...")
57 def get_from_bug_url_via_xml(url, mimetype, prefix, suffix):
58 id = url.rsplit('=', 2)[1]
59 print("id is " + prefix + id + " " + suffix)
60 print("parsing " + id)
61 sock = urlopen_retry(url+"&ctype=xml")
62 dom = minidom.parse(sock)
63 sock.close()
64 attachmentid=0
65 for attachment in dom.getElementsByTagName('attachment'):
66 attachmentid += 1
67 print(" mimetype is", end=' ')
68 for node in attachment.childNodes:
69 if node.nodeName == 'type':
70 print(node.firstChild.nodeValue, end=' ')
71 if node.firstChild.nodeValue.lower() != mimetype.lower():
72 print('skipping')
73 break
74 elif node.nodeName == 'data':
75 # check if attachment is deleted (i.e. https://bugs.kde.org/show_bug.cgi?id=53343&ctype=xml)
76 if not node.firstChild:
77 print('deleted attachment, skipping')
78 continue
80 download = suffix + '/' +prefix + id + '-' + str(attachmentid) + '.' + suffix
81 if os.path.isfile(download):
82 print("assuming " + download + " is up to date")
83 continue
85 print('downloading as ' + download)
86 f = open(download, 'wb')
87 f.write(base64.b64decode(node.firstChild.nodeValue))
88 f.close()
89 break
91 def get_novell_bug_via_xml(url, mimetype, prefix, suffix):
92 id = url.rsplit('=', 2)[1]
93 print("id is " + prefix + id + " " + suffix)
94 print("parsing " + id)
95 sock = urlopen_retry(url+"&ctype=xml")
96 dom = minidom.parse(sock)
97 sock.close()
98 attachmentid=0
99 for comment in dom.getElementsByTagName('thetext'):
100 commentText = comment.firstChild.nodeValue
101 match = re.search(r".*Created an attachment \(id=([0-9]+)\)", commentText)
102 if not match:
103 continue
105 attachmentid += 1
107 download = suffix + '/' + prefix + id + '-' + str(attachmentid) + '.' + suffix
108 if os.path.isfile(download):
109 print("assuming " + download + " is up to date")
110 continue
112 realAttachmentId = match.group(1)
113 handle = urlopen_retry(novellattach + realAttachmentId)
114 if not handle:
115 print("attachment %s is not accessible" % realAttachmentId)
116 continue
117 print(" mimetype is", end=' ')
119 info = handle.info()
120 if info.get_content_type:
121 remoteMime = info.get_content_type()
122 else:
123 remoteMime = info.gettype()
124 print(remoteMime, end=' ')
125 if remoteMime != mimetype:
126 print("skipping")
127 continue
129 print('downloading as ' + download)
130 f = open(download, 'wb')
131 f.write(handle.read())
132 f.close()
134 def create_query(mimetype):
135 query = dict()
136 query['query_format']='advanced'
137 query['field0-0-0']='attachments.mimetype'
138 query['type0-0-0']='equals'
139 query['value0-0-0']=mimetype
140 return query
142 def get_downloaded_files(prefix, suffix):
143 return glob.glob(os.path.join(suffix, '%s*.%s' % (prefix, suffix)))
145 def get_file_bz_ids(files, prefix):
146 return set([os.path.basename(f).split('-')[0].replace(prefix, '', 1) for f in files])
148 def get_changed_date(files):
149 newest = max([os.stat(f)[stat.ST_MTIME] for f in files])
150 # Subtract a day to avoid timezone differences. The worst thing that
151 # can happen is that we are going to process more bugs than necessary.
152 return datetime.date.fromtimestamp(newest - 24 * 60 * 60)
154 def get_through_rpc_query(rpcurl, showurl, mimetype, prefix, suffix):
155 try:
156 os.mkdir(suffix)
157 except:
158 pass
160 def process(query, full, have=[]):
161 try:
162 proxy = xmlrpclib.ServerProxy(rpcurl)
163 result = proxy.Bug.search(query)
164 bugs = result['bugs']
165 print(str(len(bugs)) + ' bugs to process')
167 if full:
168 available = set([str(bug['id']) for bug in bugs])
169 # we already have files from all available bugs
170 if available.difference(set(have)) == set():
171 print("assuming all downloaded files are up to date")
172 return
174 for bug in bugs:
175 url = showurl + str(bug['id'])
176 get_from_bug_url_via_xml(url, mimetype, prefix, suffix)
177 except xmlrpclib.Fault as err:
178 print("A fault occurred")
179 print("Fault code: %s" % err.faultCode)
180 print(err.faultString)
182 query = create_query(mimetype)
183 query['column_list']='bug_id'
185 files = get_downloaded_files(prefix, suffix)
187 if files != []:
188 print('looking for updated bugs having %s attachment(s)' % mimetype)
189 query_changed = query.copy()
190 query_changed['field0-1-0'] = 'days_elapsed'
191 query_changed['type0-1-0'] = 'lessthaneq'
192 query_changed['value0-1-0'] = str((datetime.date.today() - get_changed_date(files)).days)
193 process(query_changed, False)
195 print('looking for all bugs having %s attachment(s)' % mimetype)
196 process(query, True, get_file_bz_ids(files, prefix))
198 def get_through_rss_query(queryurl, mimetype, prefix, suffix):
199 try:
200 os.mkdir(suffix)
201 except:
202 pass
204 #Getting detailed bug information and downloading an attachment body is not possible without logging in to Novell bugzilla
205 #get_novell_bug_via_xml function is a workaround for that situation
206 get_bug_function = get_novell_bug_via_xml if prefix == "novell" else get_from_bug_url_via_xml
208 def process(query, full, have=[]):
209 url = queryurl + '?' + '&'.join(['='.join(kv) for kv in query.iteritems()])
210 print('url is ' + url)
211 d = feedparser.parse(url)
212 print(str(len(d['entries'])) + ' bugs to process')
214 if full:
215 available = set([str(entry['id'].split('=')[-1]) for entry in d['entries']])
216 # we already have files from all available bugs
217 if available.difference(set(have)) == set():
218 print("assuming all downloaded files are up to date")
219 return
221 for entry in d['entries']:
222 try:
223 get_bug_function(entry['id'], mimetype, prefix, suffix)
224 except KeyboardInterrupt:
225 raise # Ctrl+C should work
226 except:
227 print(entry['id'] + " failed: " + str(sys.exc_info()[0]))
228 pass
230 query = create_query(escape(mimetype))
231 query['ctype'] = 'rss'
233 files = get_downloaded_files(prefix, suffix)
235 if files != []:
236 print('looking for updated bugs having %s attachment(s)' % mimetype)
237 query_changed = query.copy()
238 query_changed['field0-1-0'] = 'changed'
239 query_changed['type0-1-0'] = 'changedbefore'
240 query_changed['value0-1-0'] = get_changed_date(files).isoformat()
241 process(query_changed, False)
243 print('looking for all bugs having %s attachment(s)' % mimetype)
244 process(query, True, get_file_bz_ids(files, prefix))
246 #since searching bugs having attachments with specific mimetypes is not available in launchpad API
247 #we're iterating over all bugs of the most interesting source packages
248 launchpad_pkgs = (
249 "abiword",
250 "calibre",
251 "calligra",
252 "gnumeric",
253 "inkscape",
254 "koffice",
255 "libabw",
256 "libcdr",
257 "libe-book",
258 "libetonyek",
259 "libfreehand",
260 "libmspub",
261 "libmwaw",
262 "liborcus",
263 "libpagemaker",
264 "libreoffice",
265 "libvisio",
266 "libwpd",
267 "libwpg",
268 "libwps",
269 "openoffice.org",
270 "python-uniconvertor",
271 "scribus",
272 "sk1",
273 "unoconv",
276 def get_launchpad_bugs(prefix):
277 #launchpadlib python module is required to download launchpad attachments
278 from launchpadlib.launchpad import Launchpad
280 launchpad = Launchpad.login_anonymously("attachmentdownload", "production")
281 ubuntu = launchpad.distributions["ubuntu"]
283 for pkg in launchpad_pkgs:
284 srcpkg = ubuntu.getSourcePackage(name=pkg)
285 pkgbugs = srcpkg.searchTasks(status=["New", "Fix Committed", "Invalid", "Won't Fix", "Confirmed", "Triaged", "In Progress", "Incomplete", "Incomplete (with response)", "Incomplete (without response)", "Fix Released", "Opinion", "Expired"])
287 for bugtask in pkgbugs:
288 bug = bugtask.bug
289 id = str(bug.id)
290 print("parsing " + id + " status: " + bugtask.status + " title: " + bug.title[:50])
291 attachmentid = 0
292 for attachment in bug.attachments:
293 attachmentid += 1
294 handle = attachment.data.open()
295 if not handle.content_type in mimetypes:
296 #print "skipping"
297 continue
299 suffix = mimetypes[handle.content_type]
300 if not os.path.isdir(suffix):
301 try:
302 os.mkdir(suffix)
303 except:
304 pass
306 download = suffix + '/' + prefix + id + '-' + str(attachmentid) + '.' + suffix
308 if os.path.isfile(download):
309 print("assuming " + id + " is up to date")
310 break
312 print('mimetype is ' + handle.content_type + ' downloading as ' + download)
314 f = open(download, "w")
315 f.write(handle.read())
316 f.close()
318 rss_bugzillas = {
319 'abi': 'http://bugzilla.abisource.com/buglist.cgi', #added for abiword
320 'fdo': 'http://bugs.libreoffice.org/buglist.cgi',
321 'gentoo': 'http://bugs.gentoo.org/buglist.cgi',
322 'gnome': 'http://bugzilla.gnome.org/buglist.cgi', # added for gnumeric
323 'kde': 'http://bugs.kde.org/buglist.cgi', # added for koffice/calligra
324 'mandriva': 'https://qa.mandriva.com/buglist.cgi',
325 'moz': 'https://bugzilla.mozilla.org/buglist.cgi',
326 # It seems something has changed and it is no longer possible to
327 # download any files from there.
328 # NOTE: I am leaving it in the list, commented out, just so someone
329 # does not add it back immediately .-)
330 # 'novell': 'https://bugzilla.novell.com/buglist.cgi',
331 'ooo': 'https://bz.apache.org/ooo/buglist.cgi',
332 'tdf': 'http://bugs.documentfoundation.org/buglist.cgi',
335 redhatrpc = 'https://bugzilla.redhat.com/xmlrpc.cgi'
336 redhatbug = 'https://bugzilla.redhat.com/show_bug.cgi?id='
338 #Novell Bugzilla requires users to log in in order to get details of the bugs such as attachment bodies etc.
339 #As a dirty workaround, we parse comments containing "Created an attachment (id=xxxxxx)" and download attachments manually
340 #python-bugzilla claims that it supports Novell bugzilla login but it's not working right now and novell bugzilla login
341 #system is a nightmare
342 novellattach = 'https://bugzilla.novell.com/attachment.cgi?id='
344 mimetypes = {
345 # ODF
346 'application/vnd.oasis.opendocument.base': 'odb',
347 'application/vnd.oasis.opendocument.database': 'odb',
348 'application/vnd.oasis.opendocument.chart': 'odc',
349 'application/vnd.oasis.opendocument.chart-template': 'otc',
350 'application/vnd.oasis.opendocument.formula': 'odf',
351 'application/vnd.oasis.opendocument.formula-template': 'otf',
352 'application/vnd.oasis.opendocument.graphics': 'odg',
353 'application/vnd.oasis.opendocument.graphics-template': 'otg',
354 'application/vnd.oasis.opendocument.graphics-flat-xml': 'fodg',
355 'application/vnd.oasis.opendocument.presentation': 'odp',
356 'application/vnd.oasis.opendocument.presentation-template': 'otp',
357 'application/vnd.oasis.opendocument.presentation-flat-xml': 'fodp',
358 'application/vnd.oasis.opendocument.spreadsheet': 'ods',
359 'application/vnd.oasis.opendocument.spreadsheet-template': 'ots',
360 'application/vnd.oasis.opendocument.spreadsheet-flat-xml': 'fods',
361 'application/vnd.oasis.opendocument.text': 'odt',
362 'application/vnd.oasis.opendocument.text-flat-xml': 'fodt',
363 'application/vnd.oasis.opendocument.text-master': 'odm',
364 'application/vnd.oasis.opendocument.text-template': 'ott',
365 'application/vnd.oasis.opendocument.text-master-template': 'otm',
366 'application/vnd.oasis.opendocument.text-web': 'oth',
367 # OOo XML
368 'application/vnd.sun.xml.base': 'odb',
369 'application/vnd.sun.xml.calc': 'sxc',
370 'application/vnd.sun.xml.calc.template': 'stc',
371 'application/vnd.sun.xml.chart': 'sxs',
372 'application/vnd.sun.xml.draw': 'sxd',
373 'application/vnd.sun.xml.draw.template': 'std',
374 'application/vnd.sun.xml.impress': 'sxi',
375 'application/vnd.sun.xml.impress.template': 'sti',
376 'application/vnd.sun.xml.math': 'sxm',
377 'application/vnd.sun.xml.writer': 'sxw',
378 'application/vnd.sun.xml.writer.global': 'sxg',
379 'application/vnd.sun.xml.writer.template': 'stw',
380 'application/vnd.sun.xml.writer.web': 'stw',
381 # MSO
382 'application/rtf': 'rtf',
383 'text/rtf': 'rtf',
384 'application/msword': 'doc',
385 'application/vnd.ms-powerpoint': 'ppt',
386 'application/vnd.ms-excel': 'xls',
387 'application/vnd.ms-excel.sheet.binary.macroEnabled.12': 'xlsb',
388 'application/vnd.ms-excel.sheet.macroEnabled.12': 'xlsm',
389 'application/vnd.ms-excel.template.macroEnabled.12': 'xltm',
390 'application/vnd.ms-powerpoint.presentation.macroEnabled.12': 'pptm',
391 'application/vnd.ms-powerpoint.slide.macroEnabled.12': 'sldm',
392 'application/vnd.ms-powerpoint.slideshow.macroEnabled.12': 'ppsm',
393 'application/vnd.ms-powerpoint.template.macroEnabled.12': 'potm',
394 'application/vnd.ms-word.document.macroEnabled.12': 'docm',
395 'application/vnd.ms-word.template.macroEnabled.12': 'dotm',
396 'application/vnd.openxmlformats-officedocument.spreadsheetml.sheet': 'xlsx',
397 'application/vnd.openxmlformats-officedocument.spreadsheetml.template': 'xltx',
398 'application/vnd.openxmlformats-officedocument.presentationml.presentation': 'pptx',
399 'application/vnd.openxmlformats-officedocument.presentationml.template': 'ppotx',
400 'application/vnd.openxmlformats-officedocument.presentationml.slideshow': 'ppsx',
401 'application/vnd.openxmlformats-officedocument.presentationml.slide': 'sldx',
402 'application/vnd.openxmlformats-officedocument.wordprocessingml.document': 'docx',
403 'application/vnd.openxmlformats-officedocument.wordprocessingml.template': 'dotx',
404 'application/vnd.visio': 'vsd',
405 'application/vnd.visio.xml': 'vdx',
406 'application/x-mspublisher': 'pub',
407 # W3C
408 'application/xhtml+xml': 'xhtml',
409 'application/mathml+xml': 'mml',
410 'text/html': 'html',
411 'application/docbook+xml': 'docbook',
412 # misc
413 'text/csv': 'csv',
414 'text/spreadsheet': 'slk',
415 'application/x-dbase': 'dbf',
416 'application/vnd.corel-draw': 'cdr',
417 'application/vnd.lotus-wordpro': 'lwp',
418 'application/vnd.lotus-1-2-3': 'wks',
419 'application/vnd.wordperfect': 'wpd',
420 'application/wordperfect5.1': 'wpd',
421 'application/vnd.ms-works': 'wps',
422 'application/clarisworks' : 'cwk',
423 'application/macwriteii' : 'mw',
424 'application/vnd.apple.keynote': 'key',
425 'application/vnd.apple.numbers': 'numbers',
426 'application/vnd.apple.pages': 'pages',
427 'application/x-iwork-keynote-sffkey': 'key',
428 'application/x-iwork-numbers-sffnumbers': 'numbers',
429 'application/x-iwork-pages-sffpages': 'pages',
430 'application/x-hwp': 'hwp',
431 'application/x-aportisdoc': 'pdb',
432 'application/prs.plucker' : 'pdb_plucker',
433 'application/vnd.palm' : 'pdb_palm',
434 'application/x-sony-bbeb' : 'lrf',
435 'application/x-pocket-word': 'psw',
436 'application/x-t602': '602',
437 'application/x-fictionbook+xml': 'fb2',
438 'application/x-abiword': 'abw',
439 'application/x-pagemaker': 'pmd',
440 # relatively uncommon image mimetypes
441 'image/x-freehand': 'fh',
442 'image/cgm': 'cgm',
443 'image/tiff': 'tiff',
444 'image/vnd.dxf': 'dxf',
445 'image/x-emf': 'emf',
446 'image/x-targa': 'tga',
447 'image/x-sgf': 'sgf',
448 'image/x-svm': 'svm',
449 'image/x-wmf': 'wmf',
450 'image/x-pict': 'pict',
451 'image/x-cmx': 'cmx',
452 'image/svg+xml': 'svg',
453 'image/x-MS-bmp': 'bmp',
454 'image/x-wpg': 'wpg',
455 'image/x-eps': 'eps',
456 'image/x-met': 'met',
457 'image/x-portable-bitmap': 'pbm',
458 'image/x-photo-cd': 'pcd',
459 'image/x-pcx': 'pcx',
460 'image/x-portable-graymap': 'pgm',
461 'image/x-portable-pixmap': 'ppm',
462 'image/vnd.adobe.photoshop': 'psd',
463 'image/x-cmu-raster': 'ras',
464 'image/x-sun-raster': 'ras',
465 'image/x-xbitmap': 'xbm',
466 'image/x-xpixmap': 'xpm',
469 # disabled for now, this would download gigs of pngs/jpegs...
470 common_noncore_mimetypes = {
471 # graphics
472 'image/gif': 'gif',
473 'image/jpeg': 'jpeg',
474 'image/png': 'png',
475 # pdf, etc.
476 'application/pdf': 'pdf',
479 class manage_threads(threading.Thread):
480 def run(self):
481 #print(threading.current_thread().get_ident())
482 while 1:
483 # Try to receive a job from queue
484 try:
485 # Get job from queue
486 # Use job parameters to call our query
487 # Then let the queue know we are done with this job
488 job = jobs.get(True,5)
489 get_through_rss_query(job[0], job[1], job[2], job[3]) # [0] = uri; [1] = mimetype; [2] = prefix; [3] = extension
490 jobs.task_done()
491 except KeyboardInterrupt:
492 raise # Ctrl+C should work
493 except:
494 break
496 def generate_multi_threading():
497 for (prefix, uri) in rss_bugzillas.items():
499 # Initialize threads
500 for i in xrange(max_threads):
501 manage_threads().start()
503 # Create a job for every mimetype for a bugzilla
504 for (mimetype,extension) in mimetypes.items():
507 # It seems that bugzilla has problems returing that many results
508 # (10000 results is probably a limit set somewhere) so we always
509 # end processing the complete list.
510 if mimetype == 'text/html' and prefix == 'moz':
511 continue
513 try:
514 jobs.put([uri, mimetype, prefix, extension], block=True, timeout=3)
515 print("successfully placed a job in the queue searching for " + mimetype + " in bugtracker " + prefix)
516 except KeyboardInterrupt:
517 raise # Ctrl+C should work
518 except:
519 print("Queue full")
521 # Continue when all mimetypes are done for a bugzilla
522 jobs.join()
524 max_threads = 20 # Number of threads to create, (1 = without multi-threading)
525 jobs = Queue.Queue(40)
527 generate_multi_threading()
529 for (mimetype,extension) in mimetypes.items():
530 get_through_rpc_query(redhatrpc, redhatbug, mimetype, "rhbz", extension)
532 try:
533 get_launchpad_bugs("lp")
534 except ImportError:
535 print("launchpadlib unavailable, skipping Ubuntu tracker")
537 # vim:set shiftwidth=4 softtabstop=4 expandtab: