bin/get-bugzilla-attachments-by-mimetype

   1 #!/usr/bin/env python
   2 # -*- coding: utf-8 -*-
   3 #
   4 # This file is part of the LibreOffice project.
   5 #
   6 # This Source Code Form is subject to the terms of the Mozilla Public
   7 # License, v. 2.0. If a copy of the MPL was not distributed with this
   8 # file, You can obtain one at http://mozilla.org/MPL/2.0/.
   9 #
  10
  11 # This digs through a pile of bugzilla's and populates the cwd with a big
  12 # collection of bug-docs in per-filetype dirs with bug-ids as names with
  13 # prefixes to indicate which bug-tracker, e.g.
  14 #
  15 # fdo-bugid-X.suffix
  16 # rhbz-bugid-X.suffix
  17 # moz-bugid-X.suffix
  18 #
  19 # where X is the n'th attachment of that type in the bug
  20 #
  21 # The results are stored in the current directory, categorized by the
  22 # extension of the downloaded file.  When a file already exists, it is assumed
  23 # it is already downloaded by a previous run, and up-to-date.
  24
  25 from __future__ import print_function
  26 import feedparser
  27 import base64
  28 import datetime
  29 import glob
  30 import re
  31 import os, os.path
  32 import stat
  33 import sys
  34 import threading, Queue
  35 try:
  36     from urllib.request import urlopen
  37 except:
  38     from urllib import urlopen
  39 try:
  40     import xmlrpc.client as xmlrpclib
  41 except:
  42     import xmlrpclib
  43 from xml.dom import minidom
  44 from xml.sax.saxutils import escape
  45
  46 def urlopen_retry(url):
  47     maxretries = 3
  48     for i in range(maxretries + 1):
  49         try:
  50             return urlopen(url)
  51         except IOError as e:
  52             print("caught IOError: " + str(e))
  53             if maxretries == i:
  54                 raise
  55             print("retrying...")
  56
  57 def get_from_bug_url_via_xml(url, mimetype, prefix, suffix):
  58     id = url.rsplit('=', 2)[1]
  59     print("id is " + prefix + id + " " + suffix)
  60     print("parsing " + id)
  61     sock = urlopen_retry(url+"&ctype=xml")
  62     dom = minidom.parse(sock)
  63     sock.close()
  64     attachmentid=0
  65     for attachment in dom.getElementsByTagName('attachment'):
  66         attachmentid += 1
  67         print(" mimetype is", end=' ')
  68         for node in attachment.childNodes:
  69             if node.nodeName == 'type':
  70                 print(node.firstChild.nodeValue, end=' ')
  71                 if node.firstChild.nodeValue.lower() != mimetype.lower():
  72                     print('skipping')
  73                     break
  74             elif node.nodeName == 'data':
  75                 # check if attachment is deleted (i.e. https://bugs.kde.org/show_bug.cgi?id=53343&ctype=xml)
  76                 if not node.firstChild:
  77                     print('deleted attachment, skipping')
  78                     continue
  79
  80                 download = suffix + '/' +prefix + id + '-' + str(attachmentid) + '.' + suffix
  81                 if os.path.isfile(download):
  82                     print("assuming " + download + " is up to date")
  83                     continue
  84
  85                 print('downloading as ' + download)
  86                 f = open(download, 'wb')
  87                 f.write(base64.b64decode(node.firstChild.nodeValue))
  88                 f.close()
  89                 break
  90
  91 def get_novell_bug_via_xml(url, mimetype, prefix, suffix):
  92     id = url.rsplit('=', 2)[1]
  93     print("id is " + prefix + id + " " + suffix)
  94     print("parsing " + id)
  95     sock = urlopen_retry(url+"&ctype=xml")
  96     dom = minidom.parse(sock)
  97     sock.close()
  98     attachmentid=0
  99     for comment in dom.getElementsByTagName('thetext'):
 100         commentText = comment.firstChild.nodeValue
 101         match = re.search(r".*Created an attachment \(id=([0-9]+)\)", commentText)
 102         if not match:
 103             continue
 104
 105         attachmentid += 1
 106
 107         download = suffix + '/' + prefix + id + '-' + str(attachmentid) + '.' + suffix
 108         if os.path.isfile(download):
 109             print("assuming " + download + " is up to date")
 110             continue
 111
 112         realAttachmentId = match.group(1)
 113         handle = urlopen_retry(novellattach + realAttachmentId)
 114         if not handle:
 115             print("attachment %s is not accessible" % realAttachmentId)
 116             continue
 117         print(" mimetype is", end=' ')
 118
 119         info = handle.info()
 120         if info.get_content_type:
 121             remoteMime = info.get_content_type()
 122         else:
 123             remoteMime = info.gettype()
 124         print(remoteMime, end=' ')
 125         if remoteMime != mimetype:
 126             print("skipping")
 127             continue
 128
 129         print('downloading as ' + download)
 130         f = open(download, 'wb')
 131         f.write(handle.read())
 132         f.close()
 133
 134 def create_query(mimetype):
 135     query = dict()
 136     query['query_format']='advanced'
 137     query['field0-0-0']='attachments.mimetype'
 138     query['type0-0-0']='equals'
 139     query['value0-0-0']=mimetype
 140     return query
 141
 142 def get_downloaded_files(prefix, suffix):
 143     return glob.glob(os.path.join(suffix, '%s*.%s' % (prefix, suffix)))
 144
 145 def get_file_bz_ids(files, prefix):
 146     return set([os.path.basename(f).split('-')[0].replace(prefix, '', 1) for f in files])
 147
 148 def get_changed_date(files):
 149     newest = max([os.stat(f)[stat.ST_MTIME] for f in files])
 150     # Subtract a day to avoid timezone differences. The worst thing that
 151     # can happen is that we are going to process more bugs than necessary.
 152     return datetime.date.fromtimestamp(newest - 24 * 60 * 60)
 153
 154 def get_through_rpc_query(rpcurl, showurl, mimetype, prefix, suffix):
 155     try:
 156         os.mkdir(suffix)
 157     except:
 158         pass
 159
 160     def process(query, full, have=[]):
 161         try:
 162             proxy = xmlrpclib.ServerProxy(rpcurl)
 163             result = proxy.Bug.search(query)
 164             bugs = result['bugs']
 165             print(str(len(bugs)) + ' bugs to process')
 166
 167             if full:
 168                 available = set([str(bug['id']) for bug in bugs])
 169                 # we already have files from all available bugs
 170                 if available.difference(set(have)) == set():
 171                     print("assuming all downloaded files are up to date")
 172                     return
 173
 174             for bug in bugs:
 175                 url = showurl + str(bug['id'])
 176                 get_from_bug_url_via_xml(url, mimetype, prefix, suffix)
 177         except xmlrpclib.Fault as err:
 178             print("A fault occurred")
 179             print("Fault code: %s" % err.faultCode)
 180             print(err.faultString)
 181
 182     query = create_query(mimetype)
 183     query['column_list']='bug_id'
 184
 185     files = get_downloaded_files(prefix, suffix)
 186
 187     if files != []:
 188         print('looking for updated bugs having %s attachment(s)' % mimetype)
 189         query_changed = query.copy()
 190         query_changed['field0-1-0'] = 'days_elapsed'
 191         query_changed['type0-1-0'] = 'lessthaneq'
 192         query_changed['value0-1-0'] = str((datetime.date.today() - get_changed_date(files)).days)
 193         process(query_changed, False)
 194
 195     print('looking for all bugs having %s attachment(s)' % mimetype)
 196     process(query, True, get_file_bz_ids(files, prefix))
 197
 198 def get_through_rss_query(queryurl, mimetype, prefix, suffix):
 199     try:
 200         os.mkdir(suffix)
 201     except:
 202         pass
 203
 204     #Getting detailed bug information and downloading an attachment body is not possible without logging in to Novell bugzilla
 205     #get_novell_bug_via_xml function is a workaround for that situation
 206     get_bug_function = get_novell_bug_via_xml if prefix == "novell" else get_from_bug_url_via_xml
 207
 208     def process(query, full, have=[]):
 209         url = queryurl + '?' + '&'.join(['='.join(kv) for kv in query.iteritems()])
 210         print('url is ' + url)
 211         d = feedparser.parse(url)
 212         print(str(len(d['entries'])) + ' bugs to process')
 213
 214         if full:
 215             available = set([str(entry['id'].split('=')[-1]) for entry in d['entries']])
 216             # we already have files from all available bugs
 217             if available.difference(set(have)) == set():
 218                 print("assuming all downloaded files are up to date")
 219                 return
 220
 221         for entry in d['entries']:
 222             try:
 223                 get_bug_function(entry['id'], mimetype, prefix, suffix)
 224             except KeyboardInterrupt:
 225                 raise # Ctrl+C should work
 226             except:
 227                 print(entry['id'] + " failed: " + str(sys.exc_info()[0]))
 228                 pass
 229
 230     query = create_query(escape(mimetype))
 231     query['ctype'] = 'rss'
 232
 233     files = get_downloaded_files(prefix, suffix)
 234
 235     if files != []:
 236         print('looking for updated bugs having %s attachment(s)' % mimetype)
 237         query_changed = query.copy()
 238         query_changed['field0-1-0'] = 'changed'
 239         query_changed['type0-1-0'] = 'changedbefore'
 240         query_changed['value0-1-0'] = get_changed_date(files).isoformat()
 241         process(query_changed, False)
 242
 243     print('looking for all bugs having %s attachment(s)' % mimetype)
 244     process(query, True, get_file_bz_ids(files, prefix))
 245
 246 #since searching bugs having attachments with specific mimetypes is not available in launchpad API
 247 #we're iterating over all bugs of the most interesting source packages
 248 launchpad_pkgs = (
 249     "abiword",
 250     "calibre",
 251     "calligra",
 252     "gnumeric",
 253     "inkscape",
 254     "koffice",
 255     "libabw",
 256     "libcdr",
 257     "libe-book",
 258     "libetonyek",
 259     "libfreehand",
 260     "libmspub",
 261     "libmwaw",
 262     "liborcus",
 263     "libpagemaker",
 264     "libreoffice",
 265     "libvisio",
 266     "libwpd",
 267     "libwpg",
 268     "libwps",
 269     "openoffice.org",
 270     "python-uniconvertor",
 271     "scribus",
 272     "sk1",
 273     "unoconv",
 274 )
 275
 276 def get_launchpad_bugs(prefix):
 277     #launchpadlib python module is required to download launchpad attachments
 278     from launchpadlib.launchpad import Launchpad
 279
 280     launchpad = Launchpad.login_anonymously("attachmentdownload", "production")
 281     ubuntu = launchpad.distributions["ubuntu"]
 282
 283     for pkg in launchpad_pkgs:
 284         srcpkg = ubuntu.getSourcePackage(name=pkg)
 285         pkgbugs = srcpkg.searchTasks(status=["New", "Fix Committed", "Invalid", "Won't Fix", "Confirmed", "Triaged", "In Progress", "Incomplete", "Incomplete (with response)", "Incomplete (without response)", "Fix Released", "Opinion", "Expired"])
 286
 287         for bugtask in pkgbugs:
 288             bug = bugtask.bug
 289             id = str(bug.id)
 290             print("parsing " + id + " status: " + bugtask.status + " title: " + bug.title[:50])
 291             attachmentid = 0
 292             for attachment in bug.attachments:
 293                 attachmentid += 1
 294                 handle = attachment.data.open()
 295                 if not handle.content_type in mimetypes:
 296                     #print "skipping"
 297                     continue
 298
 299                 suffix = mimetypes[handle.content_type]
 300                 if not os.path.isdir(suffix):
 301                     try:
 302                         os.mkdir(suffix)
 303                     except:
 304                         pass
 305
 306                 download = suffix + '/' + prefix + id + '-' + str(attachmentid) + '.' + suffix
 307
 308                 if os.path.isfile(download):
 309                     print("assuming " + id + " is up to date")
 310                     break
 311
 312                 print('mimetype is ' + handle.content_type + ' downloading as ' + download)
 313
 314                 f = open(download, "w")
 315                 f.write(handle.read())
 316                 f.close()
 317
 318 rss_bugzillas = {
 319     'abi': 'http://bugzilla.abisource.com/buglist.cgi', #added for abiword
 320     'fdo': 'http://bugs.libreoffice.org/buglist.cgi',
 321     'gentoo': 'http://bugs.gentoo.org/buglist.cgi',
 322     'gnome': 'http://bugzilla.gnome.org/buglist.cgi', # added for gnumeric
 323     'kde': 'http://bugs.kde.org/buglist.cgi', # added for koffice/calligra
 324     'mandriva': 'https://qa.mandriva.com/buglist.cgi',
 325     'moz': 'https://bugzilla.mozilla.org/buglist.cgi',
 326     # It seems something has changed and it is no longer possible to
 327     # download any files from there.
 328     # NOTE: I am leaving it in the list, commented out, just so someone
 329     # does not add it back immediately .-)
 330     # 'novell': 'https://bugzilla.novell.com/buglist.cgi',
 331     'ooo': 'https://bz.apache.org/ooo/buglist.cgi',
 332     'tdf': 'http://bugs.documentfoundation.org/buglist.cgi',
 333 }
 334
 335 redhatrpc = 'https://bugzilla.redhat.com/xmlrpc.cgi'
 336 redhatbug = 'https://bugzilla.redhat.com/show_bug.cgi?id='
 337
 338 #Novell Bugzilla requires users to log in in order to get details of the bugs such as attachment bodies etc.
 339 #As a dirty workaround, we parse comments containing "Created an attachment (id=xxxxxx)" and download attachments manually
 340 #python-bugzilla claims that it supports Novell bugzilla login but it's not working right now and novell bugzilla login
 341 #system is a nightmare
 342 novellattach = 'https://bugzilla.novell.com/attachment.cgi?id='
 343
 344 mimetypes = {
 345 # ODF
 346     'application/vnd.oasis.opendocument.base': 'odb',
 347     'application/vnd.oasis.opendocument.database': 'odb',
 348     'application/vnd.oasis.opendocument.chart': 'odc',
 349     'application/vnd.oasis.opendocument.chart-template': 'otc',
 350     'application/vnd.oasis.opendocument.formula': 'odf',
 351     'application/vnd.oasis.opendocument.formula-template': 'otf',
 352     'application/vnd.oasis.opendocument.graphics': 'odg',
 353     'application/vnd.oasis.opendocument.graphics-template': 'otg',
 354     'application/vnd.oasis.opendocument.graphics-flat-xml': 'fodg',
 355     'application/vnd.oasis.opendocument.presentation': 'odp',
 356     'application/vnd.oasis.opendocument.presentation-template': 'otp',
 357     'application/vnd.oasis.opendocument.presentation-flat-xml': 'fodp',
 358     'application/vnd.oasis.opendocument.spreadsheet': 'ods',
 359     'application/vnd.oasis.opendocument.spreadsheet-template': 'ots',
 360     'application/vnd.oasis.opendocument.spreadsheet-flat-xml': 'fods',
 361     'application/vnd.oasis.opendocument.text': 'odt',
 362     'application/vnd.oasis.opendocument.text-flat-xml': 'fodt',
 363     'application/vnd.oasis.opendocument.text-master': 'odm',
 364     'application/vnd.oasis.opendocument.text-template': 'ott',
 365     'application/vnd.oasis.opendocument.text-master-template': 'otm',
 366     'application/vnd.oasis.opendocument.text-web': 'oth',
 367 # OOo XML
 368     'application/vnd.sun.xml.base': 'odb',
 369     'application/vnd.sun.xml.calc': 'sxc',
 370     'application/vnd.sun.xml.calc.template': 'stc',
 371     'application/vnd.sun.xml.chart': 'sxs',
 372     'application/vnd.sun.xml.draw': 'sxd',
 373     'application/vnd.sun.xml.draw.template': 'std',
 374     'application/vnd.sun.xml.impress': 'sxi',
 375     'application/vnd.sun.xml.impress.template': 'sti',
 376     'application/vnd.sun.xml.math': 'sxm',
 377     'application/vnd.sun.xml.writer': 'sxw',
 378     'application/vnd.sun.xml.writer.global': 'sxg',
 379     'application/vnd.sun.xml.writer.template': 'stw',
 380     'application/vnd.sun.xml.writer.web': 'stw',
 381 # MSO
 382     'application/rtf': 'rtf',
 383     'text/rtf': 'rtf',
 384     'application/msword': 'doc',
 385     'application/vnd.ms-powerpoint': 'ppt',
 386     'application/vnd.ms-excel': 'xls',
 387     'application/vnd.ms-excel.sheet.binary.macroEnabled.12': 'xlsb',
 388     'application/vnd.ms-excel.sheet.macroEnabled.12': 'xlsm',
 389     'application/vnd.ms-excel.template.macroEnabled.12': 'xltm',
 390     'application/vnd.ms-powerpoint.presentation.macroEnabled.12': 'pptm',
 391     'application/vnd.ms-powerpoint.slide.macroEnabled.12': 'sldm',
 392     'application/vnd.ms-powerpoint.slideshow.macroEnabled.12': 'ppsm',
 393     'application/vnd.ms-powerpoint.template.macroEnabled.12': 'potm',
 394     'application/vnd.ms-word.document.macroEnabled.12': 'docm',
 395     'application/vnd.ms-word.template.macroEnabled.12': 'dotm',
 396     'application/vnd.openxmlformats-officedocument.spreadsheetml.sheet': 'xlsx',
 397     'application/vnd.openxmlformats-officedocument.spreadsheetml.template': 'xltx',
 398     'application/vnd.openxmlformats-officedocument.presentationml.presentation': 'pptx',
 399     'application/vnd.openxmlformats-officedocument.presentationml.template': 'ppotx',
 400     'application/vnd.openxmlformats-officedocument.presentationml.slideshow': 'ppsx',
 401     'application/vnd.openxmlformats-officedocument.presentationml.slide': 'sldx',
 402     'application/vnd.openxmlformats-officedocument.wordprocessingml.document': 'docx',
 403     'application/vnd.openxmlformats-officedocument.wordprocessingml.template': 'dotx',
 404     'application/vnd.visio': 'vsd',
 405     'application/vnd.visio.xml': 'vdx',
 406     'application/x-mspublisher': 'pub',
 407 # W3C
 408     'application/xhtml+xml': 'xhtml',
 409     'application/mathml+xml': 'mml',
 410     'text/html': 'html',
 411     'application/docbook+xml': 'docbook',
 412 # misc
 413     'text/csv': 'csv',
 414     'text/spreadsheet': 'slk',
 415     'application/x-dbase': 'dbf',
 416     'application/vnd.corel-draw': 'cdr',
 417     'application/vnd.lotus-wordpro': 'lwp',
 418     'application/vnd.lotus-1-2-3': 'wks',
 419     'application/vnd.wordperfect': 'wpd',
 420     'application/wordperfect5.1': 'wpd',
 421     'application/vnd.ms-works': 'wps',
 422     'application/clarisworks' : 'cwk',
 423     'application/macwriteii' : 'mw',
 424     'application/vnd.apple.keynote': 'key',
 425     'application/vnd.apple.numbers': 'numbers',
 426     'application/vnd.apple.pages': 'pages',
 427     'application/x-iwork-keynote-sffkey': 'key',
 428     'application/x-iwork-numbers-sffnumbers': 'numbers',
 429     'application/x-iwork-pages-sffpages': 'pages',
 430     'application/x-hwp': 'hwp',
 431     'application/x-aportisdoc': 'pdb',
 432     'application/prs.plucker' : 'pdb_plucker',
 433     'application/vnd.palm' : 'pdb_palm',
 434     'application/x-sony-bbeb' : 'lrf',
 435     'application/x-pocket-word': 'psw',
 436     'application/x-t602': '602',
 437     'application/x-fictionbook+xml': 'fb2',
 438     'application/x-abiword': 'abw',
 439     'application/x-pagemaker': 'pmd',
 440 # relatively uncommon image mimetypes
 441     'image/x-freehand': 'fh',
 442     'image/cgm': 'cgm',
 443     'image/tiff': 'tiff',
 444     'image/vnd.dxf': 'dxf',
 445     'image/x-emf': 'emf',
 446     'image/x-targa': 'tga',
 447     'image/x-sgf': 'sgf',
 448     'image/x-svm': 'svm',
 449     'image/x-wmf': 'wmf',
 450     'image/x-pict': 'pict',
 451     'image/x-cmx': 'cmx',
 452     'image/svg+xml': 'svg',
 453     'image/x-MS-bmp': 'bmp',
 454     'image/x-wpg': 'wpg',
 455     'image/x-eps': 'eps',
 456     'image/x-met': 'met',
 457     'image/x-portable-bitmap': 'pbm',
 458     'image/x-photo-cd': 'pcd',
 459     'image/x-pcx': 'pcx',
 460     'image/x-portable-graymap': 'pgm',
 461     'image/x-portable-pixmap': 'ppm',
 462     'image/vnd.adobe.photoshop': 'psd',
 463     'image/x-cmu-raster': 'ras',
 464     'image/x-sun-raster': 'ras',
 465     'image/x-xbitmap': 'xbm',
 466     'image/x-xpixmap': 'xpm',
 467 }
 468
 469 # disabled for now, this would download gigs of pngs/jpegs...
 470 common_noncore_mimetypes = {
 471 # graphics
 472     'image/gif': 'gif',
 473     'image/jpeg': 'jpeg',
 474     'image/png': 'png',
 475 # pdf, etc.
 476     'application/pdf': 'pdf',
 477 }
 478
 479 class manage_threads(threading.Thread):
 480     def run(self):
 481         #print(threading.current_thread().get_ident())
 482         while 1:
 483             # Try to receive a job from queue
 484             try:
 485                 # Get job from queue
 486                 # Use job parameters to call our query
 487                 # Then let the queue know we are done with this job
 488                 job = jobs.get(True,5)
 489                 get_through_rss_query(job[0], job[1], job[2], job[3]) # [0] = uri; [1] = mimetype; [2] = prefix; [3] = extension
 490                 jobs.task_done()
 491             except KeyboardInterrupt:
 492                 raise # Ctrl+C should work
 493             except:
 494                 break
 495
 496 def generate_multi_threading():
 497     for (prefix, uri) in rss_bugzillas.items():
 498
 499         # Initialize threads
 500         for i in xrange(max_threads):
 501             manage_threads().start()
 502
 503         # Create a job for every mimetype for a bugzilla
 504         for (mimetype,extension) in mimetypes.items():
 505
 506
 507             # It seems that bugzilla has problems returing that many results
 508             # (10000 results is probably a limit set somewhere) so we always
 509             # end processing the complete list.
 510             if mimetype == 'text/html' and prefix == 'moz':
 511                     continue
 512
 513             try:
 514                 jobs.put([uri, mimetype, prefix, extension], block=True, timeout=3)
 515                 print("successfully placed a job in the queue searching for " + mimetype + " in bugtracker " + prefix)
 516             except KeyboardInterrupt:
 517                 raise # Ctrl+C should work
 518             except:
 519                 print("Queue full")
 520
 521         # Continue when all mimetypes are done for a bugzilla
 522         jobs.join()
 523
 524 max_threads = 20 # Number of threads to create, (1 = without multi-threading)
 525 jobs = Queue.Queue(40)
 526
 527 generate_multi_threading()
 528
 529 for (mimetype,extension) in mimetypes.items():
 530     get_through_rpc_query(redhatrpc, redhatbug, mimetype, "rhbz", extension)
 531
 532 try:
 533     get_launchpad_bugs("lp")
 534 except ImportError:
 535     print("launchpadlib unavailable, skipping Ubuntu tracker")
 536
 537 # vim:set shiftwidth=4 softtabstop=4 expandtab: