bin/get-bugzilla-attachments-by-mimetype

   1 #!/usr/bin/env python3
   2 # -*- coding: utf-8 -*-
   3 #
   4 # This file is part of the LibreOffice project.
   5 #
   6 # This Source Code Form is subject to the terms of the Mozilla Public
   7 # License, v. 2.0. If a copy of the MPL was not distributed with this
   8 # file, You can obtain one at http://mozilla.org/MPL/2.0/.
   9 #
  10
  11 # This digs through a pile of bugzilla's and populates the cwd with a big
  12 # collection of bug-docs in per-filetype dirs with bug-ids as names with
  13 # prefixes to indicate which bug-tracker, e.g.
  14 #
  15 # fdo-bugid-X.suffix
  16 # rhbz-bugid-X.suffix
  17 # moz-bugid-X.suffix
  18 #
  19 # where X is the n'th attachment of that type in the bug
  20 #
  21 # The results are stored in the current directory, categorized by the
  22 # extension of the downloaded file.  When a file already exists, it is assumed
  23 # it is already downloaded by a previous run, and up-to-date.
  24
  25 from __future__ import print_function
  26
  27 import base64
  28 import datetime
  29 import glob
  30 import os
  31 import os.path
  32 import re
  33 import stat
  34 import sys
  35 import threading
  36 try:
  37     import queue
  38 except Exception:
  39     import Queue as queue
  40 try:
  41     from urllib.request import urlopen
  42 except Exception:
  43     from urllib import urlopen
  44 try:
  45     import xmlrpc.client as xmlrpclib
  46 except Exception:
  47     import xmlrpclib
  48 from xml.dom import minidom
  49 from xml.sax.saxutils import escape
  50
  51 from attachment_mimetypes import mimetypes
  52
  53 import feedparser
  54
  55
  56 def urlopen_retry(url):
  57     """Open url, retry 3 times."""
  58     maxretries = 3
  59     for i in range(maxretries + 1):
  60         try:
  61             return urlopen(url)
  62         except IOError as e:
  63             print('caught IOError: ' + str(e))
  64             if maxretries == i:
  65                 raise
  66             print('retrying...')
  67
  68
  69 def get_from_bug_url_via_xml(url, mimetype, prefix, suffix):
  70     """Parse bug xml, download attachments with matching suffix."""
  71     bugid = url.rsplit('=', 2)[1]
  72     print('id is ' + prefix + bugid + ' ' + suffix)
  73     print('parsing ' + bugid)
  74     sock = urlopen_retry(url+'&ctype=xml')
  75     dom = minidom.parse(sock)
  76     sock.close()
  77     attachmentid = 0
  78     for attachment in dom.getElementsByTagName('attachment'):
  79         attachmentid += 1
  80         print(' mimetype is', end=' ')
  81         for node in attachment.childNodes:
  82             if node.nodeName == 'type':
  83                 # check if attachment is deleted
  84                 if not node.firstChild:
  85                     print('deleted attachment, skipping')
  86                     continue
  87
  88                 print(node.firstChild.nodeValue, end=' ')
  89                 if node.firstChild.nodeValue.lower() != mimetype.lower():
  90                     print('skipping')
  91                     break
  92             elif node.nodeName == 'data':
  93                 # check if attachment is deleted (i.e. https://bugs.kde.org/show_bug.cgi?id=53343&ctype=xml)
  94                 if not node.firstChild:
  95                     print('deleted attachment, skipping')
  96                     continue
  97
  98                 download = (suffix + '/' + prefix + bugid + '-' +
  99                             str(attachmentid) + '.' + suffix)
 100                 if os.path.isfile(download):
 101                     print('assuming ' + download + ' is up to date')
 102                     continue
 103
 104                 # prevent re-downloading FDO attachments from TDF
 105                 if prefix == 'tdf' and int(bugid) < 88776:
 106                     fdodownload = download.replace('tdf', 'fdo')
 107                     if os.path.isfile(fdodownload):
 108                         print('assuming FDO ' + fdodownload + ' is up to date')
 109                         continue
 110
 111                 print('downloading as ' + download)
 112                 tmpfile = download + '.tmp'
 113                 f = open(tmpfile, 'wb')
 114                 f.write(base64.b64decode(node.firstChild.nodeValue))
 115                 f.close()
 116                 os.rename(tmpfile, download)
 117                 break
 118
 119
 120 def get_novell_bug_via_xml(url, mimetype, prefix, suffix):
 121     """Parse bug xml, download attachments with matching suffix."""
 122     bugid = url.rsplit('=', 2)[1]
 123     print('id is ' + prefix + bugid + ' ' + suffix)
 124     print('parsing ' + bugid)
 125     sock = urlopen_retry(url+'&ctype=xml')
 126     dom = minidom.parse(sock)
 127     sock.close()
 128     attachmentid = 0
 129     for comment in dom.getElementsByTagName('thetext'):
 130         commentText = comment.firstChild.nodeValue
 131         match = re.search(r'.*Created an attachment \(id=([0-9]+)\)',
 132                           commentText)
 133         if not match:
 134             continue
 135
 136         attachmentid += 1
 137
 138         download = (suffix + '/' + prefix + bugid + '-' +
 139                     str(attachmentid) + '.' + suffix)
 140         if os.path.isfile(download):
 141             print('assuming ' + download + ' is up to date')
 142             continue
 143
 144         realAttachmentId = match.group(1)
 145         handle = urlopen_retry(novellattach + realAttachmentId)
 146         if not handle:
 147             print('attachment ' + realAttachmentId + ' is not accessible')
 148             continue
 149         print(' mimetype is', end=' ')
 150
 151         info = handle.info()
 152         if info.get_content_type:
 153             remoteMime = info.get_content_type()
 154         else:
 155             remoteMime = info.gettype()
 156         print(remoteMime, end=' ')
 157         if remoteMime != mimetype:
 158             print('skipping')
 159             continue
 160
 161         print('downloading as ' + download)
 162         tmpfile = download + '.tmp'
 163         f = open(tmpfile, 'wb')
 164         f.write(handle.read())
 165         f.close()
 166         os.rename(tmpfile, download)
 167
 168
 169 def create_query(mimetype):
 170     """Query all bugs with suitable mimetype attachments."""
 171     query = {}
 172     query['query_format'] = 'advanced'
 173     query['field0-0-0'] = 'attachments.mimetype'
 174     query['type0-0-0'] = 'equals'
 175     query['value0-0-0'] = mimetype
 176     return query
 177
 178
 179 def get_downloaded_files(prefix, suffix):
 180     """Generate list of existing downloads (matching pre/suffix)."""
 181     return glob.glob(os.path.join(suffix, '%s*.%s' % (prefix, suffix)))
 182
 183
 184 def get_file_bz_ids(files, prefix):
 185     """Generate list of existing downloads (matching pre/suffix)."""
 186     return set([os.path.basename(f).split('-')[0].replace(prefix, '', 1) for f in files])
 187
 188
 189 def get_changed_date(files):
 190     """Compute date of last downloaded attachment."""
 191     newest = max([os.stat(f)[stat.ST_MTIME] for f in files])
 192     # Subtract a day to avoid timezone differences. The worst thing that
 193     # can happen is that we are going to process more bugs than necessary.
 194     return datetime.date.fromtimestamp(newest - 24 * 60 * 60)
 195
 196
 197 def get_through_rpc_query(rpcurl, showurl, mimetype, prefix, suffix):
 198     """Poke Bugzilla via RPC query."""
 199     try:
 200         os.mkdir(suffix)
 201     except Exception:
 202         pass
 203
 204     def process(query, full, have=[]):
 205         try:
 206             proxy = xmlrpclib.ServerProxy(rpcurl)
 207             result = proxy.Bug.search(query)
 208             bugs = result['bugs']
 209             print(str(len(bugs)) + ' bugs to process')
 210
 211             if full:
 212                 available = set([str(bug['id']) for bug in bugs])
 213                 # we already have files from all available bugs
 214                 if available.difference(set(have)) == set():
 215                     print('assuming all downloaded files are up to date')
 216                     return
 217
 218             for bug in bugs:
 219                 url = showurl + str(bug['id'])
 220                 get_from_bug_url_via_xml(url, mimetype, prefix, suffix)
 221         except xmlrpclib.Fault as err:
 222             print('A fault occurred')
 223             print('Fault code: ' + err.faultCode)
 224             print(err.faultString)
 225
 226     query = create_query(mimetype)
 227     query['column_list'] = 'bug_id'
 228
 229     files = get_downloaded_files(prefix, suffix)
 230
 231     if files != []:
 232         print('looking for updated bugs having %s attachment(s)' % mimetype)
 233         query_changed = query.copy()
 234         query_changed['field0-1-0'] = 'days_elapsed'
 235         query_changed['type0-1-0'] = 'lessthaneq'
 236         query_changed['value0-1-0'] = str((datetime.date.today() - get_changed_date(files)).days)
 237         process(query_changed, False)
 238
 239     print('looking for all bugs having %s attachment(s)' % mimetype)
 240     process(query, True, get_file_bz_ids(files, prefix))
 241
 242
 243 def get_through_rss_query(queryurl, mimetype, prefix, suffix):
 244     """Poke Bugzilla via RSS query."""
 245     try:
 246         os.mkdir(suffix)
 247     except Exception:
 248         pass
 249
 250     # Getting detailed bug information and downloading an attachment
 251     # body is not possible without logging in to Novell bugzilla
 252     # get_novell_bug_via_xml function is a workaround for that
 253     # situation
 254     get_bug_function = get_novell_bug_via_xml if prefix == 'novell' else get_from_bug_url_via_xml
 255
 256     def process(query, full, have=[]):
 257         url = queryurl + '?' + '&'.join(['='.join(kv) for kv in query.items()])
 258         print('url is ' + url)
 259         d = feedparser.parse(url)
 260         print(str(len(d['entries'])) + ' bugs to process')
 261
 262         entries = d['entries']
 263         if full:
 264             available = set([str(entry['id'].split('=')[-1]) for entry in entries])
 265             # we already have files from all available bugs
 266             if available.difference(set(have)) == set():
 267                 print('assuming all downloaded files are up to date')
 268                 return
 269
 270         for entry in entries:
 271             try:
 272                 get_bug_function(entry['id'], mimetype, prefix, suffix)
 273             except KeyboardInterrupt:
 274                 raise # Ctrl+C should work
 275             except Exception:
 276                 print(entry['id'] + ' failed: ' + str(sys.exc_info()[0]))
 277                 pass
 278
 279     query = create_query(escape(mimetype.replace('+', '%2B')))
 280     query['ctype'] = 'rss'
 281
 282     files = get_downloaded_files(prefix, suffix)
 283
 284     if files != []:
 285         print('looking for updated bugs having %s attachment(s)' % mimetype)
 286         query_changed = query.copy()
 287         query_changed['field0-1-0'] = 'delta_ts'
 288         query_changed['type0-1-0'] = 'greaterthaneq'
 289         query_changed['value0-1-0'] = get_changed_date(files).isoformat()
 290         process(query_changed, False)
 291
 292     print('looking for all bugs having %s attachment(s)' % mimetype)
 293     process(query, True, get_file_bz_ids(files, prefix))
 294
 295
 296 # since searching bugs having attachments with specific mimetypes is not
 297 # available in launchpad API:
 298 # we're iterating over all bugs of the most interesting source packages
 299 launchpad_pkgs = (
 300     'abiword',
 301     'calibre',
 302     'calligra',
 303     'gnumeric',
 304     'inkscape',
 305     'koffice',
 306     'libabw',
 307     'libcdr',
 308     'libe-book',
 309     'libetonyek',
 310     'libfreehand',
 311     'libmspub',
 312     'libmwaw',
 313     'liborcus',
 314     'libpagemaker',
 315     'libreoffice',
 316     'libvisio',
 317     'libwpd',
 318     'libwpg',
 319     'libwps',
 320     'openoffice.org',
 321     'python-uniconvertor',
 322     'scribus',
 323     'sk1',
 324     'unoconv',
 325 )
 326
 327
 328 def get_launchpad_bugs(prefix):
 329     """Query launchpad bugtracker (via launchpadlib)."""
 330     # launchpadlib python module is required to download launchpad attachments
 331     from launchpadlib.launchpad import Launchpad
 332
 333     launchpad = Launchpad.login_anonymously('attachmentdownload', 'production')
 334     ubuntu = launchpad.distributions['ubuntu']
 335
 336     for pkg in launchpad_pkgs:
 337         srcpkg = ubuntu.getSourcePackage(name=pkg)
 338         pkgbugs = srcpkg.searchTasks(status=['New', 'Fix Committed', 'Invalid',
 339                                              "Won't Fix", 'Confirmed',
 340                                              'Triaged', 'In Progress',
 341                                              'Incomplete',
 342                                              'Incomplete (with response)',
 343                                              'Incomplete (without response)',
 344                                              'Fix Released', 'Opinion',
 345                                              'Expired'])
 346
 347         for bugtask in pkgbugs:
 348             bug = bugtask.bug
 349             bugid = str(bug.id)
 350             print('parsing ' + bugid + ' status: ' + bugtask.status +
 351                   ' title: ' + bug.title[:50])
 352             attachmentid = 0
 353             for attachment in bug.attachments:
 354                 attachmentid += 1
 355                 handle = attachment.data.open()
 356                 if handle.content_type not in mimetypes:
 357                     #print "skipping"
 358                     continue
 359
 360                 suffix = mimetypes[handle.content_type]
 361                 if not os.path.isdir(suffix):
 362                     try:
 363                         os.mkdir(suffix)
 364                     except Exception:
 365                         pass
 366
 367                 download = (suffix + '/' + prefix + bugid + '-' +
 368                             str(attachmentid) + '.' + suffix)
 369
 370                 if os.path.isfile(download):
 371                     print('assuming ' + bugid + ' is up to date')
 372                     break
 373
 374                 print('mimetype is ' + handle.content_type +
 375                       ' downloading as ' + download)
 376
 377                 tmpfile = download + '.tmp'
 378                 f = open(tmpfile, 'wb')
 379                 f.write(handle.read())
 380                 f.close()
 381                 os.rename(tmpfile, download)
 382
 383
 384 rss_bugzillas = (
 385 # note: currently abisource has an expired TLS cert
 386 #    ('abi', 'http://bugzilla.abisource.com/buglist.cgi'), #added for abiword
 387     ('fdo', 'http://bugs.freedesktop.org/buglist.cgi'),
 388     ('gentoo', 'http://bugs.gentoo.org/buglist.cgi'),
 389 #    ('gnome', 'http://bugzilla.gnome.org/buglist.cgi' ), # added for gnumeric
 390     ('kde', 'http://bugs.kde.org/buglist.cgi'), # added for koffice/calligra
 391     ('mandriva', 'https://qa.mandriva.com/buglist.cgi'),
 392     ('moz', 'https://bugzilla.mozilla.org/buglist.cgi'),
 393     # It seems something has changed and it is no longer possible to
 394     # download any files from there.
 395     # NOTE: I am leaving it in the list, commented out, just so someone
 396     # does not add it back immediately .-)
 397     # 'novell': 'https://bugzilla.novell.com/buglist.cgi',
 398 # note: running this script against bz.apache.org apparently causes one's IP
 399 # to be banned or something; you won't get new files in any case...
 400 #    ('ooo', 'https://bz.apache.org/ooo/buglist.cgi'),
 401     ('tdf', 'http://bugs.documentfoundation.org/buglist.cgi'),
 402 )
 403
 404 redhatrpc = 'https://bugzilla.redhat.com/xmlrpc.cgi'
 405 redhatbug = 'https://bugzilla.redhat.com/show_bug.cgi?id='
 406
 407 # Novell Bugzilla requires users to log in, in order to get details of
 408 # the bugs such as attachment bodies etc.  As a dirty workaround, we
 409 # parse comments containing "Created an attachment (id=xxxxxx)" and
 410 # download attachments manually python-bugzilla claims that it
 411 # supports Novell bugzilla login but it's not working right now and
 412 # novell bugzilla login system is a nightmare
 413 novellattach = 'https://bugzilla.novell.com/attachment.cgi?id='
 414
 415
 416 class manage_threads(threading.Thread):
 417     def run(self):
 418         while 1:
 419             # Try to receive a job from queue
 420             try:
 421                 # Get job from queue
 422                 # Use job parameters to call our query
 423                 # Then let the queue know we are done with this job
 424                 (uri, mimetype, prefix, extension) = jobs.get(True, 6)
 425                 try:
 426                     # set thread name for easier debugging, if process
 427                     # ctl package is available
 428                     import prctl
 429                     prctl.set_name(prefix[:3] + ': ' + mimetype[-10:])
 430                 except ImportError:
 431                     pass
 432
 433                 try:
 434                     get_through_rss_query(uri, mimetype, prefix, extension)
 435                 finally:
 436                     jobs.task_done()
 437             except KeyboardInterrupt:
 438                 raise # Ctrl+C should work
 439             except queue.Empty:
 440                 break
 441
 442
 443 def generate_multi_threading():
 444
 445     # Initialize threads
 446     for _i in range(max_threads):
 447         manage_threads().start()
 448
 449     for (prefix, uri) in rss_bugzillas:
 450
 451         # Create a job for every mimetype for a bugzilla
 452         for (mimetype, extension) in mimetypes.items():
 453             # It seems that bugzilla has problems returning that many results
 454             # (10000 results is probably a limit set somewhere) so we always
 455             # end processing the complete list.
 456             if mimetype == 'text/html' and prefix == 'moz':
 457                 continue
 458
 459             jobs.put([uri, mimetype, prefix, extension], block=True)
 460             print('successfully placed a job in the queue searching for ' +
 461                   mimetype + ' in bugtracker ' + prefix)
 462
 463         # Continue when all mimetypes are done for a bugzilla
 464         print('STARTED all bugtracker ' + prefix)
 465
 466     jobs.join()
 467
 468
 469 # Number of threads to create, (1 = without multi-threading, default = 20)
 470 max_threads = int(os.environ.get('PARALLELISM', 20))
 471 jobs = queue.Queue()
 472
 473 generate_multi_threading()
 474
 475 for (mimetype, extension) in mimetypes.items():
 476     get_through_rpc_query(redhatrpc, redhatbug, mimetype, 'rhbz', extension)
 477
 478 try:
 479     get_launchpad_bugs('lp')
 480 except ImportError:
 481     print('launchpadlib unavailable, skipping Ubuntu tracker')
 482
 483 # vim:set shiftwidth=4 softtabstop=4 expandtab: