bin/get-bugzilla-attachments-by-mimetype

   1 #!/usr/bin/env python
   2 # -*- coding: utf-8 -*-
   3 #
   4 # This file is part of the LibreOffice project.
   5 #
   6 # This Source Code Form is subject to the terms of the Mozilla Public
   7 # License, v. 2.0. If a copy of the MPL was not distributed with this
   8 # file, You can obtain one at http://mozilla.org/MPL/2.0/.
   9 #
  10
  11 # This digs through a pile of bugzilla's and populates the cwd with a big
  12 # collection of bug-docs in per-filetype dirs with bug-ids as names with
  13 # prefixes to indicate which bug-tracker, e.g.
  14 #
  15 # fdo-bugid-X.suffix
  16 # rhbz-bugid-X.suffix
  17 # moz-bugid-X.suffix
  18 #
  19 # where X is the n'th attachment of that type in the bug
  20 #
  21 # The results are stored in the current directory, categorized by the
  22 # extension of the downloaded file.  When a file already exists, it is assumed
  23 # it is already downloaded by a previous run, and up-to-date.
  24
  25 from __future__ import print_function
  26 import feedparser
  27 import base64
  28 import datetime
  29 import glob
  30 import re
  31 import os, os.path
  32 import stat
  33 import sys
  34 import threading
  35 try:
  36     import queue
  37 except:
  38     import Queue as queue
  39 try:
  40     from urllib.request import urlopen
  41 except:
  42     from urllib import urlopen
  43 try:
  44     import xmlrpc.client as xmlrpclib
  45 except:
  46     import xmlrpclib
  47 from xml.dom import minidom
  48 from xml.sax.saxutils import escape
  49 from attachment_mimetypes import mimetypes
  50
  51 def urlopen_retry(url):
  52     maxretries = 3
  53     for i in range(maxretries + 1):
  54         try:
  55             return urlopen(url)
  56         except IOError as e:
  57             print("caught IOError: " + str(e))
  58             if maxretries == i:
  59                 raise
  60             print("retrying...")
  61
  62 def get_from_bug_url_via_xml(url, mimetype, prefix, suffix):
  63     id = url.rsplit('=', 2)[1]
  64     print("id is " + prefix + id + " " + suffix)
  65     print("parsing " + id)
  66     sock = urlopen_retry(url+"&ctype=xml")
  67     dom = minidom.parse(sock)
  68     sock.close()
  69     attachmentid=0
  70     for attachment in dom.getElementsByTagName('attachment'):
  71         attachmentid += 1
  72         print(" mimetype is", end=' ')
  73         for node in attachment.childNodes:
  74             if node.nodeName == 'type':
  75                 # check if attachment is deleted
  76                 if not node.firstChild:
  77                     print('deleted attachment, skipping')
  78                     continue
  79
  80                 print(node.firstChild.nodeValue, end=' ')
  81                 if node.firstChild.nodeValue.lower() != mimetype.lower():
  82                     print('skipping')
  83                     break
  84             elif node.nodeName == 'data':
  85                 # check if attachment is deleted (i.e. https://bugs.kde.org/show_bug.cgi?id=53343&ctype=xml)
  86                 if not node.firstChild:
  87                     print('deleted attachment, skipping')
  88                     continue
  89
  90                 download = suffix + '/' +prefix + id + '-' + str(attachmentid) + '.' + suffix
  91                 if os.path.isfile(download):
  92                     print("assuming " + download + " is up to date")
  93                     continue
  94
  95                 # prevent re-downloading FDO attachments from TDF
  96                 if prefix == "tdf" and int(id) < 88776:
  97                     fdodownload = download.replace("tdf", "fdo")
  98                     if os.path.isfile(fdodownload):
  99                         print("assuming FDO " + fdodownload + " is up to date")
 100                         continue
 101
 102                 print('downloading as ' + download)
 103                 tmpfile = download + ".tmp"
 104                 f = open(tmpfile, 'wb')
 105                 f.write(base64.b64decode(node.firstChild.nodeValue))
 106                 f.close()
 107                 os.rename(tmpfile, download)
 108                 break
 109
 110 def get_novell_bug_via_xml(url, mimetype, prefix, suffix):
 111     id = url.rsplit('=', 2)[1]
 112     print("id is " + prefix + id + " " + suffix)
 113     print("parsing " + id)
 114     sock = urlopen_retry(url+"&ctype=xml")
 115     dom = minidom.parse(sock)
 116     sock.close()
 117     attachmentid=0
 118     for comment in dom.getElementsByTagName('thetext'):
 119         commentText = comment.firstChild.nodeValue
 120         match = re.search(r".*Created an attachment \(id=([0-9]+)\)", commentText)
 121         if not match:
 122             continue
 123
 124         attachmentid += 1
 125
 126         download = suffix + '/' + prefix + id + '-' + str(attachmentid) + '.' + suffix
 127         if os.path.isfile(download):
 128             print("assuming " + download + " is up to date")
 129             continue
 130
 131         realAttachmentId = match.group(1)
 132         handle = urlopen_retry(novellattach + realAttachmentId)
 133         if not handle:
 134             print("attachment %s is not accessible" % realAttachmentId)
 135             continue
 136         print(" mimetype is", end=' ')
 137
 138         info = handle.info()
 139         if info.get_content_type:
 140             remoteMime = info.get_content_type()
 141         else:
 142             remoteMime = info.gettype()
 143         print(remoteMime, end=' ')
 144         if remoteMime != mimetype:
 145             print("skipping")
 146             continue
 147
 148         print('downloading as ' + download)
 149         tmpfile = download + ".tmp"
 150         f = open(tmpfile, 'wb')
 151         f.write(handle.read())
 152         f.close()
 153         os.rename(tmpfile, download)
 154
 155 def create_query(mimetype):
 156     query = dict()
 157     query['query_format']='advanced'
 158     query['field0-0-0']='attachments.mimetype'
 159     query['type0-0-0']='equals'
 160     query['value0-0-0']=mimetype
 161     return query
 162
 163 def get_downloaded_files(prefix, suffix):
 164     return glob.glob(os.path.join(suffix, '%s*.%s' % (prefix, suffix)))
 165
 166 def get_file_bz_ids(files, prefix):
 167     return set([os.path.basename(f).split('-')[0].replace(prefix, '', 1) for f in files])
 168
 169 def get_changed_date(files):
 170     newest = max([os.stat(f)[stat.ST_MTIME] for f in files])
 171     # Subtract a day to avoid timezone differences. The worst thing that
 172     # can happen is that we are going to process more bugs than necessary.
 173     return datetime.date.fromtimestamp(newest - 24 * 60 * 60)
 174
 175 def get_through_rpc_query(rpcurl, showurl, mimetype, prefix, suffix):
 176     try:
 177         os.mkdir(suffix)
 178     except:
 179         pass
 180
 181     def process(query, full, have=[]):
 182         try:
 183             proxy = xmlrpclib.ServerProxy(rpcurl)
 184             result = proxy.Bug.search(query)
 185             bugs = result['bugs']
 186             print(str(len(bugs)) + ' bugs to process')
 187
 188             if full:
 189                 available = set([str(bug['id']) for bug in bugs])
 190                 # we already have files from all available bugs
 191                 if available.difference(set(have)) == set():
 192                     print("assuming all downloaded files are up to date")
 193                     return
 194
 195             for bug in bugs:
 196                 url = showurl + str(bug['id'])
 197                 get_from_bug_url_via_xml(url, mimetype, prefix, suffix)
 198         except xmlrpclib.Fault as err:
 199             print("A fault occurred")
 200             print("Fault code: %s" % err.faultCode)
 201             print(err.faultString)
 202
 203     query = create_query(mimetype)
 204     query['column_list']='bug_id'
 205
 206     files = get_downloaded_files(prefix, suffix)
 207
 208     if files != []:
 209         print('looking for updated bugs having %s attachment(s)' % mimetype)
 210         query_changed = query.copy()
 211         query_changed['field0-1-0'] = 'days_elapsed'
 212         query_changed['type0-1-0'] = 'lessthaneq'
 213         query_changed['value0-1-0'] = str((datetime.date.today() - get_changed_date(files)).days)
 214         process(query_changed, False)
 215
 216     print('looking for all bugs having %s attachment(s)' % mimetype)
 217     process(query, True, get_file_bz_ids(files, prefix))
 218
 219 def get_through_rss_query(queryurl, mimetype, prefix, suffix):
 220     try:
 221         os.mkdir(suffix)
 222     except:
 223         pass
 224
 225     #Getting detailed bug information and downloading an attachment body is not possible without logging in to Novell bugzilla
 226     #get_novell_bug_via_xml function is a workaround for that situation
 227     get_bug_function = get_novell_bug_via_xml if prefix == "novell" else get_from_bug_url_via_xml
 228
 229     def process(query, full, have=[]):
 230         url = queryurl + '?' + '&'.join(['='.join(kv) for kv in query.items()])
 231         print('url is ' + url)
 232         d = feedparser.parse(url)
 233         print(str(len(d['entries'])) + ' bugs to process')
 234
 235         entries = []
 236         for entry in d['entries']:
 237             bugid = entry['id'].split('=')[-1]
 238             entries.append(entry)
 239
 240         if full:
 241             available = set([str(entry['id'].split('=')[-1]) for entry in entries])
 242             # we already have files from all available bugs
 243             if available.difference(set(have)) == set():
 244                 print("assuming all downloaded files are up to date")
 245                 return
 246
 247         for entry in entries:
 248             try:
 249                 get_bug_function(entry['id'], mimetype, prefix, suffix)
 250             except KeyboardInterrupt:
 251                 raise # Ctrl+C should work
 252             except:
 253                 print(entry['id'] + " failed: " + str(sys.exc_info()[0]))
 254                 pass
 255
 256     query = create_query(escape(mimetype.replace("+","%2B")))
 257     query['ctype'] = 'rss'
 258
 259     files = get_downloaded_files(prefix, suffix)
 260
 261     if files != []:
 262         print('looking for updated bugs having %s attachment(s)' % mimetype)
 263         query_changed = query.copy()
 264         query_changed['field0-1-0'] = 'delta_ts'
 265         query_changed['type0-1-0'] = 'greaterthaneq'
 266         query_changed['value0-1-0'] = get_changed_date(files).isoformat()
 267         process(query_changed, False)
 268
 269     print('looking for all bugs having %s attachment(s)' % mimetype)
 270     process(query, True, get_file_bz_ids(files, prefix))
 271
 272 #since searching bugs having attachments with specific mimetypes is not available in launchpad API
 273 #we're iterating over all bugs of the most interesting source packages
 274 launchpad_pkgs = (
 275     "abiword",
 276     "calibre",
 277     "calligra",
 278     "gnumeric",
 279     "inkscape",
 280     "koffice",
 281     "libabw",
 282     "libcdr",
 283     "libe-book",
 284     "libetonyek",
 285     "libfreehand",
 286     "libmspub",
 287     "libmwaw",
 288     "liborcus",
 289     "libpagemaker",
 290     "libreoffice",
 291     "libvisio",
 292     "libwpd",
 293     "libwpg",
 294     "libwps",
 295     "openoffice.org",
 296     "python-uniconvertor",
 297     "scribus",
 298     "sk1",
 299     "unoconv",
 300 )
 301
 302 def get_launchpad_bugs(prefix):
 303     #launchpadlib python module is required to download launchpad attachments
 304     from launchpadlib.launchpad import Launchpad
 305
 306     launchpad = Launchpad.login_anonymously("attachmentdownload", "production")
 307     ubuntu = launchpad.distributions["ubuntu"]
 308
 309     for pkg in launchpad_pkgs:
 310         srcpkg = ubuntu.getSourcePackage(name=pkg)
 311         pkgbugs = srcpkg.searchTasks(status=["New", "Fix Committed", "Invalid", "Won't Fix", "Confirmed", "Triaged", "In Progress", "Incomplete", "Incomplete (with response)", "Incomplete (without response)", "Fix Released", "Opinion", "Expired"])
 312
 313         for bugtask in pkgbugs:
 314             bug = bugtask.bug
 315             id = str(bug.id)
 316             print("parsing " + id + " status: " + bugtask.status + " title: " + bug.title[:50])
 317             attachmentid = 0
 318             for attachment in bug.attachments:
 319                 attachmentid += 1
 320                 handle = attachment.data.open()
 321                 if not handle.content_type in mimetypes:
 322                     #print "skipping"
 323                     continue
 324
 325                 suffix = mimetypes[handle.content_type]
 326                 if not os.path.isdir(suffix):
 327                     try:
 328                         os.mkdir(suffix)
 329                     except:
 330                         pass
 331
 332                 download = suffix + '/' + prefix + id + '-' + str(attachmentid) + '.' + suffix
 333
 334                 if os.path.isfile(download):
 335                     print("assuming " + id + " is up to date")
 336                     break
 337
 338                 print('mimetype is ' + handle.content_type + ' downloading as ' + download)
 339
 340                 tmpfile = download + ".tmp"
 341                 f = open(tmpfile, "wb")
 342                 f.write(handle.read())
 343                 f.close()
 344                 os.rename(tmpfile, download)
 345
 346 rss_bugzillas = (
 347     ( 'abi', 'http://bugzilla.abisource.com/buglist.cgi' ), #added for abiword
 348     ( 'fdo', 'http://bugs.freedesktop.org/buglist.cgi' ),
 349     ( 'gentoo', 'http://bugs.gentoo.org/buglist.cgi' ),
 350     ( 'gnome', 'http://bugzilla.gnome.org/buglist.cgi' ), # added for gnumeric
 351     ( 'kde', 'http://bugs.kde.org/buglist.cgi' ), # added for koffice/calligra
 352     ( 'mandriva', 'https://qa.mandriva.com/buglist.cgi' ),
 353     ( 'moz', 'https://bugzilla.mozilla.org/buglist.cgi' ),
 354     # It seems something has changed and it is no longer possible to
 355     # download any files from there.
 356     # NOTE: I am leaving it in the list, commented out, just so someone
 357     # does not add it back immediately .-)
 358     # 'novell': 'https://bugzilla.novell.com/buglist.cgi',
 359 # note: running this script against bz.apache.org apparently causes one's IP
 360 # to be banned or something; you won't get new files in any case...
 361 #    ( 'ooo', 'https://bz.apache.org/ooo/buglist.cgi' ),
 362     ( 'tdf', 'http://bugs.documentfoundation.org/buglist.cgi' ),
 363 )
 364
 365 redhatrpc = 'https://bugzilla.redhat.com/xmlrpc.cgi'
 366 redhatbug = 'https://bugzilla.redhat.com/show_bug.cgi?id='
 367
 368 #Novell Bugzilla requires users to log in, in order to get details of the bugs such as attachment bodies etc.
 369 #As a dirty workaround, we parse comments containing "Created an attachment (id=xxxxxx)" and download attachments manually
 370 #python-bugzilla claims that it supports Novell bugzilla login but it's not working right now and novell bugzilla login
 371 #system is a nightmare
 372 novellattach = 'https://bugzilla.novell.com/attachment.cgi?id='
 373
 374 class manage_threads(threading.Thread):
 375     def run(self):
 376         #print(threading.current_thread().get_ident())
 377         while 1:
 378             # Try to receive a job from queue
 379             try:
 380                 # Get job from queue
 381                 # Use job parameters to call our query
 382                 # Then let the queue know we are done with this job
 383                 (uri, mimetype, prefix, extension) = jobs.get(True,6)
 384                 try:
 385                     get_through_rss_query(uri, mimetype, prefix, extension)
 386                 finally:
 387                     jobs.task_done()
 388             except KeyboardInterrupt:
 389                 raise # Ctrl+C should work
 390             except queue.Empty:
 391                 break
 392
 393 def generate_multi_threading():
 394
 395     # Initialize threads
 396     for i in range(max_threads):
 397         manage_threads().start()
 398
 399     for (prefix, uri) in rss_bugzillas:
 400
 401         # Create a job for every mimetype for a bugzilla
 402         for (mimetype,extension) in mimetypes.items():
 403             # It seems that bugzilla has problems returning that many results
 404             # (10000 results is probably a limit set somewhere) so we always
 405             # end processing the complete list.
 406             if mimetype == 'text/html' and prefix == 'moz':
 407                     continue
 408
 409             jobs.put([uri, mimetype, prefix, extension], block=True)
 410             print("successfully placed a job in the queue searching for " + mimetype + " in bugtracker " + prefix)
 411
 412         # Continue when all mimetypes are done for a bugzilla
 413         print("STARTED all bugtracker " + prefix)
 414
 415     jobs.join()
 416
 417 max_threads = 20 # Number of threads to create, (1 = without multi-threading)
 418 jobs = queue.Queue()
 419
 420 generate_multi_threading()
 421
 422 for (mimetype,extension) in mimetypes.items():
 423     get_through_rpc_query(redhatrpc, redhatbug, mimetype, "rhbz", extension)
 424
 425 try:
 426     get_launchpad_bugs("lp")
 427 except ImportError:
 428     print("launchpadlib unavailable, skipping Ubuntu tracker")
 429
 430 # vim:set shiftwidth=4 softtabstop=4 expandtab: