2 # -*- coding: utf-8 -*-
4 # This file is part of the LibreOffice project.
6 # This Source Code Form is subject to the terms of the Mozilla Public
7 # License, v. 2.0. If a copy of the MPL was not distributed with this
8 # file, You can obtain one at http://mozilla.org/MPL/2.0/.
11 # This digs through a pile of bugzilla's and populates the cwd with a big
12 # collection of bug-docs in per-filetype dirs with bug-ids as names with
13 # prefixes to indicate which bug-tracker, e.g.
19 # where X is the n'th attachment of that type in the bug
21 # The results are stored in the current directory, categorized by the
22 # extension of the downloaded file. When a file already exists, it is assumed
23 # it is already downloaded by a previous run, and up-to-date.
25 from __future__
import print_function
34 import threading
, Queue
36 from urllib
.request
import urlopen
38 from urllib
import urlopen
40 import xmlrpc
.client
as xmlrpclib
43 from xml
.dom
import minidom
44 from xml
.sax
.saxutils
import escape
46 def urlopen_retry(url
):
48 for i
in range(maxretries
+ 1):
52 print("caught IOError: " + str(e
))
57 def get_from_bug_url_via_xml(url
, mimetype
, prefix
, suffix
):
58 id = url
.rsplit('=', 2)[1]
59 print("id is " + prefix
+ id + " " + suffix
)
60 print("parsing " + id)
61 sock
= urlopen_retry(url
+"&ctype=xml")
62 dom
= minidom
.parse(sock
)
65 for attachment
in dom
.getElementsByTagName('attachment'):
67 print(" mimetype is", end
=' ')
68 for node
in attachment
.childNodes
:
69 if node
.nodeName
== 'type':
70 print(node
.firstChild
.nodeValue
, end
=' ')
71 if node
.firstChild
.nodeValue
.lower() != mimetype
.lower():
74 elif node
.nodeName
== 'data':
75 # check if attachment is deleted (i.e. https://bugs.kde.org/show_bug.cgi?id=53343&ctype=xml)
76 if not node
.firstChild
:
77 print('deleted attachment, skipping')
80 download
= suffix
+ '/' +prefix
+ id + '-' + str(attachmentid
) + '.' + suffix
81 if os
.path
.isfile(download
):
82 print("assuming " + download
+ " is up to date")
85 print('downloading as ' + download
)
86 f
= open(download
, 'wb')
87 f
.write(base64
.b64decode(node
.firstChild
.nodeValue
))
91 def get_novell_bug_via_xml(url
, mimetype
, prefix
, suffix
):
92 id = url
.rsplit('=', 2)[1]
93 print("id is " + prefix
+ id + " " + suffix
)
94 print("parsing " + id)
95 sock
= urlopen_retry(url
+"&ctype=xml")
96 dom
= minidom
.parse(sock
)
99 for comment
in dom
.getElementsByTagName('thetext'):
100 commentText
= comment
.firstChild
.nodeValue
101 match
= re
.search(r
".*Created an attachment \(id=([0-9]+)\)", commentText
)
107 download
= suffix
+ '/' + prefix
+ id + '-' + str(attachmentid
) + '.' + suffix
108 if os
.path
.isfile(download
):
109 print("assuming " + download
+ " is up to date")
112 realAttachmentId
= match
.group(1)
113 handle
= urlopen_retry(novellattach
+ realAttachmentId
)
115 print("attachment %s is not accessible" % realAttachmentId
)
117 print(" mimetype is", end
=' ')
120 if info
.get_content_type
:
121 remoteMime
= info
.get_content_type()
123 remoteMime
= info
.gettype()
124 print(remoteMime
, end
=' ')
125 if remoteMime
!= mimetype
:
129 print('downloading as ' + download
)
130 f
= open(download
, 'wb')
131 f
.write(handle
.read())
134 def create_query(mimetype
):
136 query
['query_format']='advanced'
137 query
['field0-0-0']='attachments.mimetype'
138 query
['type0-0-0']='equals'
139 query
['value0-0-0']=mimetype
142 def get_downloaded_files(prefix
, suffix
):
143 return glob
.glob(os
.path
.join(suffix
, '%s*.%s' % (prefix
, suffix
)))
145 def get_file_bz_ids(files
, prefix
):
146 return set([os
.path
.basename(f
).split('-')[0].replace(prefix
, '', 1) for f
in files
])
148 def get_changed_date(files
):
149 newest
= max([os
.stat(f
)[stat
.ST_MTIME
] for f
in files
])
150 # Subtract a day to avoid timezone differences. The worst thing that
151 # can happen is that we are going to process more bugs than necessary.
152 return datetime
.date
.fromtimestamp(newest
- 24 * 60 * 60)
154 def get_through_rpc_query(rpcurl
, showurl
, mimetype
, prefix
, suffix
):
160 def process(query
, full
, have
=[]):
162 proxy
= xmlrpclib
.ServerProxy(rpcurl
)
163 result
= proxy
.Bug
.search(query
)
164 bugs
= result
['bugs']
165 print(str(len(bugs
)) + ' bugs to process')
168 available
= set([str(bug
['id']) for bug
in bugs
])
169 # we already have files from all available bugs
170 if available
.difference(set(have
)) == set():
171 print("assuming all downloaded files are up to date")
175 url
= showurl
+ str(bug
['id'])
176 get_from_bug_url_via_xml(url
, mimetype
, prefix
, suffix
)
177 except xmlrpclib
.Fault
as err
:
178 print("A fault occurred")
179 print("Fault code: %s" % err
.faultCode
)
180 print(err
.faultString
)
182 query
= create_query(mimetype
)
183 query
['column_list']='bug_id'
185 files
= get_downloaded_files(prefix
, suffix
)
188 print('looking for updated bugs having %s attachment(s)' % mimetype
)
189 query_changed
= query
.copy()
190 query_changed
['field0-1-0'] = 'days_elapsed'
191 query_changed
['type0-1-0'] = 'lessthaneq'
192 query_changed
['value0-1-0'] = str((datetime
.date
.today() - get_changed_date(files
)).days
)
193 process(query_changed
, False)
195 print('looking for all bugs having %s attachment(s)' % mimetype
)
196 process(query
, True, get_file_bz_ids(files
, prefix
))
198 def get_through_rss_query(queryurl
, mimetype
, prefix
, suffix
):
204 #Getting detailed bug information and downloading an attachment body is not possible without logging in to Novell bugzilla
205 #get_novell_bug_via_xml function is a workaround for that situation
206 get_bug_function
= get_novell_bug_via_xml
if prefix
== "novell" else get_from_bug_url_via_xml
208 def process(query
, full
, have
=[]):
209 url
= queryurl
+ '?' + '&'.join(['='.join(kv
) for kv
in query
.iteritems()])
210 print('url is ' + url
)
211 d
= feedparser
.parse(url
)
212 print(str(len(d
['entries'])) + ' bugs to process')
215 available
= set([str(entry
['id'].split('=')[-1]) for entry
in d
['entries']])
216 # we already have files from all available bugs
217 if available
.difference(set(have
)) == set():
218 print("assuming all downloaded files are up to date")
221 for entry
in d
['entries']:
223 get_bug_function(entry
['id'], mimetype
, prefix
, suffix
)
224 except KeyboardInterrupt:
225 raise # Ctrl+C should work
227 print(entry
['id'] + " failed: " + str(sys
.exc_info()[0]))
230 query
= create_query(escape(mimetype
))
231 query
['ctype'] = 'rss'
233 files
= get_downloaded_files(prefix
, suffix
)
236 print('looking for updated bugs having %s attachment(s)' % mimetype
)
237 query_changed
= query
.copy()
238 query_changed
['field0-1-0'] = 'changed'
239 query_changed
['type0-1-0'] = 'changedbefore'
240 query_changed
['value0-1-0'] = get_changed_date(files
).isoformat()
241 process(query_changed
, False)
243 print('looking for all bugs having %s attachment(s)' % mimetype
)
244 process(query
, True, get_file_bz_ids(files
, prefix
))
246 #since searching bugs having attachments with specific mimetypes is not available in launchpad API
247 #we're iterating over all bugs of the most interesting source packages
270 "python-uniconvertor",
276 def get_launchpad_bugs(prefix
):
277 #launchpadlib python module is required to download launchpad attachments
278 from launchpadlib
.launchpad
import Launchpad
280 launchpad
= Launchpad
.login_anonymously("attachmentdownload", "production")
281 ubuntu
= launchpad
.distributions
["ubuntu"]
283 for pkg
in launchpad_pkgs
:
284 srcpkg
= ubuntu
.getSourcePackage(name
=pkg
)
285 pkgbugs
= srcpkg
.searchTasks(status
=["New", "Fix Committed", "Invalid", "Won't Fix", "Confirmed", "Triaged", "In Progress", "Incomplete", "Incomplete (with response)", "Incomplete (without response)", "Fix Released", "Opinion", "Expired"])
287 for bugtask
in pkgbugs
:
290 print("parsing " + id + " status: " + bugtask
.status
+ " title: " + bug
.title
[:50])
292 for attachment
in bug
.attachments
:
294 handle
= attachment
.data
.open()
295 if not handle
.content_type
in mimetypes
:
299 suffix
= mimetypes
[handle
.content_type
]
300 if not os
.path
.isdir(suffix
):
306 download
= suffix
+ '/' + prefix
+ id + '-' + str(attachmentid
) + '.' + suffix
308 if os
.path
.isfile(download
):
309 print("assuming " + id + " is up to date")
312 print('mimetype is ' + handle
.content_type
+ ' downloading as ' + download
)
314 f
= open(download
, "w")
315 f
.write(handle
.read())
319 'abi': 'http://bugzilla.abisource.com/buglist.cgi', #added for abiword
320 'fdo': 'http://bugs.libreoffice.org/buglist.cgi',
321 'gentoo': 'http://bugs.gentoo.org/buglist.cgi',
322 'gnome': 'http://bugzilla.gnome.org/buglist.cgi', # added for gnumeric
323 'kde': 'http://bugs.kde.org/buglist.cgi', # added for koffice/calligra
324 'mandriva': 'https://qa.mandriva.com/buglist.cgi',
325 'moz': 'https://bugzilla.mozilla.org/buglist.cgi',
326 # It seems something has changed and it is no longer possible to
327 # download any files from there.
328 # NOTE: I am leaving it in the list, commented out, just so someone
329 # does not add it back immediately .-)
330 # 'novell': 'https://bugzilla.novell.com/buglist.cgi',
331 'ooo': 'https://bz.apache.org/ooo/buglist.cgi',
332 'tdf': 'http://bugs.documentfoundation.org/buglist.cgi',
335 redhatrpc
= 'https://bugzilla.redhat.com/xmlrpc.cgi'
336 redhatbug
= 'https://bugzilla.redhat.com/show_bug.cgi?id='
338 #Novell Bugzilla requires users to log in in order to get details of the bugs such as attachment bodies etc.
339 #As a dirty workaround, we parse comments containing "Created an attachment (id=xxxxxx)" and download attachments manually
340 #python-bugzilla claims that it supports Novell bugzilla login but it's not working right now and novell bugzilla login
341 #system is a nightmare
342 novellattach
= 'https://bugzilla.novell.com/attachment.cgi?id='
346 'application/vnd.oasis.opendocument.base': 'odb',
347 'application/vnd.oasis.opendocument.database': 'odb',
348 'application/vnd.oasis.opendocument.chart': 'odc',
349 'application/vnd.oasis.opendocument.chart-template': 'otc',
350 'application/vnd.oasis.opendocument.formula': 'odf',
351 'application/vnd.oasis.opendocument.formula-template': 'otf',
352 'application/vnd.oasis.opendocument.graphics': 'odg',
353 'application/vnd.oasis.opendocument.graphics-template': 'otg',
354 'application/vnd.oasis.opendocument.graphics-flat-xml': 'fodg',
355 'application/vnd.oasis.opendocument.presentation': 'odp',
356 'application/vnd.oasis.opendocument.presentation-template': 'otp',
357 'application/vnd.oasis.opendocument.presentation-flat-xml': 'fodp',
358 'application/vnd.oasis.opendocument.spreadsheet': 'ods',
359 'application/vnd.oasis.opendocument.spreadsheet-template': 'ots',
360 'application/vnd.oasis.opendocument.spreadsheet-flat-xml': 'fods',
361 'application/vnd.oasis.opendocument.text': 'odt',
362 'application/vnd.oasis.opendocument.text-flat-xml': 'fodt',
363 'application/vnd.oasis.opendocument.text-master': 'odm',
364 'application/vnd.oasis.opendocument.text-template': 'ott',
365 'application/vnd.oasis.opendocument.text-master-template': 'otm',
366 'application/vnd.oasis.opendocument.text-web': 'oth',
368 'application/vnd.sun.xml.base': 'odb',
369 'application/vnd.sun.xml.calc': 'sxc',
370 'application/vnd.sun.xml.calc.template': 'stc',
371 'application/vnd.sun.xml.chart': 'sxs',
372 'application/vnd.sun.xml.draw': 'sxd',
373 'application/vnd.sun.xml.draw.template': 'std',
374 'application/vnd.sun.xml.impress': 'sxi',
375 'application/vnd.sun.xml.impress.template': 'sti',
376 'application/vnd.sun.xml.math': 'sxm',
377 'application/vnd.sun.xml.writer': 'sxw',
378 'application/vnd.sun.xml.writer.global': 'sxg',
379 'application/vnd.sun.xml.writer.template': 'stw',
380 'application/vnd.sun.xml.writer.web': 'stw',
382 'application/rtf': 'rtf',
384 'application/msword': 'doc',
385 'application/vnd.ms-powerpoint': 'ppt',
386 'application/vnd.ms-excel': 'xls',
387 'application/vnd.ms-excel.sheet.binary.macroEnabled.12': 'xlsb',
388 'application/vnd.ms-excel.sheet.macroEnabled.12': 'xlsm',
389 'application/vnd.ms-excel.template.macroEnabled.12': 'xltm',
390 'application/vnd.ms-powerpoint.presentation.macroEnabled.12': 'pptm',
391 'application/vnd.ms-powerpoint.slide.macroEnabled.12': 'sldm',
392 'application/vnd.ms-powerpoint.slideshow.macroEnabled.12': 'ppsm',
393 'application/vnd.ms-powerpoint.template.macroEnabled.12': 'potm',
394 'application/vnd.ms-word.document.macroEnabled.12': 'docm',
395 'application/vnd.ms-word.template.macroEnabled.12': 'dotm',
396 'application/vnd.openxmlformats-officedocument.spreadsheetml.sheet': 'xlsx',
397 'application/vnd.openxmlformats-officedocument.spreadsheetml.template': 'xltx',
398 'application/vnd.openxmlformats-officedocument.presentationml.presentation': 'pptx',
399 'application/vnd.openxmlformats-officedocument.presentationml.template': 'ppotx',
400 'application/vnd.openxmlformats-officedocument.presentationml.slideshow': 'ppsx',
401 'application/vnd.openxmlformats-officedocument.presentationml.slide': 'sldx',
402 'application/vnd.openxmlformats-officedocument.wordprocessingml.document': 'docx',
403 'application/vnd.openxmlformats-officedocument.wordprocessingml.template': 'dotx',
404 'application/vnd.visio': 'vsd',
405 'application/vnd.visio.xml': 'vdx',
406 'application/x-mspublisher': 'pub',
408 'application/xhtml+xml': 'xhtml',
409 'application/mathml+xml': 'mml',
411 'application/docbook+xml': 'docbook',
414 'text/spreadsheet': 'slk',
415 'application/x-dbase': 'dbf',
416 'application/vnd.corel-draw': 'cdr',
417 'application/vnd.lotus-wordpro': 'lwp',
418 'application/vnd.lotus-1-2-3': 'wks',
419 'application/vnd.wordperfect': 'wpd',
420 'application/wordperfect5.1': 'wpd',
421 'application/vnd.ms-works': 'wps',
422 'application/clarisworks' : 'cwk',
423 'application/macwriteii' : 'mw',
424 'application/vnd.apple.keynote': 'key',
425 'application/vnd.apple.numbers': 'numbers',
426 'application/vnd.apple.pages': 'pages',
427 'application/x-iwork-keynote-sffkey': 'key',
428 'application/x-iwork-numbers-sffnumbers': 'numbers',
429 'application/x-iwork-pages-sffpages': 'pages',
430 'application/x-hwp': 'hwp',
431 'application/x-aportisdoc': 'pdb',
432 'application/prs.plucker' : 'pdb_plucker',
433 'application/vnd.palm' : 'pdb_palm',
434 'application/x-sony-bbeb' : 'lrf',
435 'application/x-pocket-word': 'psw',
436 'application/x-t602': '602',
437 'application/x-fictionbook+xml': 'fb2',
438 'application/x-abiword': 'abw',
439 'application/x-pagemaker': 'pmd',
440 # relatively uncommon image mimetypes
441 'image/x-freehand': 'fh',
443 'image/tiff': 'tiff',
444 'image/vnd.dxf': 'dxf',
445 'image/x-emf': 'emf',
446 'image/x-targa': 'tga',
447 'image/x-sgf': 'sgf',
448 'image/x-svm': 'svm',
449 'image/x-wmf': 'wmf',
450 'image/x-pict': 'pict',
451 'image/x-cmx': 'cmx',
452 'image/svg+xml': 'svg',
453 'image/x-MS-bmp': 'bmp',
454 'image/x-wpg': 'wpg',
455 'image/x-eps': 'eps',
456 'image/x-met': 'met',
457 'image/x-portable-bitmap': 'pbm',
458 'image/x-photo-cd': 'pcd',
459 'image/x-pcx': 'pcx',
460 'image/x-portable-graymap': 'pgm',
461 'image/x-portable-pixmap': 'ppm',
462 'image/vnd.adobe.photoshop': 'psd',
463 'image/x-cmu-raster': 'ras',
464 'image/x-sun-raster': 'ras',
465 'image/x-xbitmap': 'xbm',
466 'image/x-xpixmap': 'xpm',
469 # disabled for now, this would download gigs of pngs/jpegs...
470 common_noncore_mimetypes
= {
473 'image/jpeg': 'jpeg',
476 'application/pdf': 'pdf',
479 class manage_threads(threading
.Thread
):
481 #print(threading.current_thread().get_ident())
483 # Try to receive a job from queue
486 # Use job parameters to call our query
487 # Then let the queue know we are done with this job
488 job
= jobs
.get(True,5)
489 get_through_rss_query(job
[0], job
[1], job
[2], job
[3]) # [0] = uri; [1] = mimetype; [2] = prefix; [3] = extension
491 except KeyboardInterrupt:
492 raise # Ctrl+C should work
496 def generate_multi_threading():
497 for (prefix
, uri
) in rss_bugzillas
.items():
500 for i
in xrange(max_threads
):
501 manage_threads().start()
503 # Create a job for every mimetype for a bugzilla
504 for (mimetype
,extension
) in mimetypes
.items():
507 # It seems that bugzilla has problems returing that many results
508 # (10000 results is probably a limit set somewhere) so we always
509 # end processing the complete list.
510 if mimetype
== 'text/html' and prefix
== 'moz':
514 jobs
.put([uri
, mimetype
, prefix
, extension
], block
=True, timeout
=3)
515 print("successfully placed a job in the queue searching for " + mimetype
+ " in bugtracker " + prefix
)
516 except KeyboardInterrupt:
517 raise # Ctrl+C should work
521 # Continue when all mimetypes are done for a bugzilla
524 max_threads
= 20 # Number of threads to create, (1 = without multi-threading)
525 jobs
= Queue
.Queue(40)
527 generate_multi_threading()
529 for (mimetype
,extension
) in mimetypes
.items():
530 get_through_rpc_query(redhatrpc
, redhatbug
, mimetype
, "rhbz", extension
)
533 get_launchpad_bugs("lp")
535 print("launchpadlib unavailable, skipping Ubuntu tracker")
537 # vim:set shiftwidth=4 softtabstop=4 expandtab: