2 # -*- coding: utf-8 -*-
4 # This file is part of the LibreOffice project.
6 # This Source Code Form is subject to the terms of the Mozilla Public
7 # License, v. 2.0. If a copy of the MPL was not distributed with this
8 # file, You can obtain one at http://mozilla.org/MPL/2.0/.
11 # This digs through a pile of bugzilla's and populates the cwd with a big
12 # collection of bug-docs in per-filetype dirs with bug-ids as names with
13 # prefixes to indicate which bug-tracker, e.g.
19 # where X is the n'th attachment of that type in the bug
21 # The results are stored in the current directory, categorized by the
22 # extension of the downloaded file. When a file already exists, it is assumed
23 # it is already downloaded by a previous run, and up-to-date.
25 from __future__
import print_function
40 from urllib
.request
import urlopen
42 from urllib
import urlopen
44 import xmlrpc
.client
as xmlrpclib
47 from xml
.dom
import minidom
48 from xml
.sax
.saxutils
import escape
50 def urlopen_retry(url
):
52 for i
in range(maxretries
+ 1):
56 print("caught IOError: " + str(e
))
61 def get_from_bug_url_via_xml(url
, mimetype
, prefix
, suffix
):
62 id = url
.rsplit('=', 2)[1]
63 print("id is " + prefix
+ id + " " + suffix
)
64 print("parsing " + id)
65 sock
= urlopen_retry(url
+"&ctype=xml")
66 dom
= minidom
.parse(sock
)
69 for attachment
in dom
.getElementsByTagName('attachment'):
71 print(" mimetype is", end
=' ')
72 for node
in attachment
.childNodes
:
73 if node
.nodeName
== 'type':
74 print(node
.firstChild
.nodeValue
, end
=' ')
75 if node
.firstChild
.nodeValue
.lower() != mimetype
.lower():
78 elif node
.nodeName
== 'data':
79 # check if attachment is deleted (i.e. https://bugs.kde.org/show_bug.cgi?id=53343&ctype=xml)
80 if not node
.firstChild
:
81 print('deleted attachment, skipping')
84 download
= suffix
+ '/' +prefix
+ id + '-' + str(attachmentid
) + '.' + suffix
85 if os
.path
.isfile(download
):
86 print("assuming " + download
+ " is up to date")
89 # prevent re-downloading FDO attachments from TDF
90 if prefix
== "tdf" and int(id) < 88776:
91 fdodownload
= download
.replace("tdf", "fdo")
92 if os
.path
.isfile(fdodownload
):
93 print("assuming FDO " + fdodownload
+ " is up to date")
96 print('downloading as ' + download
)
97 tmpfile
= download
+ ".tmp"
98 f
= open(tmpfile
, 'wb')
99 f
.write(base64
.b64decode(node
.firstChild
.nodeValue
))
101 os
.rename(tmpfile
, download
)
104 def get_novell_bug_via_xml(url
, mimetype
, prefix
, suffix
):
105 id = url
.rsplit('=', 2)[1]
106 print("id is " + prefix
+ id + " " + suffix
)
107 print("parsing " + id)
108 sock
= urlopen_retry(url
+"&ctype=xml")
109 dom
= minidom
.parse(sock
)
112 for comment
in dom
.getElementsByTagName('thetext'):
113 commentText
= comment
.firstChild
.nodeValue
114 match
= re
.search(r
".*Created an attachment \(id=([0-9]+)\)", commentText
)
120 download
= suffix
+ '/' + prefix
+ id + '-' + str(attachmentid
) + '.' + suffix
121 if os
.path
.isfile(download
):
122 print("assuming " + download
+ " is up to date")
125 realAttachmentId
= match
.group(1)
126 handle
= urlopen_retry(novellattach
+ realAttachmentId
)
128 print("attachment %s is not accessible" % realAttachmentId
)
130 print(" mimetype is", end
=' ')
133 if info
.get_content_type
:
134 remoteMime
= info
.get_content_type()
136 remoteMime
= info
.gettype()
137 print(remoteMime
, end
=' ')
138 if remoteMime
!= mimetype
:
142 print('downloading as ' + download
)
143 tmpfile
= download
+ ".tmp"
144 f
= open(tmpfile
, 'wb')
145 f
.write(handle
.read())
147 os
.rename(tmpfile
, download
)
149 def create_query(mimetype
):
151 query
['query_format']='advanced'
152 query
['field0-0-0']='attachments.mimetype'
153 query
['type0-0-0']='equals'
154 query
['value0-0-0']=mimetype
157 def get_downloaded_files(prefix
, suffix
):
158 return glob
.glob(os
.path
.join(suffix
, '%s*.%s' % (prefix
, suffix
)))
160 def get_file_bz_ids(files
, prefix
):
161 return set([os
.path
.basename(f
).split('-')[0].replace(prefix
, '', 1) for f
in files
])
163 def get_changed_date(files
):
164 newest
= max([os
.stat(f
)[stat
.ST_MTIME
] for f
in files
])
165 # Subtract a day to avoid timezone differences. The worst thing that
166 # can happen is that we are going to process more bugs than necessary.
167 return datetime
.date
.fromtimestamp(newest
- 24 * 60 * 60)
169 def get_through_rpc_query(rpcurl
, showurl
, mimetype
, prefix
, suffix
):
175 def process(query
, full
, have
=[]):
177 proxy
= xmlrpclib
.ServerProxy(rpcurl
)
178 result
= proxy
.Bug
.search(query
)
179 bugs
= result
['bugs']
180 print(str(len(bugs
)) + ' bugs to process')
183 available
= set([str(bug
['id']) for bug
in bugs
])
184 # we already have files from all available bugs
185 if available
.difference(set(have
)) == set():
186 print("assuming all downloaded files are up to date")
190 url
= showurl
+ str(bug
['id'])
191 get_from_bug_url_via_xml(url
, mimetype
, prefix
, suffix
)
192 except xmlrpclib
.Fault
as err
:
193 print("A fault occurred")
194 print("Fault code: %s" % err
.faultCode
)
195 print(err
.faultString
)
197 query
= create_query(mimetype
)
198 query
['column_list']='bug_id'
200 files
= get_downloaded_files(prefix
, suffix
)
203 print('looking for updated bugs having %s attachment(s)' % mimetype
)
204 query_changed
= query
.copy()
205 query_changed
['field0-1-0'] = 'days_elapsed'
206 query_changed
['type0-1-0'] = 'lessthaneq'
207 query_changed
['value0-1-0'] = str((datetime
.date
.today() - get_changed_date(files
)).days
)
208 process(query_changed
, False)
210 print('looking for all bugs having %s attachment(s)' % mimetype
)
211 process(query
, True, get_file_bz_ids(files
, prefix
))
213 def get_through_rss_query(queryurl
, mimetype
, prefix
, suffix
):
219 #Getting detailed bug information and downloading an attachment body is not possible without logging in to Novell bugzilla
220 #get_novell_bug_via_xml function is a workaround for that situation
221 get_bug_function
= get_novell_bug_via_xml
if prefix
== "novell" else get_from_bug_url_via_xml
223 def process(query
, full
, have
=[]):
224 url
= queryurl
+ '?' + '&'.join(['='.join(kv
) for kv
in query
.items()])
225 print('url is ' + url
)
226 d
= feedparser
.parse(url
)
227 print(str(len(d
['entries'])) + ' bugs to process')
230 for entry
in d
['entries']:
231 bugid
= entry
['id'].split('=')[-1]
232 entries
.append(entry
)
235 available
= set([str(entry
['id'].split('=')[-1]) for entry
in entries
])
236 # we already have files from all available bugs
237 if available
.difference(set(have
)) == set():
238 print("assuming all downloaded files are up to date")
241 for entry
in entries
:
243 get_bug_function(entry
['id'], mimetype
, prefix
, suffix
)
244 except KeyboardInterrupt:
245 raise # Ctrl+C should work
247 print(entry
['id'] + " failed: " + str(sys
.exc_info()[0]))
250 query
= create_query(escape(mimetype
.replace("+","%2B")))
251 query
['ctype'] = 'rss'
253 files
= get_downloaded_files(prefix
, suffix
)
256 print('looking for updated bugs having %s attachment(s)' % mimetype
)
257 query_changed
= query
.copy()
258 query_changed
['field0-1-0'] = 'changed'
259 query_changed
['type0-1-0'] = 'changedbefore'
260 query_changed
['value0-1-0'] = get_changed_date(files
).isoformat()
261 process(query_changed
, False)
263 print('looking for all bugs having %s attachment(s)' % mimetype
)
264 process(query
, True, get_file_bz_ids(files
, prefix
))
266 #since searching bugs having attachments with specific mimetypes is not available in launchpad API
267 #we're iterating over all bugs of the most interesting source packages
290 "python-uniconvertor",
296 def get_launchpad_bugs(prefix
):
297 #launchpadlib python module is required to download launchpad attachments
298 from launchpadlib
.launchpad
import Launchpad
300 launchpad
= Launchpad
.login_anonymously("attachmentdownload", "production")
301 ubuntu
= launchpad
.distributions
["ubuntu"]
303 for pkg
in launchpad_pkgs
:
304 srcpkg
= ubuntu
.getSourcePackage(name
=pkg
)
305 pkgbugs
= srcpkg
.searchTasks(status
=["New", "Fix Committed", "Invalid", "Won't Fix", "Confirmed", "Triaged", "In Progress", "Incomplete", "Incomplete (with response)", "Incomplete (without response)", "Fix Released", "Opinion", "Expired"])
307 for bugtask
in pkgbugs
:
310 print("parsing " + id + " status: " + bugtask
.status
+ " title: " + bug
.title
[:50])
312 for attachment
in bug
.attachments
:
314 handle
= attachment
.data
.open()
315 if not handle
.content_type
in mimetypes
:
319 suffix
= mimetypes
[handle
.content_type
]
320 if not os
.path
.isdir(suffix
):
326 download
= suffix
+ '/' + prefix
+ id + '-' + str(attachmentid
) + '.' + suffix
328 if os
.path
.isfile(download
):
329 print("assuming " + id + " is up to date")
332 print('mimetype is ' + handle
.content_type
+ ' downloading as ' + download
)
334 tmpfile
= download
+ ".tmp"
335 f
= open(tmpfile
, "wb")
336 f
.write(handle
.read())
338 os
.rename(tmpfile
, download
)
341 ( 'abi', 'http://bugzilla.abisource.com/buglist.cgi' ), #added for abiword
342 ( 'fdo', 'http://bugs.freedesktop.org/buglist.cgi' ),
343 ( 'gentoo', 'http://bugs.gentoo.org/buglist.cgi' ),
344 ( 'gnome', 'http://bugzilla.gnome.org/buglist.cgi' ), # added for gnumeric
345 ( 'kde', 'http://bugs.kde.org/buglist.cgi' ), # added for koffice/calligra
346 ( 'mandriva', 'https://qa.mandriva.com/buglist.cgi' ),
347 ( 'moz', 'https://bugzilla.mozilla.org/buglist.cgi' ),
348 # It seems something has changed and it is no longer possible to
349 # download any files from there.
350 # NOTE: I am leaving it in the list, commented out, just so someone
351 # does not add it back immediately .-)
352 # 'novell': 'https://bugzilla.novell.com/buglist.cgi',
353 ( 'ooo', 'https://bz.apache.org/ooo/buglist.cgi' ),
354 ( 'tdf', 'http://bugs.documentfoundation.org/buglist.cgi' ),
357 redhatrpc
= 'https://bugzilla.redhat.com/xmlrpc.cgi'
358 redhatbug
= 'https://bugzilla.redhat.com/show_bug.cgi?id='
360 #Novell Bugzilla requires users to log in, in order to get details of the bugs such as attachment bodies etc.
361 #As a dirty workaround, we parse comments containing "Created an attachment (id=xxxxxx)" and download attachments manually
362 #python-bugzilla claims that it supports Novell bugzilla login but it's not working right now and novell bugzilla login
363 #system is a nightmare
364 novellattach
= 'https://bugzilla.novell.com/attachment.cgi?id='
368 'application/vnd.oasis.opendocument.base': 'odb',
369 'application/vnd.oasis.opendocument.database': 'odb',
370 'application/vnd.oasis.opendocument.chart': 'odc',
371 'application/vnd.oasis.opendocument.chart-template': 'otc',
372 'application/vnd.oasis.opendocument.formula': 'odf',
373 'application/vnd.oasis.opendocument.formula-template': 'otf',
374 'application/vnd.oasis.opendocument.graphics': 'odg',
375 'application/vnd.oasis.opendocument.graphics-template': 'otg',
376 'application/vnd.oasis.opendocument.graphics-flat-xml': 'fodg',
377 'application/vnd.oasis.opendocument.presentation': 'odp',
378 'application/vnd.oasis.opendocument.presentation-template': 'otp',
379 'application/vnd.oasis.opendocument.presentation-flat-xml': 'fodp',
380 'application/vnd.oasis.opendocument.spreadsheet': 'ods',
381 'application/vnd.oasis.opendocument.spreadsheet-template': 'ots',
382 'application/vnd.oasis.opendocument.spreadsheet-flat-xml': 'fods',
383 'application/vnd.oasis.opendocument.text': 'odt',
384 'application/vnd.oasis.opendocument.text-flat-xml': 'fodt',
385 'application/vnd.oasis.opendocument.text-master': 'odm',
386 'application/vnd.oasis.opendocument.text-template': 'ott',
387 'application/vnd.oasis.opendocument.text-master-template': 'otm',
388 'application/vnd.oasis.opendocument.text-web': 'oth',
390 'application/vnd.sun.xml.base': 'odb',
391 'application/vnd.sun.xml.calc': 'sxc',
392 'application/vnd.sun.xml.calc.template': 'stc',
393 'application/vnd.sun.xml.chart': 'sxs',
394 'application/vnd.sun.xml.draw': 'sxd',
395 'application/vnd.sun.xml.draw.template': 'std',
396 'application/vnd.sun.xml.impress': 'sxi',
397 'application/vnd.sun.xml.impress.template': 'sti',
398 'application/vnd.sun.xml.math': 'sxm',
399 'application/vnd.sun.xml.writer': 'sxw',
400 'application/vnd.sun.xml.writer.global': 'sxg',
401 'application/vnd.sun.xml.writer.template': 'stw',
402 'application/vnd.sun.xml.writer.web': 'stw',
404 'application/rtf': 'rtf',
406 'application/msword': 'doc',
407 'application/vnd.ms-powerpoint': 'ppt',
408 'application/vnd.ms-excel': 'xls',
409 'application/vnd.ms-excel.sheet.binary.macroEnabled.12': 'xlsb',
410 'application/vnd.ms-excel.sheet.macroEnabled.12': 'xlsm',
411 'application/vnd.ms-excel.template.macroEnabled.12': 'xltm',
412 'application/vnd.ms-powerpoint.presentation.macroEnabled.12': 'pptm',
413 'application/vnd.ms-powerpoint.slide.macroEnabled.12': 'sldm',
414 'application/vnd.ms-powerpoint.slideshow.macroEnabled.12': 'ppsm',
415 'application/vnd.ms-powerpoint.template.macroEnabled.12': 'potm',
416 'application/vnd.ms-word.document.macroEnabled.12': 'docm',
417 'application/vnd.ms-word.template.macroEnabled.12': 'dotm',
418 'application/vnd.openxmlformats-officedocument.spreadsheetml.sheet': 'xlsx',
419 'application/vnd.openxmlformats-officedocument.spreadsheetml.template': 'xltx',
420 'application/vnd.openxmlformats-officedocument.presentationml.presentation': 'pptx',
421 'application/vnd.openxmlformats-officedocument.presentationml.template': 'potx',
422 'application/vnd.openxmlformats-officedocument.presentationml.slideshow': 'ppsx',
423 'application/vnd.openxmlformats-officedocument.presentationml.slide': 'sldx',
424 'application/vnd.openxmlformats-officedocument.wordprocessingml.document': 'docx',
425 'application/vnd.openxmlformats-officedocument.wordprocessingml.template': 'dotx',
426 'application/vnd.visio': 'vsd',
427 'application/visio.drawing': 'vsd',
428 'application/vnd.visio2013': 'vsdx',
429 'application/vnd.visio.xml': 'vdx',
430 'application/x-mspublisher': 'pub',
432 'application/wps-office.doc': 'doc',
433 'application/wps-office.docx': 'docx',
434 'application/wps-office.xls': 'xls',
435 'application/wps-office.xlsx': 'xlsx',
436 'application/wps-office.ppt': 'ppt',
437 'application/wps-office.pptx': 'pptx',
439 'application/xhtml+xml': 'xhtml',
440 'application/mathml+xml': 'mml',
442 'application/docbook+xml': 'docbook',
445 'text/spreadsheet': 'slk',
446 'application/x-qpro': 'qpro',
447 'application/x-dbase': 'dbf',
448 'application/vnd.corel-draw': 'cdr',
449 'application/vnd.lotus-wordpro': 'lwp',
450 'application/vnd.lotus-1-2-3': 'wks',
451 'application/vnd.wordperfect': 'wpd',
452 'application/wordperfect5.1': 'wpd',
453 'application/vnd.ms-works': 'wps',
454 'application/clarisworks' : 'cwk',
455 'application/macwriteii' : 'mw',
456 'application/vnd.apple.keynote': 'key',
457 'application/vnd.apple.numbers': 'numbers',
458 'application/vnd.apple.pages': 'pages',
459 'application/x-iwork-keynote-sffkey': 'key',
460 'application/x-iwork-numbers-sffnumbers': 'numbers',
461 'application/x-iwork-pages-sffpages': 'pages',
462 'application/x-hwp': 'hwp',
463 'application/x-aportisdoc': 'pdb',
464 'application/prs.plucker' : 'pdb_plucker',
465 'application/vnd.palm' : 'pdb_palm',
466 'application/x-sony-bbeb' : 'lrf',
467 'application/x-pocket-word': 'psw',
468 'application/x-t602': '602',
469 'application/x-fictionbook+xml': 'fb2',
470 'application/x-abiword': 'abw',
471 'application/x-pagemaker': 'pmd',
472 'application/x-gnumeric': 'gnumeric',
473 'application/vnd.stardivision.calc': 'sdc',
474 'application/vnd.stardivision.draw': 'sda',
475 'application/vnd.stardivision.writer': 'sdw',
476 'application/x-starcalc': 'sdc',
477 'application/x-stardraw': 'sdd',
478 'application/x-starwriter': 'sdw',
479 # relatively uncommon image mimetypes
480 'image/x-freehand': 'fh',
483 'image/tiff': 'tiff',
484 'image/vnd.dxf': 'dxf',
486 'image/x-emf': 'emf',
487 'image/x-targa': 'tga',
488 'image/x-sgf': 'sgf',
489 'image/x-svm': 'svm',
491 'image/x-wmf': 'wmf',
492 'image/x-pict': 'pict',
493 'image/x-cmx': 'cmx',
494 'image/svg+xml': 'svg',
496 'image/x-ms-bmp': 'bmp',
497 'image/x-MS-bmp': 'bmp',
498 'image/x-wpg': 'wpg',
499 'image/x-eps': 'eps',
500 'image/x-met': 'met',
501 'image/x-portable-bitmap': 'pbm',
502 'image/x-photo-cd': 'pcd',
503 'image/x-pcx': 'pcx',
504 'image/x-portable-graymap': 'pgm',
505 'image/x-portable-pixmap': 'ppm',
506 'image/vnd.adobe.photoshop': 'psd',
507 'image/x-cmu-raster': 'ras',
508 'image/x-sun-raster': 'ras',
509 'image/x-xbitmap': 'xbm',
510 'image/x-xpixmap': 'xpm',
513 # disabled for now, this would download gigs of pngs/jpegs...
514 common_noncore_mimetypes
= {
517 'image/jpeg': 'jpeg',
520 'application/pdf': 'pdf',
523 class manage_threads(threading
.Thread
):
525 #print(threading.current_thread().get_ident())
527 # Try to receive a job from queue
530 # Use job parameters to call our query
531 # Then let the queue know we are done with this job
532 (uri
, mimetype
, prefix
, extension
) = jobs
.get(True,6)
534 get_through_rss_query(uri
, mimetype
, prefix
, extension
)
537 except KeyboardInterrupt:
538 raise # Ctrl+C should work
542 def generate_multi_threading():
543 for (prefix
, uri
) in rss_bugzillas
:
546 for i
in range(max_threads
):
547 manage_threads().start()
549 # Create a job for every mimetype for a bugzilla
550 for (mimetype
,extension
) in mimetypes
.items():
551 # It seems that bugzilla has problems returning that many results
552 # (10000 results is probably a limit set somewhere) so we always
553 # end processing the complete list.
554 if mimetype
== 'text/html' and prefix
== 'moz':
557 jobs
.put([uri
, mimetype
, prefix
, extension
], block
=True)
558 print("successfully placed a job in the queue searching for " + mimetype
+ " in bugtracker " + prefix
)
560 # Continue when all mimetypes are done for a bugzilla
562 print("DONE with bugtracker " + prefix
)
564 max_threads
= 20 # Number of threads to create, (1 = without multi-threading)
567 generate_multi_threading()
569 for (mimetype
,extension
) in mimetypes
.items():
570 get_through_rpc_query(redhatrpc
, redhatbug
, mimetype
, "rhbz", extension
)
573 get_launchpad_bugs("lp")
575 print("launchpadlib unavailable, skipping Ubuntu tracker")
577 # vim:set shiftwidth=4 softtabstop=4 expandtab: