2 # -*- coding: utf-8 -*-
4 # This file is part of the LibreOffice project.
6 # This Source Code Form is subject to the terms of the Mozilla Public
7 # License, v. 2.0. If a copy of the MPL was not distributed with this
8 # file, You can obtain one at http://mozilla.org/MPL/2.0/.
11 # This digs through a pile of bugzilla's and populates the cwd with a big
12 # collection of bug-docs in per-filetype dirs with bug-ids as names with
13 # prefixes to indicate which bug-tracker, e.g.
19 # where X is the n'th attachment of that type in the bug
21 # The results are stored in the current directory, categorized by the
22 # extension of the downloaded file. When a file already exists, it is assumed
23 # it is already downloaded by a previous run, and up-to-date.
25 from __future__
import print_function
40 from urllib
.request
import urlopen
42 from urllib
import urlopen
44 import xmlrpc
.client
as xmlrpclib
47 from xml
.dom
import minidom
48 from xml
.sax
.saxutils
import escape
50 def urlopen_retry(url
):
52 for i
in range(maxretries
+ 1):
56 print("caught IOError: " + str(e
))
61 def get_from_bug_url_via_xml(url
, mimetype
, prefix
, suffix
):
62 id = url
.rsplit('=', 2)[1]
63 print("id is " + prefix
+ id + " " + suffix
)
64 print("parsing " + id)
65 sock
= urlopen_retry(url
+"&ctype=xml")
66 dom
= minidom
.parse(sock
)
69 for attachment
in dom
.getElementsByTagName('attachment'):
71 print(" mimetype is", end
=' ')
72 for node
in attachment
.childNodes
:
73 if node
.nodeName
== 'type':
74 # check if attachment is deleted
75 if not node
.firstChild
:
76 print('deleted attachment, skipping')
79 print(node
.firstChild
.nodeValue
, end
=' ')
80 if node
.firstChild
.nodeValue
.lower() != mimetype
.lower():
83 elif node
.nodeName
== 'data':
84 # check if attachment is deleted (i.e. https://bugs.kde.org/show_bug.cgi?id=53343&ctype=xml)
85 if not node
.firstChild
:
86 print('deleted attachment, skipping')
89 download
= suffix
+ '/' +prefix
+ id + '-' + str(attachmentid
) + '.' + suffix
90 if os
.path
.isfile(download
):
91 print("assuming " + download
+ " is up to date")
94 # prevent re-downloading FDO attachments from TDF
95 if prefix
== "tdf" and int(id) < 88776:
96 fdodownload
= download
.replace("tdf", "fdo")
97 if os
.path
.isfile(fdodownload
):
98 print("assuming FDO " + fdodownload
+ " is up to date")
101 print('downloading as ' + download
)
102 tmpfile
= download
+ ".tmp"
103 f
= open(tmpfile
, 'wb')
104 f
.write(base64
.b64decode(node
.firstChild
.nodeValue
))
106 os
.rename(tmpfile
, download
)
109 def get_novell_bug_via_xml(url
, mimetype
, prefix
, suffix
):
110 id = url
.rsplit('=', 2)[1]
111 print("id is " + prefix
+ id + " " + suffix
)
112 print("parsing " + id)
113 sock
= urlopen_retry(url
+"&ctype=xml")
114 dom
= minidom
.parse(sock
)
117 for comment
in dom
.getElementsByTagName('thetext'):
118 commentText
= comment
.firstChild
.nodeValue
119 match
= re
.search(r
".*Created an attachment \(id=([0-9]+)\)", commentText
)
125 download
= suffix
+ '/' + prefix
+ id + '-' + str(attachmentid
) + '.' + suffix
126 if os
.path
.isfile(download
):
127 print("assuming " + download
+ " is up to date")
130 realAttachmentId
= match
.group(1)
131 handle
= urlopen_retry(novellattach
+ realAttachmentId
)
133 print("attachment %s is not accessible" % realAttachmentId
)
135 print(" mimetype is", end
=' ')
138 if info
.get_content_type
:
139 remoteMime
= info
.get_content_type()
141 remoteMime
= info
.gettype()
142 print(remoteMime
, end
=' ')
143 if remoteMime
!= mimetype
:
147 print('downloading as ' + download
)
148 tmpfile
= download
+ ".tmp"
149 f
= open(tmpfile
, 'wb')
150 f
.write(handle
.read())
152 os
.rename(tmpfile
, download
)
154 def create_query(mimetype
):
156 query
['query_format']='advanced'
157 query
['field0-0-0']='attachments.mimetype'
158 query
['type0-0-0']='equals'
159 query
['value0-0-0']=mimetype
162 def get_downloaded_files(prefix
, suffix
):
163 return glob
.glob(os
.path
.join(suffix
, '%s*.%s' % (prefix
, suffix
)))
165 def get_file_bz_ids(files
, prefix
):
166 return set([os
.path
.basename(f
).split('-')[0].replace(prefix
, '', 1) for f
in files
])
168 def get_changed_date(files
):
169 newest
= max([os
.stat(f
)[stat
.ST_MTIME
] for f
in files
])
170 # Subtract a day to avoid timezone differences. The worst thing that
171 # can happen is that we are going to process more bugs than necessary.
172 return datetime
.date
.fromtimestamp(newest
- 24 * 60 * 60)
174 def get_through_rpc_query(rpcurl
, showurl
, mimetype
, prefix
, suffix
):
180 def process(query
, full
, have
=[]):
182 proxy
= xmlrpclib
.ServerProxy(rpcurl
)
183 result
= proxy
.Bug
.search(query
)
184 bugs
= result
['bugs']
185 print(str(len(bugs
)) + ' bugs to process')
188 available
= set([str(bug
['id']) for bug
in bugs
])
189 # we already have files from all available bugs
190 if available
.difference(set(have
)) == set():
191 print("assuming all downloaded files are up to date")
195 url
= showurl
+ str(bug
['id'])
196 get_from_bug_url_via_xml(url
, mimetype
, prefix
, suffix
)
197 except xmlrpclib
.Fault
as err
:
198 print("A fault occurred")
199 print("Fault code: %s" % err
.faultCode
)
200 print(err
.faultString
)
202 query
= create_query(mimetype
)
203 query
['column_list']='bug_id'
205 files
= get_downloaded_files(prefix
, suffix
)
208 print('looking for updated bugs having %s attachment(s)' % mimetype
)
209 query_changed
= query
.copy()
210 query_changed
['field0-1-0'] = 'days_elapsed'
211 query_changed
['type0-1-0'] = 'lessthaneq'
212 query_changed
['value0-1-0'] = str((datetime
.date
.today() - get_changed_date(files
)).days
)
213 process(query_changed
, False)
215 print('looking for all bugs having %s attachment(s)' % mimetype
)
216 process(query
, True, get_file_bz_ids(files
, prefix
))
218 def get_through_rss_query(queryurl
, mimetype
, prefix
, suffix
):
224 #Getting detailed bug information and downloading an attachment body is not possible without logging in to Novell bugzilla
225 #get_novell_bug_via_xml function is a workaround for that situation
226 get_bug_function
= get_novell_bug_via_xml
if prefix
== "novell" else get_from_bug_url_via_xml
228 def process(query
, full
, have
=[]):
229 url
= queryurl
+ '?' + '&'.join(['='.join(kv
) for kv
in query
.items()])
230 print('url is ' + url
)
231 d
= feedparser
.parse(url
)
232 print(str(len(d
['entries'])) + ' bugs to process')
235 for entry
in d
['entries']:
236 bugid
= entry
['id'].split('=')[-1]
237 entries
.append(entry
)
240 available
= set([str(entry
['id'].split('=')[-1]) for entry
in entries
])
241 # we already have files from all available bugs
242 if available
.difference(set(have
)) == set():
243 print("assuming all downloaded files are up to date")
246 for entry
in entries
:
248 get_bug_function(entry
['id'], mimetype
, prefix
, suffix
)
249 except KeyboardInterrupt:
250 raise # Ctrl+C should work
252 print(entry
['id'] + " failed: " + str(sys
.exc_info()[0]))
255 query
= create_query(escape(mimetype
.replace("+","%2B")))
256 query
['ctype'] = 'rss'
258 files
= get_downloaded_files(prefix
, suffix
)
261 print('looking for updated bugs having %s attachment(s)' % mimetype
)
262 query_changed
= query
.copy()
263 query_changed
['field0-1-0'] = 'delta_ts'
264 query_changed
['type0-1-0'] = 'greaterthaneq'
265 query_changed
['value0-1-0'] = get_changed_date(files
).isoformat()
266 process(query_changed
, False)
268 print('looking for all bugs having %s attachment(s)' % mimetype
)
269 process(query
, True, get_file_bz_ids(files
, prefix
))
271 #since searching bugs having attachments with specific mimetypes is not available in launchpad API
272 #we're iterating over all bugs of the most interesting source packages
295 "python-uniconvertor",
301 def get_launchpad_bugs(prefix
):
302 #launchpadlib python module is required to download launchpad attachments
303 from launchpadlib
.launchpad
import Launchpad
305 launchpad
= Launchpad
.login_anonymously("attachmentdownload", "production")
306 ubuntu
= launchpad
.distributions
["ubuntu"]
308 for pkg
in launchpad_pkgs
:
309 srcpkg
= ubuntu
.getSourcePackage(name
=pkg
)
310 pkgbugs
= srcpkg
.searchTasks(status
=["New", "Fix Committed", "Invalid", "Won't Fix", "Confirmed", "Triaged", "In Progress", "Incomplete", "Incomplete (with response)", "Incomplete (without response)", "Fix Released", "Opinion", "Expired"])
312 for bugtask
in pkgbugs
:
315 print("parsing " + id + " status: " + bugtask
.status
+ " title: " + bug
.title
[:50])
317 for attachment
in bug
.attachments
:
319 handle
= attachment
.data
.open()
320 if not handle
.content_type
in mimetypes
:
324 suffix
= mimetypes
[handle
.content_type
]
325 if not os
.path
.isdir(suffix
):
331 download
= suffix
+ '/' + prefix
+ id + '-' + str(attachmentid
) + '.' + suffix
333 if os
.path
.isfile(download
):
334 print("assuming " + id + " is up to date")
337 print('mimetype is ' + handle
.content_type
+ ' downloading as ' + download
)
339 tmpfile
= download
+ ".tmp"
340 f
= open(tmpfile
, "wb")
341 f
.write(handle
.read())
343 os
.rename(tmpfile
, download
)
346 ( 'abi', 'http://bugzilla.abisource.com/buglist.cgi' ), #added for abiword
347 ( 'fdo', 'http://bugs.freedesktop.org/buglist.cgi' ),
348 ( 'gentoo', 'http://bugs.gentoo.org/buglist.cgi' ),
349 ( 'gnome', 'http://bugzilla.gnome.org/buglist.cgi' ), # added for gnumeric
350 ( 'kde', 'http://bugs.kde.org/buglist.cgi' ), # added for koffice/calligra
351 ( 'mandriva', 'https://qa.mandriva.com/buglist.cgi' ),
352 ( 'moz', 'https://bugzilla.mozilla.org/buglist.cgi' ),
353 # It seems something has changed and it is no longer possible to
354 # download any files from there.
355 # NOTE: I am leaving it in the list, commented out, just so someone
356 # does not add it back immediately .-)
357 # 'novell': 'https://bugzilla.novell.com/buglist.cgi',
358 # note: running this script against bz.apache.org apparently causes one's IP
359 # to be blacklisted or something; you won't get new files in any case...
360 # ( 'ooo', 'https://bz.apache.org/ooo/buglist.cgi' ),
361 ( 'tdf', 'http://bugs.documentfoundation.org/buglist.cgi' ),
364 redhatrpc
= 'https://bugzilla.redhat.com/xmlrpc.cgi'
365 redhatbug
= 'https://bugzilla.redhat.com/show_bug.cgi?id='
367 #Novell Bugzilla requires users to log in, in order to get details of the bugs such as attachment bodies etc.
368 #As a dirty workaround, we parse comments containing "Created an attachment (id=xxxxxx)" and download attachments manually
369 #python-bugzilla claims that it supports Novell bugzilla login but it's not working right now and novell bugzilla login
370 #system is a nightmare
371 novellattach
= 'https://bugzilla.novell.com/attachment.cgi?id='
375 'application/vnd.oasis.opendocument.base': 'odb',
376 'application/vnd.oasis.opendocument.database': 'odb',
377 'application/vnd.oasis.opendocument.chart': 'odc',
378 'application/vnd.oasis.opendocument.chart-template': 'otc',
379 'application/vnd.oasis.opendocument.formula': 'odf',
380 'application/vnd.oasis.opendocument.formula-template': 'otf',
381 'application/vnd.oasis.opendocument.graphics': 'odg',
382 'application/vnd.oasis.opendocument.graphics-template': 'otg',
383 'application/vnd.oasis.opendocument.graphics-flat-xml': 'fodg',
384 'application/vnd.oasis.opendocument.presentation': 'odp',
385 'application/vnd.oasis.opendocument.presentation-template': 'otp',
386 'application/vnd.oasis.opendocument.presentation-flat-xml': 'fodp',
387 'application/vnd.oasis.opendocument.spreadsheet': 'ods',
388 'application/vnd.oasis.opendocument.spreadsheet-template': 'ots',
389 'application/vnd.oasis.opendocument.spreadsheet-flat-xml': 'fods',
390 'application/vnd.oasis.opendocument.text': 'odt',
391 'application/vnd.oasis.opendocument.text-flat-xml': 'fodt',
392 'application/vnd.oasis.opendocument.text-master': 'odm',
393 'application/vnd.oasis.opendocument.text-template': 'ott',
394 'application/vnd.oasis.opendocument.text-master-template': 'otm',
395 'application/vnd.oasis.opendocument.text-web': 'oth',
397 'application/vnd.sun.xml.base': 'odb',
398 'application/vnd.sun.xml.calc': 'sxc',
399 'application/vnd.sun.xml.calc.template': 'stc',
400 'application/vnd.sun.xml.chart': 'sxs',
401 'application/vnd.sun.xml.draw': 'sxd',
402 'application/vnd.sun.xml.draw.template': 'std',
403 'application/vnd.sun.xml.impress': 'sxi',
404 'application/vnd.sun.xml.impress.template': 'sti',
405 'application/vnd.sun.xml.math': 'sxm',
406 'application/vnd.sun.xml.writer': 'sxw',
407 'application/vnd.sun.xml.writer.global': 'sxg',
408 'application/vnd.sun.xml.writer.template': 'stw',
409 'application/vnd.sun.xml.writer.web': 'stw',
411 'application/rtf': 'rtf',
413 'application/msword': 'doc',
414 'application/vnd.ms-powerpoint': 'ppt',
415 'application/vnd.ms-excel': 'xls',
416 'application/vnd.ms-excel.sheet.binary.macroEnabled.12': 'xlsb',
417 'application/vnd.ms-excel.sheet.macroEnabled.12': 'xlsm',
418 'application/vnd.ms-excel.template.macroEnabled.12': 'xltm',
419 'application/vnd.ms-powerpoint.presentation.macroEnabled.12': 'pptm',
420 'application/vnd.ms-powerpoint.slide.macroEnabled.12': 'sldm',
421 'application/vnd.ms-powerpoint.slideshow.macroEnabled.12': 'ppsm',
422 'application/vnd.ms-powerpoint.template.macroEnabled.12': 'potm',
423 'application/vnd.ms-word.document.macroEnabled.12': 'docm',
424 'application/vnd.ms-word.template.macroEnabled.12': 'dotm',
425 'application/vnd.openxmlformats-officedocument.spreadsheetml.sheet': 'xlsx',
426 'application/vnd.openxmlformats-officedocument.spreadsheetml.template': 'xltx',
427 'application/vnd.openxmlformats-officedocument.presentationml.presentation': 'pptx',
428 'application/vnd.openxmlformats-officedocument.presentationml.template': 'potx',
429 'application/vnd.openxmlformats-officedocument.presentationml.slideshow': 'ppsx',
430 'application/vnd.openxmlformats-officedocument.presentationml.slide': 'sldx',
431 'application/vnd.openxmlformats-officedocument.wordprocessingml.document': 'docx',
432 'application/vnd.openxmlformats-officedocument.wordprocessingml.template': 'dotx',
433 'application/vnd.visio': 'vsd',
434 'application/visio.drawing': 'vsd',
435 'application/vnd.visio2013': 'vsdx',
436 'application/vnd.visio.xml': 'vdx',
437 'application/x-mspublisher': 'pub',
439 'application/wps-office.doc': 'doc',
440 'application/wps-office.docx': 'docx',
441 'application/wps-office.xls': 'xls',
442 'application/wps-office.xlsx': 'xlsx',
443 'application/wps-office.ppt': 'ppt',
444 'application/wps-office.pptx': 'pptx',
446 'application/xhtml+xml': 'xhtml',
447 'application/mathml+xml': 'mml',
449 'application/docbook+xml': 'docbook',
452 'text/spreadsheet': 'slk',
453 'application/x-qpro': 'qpro',
454 'application/x-dbase': 'dbf',
455 'application/vnd.corel-draw': 'cdr',
456 'application/vnd.lotus-wordpro': 'lwp',
457 'application/vnd.lotus-1-2-3': 'wks',
458 'application/vnd.wordperfect': 'wpd',
459 'application/wordperfect5.1': 'wpd',
460 'application/vnd.ms-works': 'wps',
461 'application/clarisworks' : 'cwk',
462 'application/macwriteii' : 'mw',
463 'application/vnd.apple.keynote': 'key',
464 'application/vnd.apple.numbers': 'numbers',
465 'application/vnd.apple.pages': 'pages',
466 'application/x-iwork-keynote-sffkey': 'key',
467 'application/x-iwork-numbers-sffnumbers': 'numbers',
468 'application/x-iwork-pages-sffpages': 'pages',
469 'application/x-hwp': 'hwp',
470 'application/x-aportisdoc': 'pdb',
471 'application/prs.plucker' : 'pdb_plucker',
472 'application/vnd.palm' : 'pdb_palm',
473 'application/x-sony-bbeb' : 'lrf',
474 'application/x-pocket-word': 'psw',
475 'application/x-t602': '602',
476 'application/x-fictionbook+xml': 'fb2',
477 'application/x-abiword': 'abw',
478 'application/x-pagemaker': 'pmd',
479 'application/x-gnumeric': 'gnumeric',
480 'application/vnd.stardivision.calc': 'sdc',
481 'application/vnd.stardivision.draw': 'sda',
482 'application/vnd.stardivision.writer': 'sdw',
483 'application/x-starcalc': 'sdc',
484 'application/x-stardraw': 'sdd',
485 'application/x-starwriter': 'sdw',
486 # relatively uncommon image mimetypes
487 'image/x-freehand': 'fh',
490 'image/tiff': 'tiff',
491 'image/vnd.dxf': 'dxf',
493 'image/x-emf': 'emf',
494 'image/x-targa': 'tga',
495 'image/x-sgf': 'sgf',
496 'image/x-svm': 'svm',
498 'image/x-wmf': 'wmf',
499 'image/x-pict': 'pict',
500 'image/x-cmx': 'cmx',
501 'image/svg+xml': 'svg',
503 'image/x-ms-bmp': 'bmp',
504 'image/x-MS-bmp': 'bmp',
505 'image/x-wpg': 'wpg',
506 'image/x-eps': 'eps',
507 'image/x-met': 'met',
508 'image/x-portable-bitmap': 'pbm',
509 'image/x-photo-cd': 'pcd',
510 'image/x-pcx': 'pcx',
511 'image/x-portable-graymap': 'pgm',
512 'image/x-portable-pixmap': 'ppm',
513 'image/vnd.adobe.photoshop': 'psd',
514 'image/x-cmu-raster': 'ras',
515 'image/x-sun-raster': 'ras',
516 'image/x-xbitmap': 'xbm',
517 'image/x-xpixmap': 'xpm',
520 # disabled for now, this would download gigs of pngs/jpegs...
521 common_noncore_mimetypes
= {
524 'image/jpeg': 'jpeg',
527 'application/pdf': 'pdf',
530 class manage_threads(threading
.Thread
):
532 #print(threading.current_thread().get_ident())
534 # Try to receive a job from queue
537 # Use job parameters to call our query
538 # Then let the queue know we are done with this job
539 (uri
, mimetype
, prefix
, extension
) = jobs
.get(True,6)
541 get_through_rss_query(uri
, mimetype
, prefix
, extension
)
544 except KeyboardInterrupt:
545 raise # Ctrl+C should work
549 def generate_multi_threading():
550 for (prefix
, uri
) in rss_bugzillas
:
553 for i
in range(max_threads
):
554 manage_threads().start()
556 # Create a job for every mimetype for a bugzilla
557 for (mimetype
,extension
) in mimetypes
.items():
558 # It seems that bugzilla has problems returning that many results
559 # (10000 results is probably a limit set somewhere) so we always
560 # end processing the complete list.
561 if mimetype
== 'text/html' and prefix
== 'moz':
564 jobs
.put([uri
, mimetype
, prefix
, extension
], block
=True)
565 print("successfully placed a job in the queue searching for " + mimetype
+ " in bugtracker " + prefix
)
567 # Continue when all mimetypes are done for a bugzilla
569 print("DONE with bugtracker " + prefix
)
571 max_threads
= 20 # Number of threads to create, (1 = without multi-threading)
574 generate_multi_threading()
576 for (mimetype
,extension
) in mimetypes
.items():
577 get_through_rpc_query(redhatrpc
, redhatbug
, mimetype
, "rhbz", extension
)
580 get_launchpad_bugs("lp")
582 print("launchpadlib unavailable, skipping Ubuntu tracker")
584 # vim:set shiftwidth=4 softtabstop=4 expandtab: