2 # -*- coding: utf-8 -*-
4 # This file is part of the LibreOffice project.
6 # This Source Code Form is subject to the terms of the Mozilla Public
7 # License, v. 2.0. If a copy of the MPL was not distributed with this
8 # file, You can obtain one at http://mozilla.org/MPL/2.0/.
11 # This digs through a pile of bugzilla's and populates the cwd with a big
12 # collection of bug-docs in per-filetype dirs with bug-ids as names with
13 # prefixes to indicate which bug-tracker, e.g.
19 # where X is the n'th attachment of that type in the bug
21 # The results are stored in the current directory, categorized by the
22 # extension of the downloaded file. When a file already exists, it is assumed
23 # it is already downloaded by a previous run, and up-to-date.
25 from __future__
import print_function
40 from urllib
.request
import urlopen
42 from urllib
import urlopen
44 import xmlrpc
.client
as xmlrpclib
47 from xml
.dom
import minidom
48 from xml
.sax
.saxutils
import escape
49 from attachment_mimetypes
import mimetypes
51 def urlopen_retry(url
):
53 for i
in range(maxretries
+ 1):
57 print("caught IOError: " + str(e
))
62 def get_from_bug_url_via_xml(url
, mimetype
, prefix
, suffix
):
63 id = url
.rsplit('=', 2)[1]
64 print("id is " + prefix
+ id + " " + suffix
)
65 print("parsing " + id)
66 sock
= urlopen_retry(url
+"&ctype=xml")
67 dom
= minidom
.parse(sock
)
70 for attachment
in dom
.getElementsByTagName('attachment'):
72 print(" mimetype is", end
=' ')
73 for node
in attachment
.childNodes
:
74 if node
.nodeName
== 'type':
75 # check if attachment is deleted
76 if not node
.firstChild
:
77 print('deleted attachment, skipping')
80 print(node
.firstChild
.nodeValue
, end
=' ')
81 if node
.firstChild
.nodeValue
.lower() != mimetype
.lower():
84 elif node
.nodeName
== 'data':
85 # check if attachment is deleted (i.e. https://bugs.kde.org/show_bug.cgi?id=53343&ctype=xml)
86 if not node
.firstChild
:
87 print('deleted attachment, skipping')
90 download
= suffix
+ '/' +prefix
+ id + '-' + str(attachmentid
) + '.' + suffix
91 if os
.path
.isfile(download
):
92 print("assuming " + download
+ " is up to date")
95 # prevent re-downloading FDO attachments from TDF
96 if prefix
== "tdf" and int(id) < 88776:
97 fdodownload
= download
.replace("tdf", "fdo")
98 if os
.path
.isfile(fdodownload
):
99 print("assuming FDO " + fdodownload
+ " is up to date")
102 print('downloading as ' + download
)
103 tmpfile
= download
+ ".tmp"
104 f
= open(tmpfile
, 'wb')
105 f
.write(base64
.b64decode(node
.firstChild
.nodeValue
))
107 os
.rename(tmpfile
, download
)
110 def get_novell_bug_via_xml(url
, mimetype
, prefix
, suffix
):
111 id = url
.rsplit('=', 2)[1]
112 print("id is " + prefix
+ id + " " + suffix
)
113 print("parsing " + id)
114 sock
= urlopen_retry(url
+"&ctype=xml")
115 dom
= minidom
.parse(sock
)
118 for comment
in dom
.getElementsByTagName('thetext'):
119 commentText
= comment
.firstChild
.nodeValue
120 match
= re
.search(r
".*Created an attachment \(id=([0-9]+)\)", commentText
)
126 download
= suffix
+ '/' + prefix
+ id + '-' + str(attachmentid
) + '.' + suffix
127 if os
.path
.isfile(download
):
128 print("assuming " + download
+ " is up to date")
131 realAttachmentId
= match
.group(1)
132 handle
= urlopen_retry(novellattach
+ realAttachmentId
)
134 print("attachment %s is not accessible" % realAttachmentId
)
136 print(" mimetype is", end
=' ')
139 if info
.get_content_type
:
140 remoteMime
= info
.get_content_type()
142 remoteMime
= info
.gettype()
143 print(remoteMime
, end
=' ')
144 if remoteMime
!= mimetype
:
148 print('downloading as ' + download
)
149 tmpfile
= download
+ ".tmp"
150 f
= open(tmpfile
, 'wb')
151 f
.write(handle
.read())
153 os
.rename(tmpfile
, download
)
155 def create_query(mimetype
):
157 query
['query_format']='advanced'
158 query
['field0-0-0']='attachments.mimetype'
159 query
['type0-0-0']='equals'
160 query
['value0-0-0']=mimetype
163 def get_downloaded_files(prefix
, suffix
):
164 return glob
.glob(os
.path
.join(suffix
, '%s*.%s' % (prefix
, suffix
)))
166 def get_file_bz_ids(files
, prefix
):
167 return set([os
.path
.basename(f
).split('-')[0].replace(prefix
, '', 1) for f
in files
])
169 def get_changed_date(files
):
170 newest
= max([os
.stat(f
)[stat
.ST_MTIME
] for f
in files
])
171 # Subtract a day to avoid timezone differences. The worst thing that
172 # can happen is that we are going to process more bugs than necessary.
173 return datetime
.date
.fromtimestamp(newest
- 24 * 60 * 60)
175 def get_through_rpc_query(rpcurl
, showurl
, mimetype
, prefix
, suffix
):
181 def process(query
, full
, have
=[]):
183 proxy
= xmlrpclib
.ServerProxy(rpcurl
)
184 result
= proxy
.Bug
.search(query
)
185 bugs
= result
['bugs']
186 print(str(len(bugs
)) + ' bugs to process')
189 available
= set([str(bug
['id']) for bug
in bugs
])
190 # we already have files from all available bugs
191 if available
.difference(set(have
)) == set():
192 print("assuming all downloaded files are up to date")
196 url
= showurl
+ str(bug
['id'])
197 get_from_bug_url_via_xml(url
, mimetype
, prefix
, suffix
)
198 except xmlrpclib
.Fault
as err
:
199 print("A fault occurred")
200 print("Fault code: %s" % err
.faultCode
)
201 print(err
.faultString
)
203 query
= create_query(mimetype
)
204 query
['column_list']='bug_id'
206 files
= get_downloaded_files(prefix
, suffix
)
209 print('looking for updated bugs having %s attachment(s)' % mimetype
)
210 query_changed
= query
.copy()
211 query_changed
['field0-1-0'] = 'days_elapsed'
212 query_changed
['type0-1-0'] = 'lessthaneq'
213 query_changed
['value0-1-0'] = str((datetime
.date
.today() - get_changed_date(files
)).days
)
214 process(query_changed
, False)
216 print('looking for all bugs having %s attachment(s)' % mimetype
)
217 process(query
, True, get_file_bz_ids(files
, prefix
))
219 def get_through_rss_query(queryurl
, mimetype
, prefix
, suffix
):
225 #Getting detailed bug information and downloading an attachment body is not possible without logging in to Novell bugzilla
226 #get_novell_bug_via_xml function is a workaround for that situation
227 get_bug_function
= get_novell_bug_via_xml
if prefix
== "novell" else get_from_bug_url_via_xml
229 def process(query
, full
, have
=[]):
230 url
= queryurl
+ '?' + '&'.join(['='.join(kv
) for kv
in query
.items()])
231 print('url is ' + url
)
232 d
= feedparser
.parse(url
)
233 print(str(len(d
['entries'])) + ' bugs to process')
236 for entry
in d
['entries']:
237 bugid
= entry
['id'].split('=')[-1]
238 entries
.append(entry
)
241 available
= set([str(entry
['id'].split('=')[-1]) for entry
in entries
])
242 # we already have files from all available bugs
243 if available
.difference(set(have
)) == set():
244 print("assuming all downloaded files are up to date")
247 for entry
in entries
:
249 get_bug_function(entry
['id'], mimetype
, prefix
, suffix
)
250 except KeyboardInterrupt:
251 raise # Ctrl+C should work
253 print(entry
['id'] + " failed: " + str(sys
.exc_info()[0]))
256 query
= create_query(escape(mimetype
.replace("+","%2B")))
257 query
['ctype'] = 'rss'
259 files
= get_downloaded_files(prefix
, suffix
)
262 print('looking for updated bugs having %s attachment(s)' % mimetype
)
263 query_changed
= query
.copy()
264 query_changed
['field0-1-0'] = 'delta_ts'
265 query_changed
['type0-1-0'] = 'greaterthaneq'
266 query_changed
['value0-1-0'] = get_changed_date(files
).isoformat()
267 process(query_changed
, False)
269 print('looking for all bugs having %s attachment(s)' % mimetype
)
270 process(query
, True, get_file_bz_ids(files
, prefix
))
272 #since searching bugs having attachments with specific mimetypes is not available in launchpad API
273 #we're iterating over all bugs of the most interesting source packages
296 "python-uniconvertor",
302 def get_launchpad_bugs(prefix
):
303 #launchpadlib python module is required to download launchpad attachments
304 from launchpadlib
.launchpad
import Launchpad
306 launchpad
= Launchpad
.login_anonymously("attachmentdownload", "production")
307 ubuntu
= launchpad
.distributions
["ubuntu"]
309 for pkg
in launchpad_pkgs
:
310 srcpkg
= ubuntu
.getSourcePackage(name
=pkg
)
311 pkgbugs
= srcpkg
.searchTasks(status
=["New", "Fix Committed", "Invalid", "Won't Fix", "Confirmed", "Triaged", "In Progress", "Incomplete", "Incomplete (with response)", "Incomplete (without response)", "Fix Released", "Opinion", "Expired"])
313 for bugtask
in pkgbugs
:
316 print("parsing " + id + " status: " + bugtask
.status
+ " title: " + bug
.title
[:50])
318 for attachment
in bug
.attachments
:
320 handle
= attachment
.data
.open()
321 if not handle
.content_type
in mimetypes
:
325 suffix
= mimetypes
[handle
.content_type
]
326 if not os
.path
.isdir(suffix
):
332 download
= suffix
+ '/' + prefix
+ id + '-' + str(attachmentid
) + '.' + suffix
334 if os
.path
.isfile(download
):
335 print("assuming " + id + " is up to date")
338 print('mimetype is ' + handle
.content_type
+ ' downloading as ' + download
)
340 tmpfile
= download
+ ".tmp"
341 f
= open(tmpfile
, "wb")
342 f
.write(handle
.read())
344 os
.rename(tmpfile
, download
)
347 ( 'abi', 'http://bugzilla.abisource.com/buglist.cgi' ), #added for abiword
348 ( 'fdo', 'http://bugs.freedesktop.org/buglist.cgi' ),
349 ( 'gentoo', 'http://bugs.gentoo.org/buglist.cgi' ),
350 ( 'gnome', 'http://bugzilla.gnome.org/buglist.cgi' ), # added for gnumeric
351 ( 'kde', 'http://bugs.kde.org/buglist.cgi' ), # added for koffice/calligra
352 ( 'mandriva', 'https://qa.mandriva.com/buglist.cgi' ),
353 ( 'moz', 'https://bugzilla.mozilla.org/buglist.cgi' ),
354 # It seems something has changed and it is no longer possible to
355 # download any files from there.
356 # NOTE: I am leaving it in the list, commented out, just so someone
357 # does not add it back immediately .-)
358 # 'novell': 'https://bugzilla.novell.com/buglist.cgi',
359 # note: running this script against bz.apache.org apparently causes one's IP
360 # to be banned or something; you won't get new files in any case...
361 # ( 'ooo', 'https://bz.apache.org/ooo/buglist.cgi' ),
362 ( 'tdf', 'http://bugs.documentfoundation.org/buglist.cgi' ),
365 redhatrpc
= 'https://bugzilla.redhat.com/xmlrpc.cgi'
366 redhatbug
= 'https://bugzilla.redhat.com/show_bug.cgi?id='
368 #Novell Bugzilla requires users to log in, in order to get details of the bugs such as attachment bodies etc.
369 #As a dirty workaround, we parse comments containing "Created an attachment (id=xxxxxx)" and download attachments manually
370 #python-bugzilla claims that it supports Novell bugzilla login but it's not working right now and novell bugzilla login
371 #system is a nightmare
372 novellattach
= 'https://bugzilla.novell.com/attachment.cgi?id='
374 class manage_threads(threading
.Thread
):
376 #print(threading.current_thread().get_ident())
378 # Try to receive a job from queue
381 # Use job parameters to call our query
382 # Then let the queue know we are done with this job
383 (uri
, mimetype
, prefix
, extension
) = jobs
.get(True,6)
385 get_through_rss_query(uri
, mimetype
, prefix
, extension
)
388 except KeyboardInterrupt:
389 raise # Ctrl+C should work
393 def generate_multi_threading():
396 for i
in range(max_threads
):
397 manage_threads().start()
399 for (prefix
, uri
) in rss_bugzillas
:
401 # Create a job for every mimetype for a bugzilla
402 for (mimetype
,extension
) in mimetypes
.items():
403 # It seems that bugzilla has problems returning that many results
404 # (10000 results is probably a limit set somewhere) so we always
405 # end processing the complete list.
406 if mimetype
== 'text/html' and prefix
== 'moz':
409 jobs
.put([uri
, mimetype
, prefix
, extension
], block
=True)
410 print("successfully placed a job in the queue searching for " + mimetype
+ " in bugtracker " + prefix
)
412 # Continue when all mimetypes are done for a bugzilla
413 print("STARTED all bugtracker " + prefix
)
417 max_threads
= 20 # Number of threads to create, (1 = without multi-threading)
420 generate_multi_threading()
422 for (mimetype
,extension
) in mimetypes
.items():
423 get_through_rpc_query(redhatrpc
, redhatbug
, mimetype
, "rhbz", extension
)
426 get_launchpad_bugs("lp")
428 print("launchpadlib unavailable, skipping Ubuntu tracker")
430 # vim:set shiftwidth=4 softtabstop=4 expandtab: