Fix bzr-fastimport when used with newer versions of python-fastimport.(Jelmer Vernooij)
[bzr-fastimport.git] / exporter.py
blobf2006c1f8875e1cf9a6e7e70130989a7ef97e522
1 # -*- coding: utf-8 -*-
3 # Copyright (C) 2008 Canonical Ltd
5 # This program is free software; you can redistribute it and/or modify
6 # it under the terms of the GNU General Public License as published by
7 # the Free Software Foundation; either version 2 of the License, or
8 # (at your option) any later version.
10 # This program is distributed in the hope that it will be useful,
11 # but WITHOUT ANY WARRANTY; without even the implied warranty of
12 # MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
13 # GNU General Public License for more details.
15 # You should have received a copy of the GNU General Public License
16 # along with this program. If not, see <http://www.gnu.org/licenses/>.
18 # Based on bzr-fast-export
19 # Copyright (c) 2008 Adeodato Simó
21 # Permission is hereby granted, free of charge, to any person obtaining
22 # a copy of this software and associated documentation files (the
23 # "Software"), to deal in the Software without restriction, including
24 # without limitation the rights to use, copy, modify, merge, publish,
25 # distribute, sublicense, and/or sell copies of the Software, and to
26 # permit persons to whom the Software is furnished to do so, subject to
27 # the following conditions:
29 # The above copyright notice and this permission notice shall be included
30 # in all copies or substantial portions of the Software.
32 # THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
33 # EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
34 # MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.
35 # IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY
36 # CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT,
37 # TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION WITH THE
38 # SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE.
40 # vim: fileencoding=utf-8
42 """Core engine for the fast-export command."""
44 # TODO: if a new_git_branch below gets merged repeatedly, the tip of the branch
45 # is not updated (because the parent of commit is already merged, so we don't
46 # set new_git_branch to the previously used name)
48 from email.Utils import parseaddr
49 import sys, time, re
51 import bzrlib.branch
52 import bzrlib.revision
53 from bzrlib import (
54 builtins,
55 errors as bazErrors,
56 osutils,
57 progress,
58 trace,
61 from bzrlib.plugins.fastimport import (
62 helpers,
63 marks_file,
66 from fastimport import commands
67 from bzrlib.plugins.fastimport.helpers import (
68 binary_stream,
69 single_plural,
73 def _get_output_stream(destination):
74 if destination is None or destination == '-':
75 return binary_stream(sys.stdout)
76 elif destination.endswith('gz'):
77 import gzip
78 return gzip.open(destination, 'wb')
79 else:
80 return open(destination, 'wb')
82 # from dulwich.repo:
83 def check_ref_format(refname):
84 """Check if a refname is correctly formatted.
86 Implements all the same rules as git-check-ref-format[1].
88 [1] http://www.kernel.org/pub/software/scm/git/docs/git-check-ref-format.html
90 :param refname: The refname to check
91 :return: True if refname is valid, False otherwise
92 """
93 # These could be combined into one big expression, but are listed separately
94 # to parallel [1].
95 if '/.' in refname or refname.startswith('.'):
96 return False
97 if '/' not in refname:
98 return False
99 if '..' in refname:
100 return False
101 for c in refname:
102 if ord(c) < 040 or c in '\177 ~^:?*[':
103 return False
104 if refname[-1] in '/.':
105 return False
106 if refname.endswith('.lock'):
107 return False
108 if '@{' in refname:
109 return False
110 if '\\' in refname:
111 return False
112 return True
115 def sanitize_ref_name_for_git(refname):
116 """Rewrite refname so that it will be accepted by git-fast-import.
117 For the detailed rules see check_ref_format.
119 By rewriting the refname we are breaking uniqueness guarantees provided by bzr
120 so we have to manually
121 verify that resulting ref names are unique.
123 :param refname: refname to rewrite
124 :return: new refname
126 new_refname = re.sub(
127 # '/.' in refname or startswith '.'
128 r"/\.|^\."
129 # '..' in refname
130 r"|\.\."
131 # ord(c) < 040
132 r"|[" + "".join([chr(x) for x in range(040)]) + r"]"
133 # c in '\177 ~^:?*['
134 r"|[\177 ~^:?*[]"
135 # last char in "/."
136 r"|[/.]$"
137 # endswith '.lock'
138 r"|.lock$"
139 # "@{" in refname
140 r"|@{"
141 # "\\" in refname
142 r"|\\",
143 "_", refname)
144 return new_refname
147 class BzrFastExporter(object):
149 def __init__(self, source, outf, ref=None, checkpoint=-1,
150 import_marks_file=None, export_marks_file=None, revision=None,
151 verbose=False, plain_format=False, rewrite_tags=False,
152 no_tags=False, baseline=False):
153 """Export branch data in fast import format.
155 :param plain_format: if True, 'classic' fast-import format is
156 used without any extended features; if False, the generated
157 data is richer and includes information like multiple
158 authors, revision properties, etc.
159 :param rewrite_tags: if True and if plain_format is set, tag names
160 will be rewritten to be git-compatible.
161 Otherwise tags which aren't valid for git will be skipped if
162 plain_format is set.
163 :param no_tags: if True tags won't be exported at all
165 self.branch = source
166 self.outf = outf
167 self.ref = ref
168 self.checkpoint = checkpoint
169 self.import_marks_file = import_marks_file
170 self.export_marks_file = export_marks_file
171 self.revision = revision
172 self.excluded_revisions = set()
173 self.plain_format = plain_format
174 self.rewrite_tags = rewrite_tags
175 self.no_tags = no_tags
176 self.baseline = baseline
177 self._multi_author_api_available = hasattr(bzrlib.revision.Revision,
178 'get_apparent_authors')
179 self.properties_to_exclude = ['authors', 'author']
181 # Progress reporting stuff
182 self.verbose = verbose
183 if verbose:
184 self.progress_every = 100
185 else:
186 self.progress_every = 1000
187 self._start_time = time.time()
188 self._commit_total = 0
190 # Load the marks and initialise things accordingly
191 self.revid_to_mark = {}
192 self.branch_names = {}
193 if self.import_marks_file:
194 marks_info = marks_file.import_marks(self.import_marks_file)
195 if marks_info is not None:
196 self.revid_to_mark = dict((r, m) for m, r in
197 marks_info.items())
198 # These are no longer included in the marks file
199 #self.branch_names = marks_info[1]
201 def interesting_history(self):
202 if self.revision:
203 rev1, rev2 = builtins._get_revision_range(self.revision,
204 self.branch, "fast-export")
205 start_rev_id = rev1.rev_id
206 end_rev_id = rev2.rev_id
207 else:
208 start_rev_id = None
209 end_rev_id = None
210 self.note("Calculating the revisions to include ...")
211 view_revisions = [rev_id for rev_id, _, _, _ in
212 self.branch.iter_merge_sorted_revisions(end_rev_id, start_rev_id)]
213 view_revisions.reverse()
214 # If a starting point was given, we need to later check that we don't
215 # start emitting revisions from before that point. Collect the
216 # revisions to exclude now ...
217 if start_rev_id is not None:
218 self.note("Calculating the revisions to exclude ...")
219 self.excluded_revisions = set([rev_id for rev_id, _, _, _ in
220 self.branch.iter_merge_sorted_revisions(start_rev_id)])
221 if self.baseline:
222 # needed so the first relative commit knows its parent
223 self.excluded_revisions.remove(start_rev_id)
224 view_revisions.insert(0, start_rev_id)
225 return list(view_revisions)
227 def run(self):
228 # Export the data
229 self.branch.repository.lock_read()
230 try:
231 interesting = self.interesting_history()
232 self._commit_total = len(interesting)
233 self.note("Starting export of %d revisions ..." %
234 self._commit_total)
235 if not self.plain_format:
236 self.emit_features()
237 if self.baseline:
238 self.emit_baseline(interesting.pop(0), self.ref)
239 for revid in interesting:
240 self.emit_commit(revid, self.ref)
241 if self.branch.supports_tags() and not self.no_tags:
242 self.emit_tags()
243 finally:
244 self.branch.repository.unlock()
246 # Save the marks if requested
247 self._save_marks()
248 self.dump_stats()
250 def note(self, msg, *args):
251 """Output a note but timestamp it."""
252 msg = "%s %s" % (self._time_of_day(), msg)
253 trace.note(msg, *args)
255 def warning(self, msg, *args):
256 """Output a warning but timestamp it."""
257 msg = "%s WARNING: %s" % (self._time_of_day(), msg)
258 trace.warning(msg, *args)
260 def _time_of_day(self):
261 """Time of day as a string."""
262 # Note: this is a separate method so tests can patch in a fixed value
263 return time.strftime("%H:%M:%S")
265 def report_progress(self, commit_count, details=''):
266 if commit_count and commit_count % self.progress_every == 0:
267 if self._commit_total:
268 counts = "%d/%d" % (commit_count, self._commit_total)
269 else:
270 counts = "%d" % (commit_count,)
271 minutes = (time.time() - self._start_time) / 60
272 rate = commit_count * 1.0 / minutes
273 if rate > 10:
274 rate_str = "at %.0f/minute " % rate
275 else:
276 rate_str = "at %.1f/minute " % rate
277 self.note("%s commits exported %s%s" % (counts, rate_str, details))
279 def dump_stats(self):
280 time_required = progress.str_tdelta(time.time() - self._start_time)
281 rc = len(self.revid_to_mark)
282 self.note("Exported %d %s in %s",
283 rc, single_plural(rc, "revision", "revisions"),
284 time_required)
286 def print_cmd(self, cmd):
287 self.outf.write("%r\n" % cmd)
289 def _save_marks(self):
290 if self.export_marks_file:
291 revision_ids = dict((m, r) for r, m in self.revid_to_mark.items())
292 marks_file.export_marks(self.export_marks_file, revision_ids)
294 def is_empty_dir(self, tree, path):
295 path_id = tree.path2id(path)
296 if path_id is None:
297 self.warning("Skipping empty_dir detection - no file_id for %s" %
298 (path,))
299 return False
301 # Continue if path is not a directory
302 if tree.kind(path_id) != 'directory':
303 return False
305 # Use treewalk to find the contents of our directory
306 contents = list(tree.walkdirs(prefix=path))[0]
307 if len(contents[1]) == 0:
308 return True
309 else:
310 return False
312 def emit_features(self):
313 for feature in sorted(commands.FEATURE_NAMES):
314 self.print_cmd(commands.FeatureCommand(feature))
316 def emit_baseline(self, revid, ref):
317 # Emit a full source tree of the first commit's parent
318 revobj = self.branch.repository.get_revision(revid)
319 mark = 1
320 self.revid_to_mark[revid] = mark
321 file_cmds = self._get_filecommands(bzrlib.revision.NULL_REVISION, revid)
322 self.print_cmd(self._get_commit_command(ref, mark, revobj, file_cmds))
324 def emit_commit(self, revid, ref):
325 if revid in self.revid_to_mark or revid in self.excluded_revisions:
326 return
328 # Get the Revision object
329 try:
330 revobj = self.branch.repository.get_revision(revid)
331 except bazErrors.NoSuchRevision:
332 # This is a ghost revision. Mark it as not found and next!
333 self.revid_to_mark[revid] = -1
334 return
336 # Get the primary parent
337 # TODO: Consider the excluded revisions when deciding the parents.
338 # Currently, a commit with parents that are excluded ought to be
339 # triggering the ref calculation below (and it is not).
340 # IGC 20090824
341 ncommits = len(self.revid_to_mark)
342 nparents = len(revobj.parent_ids)
343 if nparents == 0:
344 if ncommits:
345 # This is a parentless commit but it's not the first one
346 # output. We need to create a new temporary branch for it
347 # otherwise git-fast-import will assume the previous commit
348 # was this one's parent
349 ref = self._next_tmp_ref()
350 parent = bzrlib.revision.NULL_REVISION
351 else:
352 parent = revobj.parent_ids[0]
354 # Print the commit
355 mark = ncommits + 1
356 self.revid_to_mark[revid] = mark
357 file_cmds = self._get_filecommands(parent, revid)
358 self.print_cmd(self._get_commit_command(ref, mark, revobj, file_cmds))
360 # Report progress and checkpoint if it's time for that
361 self.report_progress(ncommits)
362 if (self.checkpoint > 0 and ncommits
363 and ncommits % self.checkpoint == 0):
364 self.note("Exported %i commits - adding checkpoint to output"
365 % ncommits)
366 self._save_marks()
367 self.print_cmd(commands.CheckpointCommand())
369 def _get_name_email(self, user):
370 if user.find('<') == -1:
371 # If the email isn't inside <>, we need to use it as the name
372 # in order for things to round-trip correctly.
373 # (note: parseaddr('a@b.com') => name:'', email: 'a@b.com')
374 name = user
375 email = ''
376 else:
377 name, email = parseaddr(user)
378 return name.encode("utf-8"), email.encode("utf-8")
380 def _get_commit_command(self, git_ref, mark, revobj, file_cmds):
381 # Get the committer and author info
382 committer = revobj.committer
383 name, email = self._get_name_email(committer)
384 committer_info = (name, email, revobj.timestamp, revobj.timezone)
385 if self._multi_author_api_available:
386 more_authors = revobj.get_apparent_authors()
387 author = more_authors.pop(0)
388 else:
389 more_authors = []
390 author = revobj.get_apparent_author()
391 if not self.plain_format and more_authors:
392 name, email = self._get_name_email(author)
393 author_info = (name, email, revobj.timestamp, revobj.timezone)
394 more_author_info = []
395 for a in more_authors:
396 name, email = self._get_name_email(a)
397 more_author_info.append(
398 (name, email, revobj.timestamp, revobj.timezone))
399 elif author != committer:
400 name, email = self._get_name_email(author)
401 author_info = (name, email, revobj.timestamp, revobj.timezone)
402 more_author_info = None
403 else:
404 author_info = None
405 more_author_info = None
407 # Get the parents in terms of marks
408 non_ghost_parents = []
409 for p in revobj.parent_ids:
410 if p in self.excluded_revisions:
411 continue
412 try:
413 parent_mark = self.revid_to_mark[p]
414 non_ghost_parents.append(":%s" % parent_mark)
415 except KeyError:
416 # ghost - ignore
417 continue
418 if non_ghost_parents:
419 from_ = non_ghost_parents[0]
420 merges = non_ghost_parents[1:]
421 else:
422 from_ = None
423 merges = None
425 # Filter the revision properties. Some metadata (like the
426 # author information) is already exposed in other ways so
427 # don't repeat it here.
428 if self.plain_format:
429 properties = None
430 else:
431 properties = revobj.properties
432 for prop in self.properties_to_exclude:
433 try:
434 del properties[prop]
435 except KeyError:
436 pass
438 # Build and return the result
439 return commands.CommitCommand(git_ref, mark, author_info,
440 committer_info, revobj.message.encode("utf-8"), from_, merges, iter(file_cmds),
441 more_authors=more_author_info, properties=properties)
443 def _get_revision_trees(self, parent, revision_id):
444 try:
445 tree_old = self.branch.repository.revision_tree(parent)
446 except bazErrors.UnexpectedInventoryFormat:
447 self.warning("Parent is malformed - diffing against previous parent")
448 # We can't find the old parent. Let's diff against his parent
449 pp = self.branch.repository.get_revision(parent)
450 tree_old = self.branch.repository.revision_tree(pp.parent_ids[0])
451 tree_new = None
452 try:
453 tree_new = self.branch.repository.revision_tree(revision_id)
454 except bazErrors.UnexpectedInventoryFormat:
455 # We can't really do anything anymore
456 self.warning("Revision %s is malformed - skipping" % revision_id)
457 return tree_old, tree_new
459 def _get_filecommands(self, parent, revision_id):
460 """Get the list of FileCommands for the changes between two revisions."""
461 tree_old, tree_new = self._get_revision_trees(parent, revision_id)
462 if not(tree_old and tree_new):
463 # Something is wrong with this revision - ignore the filecommands
464 return []
466 changes = tree_new.changes_from(tree_old)
468 # Make "modified" have 3-tuples, as added does
469 my_modified = [ x[0:3] for x in changes.modified ]
471 # The potential interaction between renames and deletes is messy.
472 # Handle it here ...
473 file_cmds, rd_modifies, renamed = self._process_renames_and_deletes(
474 changes.renamed, changes.removed, revision_id, tree_old)
476 # Map kind changes to a delete followed by an add
477 for path, id_, kind1, kind2 in changes.kind_changed:
478 path = self._adjust_path_for_renames(path, renamed, revision_id)
479 # IGC: I don't understand why a delete is needed here.
480 # In fact, it seems harmful? If you uncomment this line,
481 # please file a bug explaining why you needed to.
482 #file_cmds.append(commands.FileDeleteCommand(path))
483 my_modified.append((path, id_, kind2))
485 # Record modifications
486 for path, id_, kind in changes.added + my_modified + rd_modifies:
487 if kind == 'file':
488 text = tree_new.get_file_text(id_)
489 file_cmds.append(commands.FileModifyCommand(path.encode("utf-8"),
490 helpers.kind_to_mode('file', tree_new.is_executable(id_)),
491 None, text))
492 elif kind == 'symlink':
493 file_cmds.append(commands.FileModifyCommand(path.encode("utf-8"),
494 helpers.kind_to_mode('symlink', False),
495 None, tree_new.get_symlink_target(id_)))
496 elif kind == 'directory':
497 if not self.plain_format:
498 file_cmds.append(commands.FileModifyCommand(path.encode("utf-8"),
499 helpers.kind_to_mode('directory', False),
500 None, None))
501 else:
502 self.warning("cannot export '%s' of kind %s yet - ignoring" %
503 (path, kind))
504 return file_cmds
506 def _process_renames_and_deletes(self, renames, deletes,
507 revision_id, tree_old):
508 file_cmds = []
509 modifies = []
510 renamed = []
512 # See https://bugs.edge.launchpad.net/bzr-fastimport/+bug/268933.
513 # In a nutshell, there are several nasty cases:
515 # 1) bzr rm a; bzr mv b a; bzr commit
516 # 2) bzr mv x/y z; bzr rm x; commmit
518 # The first must come out with the delete first like this:
520 # D a
521 # R b a
523 # The second case must come out with the rename first like this:
525 # R x/y z
526 # D x
528 # So outputting all deletes first or all renames first won't work.
529 # Instead, we need to make multiple passes over the various lists to
530 # get the ordering right.
532 must_be_renamed = {}
533 old_to_new = {}
534 deleted_paths = set([p for p, _, _ in deletes])
535 for (oldpath, newpath, id_, kind,
536 text_modified, meta_modified) in renames:
537 emit = kind != 'directory' or not self.plain_format
538 if newpath in deleted_paths:
539 if emit:
540 file_cmds.append(commands.FileDeleteCommand(newpath.encode("utf-8")))
541 deleted_paths.remove(newpath)
542 if (self.is_empty_dir(tree_old, oldpath)):
543 self.note("Skipping empty dir %s in rev %s" % (oldpath,
544 revision_id))
545 continue
546 #oldpath = self._adjust_path_for_renames(oldpath, renamed,
547 # revision_id)
548 renamed.append([oldpath, newpath])
549 old_to_new[oldpath] = newpath
550 if emit:
551 file_cmds.append(
552 commands.FileRenameCommand(oldpath.encode("utf-8"), newpath.encode("utf-8")))
553 if text_modified or meta_modified:
554 modifies.append((newpath, id_, kind))
556 # Renaming a directory implies all children must be renamed.
557 # Note: changes_from() doesn't handle this
558 if kind == 'directory' and tree_old.kind(id_) == 'directory':
559 for p, e in tree_old.inventory.iter_entries_by_dir(from_dir=id_):
560 if e.kind == 'directory' and self.plain_format:
561 continue
562 old_child_path = osutils.pathjoin(oldpath, p)
563 new_child_path = osutils.pathjoin(newpath, p)
564 must_be_renamed[old_child_path] = new_child_path
566 # Add children not already renamed
567 if must_be_renamed:
568 renamed_already = set(old_to_new.keys())
569 still_to_be_renamed = set(must_be_renamed.keys()) - renamed_already
570 for old_child_path in sorted(still_to_be_renamed):
571 new_child_path = must_be_renamed[old_child_path]
572 if self.verbose:
573 self.note("implicitly renaming %s => %s" % (old_child_path,
574 new_child_path))
575 file_cmds.append(commands.FileRenameCommand(old_child_path.encode("utf-8"),
576 new_child_path.encode("utf-8")))
578 # Record remaining deletes
579 for path, id_, kind in deletes:
580 if path not in deleted_paths:
581 continue
582 if kind == 'directory' and self.plain_format:
583 continue
584 #path = self._adjust_path_for_renames(path, renamed, revision_id)
585 file_cmds.append(commands.FileDeleteCommand(path.encode("utf-8")))
586 return file_cmds, modifies, renamed
588 def _adjust_path_for_renames(self, path, renamed, revision_id):
589 # If a previous rename is found, we should adjust the path
590 for old, new in renamed:
591 if path == old:
592 self.note("Changing path %s given rename to %s in revision %s"
593 % (path, new, revision_id))
594 path = new
595 elif path.startswith(old + '/'):
596 self.note(
597 "Adjusting path %s given rename of %s to %s in revision %s"
598 % (path, old, new, revision_id))
599 path = path.replace(old + "/", new + "/")
600 return path
602 def emit_tags(self):
603 for tag, revid in self.branch.tags.get_tag_dict().items():
604 try:
605 mark = self.revid_to_mark[revid]
606 except KeyError:
607 self.warning('not creating tag %r pointing to non-existent '
608 'revision %s' % (tag, revid))
609 else:
610 git_ref = 'refs/tags/%s' % tag.encode("utf-8")
611 if self.plain_format and not check_ref_format(git_ref):
612 if self.rewrite_tags:
613 new_ref = sanitize_ref_name_for_git(git_ref)
614 self.warning('tag %r is exported as %r to be valid in git.',
615 git_ref, new_ref)
616 git_ref = new_ref
617 else:
618 self.warning('not creating tag %r as its name would not be '
619 'valid in git.', git_ref)
620 continue
621 self.print_cmd(commands.ResetCommand(git_ref, ":" + str(mark)))
623 def _next_tmp_ref(self):
624 """Return a unique branch name. The name will start with "tmp"."""
625 prefix = 'tmp'
626 if prefix not in self.branch_names:
627 self.branch_names[prefix] = 0
628 else:
629 self.branch_names[prefix] += 1
630 prefix = '%s.%d' % (prefix, self.branch_names[prefix])
631 return 'refs/heads/%s' % prefix