1 # -*- coding: utf-8 -*-
3 # Copyright (C) 2008 Canonical Ltd
5 # This program is free software; you can redistribute it and/or modify
6 # it under the terms of the GNU General Public License as published by
7 # the Free Software Foundation; either version 2 of the License, or
8 # (at your option) any later version.
10 # This program is distributed in the hope that it will be useful,
11 # but WITHOUT ANY WARRANTY; without even the implied warranty of
12 # MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
13 # GNU General Public License for more details.
15 # You should have received a copy of the GNU General Public License
16 # along with this program; if not, write to the Free Software
17 # Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA 02111-1307 USA
19 # Original Copyright (c) 2008 Adeodato Simó
20 # Original License: MIT (See exporters/bzr-fast-export.LICENSE)
22 # vim: fileencoding=utf-8
24 """Core engine for the fast-export command."""
26 # TODO: if a new_git_branch below gets merged repeatedly, the tip of the branch
27 # is not updated (because the parent of commit is already merged, so we don't
28 # set new_git_branch to the previously used name)
30 from email
.Utils
import parseaddr
34 import bzrlib
.revision
43 from bzrlib
.plugins
.fastimport
import commands
, helpers
, marks_file
46 class BzrFastExporter(object):
48 def __init__(self
, source
, destination
, git_branch
=None, checkpoint
=-1,
49 import_marks_file
=None, export_marks_file
=None, revision
=None,
50 verbose
=False, plain_format
=False):
51 """Export branch data in fast import format.
53 :param plain_format: if True, 'classic' fast-import format is
54 used without any extended features; if False, the generated
55 data is richer and includes information like multiple
56 authors, revision properties, etc.
59 if destination
is None or destination
== '-':
60 self
.outf
= helpers
.binary_stream(sys
.stdout
)
61 elif destination
.endswith('gz'):
63 self
.outf
= gzip
.open(destination
, 'wb')
65 self
.outf
= open(destination
, 'wb')
66 self
.git_branch
= git_branch
67 self
.checkpoint
= checkpoint
68 self
.import_marks_file
= import_marks_file
69 self
.export_marks_file
= export_marks_file
70 self
.revision
= revision
71 self
.excluded_revisions
= set()
72 self
.plain_format
= plain_format
73 self
._multi
_author
_api
_available
= hasattr(bzrlib
.revision
.Revision
,
74 'get_apparent_authors')
75 self
.properties_to_exclude
= ['authors', 'author']
77 # Progress reporting stuff
78 self
.verbose
= verbose
80 self
.progress_every
= 100
82 self
.progress_every
= 1000
83 self
._start
_time
= time
.time()
84 self
._commit
_total
= 0
86 # Load the marks and initialise things accordingly
87 self
.revid_to_mark
= {}
88 self
.branch_names
= {}
89 if self
.import_marks_file
:
90 marks_info
= marks_file
.import_marks(self
.import_marks_file
)
91 if marks_info
is not None:
92 self
.revid_to_mark
= dict((r
, m
) for m
, r
in
93 marks_info
[0].items())
94 self
.branch_names
= marks_info
[1]
96 def interesting_history(self
):
98 rev1
, rev2
= builtins
._get
_revision
_range
(self
.revision
,
99 self
.branch
, "fast-export")
100 start_rev_id
= rev1
.rev_id
101 end_rev_id
= rev2
.rev_id
105 self
.note("Calculating the revisions to include ...")
106 view_revisions
= reversed([rev_id
for rev_id
, _
, _
, _
in
107 self
.branch
.iter_merge_sorted_revisions(end_rev_id
, start_rev_id
)])
108 # If a starting point was given, we need to later check that we don't
109 # start emitting revisions from before that point. Collect the
110 # revisions to exclude now ...
111 if start_rev_id
is not None:
112 self
.note("Calculating the revisions to exclude ...")
113 self
.excluded_revisions
= set([rev_id
for rev_id
, _
, _
, _
in
114 self
.branch
.iter_merge_sorted_revisions(start_rev_id
)])
115 return list(view_revisions
)
119 self
.branch
= bzrlib
.branch
.Branch
.open_containing(self
.source
)[0]
122 self
.branch
.repository
.lock_read()
124 interesting
= self
.interesting_history()
125 self
._commit
_total
= len(interesting
)
126 self
.note("Starting export of %d revisions ..." %
128 if not self
.plain_format
:
130 for revid
in interesting
:
131 self
.emit_commit(revid
, self
.git_branch
)
132 if self
.branch
.supports_tags():
135 self
.branch
.repository
.unlock()
137 # Save the marks if requested
141 def note(self
, msg
, *args
):
142 """Output a note but timestamp it."""
143 msg
= "%s %s" % (self
._time
_of
_day
(), msg
)
144 trace
.note(msg
, *args
)
146 def warning(self
, msg
, *args
):
147 """Output a warning but timestamp it."""
148 msg
= "%s WARNING: %s" % (self
._time
_of
_day
(), msg
)
149 trace
.warning(msg
, *args
)
151 def _time_of_day(self
):
152 """Time of day as a string."""
153 # Note: this is a separate method so tests can patch in a fixed value
154 return time
.strftime("%H:%M:%S")
156 def report_progress(self
, commit_count
, details
=''):
157 if commit_count
and commit_count
% self
.progress_every
== 0:
158 if self
._commit
_total
:
159 counts
= "%d/%d" % (commit_count
, self
._commit
_total
)
161 counts
= "%d" % (commit_count
,)
162 minutes
= (time
.time() - self
._start
_time
) / 60
163 rate
= commit_count
* 1.0 / minutes
165 rate_str
= "at %.0f/minute " % rate
167 rate_str
= "at %.1f/minute " % rate
168 self
.note("%s commits exported %s%s" % (counts
, rate_str
, details
))
170 def dump_stats(self
):
171 time_required
= progress
.str_tdelta(time
.time() - self
._start
_time
)
172 rc
= len(self
.revid_to_mark
)
173 self
.note("Exported %d %s in %s",
174 rc
, helpers
.single_plural(rc
, "revision", "revisions"),
177 def print_cmd(self
, cmd
):
178 self
.outf
.write("%r\n" % cmd
)
180 def _save_marks(self
):
181 if self
.export_marks_file
:
182 revision_ids
= dict((m
, r
) for r
, m
in self
.revid_to_mark
.items())
183 marks_file
.export_marks(self
.export_marks_file
, revision_ids
,
186 def is_empty_dir(self
, tree
, path
):
187 path_id
= tree
.path2id(path
)
189 self
.warning("Skipping empty_dir detection - no file_id for %s" %
193 # Continue if path is not a directory
194 if tree
.kind(path_id
) != 'directory':
197 # Use treewalk to find the contents of our directory
198 contents
= list(tree
.walkdirs(prefix
=path
))[0]
199 if len(contents
[1]) == 0:
204 def emit_features(self
):
205 for feature
in sorted(commands
.FEATURE_NAMES
):
206 self
.print_cmd(commands
.FeatureCommand(feature
))
208 def emit_commit(self
, revid
, git_branch
):
209 if revid
in self
.revid_to_mark
or revid
in self
.excluded_revisions
:
212 # Get the Revision object
214 revobj
= self
.branch
.repository
.get_revision(revid
)
215 except bazErrors
.NoSuchRevision
:
216 # This is a ghost revision. Mark it as not found and next!
217 self
.revid_to_mark
[revid
] = -1
220 # Get the primary parent
221 # TODO: Consider the excluded revisions when deciding the parents.
222 # Currently, a commit with parents that are excluded ought to be
223 # triggering the git_branch calculation below (and it is not).
225 ncommits
= len(self
.revid_to_mark
)
226 nparents
= len(revobj
.parent_ids
)
229 # This is a parentless commit but it's not the first one
230 # output. We need to create a new temporary branch for it
231 # otherwise git-fast-import will assume the previous commit
232 # was this one's parent
233 git_branch
= self
._next
_tmp
_branch
_name
()
234 parent
= bzrlib
.revision
.NULL_REVISION
236 parent
= revobj
.parent_ids
[0]
239 git_ref
= 'refs/heads/%s' % (git_branch
,)
241 self
.revid_to_mark
[revid
] = mark
242 file_cmds
= self
._get
_filecommands
(parent
, revid
)
243 self
.print_cmd(self
._get
_commit
_command
(git_ref
, mark
, revobj
,
246 # Report progress and checkpoint if it's time for that
247 self
.report_progress(ncommits
)
248 if (self
.checkpoint
> 0 and ncommits
249 and ncommits
% self
.checkpoint
== 0):
250 self
.note("Exported %i commits - adding checkpoint to output"
253 self
.print_cmd(commands
.CheckpointCommand())
255 def _get_name_email(self
, user
):
256 if user
.find('<') == -1:
257 # If the email isn't inside <>, we need to use it as the name
258 # in order for things to round-trip correctly.
259 # (note: parseaddr('a@b.com') => name:'', email: 'a@b.com')
263 name
, email
= parseaddr(user
)
266 def _get_commit_command(self
, git_ref
, mark
, revobj
, file_cmds
):
267 # Get the committer and author info
268 committer
= revobj
.committer
269 name
, email
= self
._get
_name
_email
(committer
)
270 committer_info
= (name
, email
, revobj
.timestamp
, revobj
.timezone
)
271 if self
._multi
_author
_api
_available
:
272 more_authors
= revobj
.get_apparent_authors()
273 author
= more_authors
.pop(0)
276 author
= revobj
.get_apparent_author()
278 name
, email
= self
._get
_name
_email
(author
)
279 author_info
= (name
, email
, revobj
.timestamp
, revobj
.timezone
)
280 more_author_info
= []
281 for a
in more_authors
:
282 name
, email
= self
._get
_name
_email
(a
)
283 more_author_info
.append(
284 (name
, email
, revobj
.timestamp
, revobj
.timezone
))
285 elif author
!= committer
:
286 name
, email
= self
._get
_name
_email
(author
)
287 author_info
= (name
, email
, revobj
.timestamp
, revobj
.timezone
)
288 more_author_info
= None
291 more_author_info
= None
293 # Get the parents in terms of marks
294 non_ghost_parents
= []
295 for p
in revobj
.parent_ids
:
296 if p
in self
.excluded_revisions
:
299 parent_mark
= self
.revid_to_mark
[p
]
300 non_ghost_parents
.append(":%s" % parent_mark
)
304 if non_ghost_parents
:
305 from_
= non_ghost_parents
[0]
306 merges
= non_ghost_parents
[1:]
311 # Filter the revision properties. Some metadata (like the
312 # author information) is already exposed in other ways so
313 # don't repeat it here.
314 if self
.plain_format
:
317 properties
= revobj
.properties
318 for prop
in self
.properties_to_exclude
:
324 # Build and return the result
325 return commands
.CommitCommand(git_ref
, mark
, author_info
,
326 committer_info
, revobj
.message
, from_
, merges
, iter(file_cmds
),
327 more_authors
=more_author_info
, properties
=properties
)
329 def _get_revision_trees(self
, parent
, revision_id
):
331 tree_old
= self
.branch
.repository
.revision_tree(parent
)
332 except bazErrors
.UnexpectedInventoryFormat
:
333 self
.warning("Parent is malformed - diffing against previous parent")
334 # We can't find the old parent. Let's diff against his parent
335 pp
= self
.branch
.repository
.get_revision(parent
)
336 tree_old
= self
.branch
.repository
.revision_tree(pp
.parent_ids
[0])
339 tree_new
= self
.branch
.repository
.revision_tree(revision_id
)
340 except bazErrors
.UnexpectedInventoryFormat
:
341 # We can't really do anything anymore
342 self
.warning("Revision %s is malformed - skipping" % revision_id
)
343 return tree_old
, tree_new
345 def _get_filecommands(self
, parent
, revision_id
):
346 """Get the list of FileCommands for the changes between two revisions."""
347 tree_old
, tree_new
= self
._get
_revision
_trees
(parent
, revision_id
)
348 if not(tree_old
and tree_new
):
349 # Something is wrong with this revision - ignore the filecommands
352 changes
= tree_new
.changes_from(tree_old
)
354 # Make "modified" have 3-tuples, as added does
355 my_modified
= [ x
[0:3] for x
in changes
.modified
]
357 # The potential interaction between renames and deletes is messy.
359 file_cmds
, rd_modifies
, renamed
= self
._process
_renames
_and
_deletes
(
360 changes
.renamed
, changes
.removed
, revision_id
, tree_old
)
362 # Map kind changes to a delete followed by an add
363 for path
, id_
, kind1
, kind2
in changes
.kind_changed
:
364 path
= self
._adjust
_path
_for
_renames
(path
, renamed
, revision_id
)
365 # IGC: I don't understand why a delete is needed here.
366 # In fact, it seems harmful? If you uncomment this line,
367 # please file a bug explaining why you needed to.
368 #file_cmds.append(commands.FileDeleteCommand(path))
369 my_modified
.append((path
, id_
, kind2
))
371 # Record modifications
372 for path
, id_
, kind
in changes
.added
+ my_modified
+ rd_modifies
:
374 text
= tree_new
.get_file_text(id_
)
375 file_cmds
.append(commands
.FileModifyCommand(path
, 'file',
376 tree_new
.is_executable(id_
), None, text
))
377 elif kind
== 'symlink':
378 file_cmds
.append(commands
.FileModifyCommand(path
, 'symlink',
379 False, None, tree_new
.get_symlink_target(id_
)))
380 elif kind
== 'directory':
381 if not self
.plain_format
:
382 file_cmds
.append(commands
.FileModifyCommand(path
, 'directory',
385 self
.warning("cannot export '%s' of kind %s yet - ignoring" %
389 def _process_renames_and_deletes(self
, renames
, deletes
,
390 revision_id
, tree_old
):
395 # See https://bugs.edge.launchpad.net/bzr-fastimport/+bug/268933.
396 # In a nutshell, there are several nasty cases:
398 # 1) bzr rm a; bzr mv b a; bzr commit
399 # 2) bzr mv x/y z; bzr rm x; commmit
401 # The first must come out with the delete first like this:
406 # The second case must come out with the rename first like this:
411 # So outputting all deletes first or all renames first won't work.
412 # Instead, we need to make multiple passes over the various lists to
413 # get the ordering right.
417 deleted_paths
= set([p
for p
, _
, _
in deletes
])
418 for (oldpath
, newpath
, id_
, kind
,
419 text_modified
, meta_modified
) in renames
:
420 emit
= kind
!= 'directory' or not self
.plain_format
421 if newpath
in deleted_paths
:
423 file_cmds
.append(commands
.FileDeleteCommand(newpath
))
424 deleted_paths
.remove(newpath
)
425 if (self
.is_empty_dir(tree_old
, oldpath
)):
426 self
.note("Skipping empty dir %s in rev %s" % (oldpath
,
429 #oldpath = self._adjust_path_for_renames(oldpath, renamed,
431 renamed
.append([oldpath
, newpath
])
432 old_to_new
[oldpath
] = newpath
434 file_cmds
.append(commands
.FileRenameCommand(oldpath
, newpath
))
435 if text_modified
or meta_modified
:
436 modifies
.append((newpath
, id_
, kind
))
438 # Renaming a directory implies all children must be renamed.
439 # Note: changes_from() doesn't handle this
440 if kind
== 'directory':
441 for p
, e
in tree_old
.inventory
.iter_entries_by_dir(from_dir
=id_
):
442 if e
.kind
== 'directory' and self
.plain_format
:
444 old_child_path
= osutils
.pathjoin(oldpath
, p
)
445 new_child_path
= osutils
.pathjoin(newpath
, p
)
446 must_be_renamed
[old_child_path
] = new_child_path
448 # Add children not already renamed
450 renamed_already
= set(old_to_new
.keys())
451 still_to_be_renamed
= set(must_be_renamed
.keys()) - renamed_already
452 for old_child_path
in sorted(still_to_be_renamed
):
453 new_child_path
= must_be_renamed
[old_child_path
]
455 self
.note("implicitly renaming %s => %s" % (old_child_path
,
457 file_cmds
.append(commands
.FileRenameCommand(old_child_path
,
460 # Record remaining deletes
461 for path
, id_
, kind
in deletes
:
462 if path
not in deleted_paths
:
464 if kind
== 'directory' and self
.plain_format
:
466 #path = self._adjust_path_for_renames(path, renamed, revision_id)
467 file_cmds
.append(commands
.FileDeleteCommand(path
))
468 return file_cmds
, modifies
, renamed
470 def _adjust_path_for_renames(self
, path
, renamed
, revision_id
):
471 # If a previous rename is found, we should adjust the path
472 for old
, new
in renamed
:
474 self
.note("Changing path %s given rename to %s in revision %s"
475 % (path
, new
, revision_id
))
477 elif path
.startswith(old
+ '/'):
479 "Adjusting path %s given rename of %s to %s in revision %s"
480 % (path
, old
, new
, revision_id
))
481 path
= path
.replace(old
+ "/", new
+ "/")
485 for tag
, revid
in self
.branch
.tags
.get_tag_dict().items():
487 mark
= self
.revid_to_mark
[revid
]
489 self
.warning('not creating tag %r pointing to non-existent '
490 'revision %s' % (tag
, revid
))
492 git_ref
= 'refs/tags/%s' % tag
493 self
.print_cmd(commands
.ResetCommand(git_ref
, ":" + str(mark
)))
495 def _next_tmp_branch_name(self
):
496 """Return a unique branch name. The name will start with "tmp"."""
498 if prefix
not in self
.branch_names
:
499 self
.branch_names
[prefix
] = 0
501 self
.branch_names
[prefix
] += 1
502 prefix
= '%s.%d' % (prefix
, self
.branch_names
[prefix
])