hg-fast-export.sh: handle gc'd marks
[girocco-hg-fast-export.git] / hg-fast-export.py
blobd7909a58d74c2ccd949619bf6e20055e62791e77
1 #!/usr/bin/env python
3 # Copyright (c) 2007, 2008 Rocco Rutte <pdmef@gmx.net> and others.
4 # License: MIT <http://www.opensource.org/licenses/mit-license.php>
6 import sys
7 import os
9 sys = reload(sys)
10 sys.setdefaultencoding("utf-8")
11 os.environ["HGENCODING"] = "UTF-8"
12 os.environ["HGENCODINGMODE"] = "replace"
14 from mercurial import node
15 from hg2git import setup_repo,fixup_user,get_branch,get_changeset
16 from hg2git import load_cache,save_cache,get_git_sha1,set_default_branch,set_origin_name,set_unknown_addr
17 from optparse import OptionParser
18 import re
20 if sys.platform == "win32":
21 # On Windows, sys.stdout is initially opened in text mode, which means that
22 # when a LF (\n) character is written to sys.stdout, it will be converted
23 # into CRLF (\r\n). That makes git blow up, so use this platform-specific
24 # code to change the mode of sys.stdout to binary.
25 import msvcrt
26 msvcrt.setmode(sys.stdout.fileno(), os.O_BINARY)
28 # silly regex to catch Signed-off-by lines in log message
29 sob_re=re.compile('^Signed-[Oo]ff-[Bb]y: (.+)$')
30 # insert 'checkpoint' command after this many commits or none at all if 0
31 cfg_checkpoint_count=0
32 # write some progress message every this many file contents written
33 cfg_export_boundary=1000
34 # ref manipulation regexs
35 ref_crud_re = re.compile(r'[[\x00-\x1f\x7f ~^:\\*?]+', re.S)
36 ref_dotdot_re = re.compile(r'\.\.')
37 ref_atbrace_re = re.compile(r'@\{')
38 ref_dotlock_re = re.compile(r'.*\.lock$', re.I)
39 ref_separators_re = re.compile(r'/+')
40 ref_collapse_re = re.compile(r'_+')
42 def gitmode(flags):
43 return 'l' in flags and '120000' or 'x' in flags and '100755' or '100644'
45 def wr_no_nl(msg=''):
46 if msg:
47 sys.stdout.write(msg)
49 def wr(msg=''):
50 wr_no_nl(msg)
51 sys.stdout.write('\n')
52 #map(lambda x: sys.stderr.write('\t[%s]\n' % x),msg.split('\n'))
54 def checkpoint(count):
55 count=count+1
56 if cfg_checkpoint_count>0 and count%cfg_checkpoint_count==0:
57 sys.stderr.write("Checkpoint after %d commits\n" % count)
58 wr('checkpoint')
59 wr()
60 return count
62 def revnum_to_revref(rev, old_marks):
63 """Convert an hg revnum to a git-fast-import rev reference (an SHA1
64 or a mark)"""
65 return old_marks.get(rev) or ':%d' % (rev+1)
67 def file_mismatch(f1,f2):
68 """See if two revisions of a file are not equal."""
69 return node.hex(f1)!=node.hex(f2)
71 def split_dict(dleft,dright,l=[],c=[],r=[],match=file_mismatch):
72 """Loop over our repository and find all changed and missing files."""
73 for left in dleft.keys():
74 right=dright.get(left,None)
75 if right==None:
76 # we have the file but our parent hasn't: add to left set
77 l.append(left)
78 elif match(dleft[left],right) or gitmode(dleft.flags(left))!=gitmode(dright.flags(left)):
79 # we have it but checksums mismatch: add to center set
80 c.append(left)
81 for right in dright.keys():
82 left=dleft.get(right,None)
83 if left==None:
84 # if parent has file but we don't: add to right set
85 r.append(right)
86 # change is already handled when comparing child against parent
87 return l,c,r
89 def get_filechanges(repo,revision,parents,mleft):
90 """Given some repository and revision, find all changed/deleted files."""
91 l,c,r=[],[],[]
92 for p in parents:
93 if p<0: continue
94 mright=repo.changectx(p).manifest()
95 l,c,r=split_dict(mleft,mright,l,c,r)
96 l.sort()
97 c.sort()
98 r.sort()
99 return l,c,r
101 def get_author(logmessage,committer,authors):
102 """As git distincts between author and committer of a patch, try to
103 extract author by detecting Signed-off-by lines.
105 This walks from the end of the log message towards the top skipping
106 empty lines. Upon the first non-empty line, it walks all Signed-off-by
107 lines upwards to find the first one. For that (if found), it extracts
108 authorship information the usual way (authors table, cleaning, etc.)
110 If no Signed-off-by line is found, this defaults to the committer.
112 This may sound stupid (and it somehow is), but in log messages we
113 accidentially may have lines in the middle starting with
114 "Signed-off-by: foo" and thus matching our detection regex. Prevent
115 that."""
117 loglines=logmessage.split('\n')
118 i=len(loglines)
119 # from tail walk to top skipping empty lines
120 while i>=0:
121 i-=1
122 if len(loglines[i].strip())==0: continue
123 break
124 if i>=0:
125 # walk further upwards to find first sob line, store in 'first'
126 first=None
127 while i>=0:
128 m=sob_re.match(loglines[i])
129 if m==None: break
130 first=m
131 i-=1
132 # if the last non-empty line matches our Signed-Off-by regex: extract username
133 if first!=None:
134 r=fixup_user(first.group(1),authors)
135 return r
136 return committer
138 def export_file_contents(ctx,manifest,files,hgtags):
139 count=0
140 max=len(files)
141 for file in files:
142 # Skip .hgtags files. They only get us in trouble.
143 if not hgtags and file == ".hgtags":
144 sys.stderr.write('Skip %s\n' % (file))
145 continue
146 d=ctx.filectx(file).data()
147 wr('M %s inline %s' % (gitmode(manifest.flags(file)),
148 strip_leading_slash(file)))
149 wr('data %d' % len(d)) # had some trouble with size()
150 wr(d)
151 count+=1
152 if count%cfg_export_boundary==0:
153 sys.stderr.write('Exported %d/%d files\n' % (count,max))
154 if max>cfg_export_boundary:
155 sys.stderr.write('Exported %d/%d files\n' % (count,max))
157 def sanitize_name(name,what="branch",flatten=False):
158 """Sanitize input roughly according to git-check-ref-format(1)"""
160 def dot(name):
161 if len(name) >= 1 and name[0] == '.': return '_'+name[1:]
162 return name
164 if name == '':
165 # be paranoid just in case
166 n = '_'
167 else:
168 n = name
169 n = ref_crud_re.sub('_', n)
170 n = ref_dotdot_re.sub('_', n)
171 n = ref_atbrace_re.sub('_{', n)
172 if ref_dotlock_re.match(n):
173 n = n[:-5] + '_' + n[-4:]
174 if n[-1] in ('/', '.'): n=n[:-1]+'_'
175 if flatten:
176 if n[0] == '.': n='_'+n[1:]
177 n = ref_separators_re.sub('_', n)
178 else:
179 n = '/'.join(map(dot, n.split('/')))
180 if n[0] == '/': n='_'+n[1:]
181 n = ref_separators_re.sub('/', n)
182 n = ref_collapse_re.sub('_', n)
184 if n!=name:
185 sys.stderr.write('Warning: sanitized %s [%s] to [%s]\n' % (what,name,n))
186 return n
188 def strip_leading_slash(filename):
189 if filename[0] == '/':
190 return filename[1:]
191 return filename
193 def export_commit(ui,repo,revision,old_marks,max,count,authors,sob,brmap,hgtags,flatten,notes):
194 def get_branchname(name):
195 if brmap.has_key(name):
196 return brmap[name]
197 n=sanitize_name(name,flatten=flatten)
198 brmap[name]=n
199 return n
201 (revnode,_,user,(time,timezone),files,desc,branch,_)=get_changeset(ui,repo,revision,authors)
203 branch=get_branchname(branch)
205 parents = [p for p in repo.changelog.parentrevs(revision) if p >= 0]
207 if len(parents)==0 and revision != 0:
208 wr('reset refs/heads/%s' % branch)
210 wr('commit refs/heads/%s' % branch)
211 wr('mark :%d' % (revision+1))
212 if sob:
213 wr('author %s %d %s' % (get_author(desc,user,authors),time,timezone))
214 wr('committer %s %d %s' % (user,time,timezone))
215 wr('data %d' % (len(desc)+1)) # wtf?
216 wr(desc)
217 wr()
219 ctx=repo.changectx(str(revision))
220 man=ctx.manifest()
221 added,changed,removed,type=[],[],[],''
223 if len(parents) == 0:
224 # first revision: feed in full manifest
225 added=man.keys()
226 added.sort()
227 type='full'
228 else:
229 wr('from %s' % revnum_to_revref(parents[0], old_marks))
230 if len(parents) == 1:
231 # later non-merge revision: feed in changed manifest
232 # if we have exactly one parent, just take the changes from the
233 # manifest without expensively comparing checksums
234 f=repo.status(repo.lookup(parents[0]),revnode)[:3]
235 added,changed,removed=f[1],f[0],f[2]
236 type='simple delta'
237 else: # a merge with two parents
238 wr('merge %s' % revnum_to_revref(parents[1], old_marks))
239 # later merge revision: feed in changed manifest
240 # for many files comparing checksums is expensive so only do it for
241 # merges where we really need it due to hg's revlog logic
242 added,changed,removed=get_filechanges(repo,revision,parents,man)
243 type='thorough delta'
245 sys.stderr.write('%s: Exporting %s revision %d/%d with %d/%d/%d added/changed/removed files\n' %
246 (branch,type,revision+1,max,len(added),len(changed),len(removed)))
248 removed=[strip_leading_slash(x) for x in removed]
250 map(lambda r: wr('D %s' % r),removed)
251 export_file_contents(ctx,man,added,hgtags)
252 export_file_contents(ctx,man,changed,hgtags)
253 wr()
255 count=checkpoint(count)
256 count=generate_note(user,time,timezone,revision,ctx,count,notes)
257 return count
259 def generate_note(user,time,timezone,revision,ctx,count,notes):
260 if not notes:
261 return count
262 wr('commit refs/notes/hg')
263 wr('committer %s %d %s' % (user,time,timezone))
264 wr('data 0')
265 wr('N inline :%d' % (revision+1))
266 hg_hash=ctx.hex()
267 wr('data %d' % (len(hg_hash)))
268 wr_no_nl(hg_hash)
269 wr()
270 return checkpoint(count)
272 def export_tags(ui,repo,old_marks,mapping_cache,count,authors,flatten):
273 l=repo.tagslist()
274 for tag,node in l:
275 tag=sanitize_name(tag,"tag",flatten=flatten)
276 # ignore latest revision
277 if tag=='tip': continue
278 # ignore tags to nodes that are missing (ie, 'in the future')
279 if node.encode('hex_codec') not in mapping_cache:
280 sys.stderr.write('Tag %s refers to unseen node %s\n' % (tag, node.encode('hex_codec')))
281 continue
283 rev=int(mapping_cache[node.encode('hex_codec')])
285 ref=revnum_to_revref(rev, old_marks)
286 if ref==None:
287 sys.stderr.write('Failed to find reference for creating tag'
288 ' %s at r%d\n' % (tag,rev))
289 continue
290 sys.stderr.write('Exporting tag [%s] at [hg r%d] [git %s]\n' % (tag,rev,ref))
291 wr('reset refs/tags/%s' % tag)
292 wr('from %s' % ref)
293 wr()
294 count=checkpoint(count)
295 return count
297 def load_authors(filename):
298 cache={}
299 if not os.path.exists(filename):
300 return cache
301 f=open(filename,'r')
304 lre=re.compile('^([^=]+)[ ]*=[ ]*(.+)$')
305 for line in f.readlines():
306 l+=1
307 line=line.strip()
308 if line=='' or line[0]=='#':
309 continue
310 m=lre.match(line)
311 if m==None:
312 sys.stderr.write('Invalid file format in [%s], line %d\n' % (filename,l))
313 continue
314 # put key:value in cache, key without ^:
315 cache[m.group(1).strip()]=m.group(2).strip()
316 a+=1
317 f.close()
318 sys.stderr.write('Loaded %d authors\n' % a)
319 return cache
321 def branchtip(repo, heads):
322 '''return the tipmost branch head in heads'''
323 tip = heads[-1]
324 for h in reversed(heads):
325 if 'close' not in repo.changelog.read(h)[5]:
326 tip = h
327 break
328 return tip
330 def verify_heads(ui,repo,cache,force):
331 branches={}
332 for bn, heads in repo.branchmap().iteritems():
333 branches[bn] = branchtip(repo, heads)
334 l=[(-repo.changelog.rev(n), n, t) for t, n in branches.items()]
335 l.sort()
337 # get list of hg's branches to verify, don't take all git has
338 for _,_,b in l:
339 b=get_branch(b)
340 sha1=get_git_sha1(b)
341 c=cache.get(b)
342 if sha1!=c:
343 sys.stderr.write('Error: Branch [%s] modified outside hg-fast-export:'
344 '\n%s (repo) != %s (cache)\n' % (b,sha1,c))
345 if not force: return False
347 # verify that branch has exactly one head
348 t={}
349 for h in repo.heads():
350 (_,_,_,_,_,_,branch,_)=get_changeset(ui,repo,h)
351 if t.get(branch,False):
352 sys.stderr.write('Error: repository has at least one unnamed head: hg r%s (branch: %s)\n' %
353 (repo.changelog.rev(h),branch))
354 if not force: return False
355 t[branch]=True
357 return True
359 def hg2git(repourl,m,marksfile,mappingfile,headsfile,tipfile,authors={},sob=False,force=False,hgtags=False,flatten=False,notes=False):
360 _max=int(m)
362 old_marks=load_cache(marksfile,lambda s: int(s)-1)
363 mapping_cache=load_cache(mappingfile)
364 heads_cache=load_cache(headsfile)
365 state_cache=load_cache(tipfile)
367 ui,repo=setup_repo(repourl)
369 if not verify_heads(ui,repo,heads_cache,force):
370 return 1
372 try:
373 tip=repo.changelog.count()
374 except AttributeError:
375 tip=len(repo)
377 min=int(state_cache.get('tip',0))
378 max=_max
379 if _max<0 or max>tip:
380 max=tip
382 for rev in range(0,max):
383 (revnode,_,_,_,_,_,_,_)=get_changeset(ui,repo,rev,authors)
384 mapping_cache[revnode.encode('hex_codec')] = str(rev)
386 if notes and get_git_sha1('hg', 'notes') != None:
387 wr('reset refs/notes/hg')
388 wr('from refs/notes/hg^0')
391 brmap={}
392 for rev in range(min,max):
393 c=export_commit(ui,repo,rev,old_marks,max,c,authors,sob,brmap,hgtags,flatten,notes)
395 state_cache['tip']=max
396 state_cache['repo']=repourl
397 save_cache(tipfile,state_cache)
398 save_cache(mappingfile,mapping_cache)
400 c=export_tags(ui,repo,old_marks,mapping_cache,c,authors,flatten)
402 sys.stderr.write('Issued %d commands\n' % c)
404 return 0
406 if __name__=='__main__':
407 def bail(parser,opt):
408 sys.stderr.write('Error: No %s option given\n' % opt)
409 parser.print_help()
410 sys.exit(2)
412 parser=OptionParser()
414 parser.add_option("-m","--max",type="int",dest="max",
415 help="Maximum hg revision to import")
416 parser.add_option("--mapping",dest="mappingfile",
417 help="File to read last run's hg-to-git SHA1 mapping")
418 parser.add_option("--marks",dest="marksfile",
419 help="File to read git-fast-import's marks from")
420 parser.add_option("--heads",dest="headsfile",
421 help="File to read last run's git heads from")
422 parser.add_option("--status",dest="statusfile",
423 help="File to read status from")
424 parser.add_option("-r","--repo",dest="repourl",
425 help="URL of repo to import")
426 parser.add_option("-s",action="store_true",dest="sob",
427 default=False,help="Enable parsing Signed-off-by lines")
428 parser.add_option("--hgtags",action="store_true",dest="hgtags",
429 default=False,help="Enable exporting .hgtags files")
430 parser.add_option("--flatten",action="store_true",dest="flatten",
431 default=False,help="Create one-level ref names (convert '/' to '_')")
432 parser.add_option("-A","--authors",dest="authorfile",
433 help="Read authormap from AUTHORFILE")
434 parser.add_option("-U",dest="unknown",
435 help="Email address to use for unknown instead of 'unknown' (previous default was 'devnull@localhost')")
436 parser.add_option("-f","--force",action="store_true",dest="force",
437 default=False,help="Ignore validation errors by force")
438 parser.add_option("-M","--default-branch",dest="default_branch",
439 help="Set the default branch")
440 parser.add_option("-o","--origin",dest="origin_name",
441 help="use <name> as namespace to track upstream")
442 parser.add_option("--hg-hash",action="store_true",dest="notes",
443 default=False,help="Annotate commits with the hg hash as git notes in the hg namespace")
445 (options,args)=parser.parse_args()
447 m=-1
448 if options.max!=None: m=options.max
450 if options.marksfile==None: bail(parser,'--marks')
451 if options.mappingfile==None: bail(parser,'--mapping')
452 if options.headsfile==None: bail(parser,'--heads')
453 if options.statusfile==None: bail(parser,'--status')
454 if options.repourl==None: bail(parser,'--repo')
456 a={}
457 if options.authorfile!=None:
458 a=load_authors(options.authorfile)
460 if options.unknown!=None:
461 if not set_unknown_addr(options.unknown):
462 sys.stderr.write("Error: Invalid email address '%s'\n" % options.unknown)
463 sys.exit(2)
465 if options.default_branch!=None:
466 set_default_branch(options.default_branch)
468 if options.origin_name!=None:
469 set_origin_name(options.origin_name)
471 sys.exit(hg2git(options.repourl,m,options.marksfile,options.mappingfile,options.headsfile,
472 options.statusfile,authors=a,sob=options.sob,force=options.force,hgtags=options.hgtags,flatten=options.flatten,notes=options.notes))