fix timezones in darcs-fast-export, take 2
[girocco-darcs-fast-export.git] / darcs-fast-export
blob1f153a6be974406cadc5cb8f54566027af286647
1 #!/usr/bin/env python
3 """
5 darcs-fast-export - darcs backend for fast data importers
7 Copyright (c) 2008, 2009 Miklos Vajna <vmiklos@frugalware.org>
8 Copyright (c) 2008 Matthias Andree <matthias.andree@gmx.de>
10 This program is free software; you can redistribute it and/or modify
11 it under the terms of the GNU General Public License as published by
12 the Free Software Foundation; either version 2, or (at your option)
13 any later version.
15 This program is distributed in the hope that it will be useful,
16 but WITHOUT ANY WARRANTY; without even the implied warranty of
17 MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
18 GNU General Public License for more details.
20 You should have received a copy of the GNU General Public License
21 along with this program; if not, write to the Free Software
22 Foundation, Inc., 675 Mass Ave, Cambridge, MA 02139, USA.
24 """
26 import xml.dom.minidom
27 import xml.parsers.expat
28 import os
29 import sys
30 import gzip
31 import time
32 import calendar
33 import shutil
34 import subprocess
35 import optparse
36 import re
37 import urllib
38 import urllib2
39 import StringIO
41 sys = reload(sys)
42 sys.setdefaultencoding("utf-8")
44 class Handler:
45 def __init__(self):
46 self.hashes = []
47 self.authormap = {}
48 self.export_marks = []
49 self.import_marks = []
51 def get_patchname(self, patch):
52 ret = []
53 s = ""
54 if patch.attributes['inverted'].value == 'True':
55 s = "UNDO: "
56 cs = patch.getElementsByTagName("name")[0].childNodes
57 if cs.length > 0:
58 ret.append(s + cs[0].data)
59 lines = patch.getElementsByTagName("comment")
60 if lines:
61 for i in lines[0].childNodes[0].data.split('\n'):
62 if not i.startswith("Ignore-this: "):
63 ret.append(i)
64 return "\n".join(ret).encode('utf-8')
66 def get_author(self, patch):
67 """darcs allows any freeform string, but fast-import has a more
68 strict format, so fix up broken author names here."""
70 author = patch.attributes['author'].value
71 if author in self.authormap:
72 author = self.authormap[author]
73 if not len(author):
74 author = "darcs-fast-export <darcs-fast-export>"
75 # add missing name
76 elif not ">" in author:
77 author = "%s <%s>" % (author.split('@')[0], author)
78 # avoid double quoting
79 elif author[0] == '"' and author[-1] == '"':
80 author = author[1:-1]
81 # name after email
82 elif author[-1] != '>':
83 author = author[author.index('>')+2:] + ' ' + author[:author.index('>')+1]
84 return author.encode('utf-8')
86 def get_date(self, patch):
87 try:
88 date = time.strptime(patch, "%Y%m%d%H%M%S")
89 except ValueError:
90 date = time.strptime(patch[:19] + patch[-5:], '%a %b %d %H:%M:%S %Y')
91 return calendar.timegm(date)
93 def progress(self, s):
94 print "progress [%s] %s" % (time.strftime("%Y-%m-%d %H:%M:%S", time.localtime()), s)
95 sys.stdout.flush()
97 def log(self, s):
98 self.logsock.write("[%s] %s" % (time.strftime("%Y-%m-%d %H:%M:%S", time.localtime()), s))
99 self.logsock.flush()
101 def parse_inventory(self, sock=None):
102 prev = None
103 nextprev = False
104 buf = []
105 if not sock:
106 sock = self.open(os.path.join(self.origin, "_darcs", "hashed_inventory"))
107 for i in sock.readlines():
108 if i.startswith("hash"):
109 buf.insert(0, i[6:-1])
110 if i.startswith("Starting with inventory:"):
111 nextprev = True
112 elif nextprev:
113 prev = i[:-1]
114 nextprev = False
115 sock.close()
116 for i in buf:
117 self.hashes.insert(0, i)
118 if prev:
119 sock = self.gzip_open(os.path.join(self.origin, "_darcs", "inventories", prev))
120 self.parse_inventory(sock)
122 # this is like gzip.open but supports urls as well
123 def gzip_open(self, path):
124 if os.path.exists(path):
125 return gzip.open(path)
126 buf = urllib.urlopen(path).read()
127 sock = StringIO.StringIO(buf)
128 return gzip.GzipFile(fileobj=sock)
130 # this is like os.path.exists but supports urls as well
131 def path_exists(self, path):
132 if os.path.exists(path):
133 return True
134 else:
135 try:
136 urllib2.urlopen(urllib2.Request(path))
137 return True
138 except urllib2.HTTPError, e:
139 return False
141 # this is like open, but supports urls as well
142 def open(self, path):
143 if os.path.exists(path):
144 return open(path)
145 else:
146 return urllib.urlopen(path)
148 def handle_opts(self):
149 # Option Parser
150 usage="%prog [options] darcsrepo"
151 opp = optparse.OptionParser(usage=usage)
152 opp.add_option("--import-marks", metavar="IFILE",
153 help="read state for incremental imports from IFILE")
154 opp.add_option("--export-marks", metavar="OFILE",
155 help="write state for incremental imports from OFILE")
156 opp.add_option("--encoding",
157 help="encoding of log [default: %default], if unspecified and input isn't utf-8, guess")
158 opp.add_option("--authors-file", metavar="F",
159 help="read author transformations in old=new format from F")
160 opp.add_option("--working", metavar="W",
161 help="working directory which is removed at the end of non-incremental conversions")
162 opp.add_option("--logfile", metavar="L",
163 help="log file which contains the output of external programs invoked during the conversion")
164 opp.add_option("--git-branch", metavar="B",
165 help="git branch [default: refs/heads/master]")
166 opp.add_option("--progress", metavar="P",
167 help="insert progress statements after every n commit [default: 100]")
168 (self.options, self.args) = opp.parse_args()
169 if len(self.args) < 1:
170 opp.error("darcsrepo required")
172 # read author mapping file in gitauthors format,
173 # i. e. in=out (one per # line)
174 if self.options.authors_file:
175 sock = open(self.options.authors_file)
176 self.authormap = dict([i.strip().split('=',1) for i in sock])
177 sock.close()
179 if "://" not in self.args[0]:
180 self.origin = os.path.abspath(self.args[0])
181 else:
182 self.origin = self.args[0].strip('/')
183 if self.options.working:
184 self.working = os.path.abspath(self.options.working)
185 else:
186 if "://" not in self.origin:
187 self.working = "%s.darcs" % self.origin
188 else:
189 self.working = "%s.darcs" % os.path.split(self.origin)[-1]
190 if self.options.logfile:
191 logfile = os.path.abspath(self.options.logfile)
192 else:
193 if "://" not in self.origin:
194 logfile = "%s.log" % self.origin
195 else:
196 logfile = "%s.log" % os.path.split(self.origin)[-1]
197 self.logsock = open(logfile, "a")
198 if self.options.git_branch:
199 self.git_branch = self.options.git_branch
200 else:
201 self.git_branch = "refs/heads/master"
203 if self.options.progress:
204 self.prognum = int(self.options.progress)
205 else:
206 self.prognum = 100
208 def handle_import_marks(self):
209 if self.options.import_marks:
210 sock = open(self.options.import_marks)
211 for i in sock.readlines():
212 line = i.strip()
213 if not len(line):
214 continue
215 self.import_marks.append(line.split(' ')[1])
216 self.export_marks.append(line)
217 sock.close()
219 def get_patches(self):
220 self.progress("getting list of patches")
221 if not len(self.import_marks):
222 sock = os.popen("darcs changes --xml --reverse --repo %s" % self.origin)
223 else:
224 sock = os.popen("darcs changes --xml --reverse --repo %s --from-match 'hash %s'" % (self.origin, self.import_marks[-1]))
225 buf = sock.read()
226 sock.close()
227 # this is hackish. we need to escape some bad chars, otherwise the xml
228 # will not be valid
229 buf = buf.replace('\x1b', '^[')
230 if self.options.encoding:
231 xmldoc = xml.dom.minidom.parseString(unicode(buf, self.options.encoding).encode('utf-8'))
232 else:
233 try:
234 xmldoc = xml.dom.minidom.parseString(buf)
235 except xml.parsers.expat.ExpatError:
236 try:
237 import chardet
238 except ImportError:
239 sys.exit("Error, encoding is not utf-8. Please " +
240 "either specify it with the --encoding " +
241 "option or install chardet.")
242 self.progress("encoding is not utf8, guessing charset")
243 encoding = chardet.detect(buf)['encoding']
244 self.progress("detected encoding is %s" % encoding)
245 xmldoc = xml.dom.minidom.parseString(unicode(buf, encoding).encode('utf-8'))
246 sys.stdout.flush()
247 return xmldoc.getElementsByTagName('patch')
249 def setup_workdir(self):
250 darcs2 = False
251 self.oldfashionedpatch = True
252 self.cwd = os.getcwd()
253 if self.path_exists(os.path.join(self.origin, "_darcs", "format")):
254 sock = self.open(os.path.join(self.origin, "_darcs", "format"))
255 format = [x.strip() for x in sock]
256 sock.close()
257 darcs2 = 'darcs-2' in format
258 self.oldfashionedpatch = not 'hashed' in format
259 if not self.oldfashionedpatch:
260 self.progress("parsing the inventory")
261 if "://" not in self.origin:
262 os.chdir(self.origin)
263 self.parse_inventory()
264 if not self.options.import_marks or not os.path.exists(self.working):
265 # init the tmp darcs repo
266 os.mkdir(self.working)
267 os.chdir(self.working)
268 if darcs2:
269 os.system("darcs init --darcs-2")
270 else:
271 os.system("darcs init --hashed")
272 else:
273 os.chdir(self.working)
274 if self.options.import_marks:
275 sock = os.popen("darcs pull -a --match 'hash %s' %s" % (self.import_marks[-1], self.origin))
276 self.log("Building/updating working directory:\n%s" % sock.read())
277 sock.close()
279 def sanitize_ref(self, ref):
280 ref = re.sub('[\x00-\x1f\x7f ~^:/\\*?[]+', '_', ref)
281 ref = re.sub('^[._]+', '_', ref)
282 ref = re.sub('[._]+$', '_', ref)
283 ref = re.sub('[.][.]', '__', ref)
284 ref = re.sub('@\{', '_{', ref)
285 ref = ref.strip('_')
286 ref = re.sub('[._]+lock$', '_lock', ref)
287 if len(ref):
288 return ref
289 else:
290 return '_'
292 def export_patches(self):
293 patches = self.get_patches()
294 # this is the number of the NEXT patch
295 count = 1
296 if len(self.import_marks):
297 patches = patches[1:]
298 count = len(self.import_marks) + 1
299 if len(self.export_marks):
300 # this is the mark number of the NEXT patch
301 markcount = int(self.export_marks[-1].split(' ')[0][1:]) + 1
302 else:
303 markcount = count
304 # this may be huge and we need it many times
305 patchnum = len(patches)
307 if not len(self.import_marks):
308 self.progress("starting export, repo has %d patches" % patchnum)
309 else:
310 self.progress("continuing export, %d patches to convert" % patchnum)
311 paths = []
312 for i in patches:
313 # apply the patch
314 hash = i.attributes['hash'].value
315 if not hash.endswith(".gz"):
316 hash += ".gz"
317 buf = ["\nNew patches:\n"]
318 if self.oldfashionedpatch:
319 sock = self.gzip_open(os.path.join(self.origin, "_darcs", "patches", hash))
320 else:
321 sock = self.gzip_open(os.path.join(self.origin, "_darcs", "patches", self.hashes[count-1]))
322 buf.append(sock.read())
323 sock.close()
324 sock = os.popen("darcs changes --context")
325 buf.append(sock.read())
326 sock.close()
327 sock = subprocess.Popen(["darcs", "apply", "--allow-conflicts"], stdin=subprocess.PIPE, stdout=subprocess.PIPE)
328 sock.stdin.write("".join(buf))
329 sock.stdin.close()
330 self.log("Applying %s:\n%s" % (hash, sock.stdout.read()))
331 sock.stdout.close()
332 message = self.get_patchname(i)
333 # export the commit
334 print "commit %s" % self.git_branch
335 print "mark :%s" % markcount
336 if self.options.export_marks:
337 self.export_marks.append(":%s %s" % (markcount, hash))
338 date = self.get_date(i.attributes['date'].value)
339 print "committer %s %s +0000" % (self.get_author(i), date)
340 print "data %d\n%s" % (len(message), message)
341 if markcount > 1:
342 print "from :%s" % (markcount-1)
343 # export the files
344 for j in paths:
345 print "D %s" % j
346 paths = []
347 for (root, dirs, files) in os.walk ("."):
348 for f in files:
349 j = os.path.normpath(os.path.join(root, f))
350 if j.startswith("_darcs") or "-darcs-backup" in j:
351 continue
352 paths.append(j)
353 sock = open(j)
354 buf = sock.read()
355 sock.close()
356 # darcs does not track the executable bit :/
357 print "M 644 inline %s" % j
358 print "data %s\n%s" % (len(buf), buf)
359 if message[:4] == "TAG ":
360 tag = self.sanitize_ref(message[4:].strip().split('\n')[0])
361 print "tag %s" % tag
362 print "from :%s" % markcount
363 print "tagger %s %s +0000" % (self.get_author(i), date)
364 print "data %d\n%s" % (len(message), message)
365 if count % self.prognum == 0:
366 self.progress("%d/%d patches" % (count, patchnum))
367 count += 1
368 markcount += 1
370 os.chdir(self.cwd)
372 if not self.options.export_marks:
373 shutil.rmtree(self.working)
374 self.logsock.close()
376 def handle_export_marks(self):
377 if self.options.export_marks:
378 self.progress("writing export marks")
379 sock = open(self.options.export_marks, 'w')
380 sock.write("\n".join(self.export_marks))
381 sock.write("\n")
382 sock.close()
384 self.progress("finished")
386 def handle(self):
387 self.handle_opts()
388 self.handle_import_marks()
389 self.setup_workdir()
390 self.export_patches()
391 self.handle_export_marks()
393 if __name__ == "__main__":
394 h = Handler()
395 h.handle()