darcs-fast-export

   1 #!/usr/bin/env python
   2
   3 """
   4
   5     darcs-fast-export - darcs backend for fast data importers
   6
   7     Copyright (c) 2008, 2009 Miklos Vajna <vmiklos@frugalware.org>
   8     Copyright (c) 2008 Matthias Andree <matthias.andree@gmx.de>
   9
  10     This program is free software; you can redistribute it and/or modify
  11     it under the terms of the GNU General Public License as published by
  12     the Free Software Foundation; either version 2, or (at your option)
  13     any later version.
  14
  15     This program is distributed in the hope that it will be useful,
  16     but WITHOUT ANY WARRANTY; without even the implied warranty of
  17     MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
  18     GNU General Public License for more details.
  19
  20     You should have received a copy of the GNU General Public License
  21     along with this program; if not, write to the Free Software
  22     Foundation, Inc., 675 Mass Ave, Cambridge, MA 02139, USA.
  23
  24 """
  25
  26 import xml.dom.minidom
  27 import xml.parsers.expat
  28 import os
  29 import sys
  30 import gzip
  31 import time
  32 import calendar
  33 import shutil
  34 import subprocess
  35 import optparse
  36 import re
  37 import urllib
  38 import urllib2
  39 import StringIO
  40
  41 sys = reload(sys)
  42 sys.setdefaultencoding("utf-8")
  43
  44 class Handler:
  45         def __init__(self):
  46                 self.hashes = []
  47                 self.authormap = {}
  48                 self.export_marks = []
  49                 self.import_marks = []
  50
  51         def get_patchname(self, patch):
  52                 ret = []
  53                 s = ""
  54                 if patch.attributes['inverted'].value == 'True':
  55                         s = "UNDO: "
  56                 cs = patch.getElementsByTagName("name")[0].childNodes
  57                 if cs.length > 0:
  58                         ret.append(s + cs[0].data)
  59                 lines = patch.getElementsByTagName("comment")
  60                 if lines:
  61                         for i in lines[0].childNodes[0].data.split('\n'):
  62                                 if not i.startswith("Ignore-this: "):
  63                                         ret.append(i)
  64                 return "\n".join(ret).encode('utf-8')
  65
  66         def get_author(self, patch):
  67                 """darcs allows any freeform string, but fast-import has a more
  68                 strict format, so fix up broken author names here."""
  69
  70                 author = patch.attributes['author'].value
  71                 if author in self.authormap:
  72                         author = self.authormap[author]
  73                 if not len(author):
  74                         author = "darcs-fast-export <darcs-fast-export>"
  75                 # add missing name
  76                 elif not ">" in author:
  77                         author = "%s <%s>" % (author.split('@')[0], author)
  78                 # avoid double quoting
  79                 elif author[0] == '"' and author[-1] == '"':
  80                         author = author[1:-1]
  81                 # name after email
  82                 elif author[-1] != '>':
  83                         author = author[author.index('>')+2:] + ' ' + author[:author.index('>')+1]
  84                 return author.encode('utf-8')
  85
  86         def get_date(self, patch):
  87                 try:
  88                         date = time.strptime(patch, "%Y%m%d%H%M%S")
  89                 except ValueError:
  90                         date = time.strptime(patch[:19] + patch[-5:], '%a %b %d %H:%M:%S %Y')
  91                 return calendar.timegm(date)
  92
  93         def progress(self, s):
  94                 print "progress [%s] %s" % (time.strftime("%Y-%m-%d %H:%M:%S", time.localtime()), s)
  95                 sys.stdout.flush()
  96
  97         def log(self, s):
  98                 self.logsock.write("[%s] %s" % (time.strftime("%Y-%m-%d %H:%M:%S", time.localtime()), s))
  99                 self.logsock.flush()
 100
 101         def parse_inventory(self, sock=None):
 102                 prev = None
 103                 nextprev = False
 104                 buf = []
 105                 if not sock:
 106                         sock = self.open(os.path.join(self.origin, "_darcs", "hashed_inventory"))
 107                 for i in sock.readlines():
 108                         if i.startswith("hash"):
 109                                 buf.insert(0, i[6:-1])
 110                         if i.startswith("Starting with inventory:"):
 111                                 nextprev = True
 112                         elif nextprev:
 113                                 prev = i[:-1]
 114                                 nextprev = False
 115                 sock.close()
 116                 for i in buf:
 117                         self.hashes.insert(0, i)
 118                 if prev:
 119                         sock = self.gzip_open(os.path.join(self.origin, "_darcs", "inventories", prev))
 120                         self.parse_inventory(sock)
 121
 122         # this is like gzip.open but supports urls as well
 123         def gzip_open(self, path):
 124                 if os.path.exists(path):
 125                         return gzip.open(path)
 126                 buf = urllib.urlopen(path).read()
 127                 sock = StringIO.StringIO(buf)
 128                 return gzip.GzipFile(fileobj=sock)
 129
 130         # this is like os.path.exists but supports urls as well
 131         def path_exists(self, path):
 132                 if os.path.exists(path):
 133                         return True
 134                 else:
 135                         try:
 136                                 urllib2.urlopen(urllib2.Request(path))
 137                                 return True
 138                         except urllib2.HTTPError, e:
 139                                 return False
 140
 141         # this is like open, but supports urls as well
 142         def open(self, path):
 143                 if os.path.exists(path):
 144                         return open(path)
 145                 else:
 146                         return urllib.urlopen(path)
 147
 148         def handle_opts(self):
 149                 # Option Parser
 150                 usage="%prog [options] darcsrepo"
 151                 opp = optparse.OptionParser(usage=usage)
 152                 opp.add_option("--import-marks", metavar="IFILE",
 153                         help="read state for incremental imports from IFILE")
 154                 opp.add_option("--export-marks", metavar="OFILE",
 155                         help="write state for incremental imports from OFILE")
 156                 opp.add_option("--encoding",
 157                         help="encoding of log [default: %default], if unspecified and input isn't utf-8, guess")
 158                 opp.add_option("--authors-file", metavar="F",
 159                         help="read author transformations in old=new format from F")
 160                 opp.add_option("--working", metavar="W",
 161                         help="working directory which is removed at the end of non-incremental conversions")
 162                 opp.add_option("--logfile", metavar="L",
 163                         help="log file which contains the output of external programs invoked during the conversion")
 164                 opp.add_option("--git-branch", metavar="B",
 165                         help="git branch [default: refs/heads/master]")
 166                 opp.add_option("--progress", metavar="P",
 167                         help="insert progress statements after every n commit [default: 100]")
 168                 (self.options, self.args) = opp.parse_args()
 169                 if len(self.args) < 1:
 170                         opp.error("darcsrepo required")
 171
 172                 # read author mapping file in gitauthors format,
 173                 # i. e. in=out (one per # line)
 174                 if self.options.authors_file:
 175                         sock = open(self.options.authors_file)
 176                         self.authormap = dict([i.strip().split('=',1) for i in sock])
 177                         sock.close()
 178
 179                 if "://" not in self.args[0]:
 180                         self.origin = os.path.abspath(self.args[0])
 181                 else:
 182                         self.origin = self.args[0].strip('/')
 183                 if self.options.working:
 184                         self.working = os.path.abspath(self.options.working)
 185                 else:
 186                         if "://" not in self.origin:
 187                                 self.working = "%s.darcs" % self.origin
 188                         else:
 189                                 self.working = "%s.darcs" % os.path.split(self.origin)[-1]
 190                 if self.options.logfile:
 191                         logfile = os.path.abspath(self.options.logfile)
 192                 else:
 193                         if "://" not in self.origin:
 194                                 logfile = "%s.log" % self.origin
 195                         else:
 196                                 logfile = "%s.log" % os.path.split(self.origin)[-1]
 197                 self.logsock = open(logfile, "a")
 198                 if self.options.git_branch:
 199                         self.git_branch = self.options.git_branch
 200                 else:
 201                         self.git_branch = "refs/heads/master"
 202
 203                 if self.options.progress:
 204                         self.prognum = int(self.options.progress)
 205                 else:
 206                         self.prognum = 100
 207
 208         def handle_import_marks(self):
 209                 if self.options.import_marks:
 210                         sock = open(self.options.import_marks)
 211                         for i in sock.readlines():
 212                                 line = i.strip()
 213                                 if not len(line):
 214                                         continue
 215                                 self.import_marks.append(line.split(' ')[1])
 216                                 self.export_marks.append(line)
 217                         sock.close()
 218
 219         def get_patches(self):
 220                 self.progress("getting list of patches")
 221                 if not len(self.import_marks):
 222                         sock = os.popen("darcs changes --xml --reverse --repo %s" % self.origin)
 223                 else:
 224                         sock = os.popen("darcs changes --xml --reverse  --repo %s --from-match 'hash %s'" % (self.origin, self.import_marks[-1]))
 225                 buf = sock.read()
 226                 sock.close()
 227                 # this is hackish. we need to escape some bad chars, otherwise the xml
 228                 # will not be valid
 229                 buf = buf.replace('\x1b', '^[')
 230                 if self.options.encoding:
 231                         xmldoc = xml.dom.minidom.parseString(unicode(buf, self.options.encoding).encode('utf-8'))
 232                 else:
 233                         try:
 234                                 xmldoc = xml.dom.minidom.parseString(buf)
 235                         except xml.parsers.expat.ExpatError:
 236                                 try:
 237                                         import chardet
 238                                 except ImportError:
 239                                         sys.exit("Error, encoding is not utf-8. Please " +
 240                                                 "either specify it with the --encoding " +
 241                                                 "option or install chardet.")
 242                                 self.progress("encoding is not utf8, guessing charset")
 243                                 encoding = chardet.detect(buf)['encoding']
 244                                 self.progress("detected encoding is %s" % encoding)
 245                                 xmldoc = xml.dom.minidom.parseString(unicode(buf, encoding).encode('utf-8'))
 246                 sys.stdout.flush()
 247                 return xmldoc.getElementsByTagName('patch')
 248
 249         def setup_workdir(self):
 250                 darcs2 = False
 251                 self.oldfashionedpatch = True
 252                 self.cwd = os.getcwd()
 253                 if self.path_exists(os.path.join(self.origin, "_darcs", "format")):
 254                         sock = self.open(os.path.join(self.origin, "_darcs", "format"))
 255                         format = [x.strip() for x in sock]
 256                         sock.close()
 257                         darcs2 = 'darcs-2' in format
 258                         self.oldfashionedpatch = not 'hashed' in format
 259                 if not self.oldfashionedpatch:
 260                         self.progress("parsing the inventory")
 261                         if "://" not in self.origin:
 262                                 os.chdir(self.origin)
 263                         self.parse_inventory()
 264                 if not self.options.import_marks or not os.path.exists(self.working):
 265                         # init the tmp darcs repo
 266                         os.mkdir(self.working)
 267                         os.chdir(self.working)
 268                         if darcs2:
 269                                 os.system("darcs init --darcs-2")
 270                         else:
 271                                 os.system("darcs init --hashed")
 272                 else:
 273                         os.chdir(self.working)
 274                 if self.options.import_marks:
 275                         sock = os.popen("darcs pull -a --match 'hash %s' %s" % (self.import_marks[-1], self.origin))
 276                         self.log("Building/updating working directory:\n%s" % sock.read())
 277                         sock.close()
 278
 279         def sanitize_ref(self, ref):
 280                 ref = re.sub('[\x00-\x1f\x7f ~^:/\\*?[]+', '_', ref)
 281                 ref = re.sub('^[._]+', '_', ref)
 282                 ref = re.sub('[._]+$', '_', ref)
 283                 ref = re.sub('[.][.]', '__', ref)
 284                 ref = re.sub('@\{', '_{', ref)
 285                 ref = ref.strip('_')
 286                 ref = re.sub('[._]+lock$', '_lock', ref)
 287                 if len(ref):
 288                         return ref
 289                 else:
 290                         return '_'
 291
 292         def export_patches(self):
 293                 patches = self.get_patches()
 294                 # this is the number of the NEXT patch
 295                 count = 1
 296                 if len(self.import_marks):
 297                         patches = patches[1:]
 298                         count = len(self.import_marks) + 1
 299                 if len(self.export_marks):
 300                         # this is the mark number of the NEXT patch
 301                         markcount = int(self.export_marks[-1].split(' ')[0][1:]) + 1
 302                 else:
 303                         markcount = count
 304                 # this may be huge and we need it many times
 305                 patchnum = len(patches)
 306
 307                 if not len(self.import_marks):
 308                         self.progress("starting export, repo has %d patches" % patchnum)
 309                 else:
 310                         self.progress("continuing export, %d patches to convert" % patchnum)
 311                 paths = []
 312                 for i in patches:
 313                         # apply the patch
 314                         hash = i.attributes['hash'].value
 315                         if not hash.endswith(".gz"):
 316                                 hash += ".gz"
 317                         buf = ["\nNew patches:\n"]
 318                         if self.oldfashionedpatch:
 319                                 sock = self.gzip_open(os.path.join(self.origin, "_darcs", "patches", hash))
 320                         else:
 321                                 sock = self.gzip_open(os.path.join(self.origin, "_darcs", "patches", self.hashes[count-1]))
 322                         buf.append(sock.read())
 323                         sock.close()
 324                         sock = os.popen("darcs changes --context")
 325                         buf.append(sock.read())
 326                         sock.close()
 327                         sock = subprocess.Popen(["darcs", "apply", "--allow-conflicts"], stdin=subprocess.PIPE, stdout=subprocess.PIPE)
 328                         sock.stdin.write("".join(buf))
 329                         sock.stdin.close()
 330                         self.log("Applying %s:\n%s" % (hash, sock.stdout.read()))
 331                         sock.stdout.close()
 332                         message = self.get_patchname(i)
 333                         # export the commit
 334                         print "commit %s" % self.git_branch
 335                         print "mark :%s" % markcount
 336                         if self.options.export_marks:
 337                                 self.export_marks.append(":%s %s" % (markcount, hash))
 338                         date = self.get_date(i.attributes['date'].value)
 339                         print "committer %s %s +0000" % (self.get_author(i), date)
 340                         print "data %d\n%s" % (len(message), message)
 341                         if markcount > 1:
 342                                 print "from :%s" % (markcount-1)
 343                         # export the files
 344                         for j in paths:
 345                                 print "D %s" % j
 346                         paths = []
 347                         for (root, dirs, files) in os.walk ("."):
 348                                 for f in files:
 349                                         j = os.path.normpath(os.path.join(root, f))
 350                                         if j.startswith("_darcs") or "-darcs-backup" in j:
 351                                                 continue
 352                                         paths.append(j)
 353                                         sock = open(j)
 354                                         buf = sock.read()
 355                                         sock.close()
 356                                         # darcs does not track the executable bit :/
 357                                         print "M 644 inline %s" % j
 358                                         print "data %s\n%s" % (len(buf), buf)
 359                         if message[:4] == "TAG ":
 360                                 tag = self.sanitize_ref(message[4:].strip().split('\n')[0])
 361                                 print "tag %s" % tag
 362                                 print "from :%s" % markcount
 363                                 print "tagger %s %s +0000" % (self.get_author(i), date)
 364                                 print "data %d\n%s" % (len(message), message)
 365                         if count % self.prognum == 0:
 366                                 self.progress("%d/%d patches" % (count, patchnum))
 367                         count += 1
 368                         markcount += 1
 369
 370                 os.chdir(self.cwd)
 371
 372                 if not self.options.export_marks:
 373                         shutil.rmtree(self.working)
 374                 self.logsock.close()
 375
 376         def handle_export_marks(self):
 377                 if self.options.export_marks:
 378                         self.progress("writing export marks")
 379                         sock = open(self.options.export_marks, 'w')
 380                         sock.write("\n".join(self.export_marks))
 381                         sock.write("\n")
 382                         sock.close()
 383
 384                 self.progress("finished")
 385
 386         def handle(self):
 387                 self.handle_opts()
 388                 self.handle_import_marks()
 389                 self.setup_workdir()
 390                 self.export_patches()
 391                 self.handle_export_marks()
 392
 393 if __name__ == "__main__":
 394         h = Handler()
 395         h.handle()