parser.py

   1 # Copyright (C) 2008 Canonical Ltd
   2 #
   3 # This program is free software; you can redistribute it and/or modify
   4 # it under the terms of the GNU General Public License as published by
   5 # the Free Software Foundation; either version 2 of the License, or
   6 # (at your option) any later version.
   7 #
   8 # This program is distributed in the hope that it will be useful,
   9 # but WITHOUT ANY WARRANTY; without even the implied warranty of
  10 # MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
  11 # GNU General Public License for more details.
  12 #
  13 # You should have received a copy of the GNU General Public License
  14 # along with this program; if not, write to the Free Software
  15 # Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA  02111-1307  USA
  16
  17 """Parser of import data into command objects.
  18
  19 In order to reuse existing front-ends, the stream format is a subset of
  20 the one used by git-fast-import (as of the 1.5.4 release of git at least).
  21 The grammar is:
  22
  23   stream ::= cmd*;
  24
  25   cmd ::= new_blob
  26         | new_commit
  27         | new_tag
  28         | reset_branch
  29         | checkpoint
  30         | progress
  31         ;
  32
  33   new_blob ::= 'blob' lf
  34     mark?
  35     file_content;
  36   file_content ::= data;
  37
  38   new_commit ::= 'commit' sp ref_str lf
  39     mark?
  40     ('author' sp name '<' email '>' when lf)?
  41     'committer' sp name '<' email '>' when lf
  42     commit_msg
  43     ('from' sp (ref_str | hexsha1 | sha1exp_str | idnum) lf)?
  44     ('merge' sp (ref_str | hexsha1 | sha1exp_str | idnum) lf)*
  45     file_change*
  46     lf?;
  47   commit_msg ::= data;
  48
  49   file_change ::= file_clr
  50     | file_del
  51     | file_rnm
  52     | file_cpy
  53     | file_obm
  54     | file_inm;
  55   file_clr ::= 'deleteall' lf;
  56   file_del ::= 'D' sp path_str lf;
  57   file_rnm ::= 'R' sp path_str sp path_str lf;
  58   file_cpy ::= 'C' sp path_str sp path_str lf;
  59   file_obm ::= 'M' sp mode sp (hexsha1 | idnum) sp path_str lf;
  60   file_inm ::= 'M' sp mode sp 'inline' sp path_str lf
  61     data;
  62
  63   new_tag ::= 'tag' sp tag_str lf
  64     'from' sp (ref_str | hexsha1 | sha1exp_str | idnum) lf
  65     'tagger' sp name '<' email '>' when lf
  66     tag_msg;
  67   tag_msg ::= data;
  68
  69   reset_branch ::= 'reset' sp ref_str lf
  70     ('from' sp (ref_str | hexsha1 | sha1exp_str | idnum) lf)?
  71     lf?;
  72
  73   checkpoint ::= 'checkpoint' lf
  74     lf?;
  75
  76   progress ::= 'progress' sp not_lf* lf
  77     lf?;
  78
  79      # note: the first idnum in a stream should be 1 and subsequent
  80      # idnums should not have gaps between values as this will cause
  81      # the stream parser to reserve space for the gapped values.  An
  82      # idnum can be updated in the future to a new object by issuing
  83      # a new mark directive with the old idnum.
  84      #
  85   mark ::= 'mark' sp idnum lf;
  86   data ::= (delimited_data | exact_data)
  87     lf?;
  88
  89     # note: delim may be any string but must not contain lf.
  90     # data_line may contain any data but must not be exactly
  91     # delim. The lf after the final data_line is included in
  92     # the data.
  93   delimited_data ::= 'data' sp '<<' delim lf
  94     (data_line lf)*
  95     delim lf;
  96
  97      # note: declen indicates the length of binary_data in bytes.
  98      # declen does not include the lf preceeding the binary data.
  99      #
 100   exact_data ::= 'data' sp declen lf
 101     binary_data;
 102
 103      # note: quoted strings are C-style quoting supporting \c for
 104      # common escapes of 'c' (e..g \n, \t, \\, \") or \nnn where nnn
 105      # is the signed byte value in octal.  Note that the only
 106      # characters which must actually be escaped to protect the
 107      # stream formatting is: \, " and LF.  Otherwise these values
 108      # are UTF8.
 109      #
 110   ref_str     ::= ref;
 111   sha1exp_str ::= sha1exp;
 112   tag_str     ::= tag;
 113   path_str    ::= path    | '"' quoted(path)    '"' ;
 114   mode        ::= '100644' | '644'
 115                 | '100755' | '755'
 116                 | '120000'
 117                 ;
 118
 119   declen ::= # unsigned 32 bit value, ascii base10 notation;
 120   bigint ::= # unsigned integer value, ascii base10 notation;
 121   binary_data ::= # file content, not interpreted;
 122
 123   when         ::= raw_when | rfc2822_when;
 124   raw_when     ::= ts sp tz;
 125   rfc2822_when ::= # Valid RFC 2822 date and time;
 126
 127   sp ::= # ASCII space character;
 128   lf ::= # ASCII newline (LF) character;
 129
 130      # note: a colon (':') must precede the numerical value assigned to
 131      # an idnum.  This is to distinguish it from a ref or tag name as
 132      # GIT does not permit ':' in ref or tag strings.
 133      #
 134   idnum   ::= ':' bigint;
 135   path    ::= # GIT style file path, e.g. "a/b/c";
 136   ref     ::= # GIT ref name, e.g. "refs/heads/MOZ_GECKO_EXPERIMENT";
 137   tag     ::= # GIT tag name, e.g. "FIREFOX_1_5";
 138   sha1exp ::= # Any valid GIT SHA1 expression;
 139   hexsha1 ::= # SHA1 in hexadecimal format;
 140
 141      # note: name and email are UTF8 strings, however name must not
 142      # contain '<' or lf and email must not contain any of the
 143      # following: '<', '>', lf.
 144      #
 145   name  ::= # valid GIT author/committer name;
 146   email ::= # valid GIT author/committer email;
 147   ts    ::= # time since the epoch in seconds, ascii base10 notation;
 148   tz    ::= # GIT style timezone;
 149
 150      # note: comments may appear anywhere in the input, except
 151      # within a data command.  Any form of the data command
 152      # always escapes the related input from comment processing.
 153      #
 154      # In case it is not clear, the '#' that starts the comment
 155      # must be the first character on that the line (an lf have
 156      # preceeded it).
 157      #
 158   comment ::= '#' not_lf* lf;
 159   not_lf  ::= # Any byte that is not ASCII newline (LF);
 160 """
 161
 162
 163 import re
 164 import sys
 165
 166 import commands
 167 import dates
 168 import errors
 169
 170
 171 ## Stream parsing ##
 172
 173 class LineBasedParser(object):
 174
 175     def __init__(self, input):
 176         """A Parser that keeps track of line numbers.
 177
 178         :param input: the file-like object to read from
 179         """
 180         self.input = input
 181         self.lineno = 0
 182         # Lines pushed back onto the input stream
 183         self._buffer = []
 184
 185     def abort(self, exception, *args):
 186         """Raise an exception providing line number information."""
 187         raise exception(self.lineno, *args)
 188
 189     def readline(self):
 190         """Get the next line including the newline or '' on EOF."""
 191         self.lineno += 1
 192         if self._buffer:
 193             return self._buffer.pop()
 194         else:
 195             return self.input.readline()
 196
 197     def next_line(self):
 198         """Get the next line without the newline or None on EOF."""
 199         line = self.readline()
 200         if line:
 201             return line[:-1]
 202         else:
 203             return None
 204
 205     def push_line(self, line):
 206         """Push line back onto the line buffer.
 207
 208         :param line: the line with no trailing newline
 209         """
 210         self.lineno -= 1
 211         self._buffer.append(line + "\n")
 212
 213     def read_bytes(self, count):
 214         """Read a given number of bytes from the input stream.
 215
 216         Throws MissingBytes if the bytes are not found.
 217
 218         Note: This method does not read from the line buffer.
 219
 220         :return: a string
 221         """
 222         result = self.input.read(count)
 223         found = len(result)
 224         self.lineno += result.count("\n")
 225         if found != count:
 226             self.abort(errors.MissingBytes, count, found)
 227         return result
 228
 229     def read_until(self, terminator):
 230         """Read the input stream until the terminator is found.
 231
 232         Throws MissingTerminator if the terminator is not found.
 233
 234         Note: This method does not read from the line buffer.
 235
 236         :return: the bytes read up to but excluding the terminator.
 237         """
 238
 239         lines = []
 240         term = terminator + '\n'
 241         while True:
 242             line = self.input.readline()
 243             if line == term:
 244                 break
 245             else:
 246                 lines.append(line)
 247         return ''.join(lines)
 248
 249
 250 # Regular expression used for parsing. (Note: The spec states that the name
 251 # part should be non-empty but git-fast-export doesn't always do that so
 252 # the first bit is \w*, not \w+.) Also git-fast-import code says the
 253 # space before the email is optional.
 254 _WHO_AND_WHEN_RE = re.compile(r'([^<]*)<(.*)> (.+)')
 255 _WHO_RE = re.compile(r'([^<]*)<(.*)>')
 256
 257
 258 class ImportParser(LineBasedParser):
 259
 260     def __init__(self, input, verbose=False, output=sys.stdout,
 261         user_mapper=None):
 262         """A Parser of import commands.
 263
 264         :param input: the file-like object to read from
 265         :param verbose: display extra information of not
 266         :param output: the file-like object to write messages to (YAGNI?)
 267         :param user_mapper: if not None, the UserMapper used to adjust
 268           user-ids for authors, committers and taggers.
 269         """
 270         LineBasedParser.__init__(self, input)
 271         self.verbose = verbose
 272         self.output = output
 273         self.user_mapper = user_mapper
 274         # We auto-detect the date format when a date is first encountered
 275         self.date_parser = None
 276
 277     def warning(self, msg):
 278         sys.stderr.write("warning line %d: %s\n" % (self.lineno, msg))
 279
 280     def iter_commands(self):
 281         """Iterator returning ImportCommand objects."""
 282         while True:
 283             line = self.next_line()
 284             if line is None:
 285                 break
 286             elif len(line) == 0 or line.startswith('#'):
 287                 continue
 288             # Search for commands in order of likelihood
 289             elif line.startswith('commit '):
 290                 yield self._parse_commit(line[len('commit '):])
 291             elif line.startswith('blob'):
 292                 yield self._parse_blob()
 293             elif line.startswith('progress '):
 294                 yield commands.ProgressCommand(line[len('progress '):])
 295             elif line.startswith('reset '):
 296                 yield self._parse_reset(line[len('reset '):])
 297             elif line.startswith('tag '):
 298                 yield self._parse_tag(line[len('tag '):])
 299             elif line.startswith('checkpoint'):
 300                 yield commands.CheckpointCommand()
 301             elif line.startswith('feature'):
 302                 yield self._parse_feature(line[len('feature '):])
 303             else:
 304                 self.abort(errors.InvalidCommand, line)
 305
 306     def iter_file_commands(self):
 307         """Iterator returning FileCommand objects.
 308
 309         If an invalid file command is found, the line is silently
 310         pushed back and iteration ends.
 311         """
 312         while True:
 313             line = self.next_line()
 314             if line is None:
 315                 break
 316             elif len(line) == 0 or line.startswith('#'):
 317                 continue
 318             # Search for file commands in order of likelihood
 319             elif line.startswith('M '):
 320                 yield self._parse_file_modify(line[2:])
 321             elif line.startswith('D '):
 322                 path = self._path(line[2:])
 323                 yield commands.FileDeleteCommand(path)
 324             elif line.startswith('R '):
 325                 old, new = self._path_pair(line[2:])
 326                 yield commands.FileRenameCommand(old, new)
 327             elif line.startswith('C '):
 328                 src, dest = self._path_pair(line[2:])
 329                 yield commands.FileCopyCommand(src, dest)
 330             elif line.startswith('deleteall'):
 331                 yield commands.FileDeleteAllCommand()
 332             else:
 333                 self.push_line(line)
 334                 break
 335
 336     def _parse_blob(self):
 337         """Parse a blob command."""
 338         lineno = self.lineno
 339         mark = self._get_mark_if_any()
 340         data = self._get_data('blob')
 341         return commands.BlobCommand(mark, data, lineno)
 342
 343     def _parse_commit(self, ref):
 344         """Parse a commit command."""
 345         lineno  = self.lineno
 346         mark = self._get_mark_if_any()
 347         author = self._get_user_info('commit', 'author', False)
 348         more_authors = []
 349         while True:
 350             another_author = self._get_user_info('commit', 'author', False)
 351             if another_author is not None:
 352                 more_authors.append(another_author)
 353             else:
 354                 break
 355         committer = self._get_user_info('commit', 'committer')
 356         message = self._get_data('commit', 'message')
 357         try:
 358             message = message.decode('utf_8')
 359         except UnicodeDecodeError:
 360             self.warning(
 361                 "commit message not in utf8 - replacing unknown characters")
 362             message = message.decode('utf_8', 'replace')
 363         from_ = self._get_from()
 364         merges = []
 365         while True:
 366             merge = self._get_merge()
 367             if merge is not None:
 368                 # while the spec suggests it's illegal, git-fast-export
 369                 # outputs multiple merges on the one line, e.g.
 370                 # merge :x :y :z
 371                 these_merges = merge.split(" ")
 372                 merges.extend(these_merges)
 373             else:
 374                 break
 375         properties = {}
 376         while True:
 377             name_value = self._get_property()
 378             if name_value is not None:
 379                 name, value = name_value
 380                 properties[name] = value
 381             else:
 382                 break
 383         return commands.CommitCommand(ref, mark, author, committer, message,
 384             from_, merges, self.iter_file_commands, lineno=lineno,
 385             more_authors=more_authors, properties=properties)
 386
 387     def _parse_feature(self, info):
 388         """Parse a feature command."""
 389         parts = info.split("=", 1)
 390         name = parts[0]
 391         if len(parts) > 1:
 392             value = self._path(parts[1])
 393         else:
 394             value = None
 395         return commands.FeatureCommand(name, value, lineno=self.lineno)
 396
 397     def _parse_file_modify(self, info):
 398         """Parse a filemodify command within a commit.
 399
 400         :param info: a string in the format "mode dataref path"
 401           (where dataref might be the hard-coded literal 'inline').
 402         """
 403         params = info.split(' ', 2)
 404         path = self._path(params[2])
 405         is_executable, kind = self._mode(params[0])
 406         if params[1] == 'inline':
 407             dataref = None
 408             data = self._get_data('filemodify')
 409         else:
 410             dataref = params[1]
 411             data = None
 412         return commands.FileModifyCommand(path, kind, is_executable, dataref,
 413             data)
 414
 415     def _parse_reset(self, ref):
 416         """Parse a reset command."""
 417         from_ = self._get_from()
 418         return commands.ResetCommand(ref, from_)
 419
 420     def _parse_tag(self, name):
 421         """Parse a tag command."""
 422         from_ = self._get_from('tag')
 423         tagger = self._get_user_info('tag', 'tagger', accept_just_who=True)
 424         message = self._get_data('tag', 'message').decode('utf_8')
 425         return commands.TagCommand(name, from_, tagger, message)
 426
 427     def _get_mark_if_any(self):
 428         """Parse a mark section."""
 429         line = self.next_line()
 430         if line.startswith('mark :'):
 431             return line[len('mark :'):]
 432         else:
 433             self.push_line(line)
 434             return None
 435
 436     def _get_from(self, required_for=None):
 437         """Parse a from section."""
 438         line = self.next_line()
 439         if line is None:
 440             return None
 441         elif line.startswith('from '):
 442             return line[len('from '):]
 443         elif required_for:
 444             self.abort(errors.MissingSection, required_for, 'from')
 445         else:
 446             self.push_line(line)
 447             return None
 448
 449     def _get_merge(self):
 450         """Parse a merge section."""
 451         line = self.next_line()
 452         if line is None:
 453             return None
 454         elif line.startswith('merge '):
 455             return line[len('merge '):]
 456         else:
 457             self.push_line(line)
 458             return None
 459
 460     def _get_property(self):
 461         """Parse a property section."""
 462         line = self.next_line()
 463         if line is None:
 464             return None
 465         elif line.startswith('property '):
 466             return self._name_value(line[len('property '):])
 467         else:
 468             self.push_line(line)
 469             return None
 470
 471     def _get_user_info(self, cmd, section, required=True,
 472         accept_just_who=False):
 473         """Parse a user section."""
 474         line = self.next_line()
 475         if line.startswith(section + ' '):
 476             return self._who_when(line[len(section + ' '):], cmd, section,
 477                 accept_just_who=accept_just_who)
 478         elif required:
 479             self.abort(errors.MissingSection, cmd, section)
 480         else:
 481             self.push_line(line)
 482             return None
 483
 484     def _get_data(self, required_for, section='data'):
 485         """Parse a data section."""
 486         line = self.next_line()
 487         if line.startswith('data '):
 488             rest = line[len('data '):]
 489             if rest.startswith('<<'):
 490                 return self.read_until(rest[2:])
 491             else:
 492                 size = int(rest)
 493                 read_bytes = self.read_bytes(size)
 494                 # optional LF after data.
 495                 next = self.input.readline()
 496                 self.lineno += 1
 497                 if len(next) > 1 or next != "\n":
 498                     self.push_line(next[:-1])
 499                 return read_bytes
 500         else:
 501             self.abort(errors.MissingSection, required_for, section)
 502
 503     def _who_when(self, s, cmd, section, accept_just_who=False):
 504         """Parse who and when information from a string.
 505
 506         :return: a tuple of (name,email,timestamp,timezone). name may be
 507             the empty string if only an email address was given.
 508         """
 509         match = _WHO_AND_WHEN_RE.search(s)
 510         if match:
 511             datestr = match.group(3).lstrip()
 512             if self.date_parser is None:
 513                 # auto-detect the date format
 514                 if len(datestr.split(' ')) == 2:
 515                     format = 'raw'
 516                 elif datestr == 'now':
 517                     format = 'now'
 518                 else:
 519                     format = 'rfc2822'
 520                 self.date_parser = dates.DATE_PARSERS_BY_NAME[format]
 521             try:
 522                 when = self.date_parser(datestr, self.lineno)
 523             except ValueError:
 524                 print "failed to parse datestr '%s'" % (datestr,)
 525                 raise
 526         else:
 527             match = _WHO_RE.search(s)
 528             if accept_just_who and match:
 529                 # HACK around missing time
 530                 # TODO: output a warning here
 531                 when = dates.DATE_PARSERS_BY_NAME['now']('now')
 532             else:
 533                 self.abort(errors.BadFormat, cmd, section, s)
 534         name = match.group(1)
 535         if len(name) > 0:
 536             if name[-1] == " ":
 537                 try:
 538                     name = name[:-1].decode('utf_8')
 539                 except UnicodeDecodeError:
 540                     # The spec says names are *typically* utf8 encoded
 541                     # but that isn't enforced by git-fast-export (at least)
 542                     self.warning("%s name not in utf8 - replacing unknown "
 543                         "characters" % (section,))
 544                     name = name[:-1].decode('utf_8', 'replace')
 545         email = match.group(2)
 546         # While it shouldn't happen, some datasets have email addresses
 547         # which contain unicode characters. See bug 338186. We sanitize
 548         # the data at this level just in case.
 549         try:
 550             email = email.decode('utf_8')
 551         except UnicodeDecodeError:
 552             self.warning("%s email not in utf8 - replacing unknown characters"
 553                 % (section,))
 554             email = email.decode('utf_8', 'replace')
 555         if self.user_mapper:
 556             name, email = self.user_mapper.map_name_and_email(name, email)
 557         return (name, email, when[0], when[1])
 558
 559     def _name_value(self, s):
 560         """Parse a (name,value) tuple from 'name value-length value'."""
 561         parts = s.split(' ', 2)
 562         name = parts[0]
 563         if len(parts) == 1:
 564             value = None
 565         else:
 566             size = int(parts[1])
 567             value = parts[2]
 568             still_to_read = size - len(value)
 569             if still_to_read > 0:
 570                 read_bytes = self.read_bytes(still_to_read)
 571                 value += "\n" + read_bytes[:still_to_read - 1]
 572             value = value.decode('utf8')
 573         return (name, value)
 574
 575     def _path(self, s):
 576         """Parse a path."""
 577         if s.startswith('"'):
 578             if s[-1] != '"':
 579                 self.abort(errors.BadFormat, '?', '?', s)
 580             else:
 581                 return _unquote_c_string(s[1:-1])
 582         try:
 583             return s.decode('utf_8')
 584         except UnicodeDecodeError:
 585             # The spec recommends utf8 encoding but that isn't enforced
 586             return s
 587
 588     def _path_pair(self, s):
 589         """Parse two paths separated by a space."""
 590         # TODO: handle a space in the first path
 591         if s.startswith('"'):
 592             parts = s[1:].split('" ', 1)
 593         else:
 594             parts = s.split(' ', 1)
 595         if len(parts) != 2:
 596             self.abort(errors.BadFormat, '?', '?', s)
 597         elif parts[1].startswith('"') and parts[1].endswith('"'):
 598             parts[1] = parts[1][1:-1]
 599         elif parts[1].startswith('"') or parts[1].endswith('"'):
 600             self.abort(errors.BadFormat, '?', '?', s)
 601         return map(_unquote_c_string, parts)
 602
 603     def _mode(self, s):
 604         """Parse a file mode into executable and kind.
 605
 606         :return (is_executable, kind)
 607         """
 608         # Note: Output from git-fast-export slightly different to spec
 609         if s in ['644', '100644', '0100644']:
 610             return False, commands.FILE_KIND
 611         elif s in ['755', '100755', '0100755']:
 612             return True, commands.FILE_KIND
 613         elif s in ['040000', '0040000']:
 614             return False, commands.DIRECTORY_KIND
 615         elif s in ['120000', '0120000']:
 616             return False, commands.SYMLINK_KIND
 617         elif s in ['160000', '0160000']:
 618             return False, commands.TREE_REFERENCE_KIND
 619         else:
 620             self.abort(errors.BadFormat, 'filemodify', 'mode', s)
 621
 622
 623 def _unquote_c_string(s):
 624     """replace C-style escape sequences (\n, \", etc.) with real chars."""
 625     # HACK: Python strings are close enough
 626     return s.decode('string_escape', 'replace')