1 # Copyright (C) 2008 Canonical Ltd
3 # This program is free software; you can redistribute it and/or modify
4 # it under the terms of the GNU General Public License as published by
5 # the Free Software Foundation; either version 2 of the License, or
6 # (at your option) any later version.
8 # This program is distributed in the hope that it will be useful,
9 # but WITHOUT ANY WARRANTY; without even the implied warranty of
10 # MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
11 # GNU General Public License for more details.
13 # You should have received a copy of the GNU General Public License
14 # along with this program; if not, write to the Free Software
15 # Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA 02111-1307 USA
17 """Parser of import data into command objects.
19 In order to reuse existing front-ends, the stream format is a subset of
20 the one used by git-fast-import (as of the 1.5.4 release of git at least).
33 new_blob ::= 'blob' lf
36 file_content ::= data;
38 new_commit ::= 'commit' sp ref_str lf
40 ('author' sp name '<' email '>' when lf)?
41 'committer' sp name '<' email '>' when lf
43 ('from' sp (ref_str | hexsha1 | sha1exp_str | idnum) lf)?
44 ('merge' sp (ref_str | hexsha1 | sha1exp_str | idnum) lf)*
49 file_change ::= file_clr
55 file_clr ::= 'deleteall' lf;
56 file_del ::= 'D' sp path_str lf;
57 file_rnm ::= 'R' sp path_str sp path_str lf;
58 file_cpy ::= 'C' sp path_str sp path_str lf;
59 file_obm ::= 'M' sp mode sp (hexsha1 | idnum) sp path_str lf;
60 file_inm ::= 'M' sp mode sp 'inline' sp path_str lf
63 new_tag ::= 'tag' sp tag_str lf
64 'from' sp (ref_str | hexsha1 | sha1exp_str | idnum) lf
65 'tagger' sp name '<' email '>' when lf
69 reset_branch ::= 'reset' sp ref_str lf
70 ('from' sp (ref_str | hexsha1 | sha1exp_str | idnum) lf)?
73 checkpoint ::= 'checkpoint' lf
76 progress ::= 'progress' sp not_lf* lf
79 # note: the first idnum in a stream should be 1 and subsequent
80 # idnums should not have gaps between values as this will cause
81 # the stream parser to reserve space for the gapped values. An
82 # idnum can be updated in the future to a new object by issuing
83 # a new mark directive with the old idnum.
85 mark ::= 'mark' sp idnum lf;
86 data ::= (delimited_data | exact_data)
89 # note: delim may be any string but must not contain lf.
90 # data_line may contain any data but must not be exactly
91 # delim. The lf after the final data_line is included in
93 delimited_data ::= 'data' sp '<<' delim lf
97 # note: declen indicates the length of binary_data in bytes.
98 # declen does not include the lf preceeding the binary data.
100 exact_data ::= 'data' sp declen lf
103 # note: quoted strings are C-style quoting supporting \c for
104 # common escapes of 'c' (e..g \n, \t, \\, \") or \nnn where nnn
105 # is the signed byte value in octal. Note that the only
106 # characters which must actually be escaped to protect the
107 # stream formatting is: \, " and LF. Otherwise these values
111 sha1exp_str ::= sha1exp;
113 path_str ::= path | '"' quoted(path) '"' ;
114 mode ::= '100644' | '644'
119 declen ::= # unsigned 32 bit value, ascii base10 notation;
120 bigint ::= # unsigned integer value, ascii base10 notation;
121 binary_data ::= # file content, not interpreted;
123 when ::= raw_when | rfc2822_when;
124 raw_when ::= ts sp tz;
125 rfc2822_when ::= # Valid RFC 2822 date and time;
127 sp ::= # ASCII space character;
128 lf ::= # ASCII newline (LF) character;
130 # note: a colon (':') must precede the numerical value assigned to
131 # an idnum. This is to distinguish it from a ref or tag name as
132 # GIT does not permit ':' in ref or tag strings.
134 idnum ::= ':' bigint;
135 path ::= # GIT style file path, e.g. "a/b/c";
136 ref ::= # GIT ref name, e.g. "refs/heads/MOZ_GECKO_EXPERIMENT";
137 tag ::= # GIT tag name, e.g. "FIREFOX_1_5";
138 sha1exp ::= # Any valid GIT SHA1 expression;
139 hexsha1 ::= # SHA1 in hexadecimal format;
141 # note: name and email are UTF8 strings, however name must not
142 # contain '<' or lf and email must not contain any of the
143 # following: '<', '>', lf.
145 name ::= # valid GIT author/committer name;
146 email ::= # valid GIT author/committer email;
147 ts ::= # time since the epoch in seconds, ascii base10 notation;
148 tz ::= # GIT style timezone;
150 # note: comments may appear anywhere in the input, except
151 # within a data command. Any form of the data command
152 # always escapes the related input from comment processing.
154 # In case it is not clear, the '#' that starts the comment
155 # must be the first character on that the line (an lf have
158 comment ::= '#' not_lf* lf;
159 not_lf ::= # Any byte that is not ASCII newline (LF);
173 class LineBasedParser(object):
175 def __init__(self
, input):
176 """A Parser that keeps track of line numbers.
178 :param input: the file-like object to read from
182 # Lines pushed back onto the input stream
185 def abort(self
, exception
, *args
):
186 """Raise an exception providing line number information."""
187 raise exception(self
.lineno
, *args
)
190 """Get the next line including the newline or '' on EOF."""
193 return self
._buffer
.pop()
195 return self
.input.readline()
198 """Get the next line without the newline or None on EOF."""
199 line
= self
.readline()
205 def push_line(self
, line
):
206 """Push line back onto the line buffer.
208 :param line: the line with no trailing newline
211 self
._buffer
.append(line
+ "\n")
213 def read_bytes(self
, count
):
214 """Read a given number of bytes from the input stream.
216 Throws MissingBytes if the bytes are not found.
218 Note: This method does not read from the line buffer.
222 result
= self
.input.read(count
)
224 self
.lineno
+= result
.count("\n")
226 self
.abort(errors
.MissingBytes
, count
, found
)
229 def read_until(self
, terminator
):
230 """Read the input stream until the terminator is found.
232 Throws MissingTerminator if the terminator is not found.
234 Note: This method does not read from the line buffer.
236 :return: the bytes read up to but excluding the terminator.
240 term
= terminator
+ '\n'
242 line
= self
.input.readline()
247 return ''.join(lines
)
250 # Regular expression used for parsing. (Note: The spec states that the name
251 # part should be non-empty but git-fast-export doesn't always do that so
252 # the first bit is \w*, not \w+.) Also git-fast-import code says the
253 # space before the email is optional.
254 _WHO_AND_WHEN_RE
= re
.compile(r
'([^<]*)<(.*)> (.+)')
255 _WHO_RE
= re
.compile(r
'([^<]*)<(.*)>')
258 class ImportParser(LineBasedParser
):
260 def __init__(self
, input, verbose
=False, output
=sys
.stdout
,
262 """A Parser of import commands.
264 :param input: the file-like object to read from
265 :param verbose: display extra information of not
266 :param output: the file-like object to write messages to (YAGNI?)
267 :param user_mapper: if not None, the UserMapper used to adjust
268 user-ids for authors, committers and taggers.
270 LineBasedParser
.__init
__(self
, input)
271 self
.verbose
= verbose
273 self
.user_mapper
= user_mapper
274 # We auto-detect the date format when a date is first encountered
275 self
.date_parser
= None
277 def warning(self
, msg
):
278 sys
.stderr
.write("warning line %d: %s\n" % (self
.lineno
, msg
))
280 def iter_commands(self
):
281 """Iterator returning ImportCommand objects."""
283 line
= self
.next_line()
286 elif len(line
) == 0 or line
.startswith('#'):
288 # Search for commands in order of likelihood
289 elif line
.startswith('commit '):
290 yield self
._parse
_commit
(line
[len('commit '):])
291 elif line
.startswith('blob'):
292 yield self
._parse
_blob
()
293 elif line
.startswith('progress '):
294 yield commands
.ProgressCommand(line
[len('progress '):])
295 elif line
.startswith('reset '):
296 yield self
._parse
_reset
(line
[len('reset '):])
297 elif line
.startswith('tag '):
298 yield self
._parse
_tag
(line
[len('tag '):])
299 elif line
.startswith('checkpoint'):
300 yield commands
.CheckpointCommand()
301 elif line
.startswith('feature'):
302 yield self
._parse
_feature
(line
[len('feature '):])
304 self
.abort(errors
.InvalidCommand
, line
)
306 def iter_file_commands(self
):
307 """Iterator returning FileCommand objects.
309 If an invalid file command is found, the line is silently
310 pushed back and iteration ends.
313 line
= self
.next_line()
316 elif len(line
) == 0 or line
.startswith('#'):
318 # Search for file commands in order of likelihood
319 elif line
.startswith('M '):
320 yield self
._parse
_file
_modify
(line
[2:])
321 elif line
.startswith('D '):
322 path
= self
._path
(line
[2:])
323 yield commands
.FileDeleteCommand(path
)
324 elif line
.startswith('R '):
325 old
, new
= self
._path
_pair
(line
[2:])
326 yield commands
.FileRenameCommand(old
, new
)
327 elif line
.startswith('C '):
328 src
, dest
= self
._path
_pair
(line
[2:])
329 yield commands
.FileCopyCommand(src
, dest
)
330 elif line
.startswith('deleteall'):
331 yield commands
.FileDeleteAllCommand()
336 def _parse_blob(self
):
337 """Parse a blob command."""
339 mark
= self
._get
_mark
_if
_any
()
340 data
= self
._get
_data
('blob')
341 return commands
.BlobCommand(mark
, data
, lineno
)
343 def _parse_commit(self
, ref
):
344 """Parse a commit command."""
346 mark
= self
._get
_mark
_if
_any
()
347 author
= self
._get
_user
_info
('commit', 'author', False)
350 another_author
= self
._get
_user
_info
('commit', 'author', False)
351 if another_author
is not None:
352 more_authors
.append(another_author
)
355 committer
= self
._get
_user
_info
('commit', 'committer')
356 message
= self
._get
_data
('commit', 'message')
358 message
= message
.decode('utf_8')
359 except UnicodeDecodeError:
361 "commit message not in utf8 - replacing unknown characters")
362 message
= message
.decode('utf_8', 'replace')
363 from_
= self
._get
_from
()
366 merge
= self
._get
_merge
()
367 if merge
is not None:
368 # while the spec suggests it's illegal, git-fast-export
369 # outputs multiple merges on the one line, e.g.
371 these_merges
= merge
.split(" ")
372 merges
.extend(these_merges
)
377 name_value
= self
._get
_property
()
378 if name_value
is not None:
379 name
, value
= name_value
380 properties
[name
] = value
383 return commands
.CommitCommand(ref
, mark
, author
, committer
, message
,
384 from_
, merges
, self
.iter_file_commands
, lineno
=lineno
,
385 more_authors
=more_authors
, properties
=properties
)
387 def _parse_feature(self
, info
):
388 """Parse a feature command."""
389 parts
= info
.split("=", 1)
392 value
= self
._path
(parts
[1])
395 return commands
.FeatureCommand(name
, value
, lineno
=self
.lineno
)
397 def _parse_file_modify(self
, info
):
398 """Parse a filemodify command within a commit.
400 :param info: a string in the format "mode dataref path"
401 (where dataref might be the hard-coded literal 'inline').
403 params
= info
.split(' ', 2)
404 path
= self
._path
(params
[2])
405 is_executable
, kind
= self
._mode
(params
[0])
406 if params
[1] == 'inline':
408 data
= self
._get
_data
('filemodify')
412 return commands
.FileModifyCommand(path
, kind
, is_executable
, dataref
,
415 def _parse_reset(self
, ref
):
416 """Parse a reset command."""
417 from_
= self
._get
_from
()
418 return commands
.ResetCommand(ref
, from_
)
420 def _parse_tag(self
, name
):
421 """Parse a tag command."""
422 from_
= self
._get
_from
('tag')
423 tagger
= self
._get
_user
_info
('tag', 'tagger', accept_just_who
=True)
424 message
= self
._get
_data
('tag', 'message').decode('utf_8')
425 return commands
.TagCommand(name
, from_
, tagger
, message
)
427 def _get_mark_if_any(self
):
428 """Parse a mark section."""
429 line
= self
.next_line()
430 if line
.startswith('mark :'):
431 return line
[len('mark :'):]
436 def _get_from(self
, required_for
=None):
437 """Parse a from section."""
438 line
= self
.next_line()
441 elif line
.startswith('from '):
442 return line
[len('from '):]
444 self
.abort(errors
.MissingSection
, required_for
, 'from')
449 def _get_merge(self
):
450 """Parse a merge section."""
451 line
= self
.next_line()
454 elif line
.startswith('merge '):
455 return line
[len('merge '):]
460 def _get_property(self
):
461 """Parse a property section."""
462 line
= self
.next_line()
465 elif line
.startswith('property '):
466 return self
._name
_value
(line
[len('property '):])
471 def _get_user_info(self
, cmd
, section
, required
=True,
472 accept_just_who
=False):
473 """Parse a user section."""
474 line
= self
.next_line()
475 if line
.startswith(section
+ ' '):
476 return self
._who
_when
(line
[len(section
+ ' '):], cmd
, section
,
477 accept_just_who
=accept_just_who
)
479 self
.abort(errors
.MissingSection
, cmd
, section
)
484 def _get_data(self
, required_for
, section
='data'):
485 """Parse a data section."""
486 line
= self
.next_line()
487 if line
.startswith('data '):
488 rest
= line
[len('data '):]
489 if rest
.startswith('<<'):
490 return self
.read_until(rest
[2:])
493 read_bytes
= self
.read_bytes(size
)
494 # optional LF after data.
495 next
= self
.input.readline()
497 if len(next
) > 1 or next
!= "\n":
498 self
.push_line(next
[:-1])
501 self
.abort(errors
.MissingSection
, required_for
, section
)
503 def _who_when(self
, s
, cmd
, section
, accept_just_who
=False):
504 """Parse who and when information from a string.
506 :return: a tuple of (name,email,timestamp,timezone). name may be
507 the empty string if only an email address was given.
509 match
= _WHO_AND_WHEN_RE
.search(s
)
511 datestr
= match
.group(3).lstrip()
512 if self
.date_parser
is None:
513 # auto-detect the date format
514 if len(datestr
.split(' ')) == 2:
516 elif datestr
== 'now':
520 self
.date_parser
= dates
.DATE_PARSERS_BY_NAME
[format
]
522 when
= self
.date_parser(datestr
, self
.lineno
)
524 print "failed to parse datestr '%s'" % (datestr
,)
527 match
= _WHO_RE
.search(s
)
528 if accept_just_who
and match
:
529 # HACK around missing time
530 # TODO: output a warning here
531 when
= dates
.DATE_PARSERS_BY_NAME
['now']('now')
533 self
.abort(errors
.BadFormat
, cmd
, section
, s
)
534 name
= match
.group(1)
538 name
= name
[:-1].decode('utf_8')
539 except UnicodeDecodeError:
540 # The spec says names are *typically* utf8 encoded
541 # but that isn't enforced by git-fast-export (at least)
542 self
.warning("%s name not in utf8 - replacing unknown "
543 "characters" % (section
,))
544 name
= name
[:-1].decode('utf_8', 'replace')
545 email
= match
.group(2)
546 # While it shouldn't happen, some datasets have email addresses
547 # which contain unicode characters. See bug 338186. We sanitize
548 # the data at this level just in case.
550 email
= email
.decode('utf_8')
551 except UnicodeDecodeError:
552 self
.warning("%s email not in utf8 - replacing unknown characters"
554 email
= email
.decode('utf_8', 'replace')
556 name
, email
= self
.user_mapper
.map_name_and_email(name
, email
)
557 return (name
, email
, when
[0], when
[1])
559 def _name_value(self
, s
):
560 """Parse a (name,value) tuple from 'name value-length value'."""
561 parts
= s
.split(' ', 2)
568 still_to_read
= size
- len(value
)
569 if still_to_read
> 0:
570 read_bytes
= self
.read_bytes(still_to_read
)
571 value
+= "\n" + read_bytes
[:still_to_read
- 1]
572 value
= value
.decode('utf8')
577 if s
.startswith('"'):
579 self
.abort(errors
.BadFormat
, '?', '?', s
)
581 return _unquote_c_string(s
[1:-1])
583 return s
.decode('utf_8')
584 except UnicodeDecodeError:
585 # The spec recommends utf8 encoding but that isn't enforced
588 def _path_pair(self
, s
):
589 """Parse two paths separated by a space."""
590 # TODO: handle a space in the first path
591 if s
.startswith('"'):
592 parts
= s
[1:].split('" ', 1)
594 parts
= s
.split(' ', 1)
596 self
.abort(errors
.BadFormat
, '?', '?', s
)
597 elif parts
[1].startswith('"') and parts
[1].endswith('"'):
598 parts
[1] = parts
[1][1:-1]
599 elif parts
[1].startswith('"') or parts
[1].endswith('"'):
600 self
.abort(errors
.BadFormat
, '?', '?', s
)
601 return map(_unquote_c_string
, parts
)
604 """Parse a file mode into executable and kind.
606 :return (is_executable, kind)
608 # Note: Output from git-fast-export slightly different to spec
609 if s
in ['644', '100644', '0100644']:
610 return False, commands
.FILE_KIND
611 elif s
in ['755', '100755', '0100755']:
612 return True, commands
.FILE_KIND
613 elif s
in ['040000', '0040000']:
614 return False, commands
.DIRECTORY_KIND
615 elif s
in ['120000', '0120000']:
616 return False, commands
.SYMLINK_KIND
617 elif s
in ['160000', '0160000']:
618 return False, commands
.TREE_REFERENCE_KIND
620 self
.abort(errors
.BadFormat
, 'filemodify', 'mode', s
)
623 def _unquote_c_string(s
):
624 """replace C-style escape sequences (\n, \", etc.) with real chars."""
625 # HACK: Python strings are close enough
626 return s
.decode('string_escape', 'replace')