contrib/server-side/fsfsverify.py

   1 #!/usr/bin/env python
   2 # Copyright (c) 2006, 2007 by John Szakmeister <john at szakmeister dot net>
   3 #
   4 # This program is free software; you can redistribute it and/or modify
   5 # it under the terms of the GNU General Public License as published by
   6 # the Free Software Foundation; either version 2 of the License, or
   7 # (at your option) any later version.
   8 #
   9 # This program is distributed in the hope that it will be useful,
  10 # but WITHOUT ANY WARRANTY; without even the implied warranty of
  11 # MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
  12 # GNU General Public License for more details.
  13 #
  14 # You should have received a copy of the GNU General Public License
  15 # along with this program; if not, write to the Free Software
  16 # Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA  02110-1301  USA
  17
  18 import os
  19 import optparse
  20 import sys
  21 import re
  22
  23
  24 # A handy constant for refering to the NULL digest (one that
  25 # matches every digest).
  26 NULL_DIGEST = '00000000000000000000000000000000'
  27
  28
  29 class FsfsVerifyException(Exception):
  30   pass
  31
  32
  33 class PotentiallyFixableException(FsfsVerifyException):
  34   '''Represents a class of problems that we may be able to fix.'''
  35
  36   def __init__(self, message, offset):
  37     FsfsVerifyException.__init__(self, message)
  38     self.offset = offset
  39
  40
  41 class InvalidInstruction(PotentiallyFixableException):
  42   pass
  43
  44
  45 class InvalidCompressedStream(PotentiallyFixableException):
  46   pass
  47
  48
  49 class InvalidRepHeader(PotentiallyFixableException):
  50   pass
  51
  52
  53 class InvalidWindow(PotentiallyFixableException):
  54   pass
  55
  56
  57 class InvalidSvndiffVersion(FsfsVerifyException):
  58   pass
  59
  60
  61 class InvalidSvndiffHeader(FsfsVerifyException):
  62   pass
  63
  64
  65 class DataCorrupt(FsfsVerifyException):
  66   pass
  67
  68
  69 class NoMoreData(FsfsVerifyException):
  70   pass
  71
  72
  73 LOG_INSTRUCTIONS = 1
  74 LOG_WINDOWS = 2
  75 LOG_SVNDIFF = 4
  76
  77 LOG_MASK = LOG_SVNDIFF
  78
  79
  80 def log(type, indent, format, *args):
  81   if type & LOG_MASK:
  82     indentStr = ' ' * indent
  83     str = format % args
  84     str = '\n'.join([indentStr + x for x in str.split('\n')])
  85     print str
  86
  87
  88 class ByteStream(object):
  89   def __init__(self, fileobj):
  90     self._f = fileobj
  91
  92   def readByte(self):
  93     return ord(self._f.read(1))
  94
  95   def tell(self):
  96     return self._f.tell()
  97
  98   def advance(self, numBytes):
  99     self._f.seek(numBytes, 1)
 100
 101   def clone(self):
 102     if hasattr(self._f, 'clone'):
 103       newFileObj = self._f.clone()
 104     else:
 105       # We expect the file object to map to a real file
 106       #
 107       # Tried using dup(), but (at least on the mac), that ends up
 108       # creating 2 handles to the same underlying os file object,
 109       # instead of two independent file objects.  So, we resort to
 110       # an open call to create a new file object
 111       newFileObj = open(self._f.name, 'rb')
 112       newFileObj.seek(self._f.tell())
 113     return ByteStream(newFileObj)
 114
 115   # The following let ByteStream behave as a file within the
 116   # context of this script.
 117
 118   def read(self, *args, **kwargs):
 119     return self._f.read(*args, **kwargs)
 120
 121   def seek(self, *args, **kwargs):
 122     return self._f.seek(*args, **kwargs)
 123
 124
 125 class ZlibByteStream(ByteStream):
 126   def __init__(self, fileobj, length):
 127     self._f = fileobj
 128
 129     # Store the number of bytes consumed thus far so we can compute an offset
 130     self._numBytesConsumed = 0
 131
 132     self._startingOffset = self._f.tell()
 133
 134     import zlib, binascii
 135     self._z = zlib.decompressobj(15)
 136
 137     self._buffer = self._z.decompress(self._f.read(length))
 138     self._origBufferLength = len(self._buffer)
 139
 140   def readByte(self):
 141     if not self._buffer:
 142       raise NoMoreData, "Unexpected end of data stream!"
 143
 144     byte = self._buffer[0]
 145     self._buffer = self._buffer[1:]
 146
 147     return ord(byte)
 148
 149   def tell(self):
 150     return self._origBufferLength - len(self._buffer)
 151
 152   def advance(self, numBytes):
 153     while numBytes:
 154       self.readByte()
 155
 156   def clone(self):
 157     if hasattr(self._f, 'clone'):
 158       newFileObj = self._f.clone()
 159     else:
 160       newFileObj = open(self._f.name, 'rb')
 161       newFileObj.seek(self._f.tell())
 162     return ByteStream(newFileObj)
 163
 164   # The following let ByteStream behave as a file within the
 165   # context of this script.
 166
 167   def read(self, *args, **kwargs):
 168     raise
 169
 170   def seek(self, *args, **kwargs):
 171     raise
 172
 173
 174 def getVarint(byteStream):
 175   '''Grabs a variable sized int from a bitstream (meaning this function
 176   doesn't seek).'''
 177
 178   i = long(0)
 179   while True:
 180     byte = byteStream.readByte()
 181     i = (i << 7) + (byte & 0x7F)
 182     if byte & 0x80 == 0:
 183       break
 184   return i
 185
 186
 187 INSTR_COPY_SOURCE = 'copy-source'
 188 INSTR_COPY_TARGET = 'copy-target'
 189 INSTR_COPY_DATA = 'copy-data'
 190
 191
 192 class SvndiffInstruction(object):
 193   def __init__(self, byteStream):
 194     self.instrOffset = byteStream.tell()
 195
 196     byte = byteStream.readByte()
 197
 198     instruction = (byte >> 6) & 3
 199     length = byte & 0x3F
 200
 201     if instruction == 3:
 202       raise InvalidInstruction(
 203         "Invalid instruction found at offset %d (%02X)" % (self.instrOffset,
 204                                                            byte),
 205         self.instrOffset)
 206
 207     if instruction == 0:
 208       self.type = INSTR_COPY_SOURCE
 209     elif instruction == 1:
 210       self.type = INSTR_COPY_TARGET
 211     else:
 212       self.type = INSTR_COPY_DATA
 213
 214     if length == 0:
 215       # Length is coded as a varint following the current byte
 216       length = getVarint(byteStream)
 217
 218
 219     self.length = length
 220
 221     if (self.type == INSTR_COPY_SOURCE) or (self.type == INSTR_COPY_TARGET):
 222       self.offset = getVarint(byteStream)
 223
 224     if self.type == INSTR_COPY_SOURCE:
 225       self.sourceOffset = self.offset
 226     else:
 227       self.sourceOffset = 0
 228
 229     if self.type == INSTR_COPY_TARGET:
 230       self.targetOffset = self.offset
 231     else:
 232       self.targetOffset = 0
 233
 234     # Determine the number of bytes consumed in the source stream, target
 235     # stream, and the data stream
 236
 237     if self.type == INSTR_COPY_SOURCE:
 238       self.sourceLength = self.length
 239     else:
 240       self.sourceLength = 0
 241
 242     if self.type == INSTR_COPY_TARGET:
 243       self.targetLength = self.length
 244     else:
 245       self.targetLength = 0
 246
 247     if self.type == INSTR_COPY_DATA:
 248       self.dataLength = self.length
 249     else:
 250       self.dataLength = 0
 251
 252     self.instrLength = byteStream.tell() - self.instrOffset
 253
 254   def __repr__(self):
 255     return '<SvndiffInstruction %s so:%d sl:%d to: %d tl:%d dl:%d (%d, %d)>' % (
 256       self.type, self.sourceOffset, self.sourceLength, self.targetOffset,
 257       self.targetLength, self.dataLength, self.instrOffset, self.instrLength)
 258
 259
 260 class Window(object):
 261   def __init__(self, byteStream, svndiffVersion):
 262     if svndiffVersion not in [0, 1]:
 263       raise InvalidSvndiffVersion, \
 264         "Invalid svndiff version %d" % svndiffVersion
 265
 266     # Record the initial offset of the window
 267     self.windowOffset = byteStream.tell()
 268
 269     try:
 270       self.sourceOffset = getVarint(byteStream)
 271       self.sourceLength = getVarint(byteStream)
 272       self.targetLength = getVarint(byteStream)
 273       self.instrLength = getVarint(byteStream)
 274       self.dataLength = getVarint(byteStream)
 275       self.windowHeaderLength = byteStream.tell() - self.windowOffset
 276       self.windowLength = \
 277         self.windowHeaderLength + self.instrLength + self.dataLength
 278
 279       # Store the byte stream, and clone it for use as a data stream.
 280       self.instrByteStream = byteStream
 281       self.dataByteStream = byteStream.clone()
 282
 283       # Advance the data stream past the instructions to the start of the data.
 284       self.dataByteStream.advance(self.instrLength)
 285     except:
 286       e = InvalidWindow(
 287         "The window header at offset %d appears to be corrupted" % \
 288           (self.windowOffset),
 289         self.windowOffset)
 290       e.windowOffset = self.windowOffset
 291       raise e
 292
 293
 294     # In svndiff1, the instruction area starts with a varint-encoded length.
 295     # If this length matches the one encoded in the header, then there is no
 296     # compression.  If it differs, then the stream is compressed with zlib.
 297
 298     self.origInstrStream = self.instrByteStream
 299     self.origDataStream = self.dataByteStream
 300     self.isInstrCompressed = False
 301     self.isDataCompressed = False
 302     self.compressedInstrLength = self.instrLength
 303     self.compressedDataLength = self.dataLength
 304
 305     if svndiffVersion == 1:
 306       try:
 307         offset = self.instrByteStream.tell()
 308         encodedInstrLength = getVarint(self.instrByteStream)
 309         instrIntSize = self.instrByteStream.tell() - offset
 310
 311         offset = self.dataByteStream.tell()
 312         encodedDataLength = getVarint(self.dataByteStream)
 313         dataIntSize = self.dataByteStream.tell() - offset
 314
 315         self.instrLength = encodedInstrLength
 316         self.dataLength = encodedDataLength
 317       except:
 318         e = InvalidWindow(
 319           "The window header at offset %d appears to be corrupted" % \
 320             (self.windowOffset),
 321           self.windowOffset)
 322         e.windowOffset = self.windowOffset
 323         raise e
 324
 325       # Now, we need to make a determination about whether the data and
 326       # instructions are compressed.  If they are, we need to zlib decompress
 327       # them.  We do that by creating another stream and that will decompress
 328       # the data on the fly.
 329       try:
 330         offset = self.instrByteStream.tell()
 331         if self.compressedInstrLength - instrIntSize != self.instrLength:
 332           self.origInstrStream = self.instrByteStream
 333           self.instrByteStream = ZlibByteStream(self.origInstrStream,
 334                                                 self.compressedInstrLength)
 335           self.isInstrCompressed = True
 336       except Exception, e:
 337         new_e = InvalidCompressedStream(
 338           "Invalid compressed instr stream at offset %d (%s)" % (offset,
 339                                                                  str(e)),
 340           offset)
 341         new_e.windowOffset = self.windowOffset
 342         raise new_e
 343
 344       try:
 345         offset = self.dataByteStream.tell()
 346         if self.compressedDataLength - dataIntSize != self.dataLength:
 347           self.origDataStream = self.dataByteStream
 348           self.dataByteStream = ZlibByteStream(self.origDataStream,
 349                                                self.compressedDataLength)
 350           self.isDataCompressed = True
 351       except Exception, e:
 352         new_e = InvalidCompressedStream(
 353           "Invalid compressed data stream at offset %d (%s)" % (offset,
 354                                                                 str(e)),
 355           offset)
 356         new_e.windowOffset = self.windowOffset
 357         raise new_e
 358
 359   def verify(self):
 360     expectedInstrLength = self.instrLength
 361     expectedDataLength = self.dataLength
 362     expectedTargetLength = self.targetLength
 363     expectedSourceLength = self.sourceLength
 364
 365     computedInstrLength = 0
 366     computedDataLength = 0
 367     computedTargetLength = 0
 368     computedSourceLength = 0
 369
 370     if expectedInstrLength == 0:
 371       e = InvalidWindow(
 372         "Corrupt window (at offset %d) has 0 instructions?!" % self.windowOffset,
 373         self.windowOffset)
 374       e.windowOffset = self.windowOffset
 375       raise e
 376
 377     while computedInstrLength < expectedInstrLength:
 378       try:
 379         instr = SvndiffInstruction(self.instrByteStream)
 380       except PotentiallyFixableException, e:
 381         e.window = self
 382         e.windowOffset = self.windowOffset
 383         raise
 384
 385       log(LOG_INSTRUCTIONS, 4, repr(instr))
 386
 387       computedInstrLength += instr.instrLength
 388       computedDataLength += instr.dataLength
 389       computedSourceLength += instr.sourceLength
 390       computedTargetLength += \
 391         instr.targetLength + instr.sourceLength + instr.dataLength
 392
 393     if computedInstrLength != expectedInstrLength:
 394       e = InvalidWindow(
 395         "The number of instruction bytes consumed (%d) doesn't match the expected number (%d)" % \
 396           (computedInstrLength, expectedInstrLength),
 397         self.windowOffset)
 398       e.windowOffset = self.windowOffset
 399       raise e
 400
 401     if computedDataLength != expectedDataLength:
 402       e = InvalidWindow(
 403         "The number of data bytes consumed (%d) doesn't match the expected number (%d)" % \
 404           (computedDataLength, expectedDataLength),
 405         self.windowOffset)
 406       e.windowOffset = self.windowOffset
 407       raise e
 408
 409     if computedTargetLength != expectedTargetLength:
 410       e = InvalidWindow(
 411         "The number of target bytes consumed (%d) doesn't match the expected number (%d)" % \
 412           (computedTargetLength, expectedTargetLength),
 413         self.windowOffset)
 414       e.windowOffset = self.windowOffset
 415       raise e
 416
 417     # It appears that the source length specified in the window, isn't exactly
 418     # equal to what gets consumed.  I suspect that's because the algorithm is using different
 419     # offsets within the window, and one offset/length pair will reach the end of the window.
 420     # However, this hasn't shown to be a clear indicator of corruption.  So for now, I'm
 421     # commenting it out.
 422     #
 423     #if computedSourceLength != expectedSourceLength:
 424     #  e = InvalidWindow(
 425     #    "The number of source bytes consumed (%d) doesn't match the expected number (%d)" % \
 426     #      (computedSourceLength, expectedSourceLength),
 427     #    self.windowOffset)
 428     #  e.windowOffset = self.windowOffset
 429     #  raise e
 430
 431     # Advance past the data.  We do this using seek because we might have
 432     # read a few bytes from the stream if it potentially had compressed data
 433     self.origInstrStream.seek(self.windowOffset + self.windowLength)
 434
 435   def __repr__(self):
 436     if hasattr(self, 'compressedInstrLength'):
 437       str = 'cil: %d cdl: %d ' % (self.compressedInstrLength,
 438                                   self.compressedDataLength)
 439     else:
 440       str = ''
 441
 442     return "<Window wo:%d so:%d sl:%d tl:%d %sil:%d dl:%d whl:%d wl:%d>" % (
 443       self.windowOffset, self.sourceOffset, self.sourceLength,
 444       self.targetLength, str, self.instrLength, self.dataLength,
 445       self.windowHeaderLength, self.windowLength)
 446
 447
 448 class Svndiff(object):
 449   def __init__(self, fileobj, length):
 450     self._f = fileobj
 451     self.startingOffset = self._f.tell()
 452
 453     header = self._f.read(4)
 454     if len(header) != 4:
 455       raise EOFError, \
 456         "Unexpected end of file while svndiff header at offset %d)" % \
 457         (self._f.tell())
 458
 459     if header[0:3] != 'SVN':
 460       raise InvalidSvndiffHeader, "Invalid svndiff header at offset %d" % \
 461       (self.startingOffset)
 462
 463     self.version = ord(header[3])
 464     if self.version not in [0, 1]:
 465       raise InvalidSvndiffVersion, "Invalid svndiff version %d" % self.version
 466
 467     self._length = length - 4
 468
 469   def verify(self):
 470     self._f.seek(self.startingOffset+4)
 471
 472     bs = ByteStream(self._f)
 473
 474     log(LOG_SVNDIFF, 2, "<Svndiff so: %d ver: %d>", self.startingOffset,
 475         self.version)
 476
 477     try:
 478       remaining = self._length
 479       while remaining > 0:
 480         w = Window(bs, self.version)
 481         log(LOG_WINDOWS, 3, repr(w))
 482         w.verify()
 483         remaining -= w.windowLength
 484     except PotentiallyFixableException, e:
 485       e.svndiffStart = self.startingOffset
 486       raise
 487
 488
 489 def getDirHash(f):
 490   l = f.readline()
 491   if l != 'PLAIN\n':
 492     raise ValueError, "Expected a PLAIN representation (%d)" % f.tell()
 493
 494   hash = {}
 495
 496   while True:
 497     field = f.readline()[:-1]
 498     if field == 'END':
 499       break
 500     assert(field[0] == 'K')
 501     length = int(field.split(' ')[1])
 502     field = f.readline()[:length]
 503
 504     value = f.readline()[:-1]
 505     assert(value[0] == 'V')
 506     length = int(value.split(' ')[1])
 507     value = f.readline()[:length]
 508
 509     (type, txn) = value.split(' ')
 510     hash[field] = [NodeType(type), NodeId(txn)]
 511
 512   return hash
 513
 514
 515
 516 class Rep(object):
 517   def __init__(self, type, rev, offset, length, size, digest,
 518                contentType, currentRev, noderev):
 519     self.type = type
 520     self.rev = int(rev)
 521     self.offset = int(offset)
 522     self.length = int(length)
 523     self.size = int(size)
 524
 525     self.digest = digest
 526     self.currentRev = currentRev
 527
 528     self.contentType = contentType
 529     self.noderev = noderev
 530
 531   def __repr__(self):
 532     if not self.contentType:
 533       contentType = 'UNKNOWN'
 534     else:
 535       if self.contentType not in ['PLAIN', 'DELTA', None]:
 536         contentType = 'INVALID'
 537       else:
 538         contentType = self.contentType
 539     return '%s: %s %d %d %d %d %s' % (self.type, contentType, self.rev,
 540                                       self.offset, self.length, self.size,
 541                                       self.digest)
 542
 543   def verify(self, f, dumpInstructions, dumpWindows):
 544     if self.contentType not in ['PLAIN', 'DELTA', None]:
 545       e = InvalidRepHeader("Invalid rep header found at %d (%s)!" % \
 546                                      (self.offset, self.contentType),
 547                            self.offset)
 548       e.rep = self
 549       e.noderev = self.noderev
 550       raise e
 551
 552     if self.rev != currentRev:
 553       print >>sys.stderr, "Skipping text rep since it isn't present in the current rev"
 554       return
 555
 556     f.seek(self.offset)
 557     header = f.read(5)
 558     if header != self.contentType:
 559       raise FsfsVerifyException, \
 560         "Invalid rep header found at %d (%s, %s)!" % (self.offset, header,
 561                                                       self.contentType)
 562
 563     if header == 'DELTA':
 564       # Consume the rest of the DELTA header
 565       while f.read(1) != '\n':
 566         pass
 567
 568       # This should be the start of the svndiff stream
 569       actual_start = f.tell()
 570       try:
 571         svndiff = Svndiff(f, self.length)
 572         svndiff.verify()
 573         digest = None
 574       except Exception, e:
 575         e.rep = self
 576         e.noderev = self.noderev
 577         raise
 578
 579       if digest and (self.digest != NULL_DIGEST):
 580         assert(digest == self.digest)
 581     else:
 582       if f.read(1) != '\n':
 583         raise DataCorrupt, "Expected a '\\n' after PLAIN"
 584
 585       import md5
 586       m = md5.new()
 587       m.update(f.read(self.length))
 588
 589       if self.digest and self.digest != NULL_DIGEST \
 590           and self.digest != m.hexdigest():
 591         raise DataCorrupt, \
 592           "PLAIN data is corrupted.  Expected digest '%s', computed '%s'." % (
 593             self.digest, m.hexdigest())
 594
 595       if f.read(7) != 'ENDREP\n':
 596         raise DataCorrupt, "Terminating ENDREP missing!"
 597
 598
 599 class TextRep(Rep):
 600   def __init__(self, rev, offset, length, size, digest,
 601                contentType, currentRev, noderev):
 602     super(TextRep,self).__init__('text', rev, offset, length, size,
 603                                  digest, contentType, currentRev, noderev)
 604
 605
 606 class PropRep(Rep):
 607   def __init__(self, rev, offset, length, size, digest,
 608                contentType, currentRev, noderev):
 609     super(PropRep,self).__init__('prop', rev, offset, length, size,
 610                                  digest, contentType, currentRev, noderev)
 611
 612
 613 class NodeId(object):
 614   def __init__(self, nodeid):
 615     (self.txn_name, offset) = nodeid.split('/')
 616     self.offset = int(offset)
 617     self.rev = int(self.txn_name.split('.')[2][1:])
 618
 619   def __repr__(self):
 620     return self.txn_name + '/%d' % self.offset
 621
 622   def __eq__ (self, other):
 623     s = self.txn_name + '/%d' % self.offset
 624     if s == other:
 625       return True
 626
 627     return False
 628
 629
 630 class NodeType(object):
 631   def __init__(self, t):
 632     if (t != 'file') and (t != 'dir'):
 633       raise ValueError, 'Invalid Node type received: "%s"' % t
 634     self.type = t
 635
 636   def __repr__(self):
 637     return self.type[:]
 638
 639
 640 class NodeRev(object):
 641   def __init__(self, f, currentRev):
 642     self.pred = None
 643     self.text = None
 644     self.props = None
 645     self.cpath = None
 646     self.copyroot = None
 647     self.copyfrom = None
 648     self.dir = []
 649
 650     self.nodeOffset = f.tell()
 651
 652     while True:
 653       line = f.readline()
 654       if line == '':
 655         raise IOError, "Unexpected end of file"
 656       if line == '\n':
 657         break
 658
 659       # break apart the line
 660       try:
 661         (field, value) = line.split(':', 1)
 662       except:
 663         print repr(line)
 664         print self.nodeOffset
 665         print f.tell()
 666         raise
 667
 668       # pull of the leading space and trailing new line
 669       value = value[1:-1]
 670
 671       if field == 'id':
 672         self.id = NodeId(value)
 673       elif field == 'type':
 674         self.type = NodeType(value)
 675       elif field == 'pred':
 676         self.pred = NodeId(value)
 677       elif field == 'text':
 678         (rev, offset, length, size, digest) = value.split(' ')
 679         rev = int(rev)
 680         offset = int(offset)
 681         length = int(length)
 682         size = int(size)
 683
 684         if rev != currentRev:
 685           contentType = None
 686         else:
 687           savedOffset = f.tell()
 688           f.seek(offset)
 689           contentType = f.read(5)
 690           f.seek(savedOffset)
 691
 692         self.text = TextRep(rev, offset, length, size, digest,
 693                             contentType, currentRev, self)
 694       elif field == 'props':
 695         (rev, offset, length, size, digest) = value.split(' ')
 696         rev = int(rev)
 697         offset = int(offset)
 698         length = int(length)
 699         size = int(size)
 700
 701         if rev != currentRev:
 702           contentType = None
 703         else:
 704           savedOffset = f.tell()
 705           f.seek(offset)
 706           contentType = f.read(5)
 707           f.seek(savedOffset)
 708
 709         self.props = PropRep(rev, offset, length, size, digest,
 710                              contentType, currentRev, self)
 711       elif field == 'cpath':
 712         self.cpath = value
 713       elif field == 'copyroot':
 714         self.copyroot = value
 715       elif field == 'copyfrom':
 716         self.copyfrom = value
 717
 718     if self.type.type == 'dir':
 719       if self.text:
 720         if self.id.rev == self.text.rev:
 721           offset = f.tell()
 722           f.seek(self.text.offset)
 723           self.dir = getDirHash(f)
 724           f.seek(offset)
 725         else:
 726           # The directory entries are stored in another file.
 727           print "Warning: dir entries are stored in rev %d for noderev %s" % (
 728             self.text.rev, repr(self.id))
 729
 730   def __repr__(self):
 731     str = 'NodeRev Id: %s\n type: %s\n' % (repr(self.id), repr(self.type))
 732     if self.pred:
 733       str = str + ' pred: %s\n' % repr(self.pred)
 734     if self.text:
 735       str = str + ' %s\n' % repr(self.text)
 736     if self.props:
 737       str = str + ' %s\n' % repr(self.props)
 738     if self.cpath:
 739       str = str + ' cpath: %s\n' % self.cpath
 740     if self.copyroot:
 741       str = str + ' copyroot: %s\n' % self.copyroot
 742     if self.copyfrom:
 743       str = str + ' copyfrom: %s\n' % self.copyfrom
 744     if self.dir:
 745       str = str + ' dir contents:\n'
 746       for k in self.dir:
 747         str = str + '  %s: %s\n' % (k, self.dir[k])
 748     return str[:-1]
 749
 750
 751 class ChangedPaths(object):
 752   def __init__(self, f):
 753     self.changedPaths = {}
 754
 755     while True:
 756       currentOffset = revFile.tell()
 757       action = revFile.readline()
 758       if action == '\n' or action == '':
 759         break
 760
 761       path = action[:-1]
 762       try:
 763         (id, action, textMod, propMod) = action[:-1].split(' ')[:4]
 764       except:
 765         raise DataCorrupt, \
 766           "Data appears to be corrupt at offset %d" % currentOffset
 767       path = path[len(' '.join([id, action, textMod, propMod]))+1:]
 768
 769       line = revFile.readline()
 770       if line != '\n':
 771         (copyfromRev, copyfromPath) = line[:-1].split(' ', 1)
 772       else:
 773         copyfromRev = -1
 774         copyfromPath = ''
 775
 776       self.changedPaths[path] = (id, action, textMod, propMod,
 777                                  copyfromRev, copyfromPath)
 778
 779
 780   def __iter__(self):
 781     return self.changedPaths.iteritems()
 782
 783
 784 def getRootAndChangedPaths(revFile):
 785   offset = -2
 786   while True:
 787     revFile.seek(offset, 2)
 788     c = revFile.read(1)
 789     if c == '\n':
 790       offset = revFile.tell()
 791       break
 792     offset = offset - 1
 793
 794   (rootNode, changedPaths) = map(int, revFile.readline().split(' '))
 795
 796   return (rootNode, changedPaths)
 797
 798
 799 def dumpChangedPaths(changedPaths):
 800   print "Changed Path Information:"
 801   for (path,
 802        (id, action, textMod, propMod,
 803         copyfromRev, copyfromPath)) in changedPaths:
 804     print " %s:" % path
 805     print "  id: %s" % id
 806     print "  action: %s" % action
 807     print "  text mod: %s" % textMod
 808     print "  prop mod: %s" % propMod
 809     if copyfromRev != -1:
 810       print "  copyfrom path: %s" % copyfromPath
 811       print "  copyfrom rev: %s" % copyfromRev
 812     print
 813
 814
 815 class WalkStrategy(object):
 816   def __init__(self, filename, rootOffset, currentRev):
 817     self.f = open(filename, 'rb')
 818     self.rootOffset = rootOffset
 819     self.f.seek(rootOffset)
 820     self.currentRev = currentRev
 821
 822   def _nodeWalker(self):
 823     raise NotImplementedError, "_nodeWalker is not implemented"
 824
 825   def __iter__(self):
 826     self.f.seek(self.rootOffset)
 827     return self._nodeWalker()
 828
 829
 830 class ClassicStrategy(WalkStrategy):
 831   def _nodeWalker (self):
 832     noderev = NodeRev(self.f, self.currentRev)
 833     yield noderev
 834
 835     if noderev.type.type == 'dir':
 836       for e in noderev.dir:
 837         if noderev.dir[e][1].rev == noderev.id.rev:
 838           self.f.seek(noderev.dir[e][1].offset)
 839           for x in self._nodeWalker():
 840             yield x
 841
 842
 843 class RegexpStrategy(WalkStrategy):
 844   def __init__(self, filename, rootOffset, currentRev):
 845     WalkStrategy.__init__(self, filename, rootOffset, currentRev)
 846
 847     # File object passed to the NodeRev() constructor so that it
 848     # doesn't interfere with our regex search.
 849     self.nodeFile = open(filename, 'rb')
 850
 851   def _nodeWalker(self):
 852     nodeId_re = re.compile(r'^id: [a-z0-9\./]+$')
 853
 854     self.f.seek(0)
 855     offset = 0
 856
 857     for line in self.f:
 858       match = nodeId_re.search(line)
 859       if match:
 860         self.nodeFile.seek(offset)
 861         noderev = NodeRev(self.nodeFile, self.currentRev)
 862         yield noderev
 863
 864       offset = offset + len(line)
 865
 866
 867 def verify(noderev, revFile, dumpInstructions, dumpWindows):
 868   print noderev
 869
 870   if noderev.text:
 871     noderev.text.verify(revFile,
 872                         dumpInstructions,
 873                         dumpWindows)
 874
 875   if noderev.props and noderev.props.rev == noderev.props.currentRev:
 876     noderev.props.verify(revFile,
 877                          dumpInstructions,
 878                          dumpWindows)
 879
 880   print
 881
 882
 883 def truncate(noderev, revFile):
 884   txnId = noderev.id
 885
 886   print "Truncating node %s (%s)" % (txnId, noderev.cpath)
 887
 888   # Grab the text rep
 889   textRep = noderev.text
 890
 891   # Fix the text rep contents
 892   offset = textRep.offset
 893   revFile.seek(offset, 0)
 894   revFile.write("PLAIN\x0aENDREP\x0a")
 895
 896   # Fix the node rev
 897   offset = noderev.nodeOffset
 898   revFile.seek(offset, 0)
 899   while True:
 900     savedOffset = revFile.tell()
 901     s = revFile.readline()
 902     if s[:4] == 'text':
 903       revFile.seek(savedOffset, 0)
 904       break
 905
 906   line = revFile.readline()
 907   revFile.seek(savedOffset, 0)
 908   fields = line.split(' ')
 909   overallLength = len(line)
 910
 911   fields[3] = '0' * len(fields[3])
 912   fields[4] = '0' * len(fields[4])
 913   fields[5] = 'd41d8cd98f00b204e9800998ecf8427e'
 914   newTextRep = ' '.join(fields) + '\x0a'
 915   assert(len(newTextRep) == overallLength)
 916   revFile.write(newTextRep)
 917   print "Done."
 918   sys.exit(0)
 919
 920
 921 def fixHeader(e, revFile):
 922   '''Attempt to fix the rep header.  e is expected to be of type
 923   InvalidRepHeader, since the exception stores the necessary information
 924   to help repair the file.'''
 925
 926   # First, we need to locate the real start of the text rep
 927   textrep_re = re.compile(r'^(DELTA( \d+ \d+ \d+)?|PLAIN)$')
 928
 929   revFile.seek(0)
 930   offset = 0
 931   originalOffset = 0
 932   for line in revFile:
 933     m = textrep_re.match(line)
 934     if m:
 935       if offset >= originalOffset and offset < e.offset:
 936         originalOffset = offset
 937         headerLen = len(line)
 938     offset = offset + len(line)
 939
 940   print "Original text rep located at", originalOffset
 941
 942   # Okay, now we have the original offset of the text rep that was
 943   # in the process of being written out.  The header portion of the
 944   # text rep has a fsync() done after it, so the 4K blocks actually
 945   # start after the header.  We need to make sure to copy the header
 946   # and the next 4K, to be on the safe side.
 947   copyLen = 4096 + headerLen
 948
 949   revFile.seek(originalOffset)
 950   block = revFile.read(copyLen)
 951   print "Copy %d bytes from offset %d" % (copyLen, originalOffset)
 952
 953   print "Write %d bytes at offset %d" % (copyLen, e.offset)
 954   revFile.seek(e.offset)
 955   revFile.write(block)
 956   revFile.flush()
 957
 958   print "Fixed? :-)  Re-run fsfsverify without the -f option"
 959
 960
 961 def fixStream(e, revFile):
 962   startOffset = e.svndiffStart
 963   errorOffset = e.windowOffset
 964
 965   repeatedBlockOffset = errorOffset - ((errorOffset - startOffset) % 4096)
 966
 967   # Now we need to move up the rest of the rep
 968
 969   # Determine the final offset by finding the end of the rep.
 970   revFile.seek(errorOffset)
 971
 972   endrep_re = re.compile(".*ENDREP$")
 973   srcLength = 0
 974   for l in revFile:
 975     srcLength += len(l)
 976     m = endrep_re.match(l)
 977     if m:
 978       break
 979
 980   if not m:
 981     raise "Couldn't find end of rep!"
 982
 983   finalOffset = errorOffset + srcLength
 984   srcOffset = errorOffset
 985   destOffset = repeatedBlockOffset
 986
 987   print "Copy %d bytes from offset %d" % (srcLength, srcOffset)
 988   print "Write %d bytes at offset %d" % (srcLength, destOffset)
 989
 990   while srcOffset < finalOffset:
 991     blen = 64*1024
 992     if (finalOffset - srcOffset) < blen:
 993       blen = finalOffset - srcOffset
 994     revFile.seek(srcOffset)
 995     block = revFile.read(blen)
 996     revFile.seek(destOffset)
 997     revFile.write(block)
 998
 999     srcOffset += blen
1000     destOffset += blen
1001
1002   revFile.flush()
1003   revFile.close()
1004
1005   print "Fixed? :-)  Re-run fsfsverify without the -f option"
1006
1007
1008 def checkOptions(options):
1009   count = 0
1010   for k,v in options.__dict__.items():
1011     if v and (k in ['dumpChanged', 'truncate', 'fixRlle']):
1012       count = count + 1
1013
1014   if count > 1:
1015     print >>sys.stderr, "Please use only one of -c, -f, and -t."
1016     sys.exit(1)
1017
1018   if options.dumpChanged and (options.dumpWindows or options.dumpInstructions):
1019     print >>sys.stderr, \
1020       "-c is incompatible with -w and -i.  Dropping -w and/or -i."
1021
1022   if options.noVerify and (options.dumpWindows or options.dumpInstructions):
1023     print >>sys.stderr, \
1024       "--no-verify is incompatible with -w and -i.  Dropping -w and/or -i."
1025
1026
1027 def handleError(error, withTraceback=False):
1028   print
1029   if withTraceback:
1030     import traceback
1031     traceback.print_exc()
1032
1033   print >>sys.stderr,"Error %s: %s" % (error.__class__.__name__, str(e))
1034   print >>sys.stderr,"Try running with -f to fix the revision"
1035   sys.exit(1)
1036
1037
1038 if __name__ == '__main__':
1039   from optparse import OptionParser
1040
1041   parser = OptionParser("usage: %prog [-w | -i | -r | -n] REV-FILE")
1042   parser.add_option("-c", "--changed-paths",
1043                     action="store_true", dest="dumpChanged",
1044                     help="Dump changed path information", default=False)
1045   parser.add_option("", "--no-verify",
1046                     action="store_true", dest="noVerify",
1047                     help="Don't parse svndiff streams.", default=False)
1048   parser.add_option("-i", "--instructions",
1049                     action="store_true", dest="dumpInstructions",
1050                     help="Dump instructions (implies -w)", default=False)
1051   parser.add_option("-w", "--windows",
1052                     action="store_true", dest="dumpWindows",
1053                     help="Dump windows", default=False)
1054   parser.add_option("-n", "--noderev-regexp",
1055                     action="store_true", dest="noderevRegexp",
1056                     help="Find all noderevs using a regexp", default=False)
1057   parser.add_option("-f", "--fix-read-length-line-error",
1058                     action="store_true", dest="fixRlle",
1059                     help="Attempt to fix the read length line error",
1060                     default=False)
1061   parser.add_option("-t", "--truncate",
1062                     action="store", type="string", dest="truncate",
1063                     help="Truncate the specified node rev.",
1064                     default=None)
1065   parser.add_option("", "--traceback",
1066                     action="store_true", dest="showTraceback",
1067                     help="Show error tracebacks (mainly used for debugging).",
1068                     default=False)
1069
1070   (options, args) = parser.parse_args()
1071
1072   if len(args) != 1:
1073     print >>sys.stderr, "Please specify exactly one rev file."
1074     parser.print_help()
1075     sys.exit(1)
1076
1077   checkOptions(options)
1078
1079   filename = args[0]
1080
1081   if options.dumpInstructions:
1082     options.dumpWindows = True
1083     LOG_MASK |= LOG_INSTRUCTIONS
1084
1085   if options.dumpWindows:
1086     LOG_MASK |= LOG_WINDOWS
1087
1088   if options.truncate or options.fixRlle:
1089     revFile = open(filename, 'r+b')
1090   else:
1091     revFile = open(filename, 'rb')
1092
1093   (root, changed) = getRootAndChangedPaths(revFile)
1094
1095   if options.dumpChanged:
1096     revFile.seek(changed)
1097     changedPaths = ChangedPaths(revFile)
1098
1099     dumpChangedPaths(changedPaths)
1100     sys.exit(0)
1101
1102   try:
1103     import re
1104     match = re.match('([0-9]+)', os.path.basename(filename))
1105     currentRev = int(match.group(1), 10)
1106   except:
1107     raise CmdlineError, \
1108       "The file name must start with a decimal number that indicates the revision"
1109
1110   if options.noderevRegexp:
1111     strategy = RegexpStrategy(filename, root, currentRev)
1112   else:
1113     strategy = ClassicStrategy(filename, root, currentRev)
1114
1115   # Make stderr the same as stdout.  This helps when trying to catch all of the
1116   # output from a run.
1117   sys.stderr = sys.stdout
1118
1119   try:
1120     for noderev in strategy:
1121       try:
1122         if options.truncate:
1123           # Check to see if this is the rev we need to truncate
1124           if options.truncate == noderev.id:
1125             truncate(noderev, revFile)
1126
1127         else:
1128           print noderev
1129
1130           if not options.noVerify:
1131             if noderev.text:
1132               noderev.text.verify(revFile,
1133                                   options.dumpInstructions,
1134                                   options.dumpWindows)
1135
1136             if noderev.props and noderev.props.rev == noderev.props.currentRev:
1137               noderev.props.verify(revFile,
1138                                    options.dumpInstructions,
1139                                    options.dumpWindows)
1140
1141           print
1142       except:
1143         sys.stdout.flush()
1144         raise
1145   except InvalidRepHeader, e:
1146     if not options.fixRlle:
1147       handleError(e, options.showTraceback)
1148
1149     fixHeader(e, revFile)
1150
1151   except PotentiallyFixableException, e:
1152     if not options.fixRlle:
1153       handleError(e, options.showTraceback)
1154
1155     fixStream(e, revFile)
1156