contrib/server-side/fsfsverify.py

   1 #!/usr/bin/env python
   2 # Copyright (c) 2006, 2007 by John Szakmeister <john at szakmeister dot net>
   3 #
   4 # This program is free software; you can redistribute it and/or modify
   5 # it under the terms of the GNU General Public License as published by
   6 # the Free Software Foundation; either version 2 of the License, or
   7 # (at your option) any later version.
   8 #
   9 # This program is distributed in the hope that it will be useful,
  10 # but WITHOUT ANY WARRANTY; without even the implied warranty of
  11 # MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
  12 # GNU General Public License for more details.
  13 #
  14 # You should have received a copy of the GNU General Public License
  15 # along with this program; if not, write to the Free Software
  16 # Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA  02110-1301  USA
  17
  18 import os
  19 import optparse
  20 import sys
  21 import re
  22
  23
  24 class FsfsVerifyException(Exception):
  25   pass
  26
  27
  28 class PotentiallyFixableException(FsfsVerifyException):
  29   '''Represents a class of problems that we may be able to fix.'''
  30
  31   def __init__(self, message, offset):
  32     FsfsVerifyException.__init__(self, message)
  33     self.offset = offset
  34
  35
  36 class InvalidInstruction(PotentiallyFixableException):
  37   pass
  38
  39
  40 class InvalidCompressedStream(PotentiallyFixableException):
  41   pass
  42
  43
  44 class InvalidRepHeader(PotentiallyFixableException):
  45   pass
  46
  47
  48 class InvalidWindow(PotentiallyFixableException):
  49   pass
  50
  51
  52 class InvalidSvndiffVersion(FsfsVerifyException):
  53   pass
  54
  55
  56 class InvalidSvndiffHeader(FsfsVerifyException):
  57   pass
  58
  59
  60 class DataCorrupt(FsfsVerifyException):
  61   pass
  62
  63
  64 class NoMoreData(FsfsVerifyException):
  65   pass
  66
  67
  68 LOG_INSTRUCTIONS = 1
  69 LOG_WINDOWS = 2
  70 LOG_SVNDIFF = 4
  71
  72 LOG_MASK = LOG_SVNDIFF
  73
  74
  75 def log(type, indent, format, *args):
  76   if type & LOG_MASK:
  77     indentStr = ' ' * indent
  78     str = format % args
  79     str = '\n'.join([indentStr + x for x in str.split('\n')])
  80     print str
  81
  82
  83 class ByteStream(object):
  84   def __init__(self, fileobj):
  85     self._f = fileobj
  86
  87   def readByte(self):
  88     return ord(self._f.read(1))
  89
  90   def tell(self):
  91     return self._f.tell()
  92
  93   def advance(self, numBytes):
  94     self._f.seek(numBytes, 1)
  95
  96   def clone(self):
  97     if hasattr(self._f, 'clone'):
  98       newFileObj = self._f.clone()
  99     else:
 100       # We expect the file object to map to a real file
 101       #
 102       # Tried using dup(), but (at least on the mac), that ends up
 103       # creating 2 handles to the same underlying os file object,
 104       # instead of two independent file objects.  So, we resort to
 105       # an open call to create a new file object
 106       newFileObj = open(self._f.name, 'rb')
 107       newFileObj.seek(self._f.tell())
 108     return ByteStream(newFileObj)
 109
 110   # The following let ByteStream behave as a file within the
 111   # context of this script.
 112
 113   def read(self, *args, **kwargs):
 114     return self._f.read(*args, **kwargs)
 115
 116   def seek(self, *args, **kwargs):
 117     return self._f.seek(*args, **kwargs)
 118
 119
 120 class ZlibByteStream(ByteStream):
 121   def __init__(self, fileobj, length):
 122     self._f = fileobj
 123
 124     # Store the number of bytes consumed thus far so we can compute an offset
 125     self._numBytesConsumed = 0
 126
 127     self._startingOffset = self._f.tell()
 128
 129     import zlib, binascii
 130     self._z = zlib.decompressobj(15)
 131
 132     self._buffer = self._z.decompress(self._f.read(length))
 133     self._origBufferLength = len(self._buffer)
 134
 135   def readByte(self):
 136     if not self._buffer:
 137       raise NoMoreData, "Unexpected end of data stream!"
 138
 139     byte = self._buffer[0]
 140     self._buffer = self._buffer[1:]
 141
 142     return ord(byte)
 143
 144   def tell(self):
 145     return self._origBufferLength - len(self._buffer)
 146
 147   def advance(self, numBytes):
 148     while numBytes:
 149       self.readByte()
 150
 151   def clone(self):
 152     if hasattr(self._f, 'clone'):
 153       newFileObj = self._f.clone()
 154     else:
 155       newFileObj = open(self._f.name, 'rb')
 156       newFileObj.seek(self._f.tell())
 157     return ByteStream(newFileObj)
 158
 159   # The following let ByteStream behave as a file within the
 160   # context of this script.
 161
 162   def read(self, *args, **kwargs):
 163     raise
 164
 165   def seek(self, *args, **kwargs):
 166     raise
 167
 168
 169 def getVarint(byteStream):
 170   '''Grabs a variable sized int from a bitstream (meaning this function
 171   doesn't seek).'''
 172
 173   i = long(0)
 174   while True:
 175     byte = byteStream.readByte()
 176     i = (i << 7) + (byte & 0x7F)
 177     if byte & 0x80 == 0:
 178       break
 179   return i
 180
 181
 182 INSTR_COPY_SOURCE = 'copy-source'
 183 INSTR_COPY_TARGET = 'copy-target'
 184 INSTR_COPY_DATA = 'copy-data'
 185
 186
 187 class SvndiffInstruction(object):
 188   def __init__(self, byteStream):
 189     self.instrOffset = byteStream.tell()
 190
 191     byte = byteStream.readByte()
 192
 193     instruction = (byte >> 6) & 3
 194     length = byte & 0x3F
 195
 196     if instruction == 3:
 197       raise InvalidInstruction(
 198         "Invalid instruction found at offset %d (%02X)" % (self.instrOffset,
 199                                                            byte),
 200         self.instrOffset)
 201
 202     if instruction == 0:
 203       self.type = INSTR_COPY_SOURCE
 204     elif instruction == 1:
 205       self.type = INSTR_COPY_TARGET
 206     else:
 207       self.type = INSTR_COPY_DATA
 208
 209     if length == 0:
 210       # Length is coded as a varint following the current byte
 211       length = getVarint(byteStream)
 212
 213
 214     self.length = length
 215
 216     if (self.type == INSTR_COPY_SOURCE) or (self.type == INSTR_COPY_TARGET):
 217       self.offset = getVarint(byteStream)
 218
 219     if self.type == INSTR_COPY_SOURCE:
 220       self.sourceOffset = self.offset
 221     else:
 222       self.sourceOffset = 0
 223
 224     if self.type == INSTR_COPY_TARGET:
 225       self.targetOffset = self.offset
 226     else:
 227       self.targetOffset = 0
 228
 229     # Determine the number of bytes consumed in the source stream, target
 230     # stream, and the data stream
 231
 232     if self.type == INSTR_COPY_SOURCE:
 233       self.sourceLength = self.length
 234     else:
 235       self.sourceLength = 0
 236
 237     if self.type == INSTR_COPY_TARGET:
 238       self.targetLength = self.length
 239     else:
 240       self.targetLength = 0
 241
 242     if self.type == INSTR_COPY_DATA:
 243       self.dataLength = self.length
 244     else:
 245       self.dataLength = 0
 246
 247     self.instrLength = byteStream.tell() - self.instrOffset
 248
 249   def __repr__(self):
 250     return '<SvndiffInstruction %s so:%d sl:%d to: %d tl:%d dl:%d (%d, %d)>' % (
 251       self.type, self.sourceOffset, self.sourceLength, self.targetOffset,
 252       self.targetLength, self.dataLength, self.instrOffset, self.instrLength)
 253
 254
 255 class Window(object):
 256   def __init__(self, byteStream, svndiffVersion):
 257     if svndiffVersion not in [0, 1]:
 258       raise InvalidSvndiffVersion, \
 259         "Invalid svndiff version %d" % svndiffVersion
 260
 261     # Record the initial offset of the window
 262     self.windowOffset = byteStream.tell()
 263
 264     try:
 265       self.sourceOffset = getVarint(byteStream)
 266       self.sourceLength = getVarint(byteStream)
 267       self.targetLength = getVarint(byteStream)
 268       self.instrLength = getVarint(byteStream)
 269       self.dataLength = getVarint(byteStream)
 270       self.windowHeaderLength = byteStream.tell() - self.windowOffset
 271       self.windowLength = \
 272         self.windowHeaderLength + self.instrLength + self.dataLength
 273
 274       # Store the byte stream, and clone it for use as a data stream.
 275       self.instrByteStream = byteStream
 276       self.dataByteStream = byteStream.clone()
 277
 278       # Advance the data stream past the instructions to the start of the data.
 279       self.dataByteStream.advance(self.instrLength)
 280     except:
 281       e = InvalidWindow(
 282         "The window header at offset %d appears to be corrupted" % \
 283           (self.windowOffset),
 284         self.windowOffset)
 285       e.windowOffset = self.windowOffset
 286       raise e
 287
 288
 289     # In svndiff1, the instruction area starts with a varint-encoded length.
 290     # If this length matches the one encoded in the header, then there is no
 291     # compression.  If it differs, then the stream is compressed with zlib.
 292
 293     self.origInstrStream = self.instrByteStream
 294     self.origDataStream = self.dataByteStream
 295     self.isInstrCompressed = False
 296     self.isDataCompressed = False
 297     self.compressedInstrLength = self.instrLength
 298     self.compressedDataLength = self.dataLength
 299
 300     if svndiffVersion == 1:
 301       try:
 302         offset = self.instrByteStream.tell()
 303         encodedInstrLength = getVarint(self.instrByteStream)
 304         instrIntSize = self.instrByteStream.tell() - offset
 305
 306         offset = self.dataByteStream.tell()
 307         encodedDataLength = getVarint(self.dataByteStream)
 308         dataIntSize = self.dataByteStream.tell() - offset
 309
 310         self.instrLength = encodedInstrLength
 311         self.dataLength = encodedDataLength
 312       except:
 313         e = InvalidWindow(
 314           "The window header at offset %d appears to be corrupted" % \
 315             (self.windowOffset),
 316           self.windowOffset)
 317         e.windowOffset = self.windowOffset
 318         raise e
 319
 320       # Now, we need to make a determination about whether the data and
 321       # instructions are compressed.  If they are, we need to zlib decompress
 322       # them.  We do that by creating another stream and that will decompress
 323       # the data on the fly.
 324       try:
 325         offset = self.instrByteStream.tell()
 326         if self.compressedInstrLength - instrIntSize != self.instrLength:
 327           self.origInstrStream = self.instrByteStream
 328           self.instrByteStream = ZlibByteStream(self.origInstrStream,
 329                                                 self.compressedInstrLength)
 330           self.isInstrCompressed = True
 331       except Exception, e:
 332         new_e = InvalidCompressedStream(
 333           "Invalid compressed instr stream at offset %d (%s)" % (offset,
 334                                                                  str(e)),
 335           offset)
 336         new_e.windowOffset = self.windowOffset
 337         raise new_e
 338
 339       try:
 340         offset = self.dataByteStream.tell()
 341         if self.compressedDataLength - dataIntSize != self.dataLength:
 342           self.origDataStream = self.dataByteStream
 343           self.dataByteStream = ZlibByteStream(self.origDataStream,
 344                                                self.compressedDataLength)
 345           self.isDataCompressed = True
 346       except Exception, e:
 347         new_e = InvalidCompressedStream(
 348           "Invalid compressed data stream at offset %d (%s)" % (offset,
 349                                                                 str(e)),
 350           offset)
 351         new_e.windowOffset = self.windowOffset
 352         raise new_e
 353
 354   def verify(self):
 355     expectedInstrLength = self.instrLength
 356     expectedDataLength = self.dataLength
 357     expectedTargetLength = self.targetLength
 358     expectedSourceLength = self.sourceLength
 359
 360     computedInstrLength = 0
 361     computedDataLength = 0
 362     computedTargetLength = 0
 363     computedSourceLength = 0
 364
 365     if expectedInstrLength == 0:
 366       e = InvalidWindow(
 367         "Corrupt window (at offset %d) has 0 instructions?!" % self.windowOffset,
 368         self.windowOffset)
 369       e.windowOffset = self.windowOffset
 370       raise e
 371
 372     while computedInstrLength < expectedInstrLength:
 373       try:
 374         instr = SvndiffInstruction(self.instrByteStream)
 375       except PotentiallyFixableException, e:
 376         e.window = self
 377         e.windowOffset = self.windowOffset
 378         raise
 379
 380       log(LOG_INSTRUCTIONS, 4, repr(instr))
 381
 382       computedInstrLength += instr.instrLength
 383       computedDataLength += instr.dataLength
 384       computedSourceLength += instr.sourceLength
 385       computedTargetLength += \
 386         instr.targetLength + instr.sourceLength + instr.dataLength
 387
 388     if computedInstrLength != expectedInstrLength:
 389       e = InvalidWindow(
 390         "The number of instruction bytes consumed (%d) doesn't match the expected number (%d)" % \
 391           (computedInstrLength, expectedInstrLength),
 392         self.windowOffset)
 393       e.windowOffset = self.windowOffset
 394       raise e
 395
 396     if computedDataLength != expectedDataLength:
 397       e = InvalidWindow(
 398         "The number of data bytes consumed (%d) doesn't match the expected number (%d)" % \
 399           (computedDataLength, expectedDataLength),
 400         self.windowOffset)
 401       e.windowOffset = self.windowOffset
 402       raise e
 403
 404     if computedTargetLength != expectedTargetLength:
 405       e = InvalidWindow(
 406         "The number of target bytes consumed (%d) doesn't match the expected number (%d)" % \
 407           (computedTargetLength, expectedTargetLength),
 408         self.windowOffset)
 409       e.windowOffset = self.windowOffset
 410       raise e
 411
 412     # It appears that the source length specified in the window, isn't exactly
 413     # equal to what gets consumed.  I suspect that's because the algorithm is using different
 414     # offsets within the window, and one offset/length pair will reach the end of the window.
 415     # However, this hasn't shown to be a clear indicator of corruption.  So for now, I'm
 416     # commenting it out.
 417     #
 418     #if computedSourceLength != expectedSourceLength:
 419     #  e = InvalidWindow(
 420     #    "The number of source bytes consumed (%d) doesn't match the expected number (%d)" % \
 421     #      (computedSourceLength, expectedSourceLength),
 422     #    self.windowOffset)
 423     #  e.windowOffset = self.windowOffset
 424     #  raise e
 425
 426     # Advance past the data.  We do this using seek because we might have
 427     # read a few bytes from the stream if it potentially had compressed data
 428     self.origInstrStream.seek(self.windowOffset + self.windowLength)
 429
 430   def __repr__(self):
 431     if hasattr(self, 'compressedInstrLength'):
 432       str = 'cil: %d cdl: %d ' % (self.compressedInstrLength,
 433                                   self.compressedDataLength)
 434     else:
 435       str = ''
 436
 437     return "<Window wo:%d so:%d sl:%d tl:%d %sil:%d dl:%d whl:%d wl:%d>" % (
 438       self.windowOffset, self.sourceOffset, self.sourceLength,
 439       self.targetLength, str, self.instrLength, self.dataLength,
 440       self.windowHeaderLength, self.windowLength)
 441
 442
 443 class Svndiff(object):
 444   def __init__(self, fileobj, length):
 445     self._f = fileobj
 446     self.startingOffset = self._f.tell()
 447
 448     header = self._f.read(4)
 449     if len(header) != 4:
 450       raise EOFError, \
 451         "Unexpected end of file while svndiff header at offset %d)" % \
 452         (self._f.tell())
 453
 454     if header[0:3] != 'SVN':
 455       raise InvalidSvndiffHeader, "Invalid svndiff header at offset %d" % \
 456       (self.startingOffset)
 457
 458     self.version = ord(header[3])
 459     if self.version not in [0, 1]:
 460       raise InvalidSvndiffVersion, "Invalid svndiff version %d" % self.version
 461
 462     self._length = length - 4
 463
 464   def verify(self):
 465     self._f.seek(self.startingOffset+4)
 466
 467     bs = ByteStream(self._f)
 468
 469     log(LOG_SVNDIFF, 2, "<Svndiff so: %d ver: %d>", self.startingOffset,
 470         self.version)
 471
 472     try:
 473       remaining = self._length
 474       while remaining > 0:
 475         w = Window(bs, self.version)
 476         log(LOG_WINDOWS, 3, repr(w))
 477         w.verify()
 478         remaining -= w.windowLength
 479     except PotentiallyFixableException, e:
 480       e.svndiffStart = self.startingOffset
 481       raise
 482
 483
 484 def getDirHash(f):
 485   l = f.readline()
 486   if l != 'PLAIN\n':
 487     raise ValueError, "Expected a PLAIN representation (%d)" % f.tell()
 488
 489   hash = {}
 490
 491   while True:
 492     field = f.readline()[:-1]
 493     if field == 'END':
 494       break
 495     assert(field[0] == 'K')
 496     length = int(field.split(' ')[1])
 497     field = f.readline()[:length]
 498
 499     value = f.readline()[:-1]
 500     assert(value[0] == 'V')
 501     length = int(value.split(' ')[1])
 502     value = f.readline()[:length]
 503
 504     (type, txn) = value.split(' ')
 505     hash[field] = [NodeType(type), NodeId(txn)]
 506
 507   return hash
 508
 509
 510
 511 class Rep(object):
 512   def __init__(self, type, rev, offset, length, size, digest,
 513                contentType, currentRev, noderev):
 514     self.type = type
 515     self.rev = int(rev)
 516     self.offset = int(offset)
 517     self.length = int(length)
 518     self.size = int(size)
 519     self.digest = digest
 520     self.currentRev = currentRev
 521
 522     self.contentType = contentType
 523     self.noderev = noderev
 524
 525   def __repr__(self):
 526     if not self.contentType:
 527       contentType = 'UNKNOWN'
 528     else:
 529       if self.contentType not in ['PLAIN', 'DELTA', None]:
 530         contentType = 'INVALID'
 531       else:
 532         contentType = self.contentType
 533     return '%s: %s %d %d %d %d %s' % (self.type, contentType, self.rev,
 534                                       self.offset, self.length, self.size,
 535                                       self.digest)
 536
 537   def verify(self, f, dumpInstructions, dumpWindows):
 538     if self.contentType not in ['PLAIN', 'DELTA', None]:
 539       e = InvalidRepHeader("Invalid rep header found at %d (%s)!" % \
 540                                      (self.offset, self.contentType),
 541                            self.offset)
 542       e.rep = self
 543       e.noderev = self.noderev
 544       raise e
 545
 546     if self.rev != currentRev:
 547       print >>sys.stderr, "Skipping text rep since it isn't present in the current rev"
 548       return
 549
 550     f.seek(self.offset)
 551     header = f.read(5)
 552     if header != self.contentType:
 553       raise FsfsVerifyException, \
 554         "Invalid rep header found at %d (%s, %s)!" % (self.offset, header,
 555                                                       self.contentType)
 556
 557     if header == 'DELTA':
 558       # Consume the rest of the DELTA header
 559       while f.read(1) != '\n':
 560         pass
 561
 562       # This should be the start of the svndiff stream
 563       actual_start = f.tell()
 564       try:
 565         svndiff = Svndiff(f, self.length)
 566         svndiff.verify()
 567         digest = None
 568       except Exception, e:
 569         e.rep = self
 570         e.noderev = self.noderev
 571         raise
 572
 573       if digest:
 574         assert(digest == self.digest)
 575     else:
 576       if f.read(1) != '\n':
 577         raise DataCorrupt, "Expected a '\\n' after PLAIN"
 578
 579       import md5
 580       m = md5.new()
 581       m.update(f.read(self.length))
 582
 583       if self.digest and self.digest != m.hexdigest():
 584         raise DataCorrupt, \
 585           "PLAIN data is corrupted.  Expected digest '%s', computed '%s'." % (
 586             self.digest, m.hexdigest())
 587
 588       if f.read(7) != 'ENDREP\n':
 589         raise DataCorrupt, "Terminating ENDREP missing!"
 590
 591
 592 class TextRep(Rep):
 593   def __init__(self, rev, offset, length, size, digest,
 594                contentType, currentRev, noderev):
 595     super(TextRep,self).__init__('text', rev, offset, length, size,
 596                                  digest, contentType, currentRev, noderev)
 597
 598
 599 class PropRep(Rep):
 600   def __init__(self, rev, offset, length, size, digest,
 601                contentType, currentRev, noderev):
 602     super(PropRep,self).__init__('prop', rev, offset, length, size,
 603                                  digest, contentType, currentRev, noderev)
 604
 605
 606 class NodeId(object):
 607   def __init__(self, nodeid):
 608     (self.txn_name, offset) = nodeid.split('/')
 609     self.offset = int(offset)
 610     self.rev = int(self.txn_name.split('.')[2][1:])
 611
 612   def __repr__(self):
 613     return self.txn_name + '/%d' % self.offset
 614
 615   def __eq__ (self, other):
 616     s = self.txn_name + '/%d' % self.offset
 617     if s == other:
 618       return True
 619
 620     return False
 621
 622
 623 class NodeType(object):
 624   def __init__(self, t):
 625     if (t != 'file') and (t != 'dir'):
 626       raise ValueError, 'Invalid Node type received: "%s"' % t
 627     self.type = t
 628
 629   def __repr__(self):
 630     return self.type[:]
 631
 632
 633 class NodeRev(object):
 634   def __init__(self, f, currentRev):
 635     self.pred = None
 636     self.text = None
 637     self.props = None
 638     self.cpath = None
 639     self.copyroot = None
 640     self.copyfrom = None
 641     self.dir = []
 642
 643     self.nodeOffset = f.tell()
 644
 645     while True:
 646       line = f.readline()
 647       if line == '':
 648         raise IOError, "Unexpected end of file"
 649       if line == '\n':
 650         break
 651
 652       # break apart the line
 653       try:
 654         (field, value) = line.split(':', 1)
 655       except:
 656         print repr(line)
 657         print self.nodeOffset
 658         print f.tell()
 659         raise
 660
 661       # pull of the leading space and trailing new line
 662       value = value[1:-1]
 663
 664       if field == 'id':
 665         self.id = NodeId(value)
 666       elif field == 'type':
 667         self.type = NodeType(value)
 668       elif field == 'pred':
 669         self.pred = NodeId(value)
 670       elif field == 'text':
 671         (rev, offset, length, size, digest) = value.split(' ')
 672         rev = int(rev)
 673         offset = int(offset)
 674         length = int(length)
 675         size = int(size)
 676
 677         if rev != currentRev:
 678           contentType = None
 679         else:
 680           savedOffset = f.tell()
 681           f.seek(offset)
 682           contentType = f.read(5)
 683           f.seek(savedOffset)
 684
 685         self.text = TextRep(rev, offset, length, size, digest,
 686                             contentType, currentRev, self)
 687       elif field == 'props':
 688         (rev, offset, length, size, digest) = value.split(' ')
 689         rev = int(rev)
 690         offset = int(offset)
 691         length = int(length)
 692         size = int(size)
 693
 694         if rev != currentRev:
 695           contentType = None
 696         else:
 697           savedOffset = f.tell()
 698           f.seek(offset)
 699           contentType = f.read(5)
 700           f.seek(savedOffset)
 701
 702         self.props = PropRep(rev, offset, length, size, digest,
 703                              contentType, currentRev, self)
 704       elif field == 'cpath':
 705         self.cpath = value
 706       elif field == 'copyroot':
 707         self.copyroot = value
 708       elif field == 'copyfrom':
 709         self.copyfrom = value
 710
 711     if self.type.type == 'dir':
 712       if self.text:
 713         if self.id.rev == self.text.rev:
 714           offset = f.tell()
 715           f.seek(self.text.offset)
 716           self.dir = getDirHash(f)
 717           f.seek(offset)
 718         else:
 719           # The directory entries are stored in another file.
 720           print "Warning: dir entries are stored in rev %d for noderev %s" % (
 721             self.text.rev, repr(self.id))
 722
 723   def __repr__(self):
 724     str = 'NodeRev Id: %s\n type: %s\n' % (repr(self.id), repr(self.type))
 725     if self.pred:
 726       str = str + ' pred: %s\n' % repr(self.pred)
 727     if self.text:
 728       str = str + ' %s\n' % repr(self.text)
 729     if self.props:
 730       str = str + ' %s\n' % repr(self.props)
 731     if self.cpath:
 732       str = str + ' cpath: %s\n' % self.cpath
 733     if self.copyroot:
 734       str = str + ' copyroot: %s\n' % self.copyroot
 735     if self.copyfrom:
 736       str = str + ' copyfrom: %s\n' % self.copyfrom
 737     if self.dir:
 738       str = str + ' dir contents:\n'
 739       for k in self.dir:
 740         str = str + '  %s: %s\n' % (k, self.dir[k])
 741     return str[:-1]
 742
 743
 744 class ChangedPaths(object):
 745   def __init__(self, f):
 746     self.changedPaths = {}
 747
 748     while True:
 749       currentOffset = revFile.tell()
 750       action = revFile.readline()
 751       if action == '\n' or action == '':
 752         break
 753
 754       path = action[:-1]
 755       try:
 756         (id, action, textMod, propMod) = action[:-1].split(' ')[:4]
 757       except:
 758         raise DataCorrupt, \
 759           "Data appears to be corrupt at offset %d" % currentOffset
 760       path = path[len(' '.join([id, action, textMod, propMod]))+1:]
 761
 762       line = revFile.readline()
 763       if line != '\n':
 764         (copyfromRev, copyfromPath) = line.split(' ')
 765       else:
 766         copyfromRev = -1
 767         copyfromPath = ''
 768
 769       self.changedPaths[path] = (id, action, textMod, propMod,
 770                                  copyfromRev, copyfromPath)
 771
 772
 773   def __iter__(self):
 774     return self.changedPaths.iteritems()
 775
 776
 777 def getRootAndChangedPaths(revFile):
 778   offset = -2
 779   while True:
 780     revFile.seek(offset, 2)
 781     c = revFile.read(1)
 782     if c == '\n':
 783       offset = revFile.tell()
 784       break
 785     offset = offset - 1
 786
 787   (rootNode, changedPaths) = map(int, revFile.readline().split(' '))
 788
 789   return (rootNode, changedPaths)
 790
 791
 792 def dumpChangedPaths(changedPaths):
 793   print "Changed Path Information:"
 794   for (path,
 795        (id, action, textMod, propMod,
 796         copyfromRev, copyfromPath)) in changedPaths:
 797     print " %s:" % path
 798     print "  action: %s" % action
 799     print "  text mod: %s" % textMod
 800     print "  prop mod: %s" % propMod
 801     if copyfromRev != -1:
 802       print "copyfrom path: %s" % copyfromPath
 803       print "copyfrom rev: %s" % copyfromRev
 804     print
 805
 806
 807 class WalkStrategy(object):
 808   def __init__(self, filename, rootOffset, currentRev):
 809     self.f = open(filename, 'rb')
 810     self.rootOffset = rootOffset
 811     self.f.seek(rootOffset)
 812     self.currentRev = currentRev
 813
 814   def _nodeWalker(self):
 815     raise NotImplementedError, "_nodeWalker is not implemented"
 816
 817   def __iter__(self):
 818     self.f.seek(self.rootOffset)
 819     return self._nodeWalker()
 820
 821
 822 class ClassicStrategy(WalkStrategy):
 823   def _nodeWalker (self):
 824     noderev = NodeRev(self.f, self.currentRev)
 825     yield noderev
 826
 827     if noderev.type.type == 'dir':
 828       for e in noderev.dir:
 829         if noderev.dir[e][1].rev == noderev.id.rev:
 830           self.f.seek(noderev.dir[e][1].offset)
 831           for x in self._nodeWalker():
 832             yield x
 833
 834
 835 class RegexpStrategy(WalkStrategy):
 836   def __init__(self, filename, rootOffset, currentRev):
 837     WalkStrategy.__init__(self, filename, rootOffset, currentRev)
 838
 839     # File object passed to the NodeRev() constructor so that it
 840     # doesn't interfere with our regex search.
 841     self.nodeFile = open(filename, 'rb')
 842
 843   def _nodeWalker(self):
 844     nodeId_re = re.compile(r'^id: [a-z0-9\./]+$')
 845
 846     self.f.seek(0)
 847     offset = 0
 848
 849     for line in self.f:
 850       match = nodeId_re.search(line)
 851       if match:
 852         self.nodeFile.seek(offset)
 853         noderev = NodeRev(self.nodeFile, self.currentRev)
 854         yield noderev
 855
 856       offset = offset + len(line)
 857
 858
 859 def verify(noderev, revFile, dumpInstructions, dumpWindows):
 860   print noderev
 861
 862   if noderev.text:
 863     noderev.text.verify(revFile,
 864                         dumpInstructions,
 865                         dumpWindows)
 866
 867   if noderev.props and noderev.props.rev == noderev.props.currentRev:
 868     noderev.props.verify(revFile,
 869                          dumpInstructions,
 870                          dumpWindows)
 871
 872   print
 873
 874
 875 def truncate(noderev, revFile):
 876   txnId = noderev.id
 877
 878   print "Truncating node %s (%s)" % (txnId, noderev.cpath)
 879
 880   # Grab the text rep
 881   textRep = noderev.text
 882
 883   # Fix the text rep contents
 884   offset = textRep.offset
 885   revFile.seek(offset, 0)
 886   revFile.write("PLAIN\x0aENDREP\x0a")
 887
 888   # Fix the node rev
 889   offset = noderev.nodeOffset
 890   revFile.seek(offset, 0)
 891   while True:
 892     savedOffset = revFile.tell()
 893     s = revFile.readline()
 894     if s[:4] == 'text':
 895       revFile.seek(savedOffset, 0)
 896       break
 897
 898   line = revFile.readline()
 899   revFile.seek(savedOffset, 0)
 900   fields = line.split(' ')
 901   overallLength = len(line)
 902
 903   fields[3] = '0' * len(fields[3])
 904   fields[4] = '0' * len(fields[4])
 905   fields[5] = 'd41d8cd98f00b204e9800998ecf8427e'
 906   newTextRep = ' '.join(fields) + '\x0a'
 907   assert(len(newTextRep) == overallLength)
 908   revFile.write(newTextRep)
 909   print "Done."
 910   sys.exit(0)
 911
 912
 913 def fixHeader(e, revFile):
 914   '''Attempt to fix the rep header.  e is expected to be of type
 915   InvalidRepHeader, since the exception stores the necessary information
 916   to help repair the file.'''
 917
 918   # First, we need to locate the real start of the text rep
 919   textrep_re = re.compile(r'^(DELTA( \d+ \d+ \d+)?|PLAIN)$')
 920
 921   revFile.seek(0)
 922   offset = 0
 923   originalOffset = 0
 924   for line in revFile:
 925     m = textrep_re.match(line)
 926     if m:
 927       if offset >= originalOffset and offset < e.offset:
 928         originalOffset = offset
 929         headerLen = len(line)
 930     offset = offset + len(line)
 931
 932   print "Original text rep located at", originalOffset
 933
 934   # Okay, now we have the original offset of the text rep that was
 935   # in the process of being written out.  The header portion of the
 936   # text rep has a fsync() done after it, so the 4K blocks actually
 937   # start after the header.  We need to make sure to copy the header
 938   # and the next 4K, to be on the safe side.
 939   copyLen = 4096 + headerLen
 940
 941   revFile.seek(originalOffset)
 942   block = revFile.read(copyLen)
 943   print "Copy %d bytes from offset %d" % (copyLen, originalOffset)
 944
 945   print "Write %d bytes at offset %d" % (copyLen, e.offset)
 946   revFile.seek(e.offset)
 947   revFile.write(block)
 948   revFile.flush()
 949
 950   print "Fixed? :-)  Re-run fsfsverify without the -f option"
 951
 952
 953 def fixStream(e, revFile):
 954   startOffset = e.svndiffStart
 955   errorOffset = e.windowOffset
 956
 957   repeatedBlockOffset = errorOffset - ((errorOffset - startOffset) % 4096)
 958
 959   # Now we need to move up the rest of the rep
 960
 961   # Determine the final offset by finding the end of the rep.
 962   revFile.seek(errorOffset)
 963
 964   endrep_re = re.compile(".*ENDREP$")
 965   srcLength = 0
 966   for l in revFile:
 967     srcLength += len(l)
 968     m = endrep_re.match(l)
 969     if m:
 970       break
 971
 972   if not m:
 973     raise "Couldn't find end of rep!"
 974
 975   finalOffset = errorOffset + srcLength
 976   srcOffset = errorOffset
 977   destOffset = repeatedBlockOffset
 978
 979   print "Copy %d bytes from offset %d" % (srcLength, srcOffset)
 980   print "Write %d bytes at offset %d" % (srcLength, destOffset)
 981
 982   while srcOffset < finalOffset:
 983     blen = 64*1024
 984     if (finalOffset - srcOffset) < blen:
 985       blen = finalOffset - srcOffset
 986     revFile.seek(srcOffset)
 987     block = revFile.read(blen)
 988     revFile.seek(destOffset)
 989     revFile.write(block)
 990
 991     srcOffset += blen
 992     destOffset += blen
 993
 994   revFile.flush()
 995   revFile.close()
 996
 997   print "Fixed? :-)  Re-run fsfsverify without the -f option"
 998
 999
1000 def checkOptions(options):
1001   count = 0
1002   for k,v in options.__dict__.items():
1003     if v and (k in ['dumpChanged', 'truncate', 'fixRlle']):
1004       count = count + 1
1005
1006   if count > 1:
1007     print >>sys.stderr, "Please use only one of -c, -f, and -t."
1008     sys.exit(1)
1009
1010   if options.dumpChanged and (options.dumpWindows or options.dumpInstructions):
1011     print >>sys.stderr, \
1012       "-c is incompatible with -w and -i.  Dropping -w and/or -i."
1013
1014   if options.noVerify and (options.dumpWindows or options.dumpInstructions):
1015     print >>sys.stderr, \
1016       "--no-verify is incompatible with -w and -i.  Dropping -w and/or -i."
1017
1018
1019 def handleError(error, withTraceback=False):
1020   print
1021   if withTraceback:
1022     import traceback
1023     traceback.print_exc()
1024
1025   print >>sys.stderr,"Error %s: %s" % (error.__class__.__name__, str(e))
1026   print >>sys.stderr,"Try running with -f to fix the revision"
1027   sys.exit(1)
1028
1029
1030 if __name__ == '__main__':
1031   from optparse import OptionParser
1032
1033   parser = OptionParser("usage: %prog [-w | -i | -r | -n] REV-FILE")
1034   parser.add_option("-c", "--changed-paths",
1035                     action="store_true", dest="dumpChanged",
1036                     help="Dump changed path information", default=False)
1037   parser.add_option("", "--no-verify",
1038                     action="store_true", dest="noVerify",
1039                     help="Don't parse svndiff streams.", default=False)
1040   parser.add_option("-i", "--instructions",
1041                     action="store_true", dest="dumpInstructions",
1042                     help="Dump instructions (implies -w)", default=False)
1043   parser.add_option("-w", "--windows",
1044                     action="store_true", dest="dumpWindows",
1045                     help="Dump windows", default=False)
1046   parser.add_option("-n", "--noderev-regexp",
1047                     action="store_true", dest="noderevRegexp",
1048                     help="Find all noderevs using a regexp", default=False)
1049   parser.add_option("-f", "--fix-read-length-line-error",
1050                     action="store_true", dest="fixRlle",
1051                     help="Attempt to fix the read length line error",
1052                     default=False)
1053   parser.add_option("-t", "--truncate",
1054                     action="store", type="string", dest="truncate",
1055                     help="Truncate the specified node rev.",
1056                     default=None)
1057   parser.add_option("", "--traceback",
1058                     action="store_true", dest="showTraceback",
1059                     help="Show error tracebacks (mainly used for debugging).",
1060                     default=False)
1061
1062   (options, args) = parser.parse_args()
1063
1064   if len(args) != 1:
1065     print >>sys.stderr, "Please specify exactly one rev file."
1066     parser.print_help()
1067     sys.exit(1)
1068
1069   checkOptions(options)
1070
1071   filename = args[0]
1072
1073   if options.dumpInstructions:
1074     options.dumpWindows = True
1075     LOG_MASK |= LOG_INSTRUCTIONS
1076
1077   if options.dumpWindows:
1078     LOG_MASK |= LOG_WINDOWS
1079
1080   if options.truncate or options.fixRlle:
1081     revFile = open(filename, 'r+b')
1082   else:
1083     revFile = open(filename, 'rb')
1084
1085   (root, changed) = getRootAndChangedPaths(revFile)
1086
1087   if options.dumpChanged:
1088     revFile.seek(changed)
1089     changedPaths = ChangedPaths(revFile)
1090
1091     dumpChangedPaths(changedPaths)
1092     sys.exit(0)
1093
1094   try:
1095     import re
1096     match = re.match('([0-9]+)', os.path.basename(filename))
1097     currentRev = int(match.group(1), 10)
1098   except:
1099     raise CmdlineError, \
1100       "The file name must start with a decimal number that indicates the revision"
1101
1102   if options.noderevRegexp:
1103     strategy = RegexpStrategy(filename, root, currentRev)
1104   else:
1105     strategy = ClassicStrategy(filename, root, currentRev)
1106
1107   # Make stderr the same as stdout.  This helps when trying to catch all of the
1108   # output from a run.
1109   sys.stderr = sys.stdout
1110
1111   try:
1112     for noderev in strategy:
1113       try:
1114         if options.truncate:
1115           # Check to see if this is the rev we need to truncate
1116           if options.truncate == noderev.id:
1117             truncate(noderev, revFile)
1118
1119         else:
1120           print noderev
1121
1122           if not options.noVerify:
1123             if noderev.text:
1124               noderev.text.verify(revFile,
1125                                   options.dumpInstructions,
1126                                   options.dumpWindows)
1127
1128             if noderev.props and noderev.props.rev == noderev.props.currentRev:
1129               noderev.props.verify(revFile,
1130                                    options.dumpInstructions,
1131                                    options.dumpWindows)
1132
1133           print
1134       except:
1135         sys.stdout.flush()
1136         raise
1137   except InvalidRepHeader, e:
1138     if not options.fixRlle:
1139       handleError(e, options.showTraceback)
1140
1141     fixHeader(e, revFile)
1142
1143   except PotentiallyFixableException, e:
1144     if not options.fixRlle:
1145       handleError(e, options.showTraceback)
1146
1147     fixStream(e, revFile)
1148