cvs2svn_lib/checkout_internal.py

   1 # (Be in -*- python -*- mode.)
   2 #
   3 # ====================================================================
   4 # Copyright (c) 2007-2009 CollabNet.  All rights reserved.
   5 #
   6 # This software is licensed as described in the file COPYING, which
   7 # you should have received as part of this distribution.  The terms
   8 # are also available at http://subversion.tigris.org/license-1.html.
   9 # If newer versions of this license are posted there, you may use a
  10 # newer version instead, at your option.
  11 #
  12 # This software consists of voluntary contributions made by many
  13 # individuals.  For exact contribution history, see the revision
  14 # history and logs, available at http://cvs2svn.tigris.org/.
  15 # ====================================================================
  16
  17 """This module contains classes that implement the --use-internal-co option.
  18
  19 The idea is to patch up the revisions' contents incrementally, thus
  20 avoiding the huge number of process spawns and the O(n^2) overhead of
  21 using 'co' and 'cvs'.
  22
  23 InternalRevisionCollector saves the RCS deltas and RCS revision trees
  24 to databases.  Notably, deltas from the trunk need to be reversed, as
  25 CVS stores them so they apply from HEAD backwards.
  26
  27 InternalRevisionReader produces the revisions' contents on demand.  To
  28 generate the text for a typical revision, we need the revision's delta
  29 text plus the fulltext of the previous revision.  Therefore, we
  30 maintain a checkout database containing a copy of the fulltext of any
  31 revision for which subsequent revisions still need to be retrieved.
  32 It is crucial to remove text from this database as soon as it is no
  33 longer needed, to prevent it from growing enormous.
  34
  35 There are two reasons that the text from a revision can be needed: (1)
  36 because the revision itself still needs to be output to a dumpfile;
  37 (2) because another revision needs it as the base of its delta.  We
  38 maintain a reference count for each revision, which includes *both*
  39 possibilities.  The first time a revision's text is needed, it is
  40 generated by applying the revision's deltatext to the previous
  41 revision's fulltext, and the resulting fulltext is stored in the
  42 checkout database.  Each time a revision's fulltext is retrieved, its
  43 reference count is decremented.  When the reference count goes to
  44 zero, then the fulltext is deleted from the checkout database.
  45
  46 The administrative data for managing this consists of one TextRecord
  47 entry for each revision.  Each TextRecord has an id, which is the same
  48 id as used for the corresponding CVSRevision instance.  It also
  49 maintains a count of the times it is expected to be retrieved.
  50 TextRecords come in several varieties:
  51
  52 FullTextRecord -- Used for revisions whose fulltext is derived
  53     directly from the RCS file by the InternalRevisionCollector (i.e.,
  54     typically revision 1.1 of each file).
  55
  56 DeltaTextRecord -- Used for revisions that are defined via a delta
  57     relative to some other TextRecord.  These records record the id of
  58     the TextRecord that holds the base text against which the delta is
  59     defined.  When the text for a DeltaTextRecord is retrieved, the
  60     DeltaTextRecord instance is deleted and a CheckedOutTextRecord
  61     instance is created to take its place.
  62
  63 CheckedOutTextRecord -- Used during OutputPass for a revision that
  64     started out as a DeltaTextRecord, but has already been retrieved
  65     (and therefore its fulltext is stored in the checkout database).
  66
  67 While a file is being processed during FilterSymbolsPass, the fulltext
  68 and deltas are stored to the delta database, and TextRecord instances
  69 are created to keep track of things.  The reference counts are all
  70 initialized: each record referred to by a delta has its refcount
  71 incremented, and each record that corresponds to a non-delete
  72 CVSRevision is incremented.  After that, any records with refcount==0
  73 are removed.  When one record is removed, that can cause another
  74 record's reference count to go to zero and be removed too,
  75 recursively.  When a TextRecord is deleted at this stage, its
  76 deltatext is also deleted from the delta database."""
  77
  78
  79 from cvs2svn_lib import config
  80 from cvs2svn_lib.common import DB_OPEN_NEW
  81 from cvs2svn_lib.common import DB_OPEN_READ
  82 from cvs2svn_lib.common import warning_prefix
  83 from cvs2svn_lib.common import FatalError
  84 from cvs2svn_lib.common import InternalError
  85 from cvs2svn_lib.common import canonicalize_eol
  86 from cvs2svn_lib.common import is_trunk_revision
  87 from cvs2svn_lib.context import Ctx
  88 from cvs2svn_lib.log import logger
  89 from cvs2svn_lib.artifact_manager import artifact_manager
  90 from cvs2svn_lib.cvs_item import CVSRevisionModification
  91 from cvs2svn_lib.indexed_database import IndexedDatabase
  92 from cvs2svn_lib.rcs_stream import RCSStream
  93 from cvs2svn_lib.rcs_stream import MalformedDeltaException
  94 from cvs2svn_lib.keyword_expander import expand_keywords
  95 from cvs2svn_lib.keyword_expander import collapse_keywords
  96 from cvs2svn_lib.revision_manager import RevisionCollector
  97 from cvs2svn_lib.revision_manager import RevisionReader
  98 from cvs2svn_lib.serializer import MarshalSerializer
  99 from cvs2svn_lib.serializer import CompressingSerializer
 100 from cvs2svn_lib.serializer import PrimedPickleSerializer
 101 from cvs2svn_lib.apple_single_filter import get_maybe_apple_single
 102
 103 from cvs2svn_lib.rcsparser import Sink
 104 from cvs2svn_lib.rcsparser import parse
 105
 106
 107 class TextRecord(object):
 108   """Bookkeeping data for the text of a single CVSRevision."""
 109
 110   __slots__ = ['id', 'refcount']
 111
 112   def __init__(self, id):
 113     # The cvs_rev_id of the revision whose text this is.
 114     self.id = id
 115
 116     # The number of times that the text of this revision will be
 117     # retrieved.
 118     self.refcount = 0
 119
 120   def __getstate__(self):
 121     return (self.id, self.refcount,)
 122
 123   def __setstate__(self, state):
 124     (self.id, self.refcount,) = state
 125
 126   def increment_dependency_refcounts(self, text_record_db):
 127     """Increment the refcounts of any records that this one depends on."""
 128
 129     pass
 130
 131   def decrement_refcount(self, text_record_db):
 132     """Decrement the number of times our text still has to be checked out.
 133
 134     If the reference count goes to zero, call discard()."""
 135
 136     self.refcount -= 1
 137     if self.refcount == 0:
 138       text_record_db.discard(self.id)
 139
 140   def checkout(self, text_record_db):
 141     """Workhorse of the checkout process.
 142
 143     Return the text for this revision, decrement our reference count,
 144     and update the databases depending on whether there will be future
 145     checkouts."""
 146
 147     raise NotImplementedError()
 148
 149   def free(self, text_record_db):
 150     """This instance will never again be checked out; free it.
 151
 152     Also free any associated resources and decrement the refcounts of
 153     any other TextRecords that this one depends on."""
 154
 155     raise NotImplementedError()
 156
 157
 158 class FullTextRecord(TextRecord):
 159   """A record whose revision's fulltext is stored in the delta_db.
 160
 161   These records are used for revisions whose fulltext was determined
 162   by the InternalRevisionCollector during FilterSymbolsPass.  The
 163   fulltext for such a revision is is stored in the delta_db as a
 164   single string."""
 165
 166   __slots__ = []
 167
 168   def __getstate__(self):
 169     return (self.id, self.refcount,)
 170
 171   def __setstate__(self, state):
 172     (self.id, self.refcount,) = state
 173
 174   def checkout(self, text_record_db):
 175     text = text_record_db.delta_db[self.id]
 176     self.decrement_refcount(text_record_db)
 177     return text
 178
 179   def free(self, text_record_db):
 180     del text_record_db.delta_db[self.id]
 181
 182   def __str__(self):
 183     return 'FullTextRecord(%x, %d)' % (self.id, self.refcount,)
 184
 185
 186 class DeltaTextRecord(TextRecord):
 187   """A record whose revision's delta is stored as an RCS delta.
 188
 189   The text of this revision must be derived by applying an RCS delta
 190   to the text of the predecessor revision.  The RCS delta is stored
 191   in the delta_db."""
 192
 193   __slots__ = ['pred_id']
 194
 195   def __init__(self, id, pred_id):
 196     TextRecord.__init__(self, id)
 197
 198     # The cvs_rev_id of the revision relative to which this delta is
 199     # defined.
 200     self.pred_id = pred_id
 201
 202   def __getstate__(self):
 203     return (self.id, self.refcount, self.pred_id,)
 204
 205   def __setstate__(self, state):
 206     (self.id, self.refcount, self.pred_id,) = state
 207
 208   def increment_dependency_refcounts(self, text_record_db):
 209     text_record_db[self.pred_id].refcount += 1
 210
 211   def checkout(self, text_record_db):
 212     base_text = text_record_db[self.pred_id].checkout(text_record_db)
 213     rcs_stream = RCSStream(base_text)
 214     delta_text = text_record_db.delta_db[self.id]
 215     rcs_stream.apply_diff(delta_text)
 216     text = rcs_stream.get_text()
 217     del rcs_stream
 218     self.refcount -= 1
 219     if self.refcount == 0:
 220       # This text will never be needed again; just delete ourselves
 221       # without ever having stored the fulltext to the checkout
 222       # database:
 223       del text_record_db[self.id]
 224     else:
 225       # Store a new CheckedOutTextRecord in place of ourselves:
 226       text_record_db.checkout_db['%x' % self.id] = text
 227       new_text_record = CheckedOutTextRecord(self.id)
 228       new_text_record.refcount = self.refcount
 229       text_record_db.replace(new_text_record)
 230     return text
 231
 232   def free(self, text_record_db):
 233     del text_record_db.delta_db[self.id]
 234     text_record_db[self.pred_id].decrement_refcount(text_record_db)
 235
 236   def __str__(self):
 237     return 'DeltaTextRecord(%x -> %x, %d)' % (
 238         self.pred_id, self.id, self.refcount,
 239         )
 240
 241
 242 class CheckedOutTextRecord(TextRecord):
 243   """A record whose revision's fulltext is stored in the text_record_db.
 244
 245   These records are used for revisions whose fulltext has been
 246   computed already during OutputPass.  The fulltext for such a
 247   revision is stored in the text_record_db as a single string."""
 248
 249   __slots__ = []
 250
 251   def __getstate__(self):
 252     return (self.id, self.refcount,)
 253
 254   def __setstate__(self, state):
 255     (self.id, self.refcount,) = state
 256
 257   def checkout(self, text_record_db):
 258     text = text_record_db.checkout_db['%x' % self.id]
 259     self.decrement_refcount(text_record_db)
 260     return text
 261
 262   def free(self, text_record_db):
 263     del text_record_db.checkout_db['%x' % self.id]
 264
 265   def __str__(self):
 266     return 'CheckedOutTextRecord(%x, %d)' % (self.id, self.refcount,)
 267
 268
 269 class NullDatabase(object):
 270   """A do-nothing database that can be used with TextRecordDatabase.
 271
 272   Use this when you don't actually want to allow anything to be
 273   deleted."""
 274
 275   def __delitem__(self, id):
 276     pass
 277
 278
 279 class TextRecordDatabase:
 280   """Holds the TextRecord instances that are currently live.
 281
 282   During FilterSymbolsPass, files are processed one by one and a new
 283   TextRecordDatabase instance is used for each file.  During
 284   OutputPass, a single TextRecordDatabase instance is used for the
 285   duration of OutputPass; individual records are added and removed
 286   when they are active."""
 287
 288   def __init__(self, delta_db, checkout_db):
 289     # A map { cvs_rev_id -> TextRecord }.
 290     self.text_records = {}
 291
 292     # A database-like object using cvs_rev_ids as keys and containing
 293     # fulltext/deltatext strings as values.  Its __getitem__() method
 294     # is used to retrieve deltas when they are needed, and its
 295     # __delitem__() method is used to delete deltas when they can be
 296     # freed.  The modifiability of the delta database varies from pass
 297     # to pass, so the object stored here varies as well:
 298     #
 299     # FilterSymbolsPass: a NullDatabase.  The delta database cannot be
 300     #     modified during this pass, and we have no need to retrieve
 301     #     deltas, so we just use a dummy object here.
 302     #
 303     # OutputPass: a disabled IndexedDatabase.  During this pass we
 304     #     need to retrieve deltas, but we are not allowed to modify
 305     #     the delta database.  So we use an IndexedDatabase whose
 306     #     __del__() method has been disabled to do nothing.
 307     self.delta_db = delta_db
 308
 309     # A database-like object using cvs_rev_ids as keys and containing
 310     # fulltext strings as values.  This database is only set during
 311     # OutputPass.
 312     self.checkout_db = checkout_db
 313
 314     # If this is set to a list, then the list holds the ids of
 315     # text_records that have to be deleted; when discard() is called,
 316     # it adds the requested id to the list but does not delete it.  If
 317     # this member is set to None, then text_records are deleted
 318     # immediately when discard() is called.
 319     self.deferred_deletes = None
 320
 321   def __getstate__(self):
 322     return (self.text_records.values(),)
 323
 324   def __setstate__(self, state):
 325     (text_records,) = state
 326     self.text_records = {}
 327     for text_record in text_records:
 328       self.add(text_record)
 329     self.delta_db = NullDatabase()
 330     self.checkout_db = NullDatabase()
 331     self.deferred_deletes = None
 332
 333   def add(self, text_record):
 334     """Add TEXT_RECORD to our database.
 335
 336     There must not already be a record with the same id."""
 337
 338     assert not self.text_records.has_key(text_record.id)
 339
 340     self.text_records[text_record.id] = text_record
 341
 342   def __getitem__(self, id):
 343     return self.text_records[id]
 344
 345   def __delitem__(self, id):
 346     """Free the record with the specified ID."""
 347
 348     del self.text_records[id]
 349
 350   def replace(self, text_record):
 351     """Store TEXT_RECORD in place of the existing record with the same id.
 352
 353     Do not do anything with the old record."""
 354
 355     assert self.text_records.has_key(text_record.id)
 356     self.text_records[text_record.id] = text_record
 357
 358   def discard(self, *ids):
 359     """The text records with IDS are no longer needed; discard them.
 360
 361     This involves calling their free() methods and also removing them
 362     from SELF.
 363
 364     If SELF.deferred_deletes is not None, then the ids to be deleted
 365     are added to the list instead of deleted immediately.  This
 366     mechanism is to prevent a stack overflow from the avalanche of
 367     deletes that can result from deleting a long chain of revisions."""
 368
 369     if self.deferred_deletes is None:
 370       # This is an outer-level delete.
 371       self.deferred_deletes = list(ids)
 372       while self.deferred_deletes:
 373         id = self.deferred_deletes.pop()
 374         text_record = self[id]
 375         if text_record.refcount != 0:
 376           raise InternalError(
 377               'TextRecordDatabase.discard(%s) called with refcount = %d'
 378               % (text_record, text_record.refcount,)
 379               )
 380         # This call might cause other text_record ids to be added to
 381         # self.deferred_deletes:
 382         text_record.free(self)
 383         del self[id]
 384       self.deferred_deletes = None
 385     else:
 386       self.deferred_deletes.extend(ids)
 387
 388   def itervalues(self):
 389     return self.text_records.itervalues()
 390
 391   def recompute_refcounts(self, cvs_file_items):
 392     """Recompute the refcounts of the contained TextRecords.
 393
 394     Use CVS_FILE_ITEMS to determine which records will be needed by
 395     cvs2svn."""
 396
 397     # First clear all of the refcounts:
 398     for text_record in self.itervalues():
 399       text_record.refcount = 0
 400
 401     # Now increment the reference count of records that are needed as
 402     # the source of another record's deltas:
 403     for text_record in self.itervalues():
 404       text_record.increment_dependency_refcounts(self.text_records)
 405
 406     # Now increment the reference count of records that will be needed
 407     # by cvs2svn:
 408     for lod_items in cvs_file_items.iter_lods():
 409       for cvs_rev in lod_items.cvs_revisions:
 410         if isinstance(cvs_rev, CVSRevisionModification):
 411           self[cvs_rev.id].refcount += 1
 412
 413   def free_unused(self):
 414     """Free any TextRecords whose reference counts are zero."""
 415
 416     # The deletion of some of these text records might cause others to
 417     # be unused, in which case they will be deleted automatically.
 418     # But since the initially-unused records are not referred to by
 419     # any others, we don't have to be afraid that they will be deleted
 420     # before we get to them.  But it *is* crucial that we create the
 421     # whole unused list before starting the loop.
 422
 423     unused = [
 424         text_record.id
 425         for text_record in self.itervalues()
 426         if text_record.refcount == 0
 427         ]
 428
 429     self.discard(*unused)
 430
 431   def log_leftovers(self):
 432     """If any TextRecords still exist, log them."""
 433
 434     if self.text_records:
 435       logger.warn(
 436           "%s: internal problem: leftover revisions in the checkout cache:"
 437           % warning_prefix)
 438       for text_record in self.itervalues():
 439         logger.warn('    %s' % (text_record,))
 440
 441   def __repr__(self):
 442     """Debugging output of the current contents of the TextRecordDatabase."""
 443
 444     retval = ['TextRecordDatabase:']
 445     for text_record in self.itervalues():
 446       retval.append('    %s' % (text_record,))
 447     return '\n'.join(retval)
 448
 449
 450 class _Sink(Sink):
 451   def __init__(self, revision_collector, cvs_file_items):
 452     self.revision_collector = revision_collector
 453     self.cvs_file_items = cvs_file_items
 454
 455     # A map {rev : base_rev} indicating that the text for rev is
 456     # stored in CVS as a delta relative to base_rev.
 457     self.base_revisions = {}
 458
 459     # The revision that is stored with its fulltext in CVS (usually
 460     # the oldest revision on trunk):
 461     self.head_revision = None
 462
 463     # The first logical revision on trunk (usually '1.1'):
 464     self.revision_1_1 = None
 465
 466     # Keep track of the revisions whose revision info has been seen so
 467     # far (to avoid repeated revision info blocks):
 468     self.revisions_seen = set()
 469
 470   def set_head_revision(self, revision):
 471     self.head_revision = revision
 472
 473   def define_revision(
 474         self, revision, timestamp, author, state, branches, next
 475         ):
 476     if next:
 477       self.base_revisions[next] = revision
 478     else:
 479       if is_trunk_revision(revision):
 480         self.revision_1_1 = revision
 481
 482     for branch in branches:
 483       self.base_revisions[branch] = revision
 484
 485   def set_revision_info(self, revision, log, text):
 486     if revision in self.revisions_seen:
 487       # One common form of CVS repository corruption is that the
 488       # Deltatext block for revision 1.1 appears twice.  CollectData
 489       # has already warned about this problem; here we can just ignore
 490       # it.
 491       return
 492     else:
 493       self.revisions_seen.add(revision)
 494
 495     cvs_rev_id = self.cvs_file_items.original_ids[revision]
 496     if is_trunk_revision(revision):
 497       # On trunk, revisions are encountered in reverse order (1.<N>
 498       # ... 1.1) and deltas are inverted.  The first text that we see
 499       # is the fulltext for the HEAD revision.  After that, the text
 500       # corresponding to revision 1.N is the delta (1.<N+1> ->
 501       # 1.<N>)).  We have to invert the deltas here so that we can
 502       # read the revisions out in dependency order; that is, for
 503       # revision 1.1 we want the fulltext, and for revision 1.<N> we
 504       # want the delta (1.<N-1> -> 1.<N>).  This means that we can't
 505       # compute the delta for a revision until we see its logical
 506       # parent.  When we finally see revision 1.1 (which is recognized
 507       # because it doesn't have a parent), we can record the diff (1.1
 508       # -> 1.2) for revision 1.2, and also the fulltext for 1.1.
 509
 510       if revision == self.head_revision:
 511         # This is HEAD, as fulltext.  Initialize the RCSStream so
 512         # that we can compute deltas backwards in time.
 513         self._rcs_stream = RCSStream(text)
 514         self._rcs_stream_revision = revision
 515       else:
 516         # Any other trunk revision is a backward delta.  Apply the
 517         # delta to the RCSStream to mutate it to the contents of this
 518         # revision, and also to get the reverse delta, which we store
 519         # as the forward delta of our child revision.
 520         try:
 521           text = self._rcs_stream.invert_diff(text)
 522         except MalformedDeltaException, e:
 523           logger.error(
 524               'Malformed RCS delta in %s, revision %s: %s'
 525               % (self.cvs_file_items.cvs_file.rcs_path, revision, e)
 526               )
 527           raise RuntimeError()
 528         text_record = DeltaTextRecord(
 529             self.cvs_file_items.original_ids[self._rcs_stream_revision],
 530             cvs_rev_id
 531             )
 532         self.revision_collector._writeout(text_record, text)
 533         self._rcs_stream_revision = revision
 534
 535       if revision == self.revision_1_1:
 536         # This is revision 1.1.  Write its fulltext:
 537         text_record = FullTextRecord(cvs_rev_id)
 538         self.revision_collector._writeout(
 539             text_record, self._rcs_stream.get_text()
 540             )
 541
 542         # There will be no more trunk revisions delivered, so free the
 543         # RCSStream.
 544         del self._rcs_stream
 545         del self._rcs_stream_revision
 546
 547     else:
 548       # On branches, revisions are encountered in logical order
 549       # (<BRANCH>.1 ... <BRANCH>.<N>) and the text corresponding to
 550       # revision <BRANCH>.<N> is the forward delta (<BRANCH>.<N-1> ->
 551       # <BRANCH>.<N>).  That's what we need, so just store it.
 552
 553       # FIXME: It would be nice to avoid writing out branch deltas
 554       # when --trunk-only.  (They will be deleted when finish_file()
 555       # is called, but if the delta db is in an IndexedDatabase the
 556       # deletions won't actually recover any disk space.)
 557       text_record = DeltaTextRecord(
 558           cvs_rev_id,
 559           self.cvs_file_items.original_ids[self.base_revisions[revision]]
 560           )
 561       self.revision_collector._writeout(text_record, text)
 562
 563     return None
 564
 565
 566 class InternalRevisionCollector(RevisionCollector):
 567   """The RevisionCollector used by InternalRevisionReader."""
 568
 569   def __init__(self, compress):
 570     RevisionCollector.__init__(self)
 571     self._compress = compress
 572
 573   def register_artifacts(self, which_pass):
 574     artifact_manager.register_temp_file(
 575         config.RCS_DELTAS_INDEX_TABLE, which_pass
 576         )
 577     artifact_manager.register_temp_file(config.RCS_DELTAS_STORE, which_pass)
 578     artifact_manager.register_temp_file(
 579         config.RCS_TREES_INDEX_TABLE, which_pass
 580         )
 581     artifact_manager.register_temp_file(config.RCS_TREES_STORE, which_pass)
 582
 583   def start(self):
 584     serializer = MarshalSerializer()
 585     if self._compress:
 586       serializer = CompressingSerializer(serializer)
 587     self._delta_db = IndexedDatabase(
 588         artifact_manager.get_temp_file(config.RCS_DELTAS_STORE),
 589         artifact_manager.get_temp_file(config.RCS_DELTAS_INDEX_TABLE),
 590         DB_OPEN_NEW, serializer,
 591         )
 592     primer = (FullTextRecord, DeltaTextRecord)
 593     self._rcs_trees = IndexedDatabase(
 594         artifact_manager.get_temp_file(config.RCS_TREES_STORE),
 595         artifact_manager.get_temp_file(config.RCS_TREES_INDEX_TABLE),
 596         DB_OPEN_NEW, PrimedPickleSerializer(primer),
 597         )
 598
 599   def _writeout(self, text_record, text):
 600     self.text_record_db.add(text_record)
 601     self._delta_db[text_record.id] = text
 602
 603   def process_file(self, cvs_file_items):
 604     """Read revision information for the file described by CVS_FILE_ITEMS.
 605
 606     Compute the text record refcounts, discard any records that are
 607     unneeded, and store the text records for the file to the
 608     _rcs_trees database."""
 609
 610     # A map from cvs_rev_id to TextRecord instance:
 611     self.text_record_db = TextRecordDatabase(self._delta_db, NullDatabase())
 612
 613     f = open(cvs_file_items.cvs_file.rcs_path, 'rb')
 614     try:
 615       parse(f, _Sink(self, cvs_file_items))
 616     finally:
 617       f.close()
 618
 619     self.text_record_db.recompute_refcounts(cvs_file_items)
 620     self.text_record_db.free_unused()
 621     self._rcs_trees[cvs_file_items.cvs_file.id] = self.text_record_db
 622     del self.text_record_db
 623
 624   def finish(self):
 625     self._delta_db.close()
 626     self._rcs_trees.close()
 627
 628
 629 class InternalRevisionReader(RevisionReader):
 630   """A RevisionReader that reads the contents from an own delta store."""
 631
 632   def __init__(self, compress):
 633     # Only import Database if an InternalRevisionReader is really
 634     # instantiated, because the import fails if a decent dbm is not
 635     # installed.
 636     from cvs2svn_lib.database import Database
 637     self._Database = Database
 638
 639     self._compress = compress
 640
 641   def register_artifacts(self, which_pass):
 642     artifact_manager.register_temp_file(config.CVS_CHECKOUT_DB, which_pass)
 643     artifact_manager.register_temp_file_needed(
 644         config.RCS_DELTAS_STORE, which_pass
 645         )
 646     artifact_manager.register_temp_file_needed(
 647         config.RCS_DELTAS_INDEX_TABLE, which_pass
 648         )
 649     artifact_manager.register_temp_file_needed(
 650         config.RCS_TREES_STORE, which_pass
 651         )
 652     artifact_manager.register_temp_file_needed(
 653         config.RCS_TREES_INDEX_TABLE, which_pass
 654         )
 655
 656   def start(self):
 657     self._delta_db = IndexedDatabase(
 658         artifact_manager.get_temp_file(config.RCS_DELTAS_STORE),
 659         artifact_manager.get_temp_file(config.RCS_DELTAS_INDEX_TABLE),
 660         DB_OPEN_READ,
 661         )
 662     self._delta_db.__delitem__ = lambda id: None
 663     self._tree_db = IndexedDatabase(
 664         artifact_manager.get_temp_file(config.RCS_TREES_STORE),
 665         artifact_manager.get_temp_file(config.RCS_TREES_INDEX_TABLE),
 666         DB_OPEN_READ,
 667         )
 668     serializer = MarshalSerializer()
 669     if self._compress:
 670       serializer = CompressingSerializer(serializer)
 671     self._co_db = self._Database(
 672         artifact_manager.get_temp_file(config.CVS_CHECKOUT_DB),
 673         DB_OPEN_NEW, serializer,
 674         )
 675
 676     # The set of CVSFile instances whose TextRecords have already been
 677     # read:
 678     self._loaded_files = set()
 679
 680     # A map { CVSFILE : _FileTree } for files that currently have live
 681     # revisions:
 682     self._text_record_db = TextRecordDatabase(self._delta_db, self._co_db)
 683
 684   def _get_text_record(self, cvs_rev):
 685     """Return the TextRecord instance for CVS_REV.
 686
 687     If the TextRecords for CVS_REV.cvs_file haven't been loaded yet,
 688     do so now."""
 689
 690     if cvs_rev.cvs_file not in self._loaded_files:
 691       for text_record in self._tree_db[cvs_rev.cvs_file.id].itervalues():
 692         self._text_record_db.add(text_record)
 693       self._loaded_files.add(cvs_rev.cvs_file)
 694
 695     return self._text_record_db[cvs_rev.id]
 696
 697   def get_content(self, cvs_rev):
 698     """Check out the text for revision C_REV from the repository.
 699
 700     Return the text.  If CVS_REV has a property _keyword_handling, use
 701     it to determine how to handle RCS keywords in the output:
 702
 703         'collapsed' -- collapse keywords
 704
 705         'expanded' -- expand keywords
 706
 707         'untouched' -- output keywords in the form they are found in
 708             the RCS file
 709
 710     Note that $Log$ never actually generates a log (which makes test
 711     'requires_cvs()' fail).
 712
 713     Revisions may be requested in any order, but if they are not
 714     requested in dependency order the checkout database will become
 715     very large.  Revisions may be skipped.  Each revision may be
 716     requested only once."""
 717
 718     try:
 719       text = self._get_text_record(cvs_rev).checkout(self._text_record_db)
 720     except MalformedDeltaException, (msg):
 721       raise FatalError(
 722           'Malformed RCS delta in %s, revision %s: %s'
 723           % (cvs_rev.cvs_file.rcs_path, cvs_rev.rev, msg)
 724           )
 725
 726     keyword_handling = cvs_rev.get_property('_keyword_handling')
 727
 728     if keyword_handling == 'untouched':
 729       # Leave keywords in the form that they were checked in.
 730       pass
 731     elif keyword_handling == 'collapsed':
 732       text = collapse_keywords(text)
 733     elif keyword_handling == 'expanded':
 734       text = expand_keywords(text, cvs_rev)
 735     else:
 736       raise FatalError(
 737           'Undefined _keyword_handling property (%r) for %s'
 738           % (keyword_handling, cvs_rev,)
 739           )
 740
 741     if Ctx().decode_apple_single:
 742       # Insert a filter to decode any files that are in AppleSingle
 743       # format:
 744       text = get_maybe_apple_single(text)
 745
 746     eol_fix = cvs_rev.get_property('_eol_fix')
 747     if eol_fix:
 748       text = canonicalize_eol(text, eol_fix)
 749
 750     return text
 751
 752   def finish(self):
 753     self._text_record_db.log_leftovers()
 754
 755     del self._text_record_db
 756     self._delta_db.close()
 757     self._tree_db.close()
 758     self._co_db.close()
 759