Tools/scripts/ndiff.py

   1 #! /usr/bin/env python
   2
   3 # Released to the public domain $JustDate:  3/16/98 $,
   4 # by Tim Peters (email tim_one@email.msn.com).
   5
   6 # ndiff file1 file2 -- a human-friendly file differencer.
   7
   8 # $Revision$
   9
  10 # SequenceMatcher tries to compute a "human-friendly diff" between
  11 # two sequences (chiefly picturing a file as a sequence of lines,
  12 # and a line as a sequence of characters, here).  Unlike UNIX(tm) diff,
  13 # e.g., the fundamental notion is the longest *contiguous* & junk-free
  14 # matching subsequence.  That's what catches peoples' eyes.  The
  15 # Windows(tm) windiff has another interesting notion, pairing up elements
  16 # that appear uniquely in each sequence.  That, and the method here,
  17 # appear to yield more intuitive difference reports than does diff.  This
  18 # method appears to be the least vulnerable to synching up on blocks
  19 # of "junk lines", though (like blank lines in ordinary text files,
  20 # or maybe "<P>" lines in HTML files).  That may be because this is
  21 # the only method of the 3 that has a *concept* of "junk" <wink>.
  22 #
  23 # Note that ndiff makes no claim to produce a *minimal* diff.  To the
  24 # contrary, minimal diffs are often counter-intuitive, because they
  25 # synch up anywhere possible, sometimes accidental matches 100 pages
  26 # apart.  Restricting synch points to contiguous matches preserves some
  27 # notion of locality, at the occasional cost of producing a longer diff.
  28 #
  29 # With respect to junk, an earlier verion of ndiff simply refused to
  30 # *start* a match with a junk element.  The result was cases like this:
  31 #     before: private Thread currentThread;
  32 #     after:  private volatile Thread currentThread;
  33 # If you consider whitespace to be junk, the longest continguous match
  34 # not starting with junk is "e Thread currentThread".  So ndiff reported
  35 # that "e volatil" was inserted between the 't' and the 'e' in "private".
  36 # While an accurate view, to people that's absurd.  The current version
  37 # looks for matching blocks that are entirely junk-free, then extends the
  38 # longest one of those as far as possible but only with matching junk.
  39 # So now "currentThread" is matched, then extended to suck up the
  40 # preceding blank; then "private" is matched, and extended to suck up the
  41 # following blank; then "Thread" is matched; and finally ndiff reports
  42 # that "volatile " was inserted before "Thread".  The only quibble
  43 # remaining is that perhaps it was really the case that " volative"
  44 # was inserted after "private".  I can live with that <wink>.
  45 #
  46 # NOTE on the output:  From an ndiff report,
  47 # 1) The first file can be recovered by retaining only lines that begin
  48 #    with "  " or "- ", and deleting those 2-character prefixes.
  49 # 2) The second file can be recovered similarly, but by retaining only
  50 #    "  " and "+ " lines.
  51 # 3) Lines beginning with "? " attempt to guide the eye to intraline
  52 #    differences, and were not present in either input file.
  53 #
  54 # COROLLARY:
  55 # On Unix, the second file can be recovered by piping the output through
  56 #    sed -n '/^[+ ] /s/^..//p'
  57 # Modifications to recover the first file are left as an exercise for
  58 # the reader.
  59 #
  60 # NOTE on junk:  the module-level names
  61 #    IS_LINE_JUNK
  62 #    IS_CHARACTER_JUNK
  63 # can be set to any functions you like.  The first one should accept
  64 # a single string argument, and return true iff the string is junk.
  65 # The default is whether the regexp r"\s*#?\s*$" matches (i.e., a
  66 # line without visible characters, except for at most one splat).
  67 # The second should accept a string of length 1 etc.  The default is
  68 # whether the character is a blank or tab (note: bad idea to include
  69 # newline in this!).
  70 #
  71 # After setting those, you can call fcompare(f1name, f2name) with the
  72 # names of the files you want to compare.  The difference report
  73 # is sent to stdout.  Or you can call main(), which expects to find
  74 # (exactly) the two file names in sys.argv.
  75
  76 import string
  77 TRACE = 0
  78
  79 # define what "junk" means
  80 import re
  81
  82 def IS_LINE_JUNK(line, pat=re.compile(r"\s*#?\s*$").match):
  83     return pat(line) is not None
  84
  85 def IS_CHARACTER_JUNK(ch, ws=" \t"):
  86     return ch in ws
  87
  88 del re
  89
  90 class SequenceMatcher:
  91     def __init__(self, isjunk=None, a='', b=''):
  92         # Members:
  93         # a
  94         #      first sequence
  95         # b
  96         #      second sequence; differences are computed as "what do
  97         #      we need to do to 'a' to change it into 'b'?"
  98         # b2j
  99         #      for x in b, b2j[x] is a list of the indices (into b)
 100         #      at which x appears; junk elements do not appear
 101         # b2jhas
 102         #      b2j.has_key
 103         # fullbcount
 104         #      for x in b, fullbcount[x] == the number of times x
 105         #      appears in b; only materialized if really needed (used
 106         #      only for computing quick_ratio())
 107         # matching_blocks
 108         #      a list of (i, j, k) triples, where a[i:i+k] == b[j:j+k];
 109         #      ascending & non-overlapping in i and in j; terminated by
 110         #      a dummy (len(a), len(b), 0) sentinel
 111         # opcodes
 112         #      a list of (tag, i1, i2, j1, j2) tuples, where tag is
 113         #      one of
 114         #          'replace'   a[i1:i2] should be replaced by b[j1:j2]
 115         #          'delete'    a[i1:i2] should be deleted
 116         #          'insert'    b[j1:j2] should be inserted
 117         #          'equal'     a[i1:i2] == b[j1:j2]
 118         # isjunk
 119         #      a user-supplied function taking a sequence element and
 120         #      returning true iff the element is "junk" -- this has
 121         #      subtle but helpful effects on the algorithm, which I'll
 122         #      get around to writing up someday <0.9 wink>.
 123         #      DON'T USE!  Only __chain_b uses this.  Use isbjunk.
 124         # isbjunk
 125         #      for x in b, isbjunk(x) == isjunk(x) but much faster;
 126         #      it's really the has_key method of a hidden dict.
 127         #      DOES NOT WORK for x in a!
 128
 129         self.isjunk = isjunk
 130         self.a = self.b = None
 131         self.set_seqs(a, b)
 132
 133     def set_seqs(self, a, b):
 134         self.set_seq1(a)
 135         self.set_seq2(b)
 136
 137     def set_seq1(self, a):
 138         if a is self.a:
 139             return
 140         self.a = a
 141         self.matching_blocks = self.opcodes = None
 142
 143     def set_seq2(self, b):
 144         if b is self.b:
 145             return
 146         self.b = b
 147         self.matching_blocks = self.opcodes = None
 148         self.fullbcount = None
 149         self.__chain_b()
 150
 151     # for each element x in b, set b2j[x] to a list of the indices in
 152     # b where x appears; the indices are in increasing order; note that
 153     # the number of times x appears in b is len(b2j[x]) ...
 154     # when self.isjunk is defined, junk elements don't show up in this
 155     # map at all, which stops the central find_longest_match method
 156     # from starting any matching block at a junk element ...
 157     # also creates the fast isbjunk function ...
 158     # note that this is only called when b changes; so for cross-product
 159     # kinds of matches, it's best to call set_seq2 once, then set_seq1
 160     # repeatedly
 161
 162     def __chain_b(self):
 163         # Because isjunk is a user-defined (not C) function, and we test
 164         # for junk a LOT, it's important to minimize the number of calls.
 165         # Before the tricks described here, __chain_b was by far the most
 166         # time-consuming routine in the whole module!  If anyone sees
 167         # Jim Roskind, thank him again for profile.py -- I never would
 168         # have guessed that.
 169         # The first trick is to build b2j ignoring the possibility
 170         # of junk.  I.e., we don't call isjunk at all yet.  Throwing
 171         # out the junk later is much cheaper than building b2j "right"
 172         # from the start.
 173         b = self.b
 174         self.b2j = b2j = {}
 175         self.b2jhas = b2jhas = b2j.has_key
 176         for i in xrange(0, len(b)):
 177             elt = b[i]
 178             if b2jhas(elt):
 179                 b2j[elt].append(i)
 180             else:
 181                 b2j[elt] = [i]
 182
 183         # Now b2j.keys() contains elements uniquely, and especially when
 184         # the sequence is a string, that's usually a good deal smaller
 185         # than len(string).  The difference is the number of isjunk calls
 186         # saved.
 187         isjunk, junkdict = self.isjunk, {}
 188         if isjunk:
 189             for elt in b2j.keys():
 190                 if isjunk(elt):
 191                     junkdict[elt] = 1   # value irrelevant; it's a set
 192                     del b2j[elt]
 193
 194         # Now for x in b, isjunk(x) == junkdict.has_key(x), but the
 195         # latter is much faster.  Note too that while there may be a
 196         # lot of junk in the sequence, the number of *unique* junk
 197         # elements is probably small.  So the memory burden of keeping
 198         # this dict alive is likely trivial compared to the size of b2j.
 199         self.isbjunk = junkdict.has_key
 200
 201     def find_longest_match(self, alo, ahi, blo, bhi):
 202         """Find longest matching block in a[alo:ahi] and b[blo:bhi].
 203
 204         If isjunk is not defined:
 205
 206         Return (i,j,k) such that a[i:i+k] is equal to b[j:j+k], where
 207             alo <= i <= i+k <= ahi
 208             blo <= j <= j+k <= bhi
 209         and for all (i',j',k') meeting those conditions,
 210             k >= k'
 211             i <= i'
 212             and if i == i', j <= j'
 213         In other words, of all maximal matching blocks, returns one
 214         that starts earliest in a, and of all those maximal matching
 215         blocks that start earliest in a, returns the one that starts
 216         earliest in b.
 217
 218         If isjunk is defined, first the longest matching block is
 219         determined as above, but with the additional restriction that
 220         no junk element appears in the block.  Then that block is
 221         extended as far as possible by matching (only) junk elements on
 222         both sides.  So the resulting block never matches on junk except
 223         as identical junk happens to be adjacent to an "interesting"
 224         match.
 225
 226         If no blocks match, returns (alo, blo, 0).
 227         """
 228
 229         # CAUTION:  stripping common prefix or suffix would be incorrect.
 230         # E.g.,
 231         #    ab
 232         #    acab
 233         # Longest matching block is "ab", but if common prefix is
 234         # stripped, it's "a" (tied with "b").  UNIX(tm) diff does so
 235         # strip, so ends up claiming that ab is changed to acab by
 236         # inserting "ca" in the middle.  That's minimal but unintuitive:
 237         # "it's obvious" that someone inserted "ac" at the front.
 238         # Windiff ends up at the same place as diff, but by pairing up
 239         # the unique 'b's and then matching the first two 'a's.
 240
 241         # find longest junk-free match
 242         a, b, b2j, isbjunk = self.a, self.b, self.b2j, self.isbjunk
 243         besti, bestj, bestsize = alo, blo, 0
 244         for i in xrange(alo, ahi):
 245             # check for longest match starting at a[i]
 246             if i + bestsize >= ahi:
 247                 # we're too far right to get a new best
 248                 break
 249             # look at all instances of a[i] in b; note that because
 250             # b2j has no junk keys, the loop is skipped if a[i] is junk
 251             for j in b2j.get(a[i], []):
 252                 # a[i] matches b[j]
 253                 if j < blo:
 254                     continue
 255                 if j + bestsize >= bhi:
 256                     # we're too far right to get a new best, here or
 257                     # anywhere to the right
 258                     break
 259                 if a[i + bestsize] != b[j + bestsize]:
 260                     # can't be longer match; this test is not necessary
 261                     # for correctness, but is a huge win for efficiency
 262                     continue
 263                 # set k to length of match
 264                 k = 1   # a[i] == b[j] already known
 265                 while i + k < ahi and j + k < bhi and \
 266                       a[i+k] == b[j+k] and not isbjunk(b[j+k]):
 267                     k = k + 1
 268                 if k > bestsize:
 269                     besti, bestj, bestsize = i, j, k
 270                     if i + bestsize >= ahi:
 271                         # only time in my life I really wanted a
 272                         # labelled break <wink> -- we're done with
 273                         # both loops now
 274                         break
 275
 276         # Now that we have a wholly interesting match (albeit possibly
 277         # empty!), we may as well suck up the matching junk on each
 278         # side of it too.  Can't think of a good reason not to, and it
 279         # saves post-processing the (possibly considerable) expense of
 280         # figuring out what to do with it.  In the case of an empty
 281         # interesting match, this is clearly the right thing to do,
 282         # because no other kind of match is possible in the regions.
 283         while besti > alo and bestj > blo and \
 284               isbjunk(b[bestj-1]) and \
 285               a[besti-1] == b[bestj-1]:
 286             besti, bestj, bestsize = besti-1, bestj-1, bestsize+1
 287         while besti+bestsize < ahi and bestj+bestsize < bhi and \
 288               isbjunk(b[bestj+bestsize]) and \
 289               a[besti+bestsize] == b[bestj+bestsize]:
 290             bestsize = bestsize + 1
 291
 292         if TRACE:
 293             print "get_matching_blocks", alo, ahi, blo, bhi
 294             print "    returns", besti, bestj, bestsize
 295         return besti, bestj, bestsize
 296
 297 #   A different implementation, using a binary doubling technique that
 298 #   does far fewer element compares (trades 'em for integer compares),
 299 #   and has n*lg n worst-case behavior.  Alas, the code is much harder
 300 #   to follow (the details are tricky!), and in most cases I've seen,
 301 #   it takes at least 50% longer than the "clever dumb" method above;
 302 #   probably due to creating layers of small dicts.
 303 #   NOTE:  this no longer matches the version above wrt junk; remains
 304 #   too unpromising to update it; someday, though ...
 305
 306 #    def find_longest_match(self, alo, ahi, blo, bhi):
 307 #        """Find longest matching block in a[alo:ahi] and b[blo:bhi].
 308 #
 309 #        Return (i,j,k) such that a[i:i+k] is equal to b[j:j+k], where
 310 #            alo <= i <= i+k <= ahi
 311 #            blo <= j <= j+k <= bhi
 312 #        and for all (i',j',k') meeting those conditions,
 313 #            k >= k'
 314 #            i <= i'
 315 #            and if i == i', j <= j'
 316 #        In other words, of all maximal matching blocks, returns one
 317 #        that starts earliest in a, and of all those maximal matching
 318 #        blocks that start earliest in a, returns the one that starts
 319 #        earliest in b.
 320 #
 321 #        If no blocks match, returns (alo, blo, 0).
 322 #        """
 323 #
 324 #        a, b2j = self.a, self.b2j
 325 #        # alljs[size][i] is a set of all j's s.t. a[i:i+len] matches
 326 #        # b[j:j+len]
 327 #        alljs = {}
 328 #        alljs[1] = js = {}
 329 #        ahits = {}
 330 #        for i in xrange(alo, ahi):
 331 #            elt = a[i]
 332 #            if ahits.has_key(elt):
 333 #                js[i] = ahits[elt]
 334 #                continue
 335 #            if b2j.has_key(elt):
 336 #                in_range = {}
 337 #                for j in b2j[elt]:
 338 #                    if j >= blo:
 339 #                        if j >= bhi:
 340 #                            break
 341 #                        in_range[j] = 1
 342 #                if in_range:
 343 #                    ahits[elt] = js[i] = in_range
 344 #        del ahits
 345 #        size = 1
 346 #        while js:
 347 #            oldsize = size
 348 #            size = size + size
 349 #            oldjs = js
 350 #            alljs[size] = js = {}
 351 #            for i in oldjs.keys():
 352 #                # i has matches of size oldsize
 353 #                if not oldjs.has_key(i + oldsize):
 354 #                    # can't double it
 355 #                    continue
 356 #                second_js = oldjs[i + oldsize]
 357 #                answer = {}
 358 #                for j in oldjs[i].keys():
 359 #                    if second_js.has_key(j + oldsize):
 360 #                        answer[j] = 1
 361 #                if answer:
 362 #                    js[i] = answer
 363 #        del alljs[size]
 364 #        size = size >> 1    # max power of 2 with a match
 365 #        if not size:
 366 #            return alo, blo, 0
 367 #        besti, bestj, bestsize = alo, blo, 0
 368 #        fatis = alljs[size].keys()
 369 #        fatis.sort()
 370 #        for i in fatis:
 371 #            # figure out longest match starting at a[i]
 372 #            totalsize = halfsize = size
 373 #            # i has matches of len totalsize at the indices in js
 374 #            js = alljs[size][i].keys()
 375 #            while halfsize > 1:
 376 #                halfsize = halfsize >> 1
 377 #                # is there a match of len halfsize starting at
 378 #                # i + totalsize?
 379 #                newjs = []
 380 #                if alljs[halfsize].has_key(i + totalsize):
 381 #                    second_js = alljs[halfsize][i + totalsize]
 382 #                    for j in js:
 383 #                        if second_js.has_key(j + totalsize):
 384 #                            newjs.append(j)
 385 #                if newjs:
 386 #                    totalsize = totalsize + halfsize
 387 #                    js = newjs
 388 #            if totalsize > bestsize:
 389 #                besti, bestj, bestsize = i, min(js), totalsize
 390 #        return besti, bestj, bestsize
 391
 392     def get_matching_blocks(self):
 393         if self.matching_blocks is not None:
 394             return self.matching_blocks
 395         self.matching_blocks = []
 396         la, lb = len(self.a), len(self.b)
 397         self.__helper(0, la, 0, lb, self.matching_blocks)
 398         self.matching_blocks.append( (la, lb, 0) )
 399         if TRACE:
 400             print '*** matching blocks', self.matching_blocks
 401         return self.matching_blocks
 402
 403     # builds list of matching blocks covering a[alo:ahi] and
 404     # b[blo:bhi], appending them in increasing order to answer
 405
 406     def __helper(self, alo, ahi, blo, bhi, answer):
 407         i, j, k = x = self.find_longest_match(alo, ahi, blo, bhi)
 408         # a[alo:i] vs b[blo:j] unknown
 409         # a[i:i+k] same as b[j:j+k]
 410         # a[i+k:ahi] vs b[j+k:bhi] unknown
 411         if k:
 412             if alo < i and blo < j:
 413                 self.__helper(alo, i, blo, j, answer)
 414             answer.append( x )
 415             if i+k < ahi and j+k < bhi:
 416                 self.__helper(i+k, ahi, j+k, bhi, answer)
 417
 418     def ratio(self):
 419         """Return a measure of the sequences' similarity (float in [0,1]).
 420
 421         Where T is the total number of elements in both sequences, and
 422         M is the number of matches, this is 2*M / T.
 423         Note that this is 1 if the sequences are identical, and 0 if
 424         they have nothing in common.
 425         """
 426
 427         matches = reduce(lambda sum, triple: sum + triple[-1],
 428                          self.get_matching_blocks(), 0)
 429         return 2.0 * matches / (len(self.a) + len(self.b))
 430
 431     def quick_ratio(self):
 432         """Return an upper bound on ratio() relatively quickly."""
 433         # viewing a and b as multisets, set matches to the cardinality
 434         # of their intersection; this counts the number of matches
 435         # without regard to order, so is clearly an upper bound
 436         if self.fullbcount is None:
 437             self.fullbcount = fullbcount = {}
 438             for elt in self.b:
 439                 fullbcount[elt] = fullbcount.get(elt, 0) + 1
 440         fullbcount = self.fullbcount
 441         # avail[x] is the number of times x appears in 'b' less the
 442         # number of times we've seen it in 'a' so far ... kinda
 443         avail = {}
 444         availhas, matches = avail.has_key, 0
 445         for elt in self.a:
 446             if availhas(elt):
 447                 numb = avail[elt]
 448             else:
 449                 numb = fullbcount.get(elt, 0)
 450             avail[elt] = numb - 1
 451             if numb > 0:
 452                 matches = matches + 1
 453         return 2.0 * matches / (len(self.a) + len(self.b))
 454
 455     def real_quick_ratio(self):
 456         """Return an upper bound on ratio() very quickly"""
 457         la, lb = len(self.a), len(self.b)
 458         # can't have more matches than the number of elements in the
 459         # shorter sequence
 460         return 2.0 * min(la, lb) / (la + lb)
 461
 462     def get_opcodes(self):
 463         if self.opcodes is not None:
 464             return self.opcodes
 465         i = j = 0
 466         self.opcodes = answer = []
 467         for ai, bj, size in self.get_matching_blocks():
 468             # invariant:  we've pumped out correct diffs to change
 469             # a[:i] into b[:j], and the next matching block is
 470             # a[ai:ai+size] == b[bj:bj+size].  So we need to pump
 471             # out a diff to change a[i:ai] into b[j:bj], pump out
 472             # the matching block, and move (i,j) beyond the match
 473             tag = ''
 474             if i < ai and j < bj:
 475                 tag = 'replace'
 476             elif i < ai:
 477                 tag = 'delete'
 478             elif j < bj:
 479                 tag = 'insert'
 480             if tag:
 481                 answer.append( (tag, i, ai, j, bj) )
 482             i, j = ai+size, bj+size
 483             # the list of matching blocks is terminated by a
 484             # sentinel with size 0
 485             if size:
 486                 answer.append( ('equal', ai, i, bj, j) )
 487         return answer
 488
 489 # meant for dumping lines
 490 def dump(tag, x, lo, hi):
 491     for i in xrange(lo, hi):
 492         print tag, x[i],
 493
 494 # figure out which mark to stick under characters in lines that
 495 # have changed (blank = same, - = deleted, + = inserted, ^ = replaced)
 496 _combine = { '  ': ' ',
 497              '. ': '-',
 498              ' .': '+',
 499              '..': '^' }
 500
 501 def plain_replace(a, alo, ahi, b, blo, bhi):
 502     assert alo < ahi and blo < bhi
 503     # dump the shorter block first -- reduces the burden on short-term
 504     # memory if the blocks are of very different sizes
 505     if bhi - blo < ahi - alo:
 506         dump('+', b, blo, bhi)
 507         dump('-', a, alo, ahi)
 508     else:
 509         dump('-', a, alo, ahi)
 510         dump('+', b, blo, bhi)
 511
 512 # When replacing one block of lines with another, this guy searches
 513 # the blocks for *similar* lines; the best-matching pair (if any) is
 514 # used as a synch point, and intraline difference marking is done on
 515 # the similar pair.  Lots of work, but often worth it.
 516
 517 def fancy_replace(a, alo, ahi, b, blo, bhi):
 518     if TRACE:
 519         print '*** fancy_replace', alo, ahi, blo, bhi
 520         dump('>', a, alo, ahi)
 521         dump('<', b, blo, bhi)
 522
 523     # don't synch up unless the lines have a similarity score of at
 524     # least cutoff; best_ratio tracks the best score seen so far
 525     best_ratio, cutoff = 0.74, 0.75
 526     cruncher = SequenceMatcher(IS_CHARACTER_JUNK)
 527     eqi, eqj = None, None   # 1st indices of equal lines (if any)
 528
 529     # search for the pair that matches best without being identical
 530     # (identical lines must be junk lines, & we don't want to synch up
 531     # on junk -- unless we have to)
 532     for j in xrange(blo, bhi):
 533         bj = b[j]
 534         cruncher.set_seq2(bj)
 535         for i in xrange(alo, ahi):
 536             ai = a[i]
 537             if ai == bj:
 538                 if eqi is None:
 539                     eqi, eqj = i, j
 540                 continue
 541             cruncher.set_seq1(ai)
 542             # computing similarity is expensive, so use the quick
 543             # upper bounds first -- have seen this speed up messy
 544             # compares by a factor of 3.
 545             # note that ratio() is only expensive to compute the first
 546             # time it's called on a sequence pair; the expensive part
 547             # of the computation is cached by cruncher
 548             if cruncher.real_quick_ratio() > best_ratio and \
 549                   cruncher.quick_ratio() > best_ratio and \
 550                   cruncher.ratio() > best_ratio:
 551                 best_ratio, best_i, best_j = cruncher.ratio(), i, j
 552     if best_ratio < cutoff:
 553         # no non-identical "pretty close" pair
 554         if eqi is None:
 555             # no identical pair either -- treat it as a straight replace
 556             plain_replace(a, alo, ahi, b, blo, bhi)
 557             return
 558         # no close pair, but an identical pair -- synch up on that
 559         best_i, best_j, best_ratio = eqi, eqj, 1.0
 560     else:
 561         # there's a close pair, so forget the identical pair (if any)
 562         eqi = None
 563
 564     # a[best_i] very similar to b[best_j]; eqi is None iff they're not
 565     # identical
 566     if TRACE:
 567         print '*** best_ratio', best_ratio, best_i, best_j
 568         dump('>', a, best_i, best_i+1)
 569         dump('<', b, best_j, best_j+1)
 570
 571     # pump out diffs from before the synch point
 572     fancy_helper(a, alo, best_i, b, blo, best_j)
 573
 574     # do intraline marking on the synch pair
 575     aelt, belt = a[best_i], b[best_j]
 576     if eqi is None:
 577         # pump out a '-', '+', '?' triple for the synched lines;
 578         atags = btags = ""
 579         cruncher.set_seqs(aelt, belt)
 580         for tag, ai1, ai2, bj1, bj2 in cruncher.get_opcodes():
 581             la, lb = ai2 - ai1, bj2 - bj1
 582             if tag == 'replace':
 583                 atags = atags + '.' * la
 584                 btags = btags + '.' * lb
 585             elif tag == 'delete':
 586                 atags = atags + '.' * la
 587             elif tag == 'insert':
 588                 btags = btags + '.' * lb
 589             elif tag == 'equal':
 590                 atags = atags + ' ' * la
 591                 btags = btags + ' ' * lb
 592             else:
 593                 raise ValueError, 'unknown tag ' + `tag`
 594         la, lb = len(atags), len(btags)
 595         if la < lb:
 596             atags = atags + ' ' * (lb - la)
 597         elif lb < la:
 598             btags = btags + ' ' * (la - lb)
 599         combined = map(lambda x,y: _combine[x+y], atags, btags)
 600         print '-', aelt, '+', belt, '?', \
 601               string.rstrip(string.join(combined, ''))
 602     else:
 603         # the synch pair is identical
 604         print ' ', aelt,
 605
 606     # pump out diffs from after the synch point
 607     fancy_helper(a, best_i+1, ahi, b, best_j+1, bhi)
 608
 609 def fancy_helper(a, alo, ahi, b, blo, bhi):
 610     if alo < ahi:
 611         if blo < bhi:
 612             fancy_replace(a, alo, ahi, b, blo, bhi)
 613         else:
 614             dump('-', a, alo, ahi)
 615     elif blo < bhi:
 616         dump('+', b, blo, bhi)
 617
 618 # open a file & return the file object; gripe and return 0 if it
 619 # couldn't be opened
 620 def fopen(fname):
 621     try:
 622         return open(fname, 'r')
 623     except IOError, detail:
 624         print "couldn't open " + fname + ": " + `detail`
 625         return 0
 626
 627 # open two files & spray the diff to stdout; return false iff a problem
 628 def fcompare(f1name, f2name):
 629     f1 = fopen(f1name)
 630     f2 = fopen(f2name)
 631     if not f1 or not f2:
 632         return 0
 633
 634     a = f1.readlines(); f1.close()
 635     b = f2.readlines(); f2.close()
 636
 637     cruncher = SequenceMatcher(IS_LINE_JUNK, a, b)
 638     for tag, alo, ahi, blo, bhi in cruncher.get_opcodes():
 639         if tag == 'replace':
 640             fancy_replace(a, alo, ahi, b, blo, bhi)
 641         elif tag == 'delete':
 642             dump('-', a, alo, ahi)
 643         elif tag == 'insert':
 644             dump('+', b, blo, bhi)
 645         elif tag == 'equal':
 646             dump(' ', a, alo, ahi)
 647         else:
 648             raise ValueError, 'unknown tag ' + `tag`
 649
 650     return 1
 651
 652 # get file names from argv & compare; return false iff a problem
 653 def main():
 654     from sys import argv
 655     if len(argv) != 3:
 656         print 'need 2 args'
 657         return 0
 658     [f1name, f2name] = argv[1:3]
 659     print '-:', f1name
 660     print '+:', f2name
 661     return fcompare(f1name, f2name)
 662
 663 if __name__ == '__main__':
 664     if 1:
 665         main()
 666     else:
 667         import profile, pstats
 668         statf = "ndiff.pro"
 669         profile.run("main()", statf)
 670         stats = pstats.Stats(statf)
 671         stats.strip_dirs().sort_stats('time').print_stats()
 672