Lib/difflib.py

   1 #! /usr/bin/env python
   2
   3 from __future__ import generators
   4
   5 """
   6 Module difflib -- helpers for computing deltas between objects.
   7
   8 Function get_close_matches(word, possibilities, n=3, cutoff=0.6):
   9     Use SequenceMatcher to return list of the best "good enough" matches.
  10
  11 Function ndiff(a, b):
  12     Return a delta: the difference between `a` and `b` (lists of strings).
  13
  14 Function restore(delta, which):
  15     Return one of the two sequences that generated an ndiff delta.
  16
  17 Class SequenceMatcher:
  18     A flexible class for comparing pairs of sequences of any type.
  19
  20 Class Differ:
  21     For producing human-readable deltas from sequences of lines of text.
  22 """
  23
  24 __all__ = ['get_close_matches', 'ndiff', 'restore', 'SequenceMatcher',
  25            'Differ']
  26
  27 class SequenceMatcher:
  28
  29     """
  30     SequenceMatcher is a flexible class for comparing pairs of sequences of
  31     any type, so long as the sequence elements are hashable.  The basic
  32     algorithm predates, and is a little fancier than, an algorithm
  33     published in the late 1980's by Ratcliff and Obershelp under the
  34     hyperbolic name "gestalt pattern matching".  The basic idea is to find
  35     the longest contiguous matching subsequence that contains no "junk"
  36     elements (R-O doesn't address junk).  The same idea is then applied
  37     recursively to the pieces of the sequences to the left and to the right
  38     of the matching subsequence.  This does not yield minimal edit
  39     sequences, but does tend to yield matches that "look right" to people.
  40
  41     SequenceMatcher tries to compute a "human-friendly diff" between two
  42     sequences.  Unlike e.g. UNIX(tm) diff, the fundamental notion is the
  43     longest *contiguous* & junk-free matching subsequence.  That's what
  44     catches peoples' eyes.  The Windows(tm) windiff has another interesting
  45     notion, pairing up elements that appear uniquely in each sequence.
  46     That, and the method here, appear to yield more intuitive difference
  47     reports than does diff.  This method appears to be the least vulnerable
  48     to synching up on blocks of "junk lines", though (like blank lines in
  49     ordinary text files, or maybe "<P>" lines in HTML files).  That may be
  50     because this is the only method of the 3 that has a *concept* of
  51     "junk" <wink>.
  52
  53     Example, comparing two strings, and considering blanks to be "junk":
  54
  55     >>> s = SequenceMatcher(lambda x: x == " ",
  56     ...                     "private Thread currentThread;",
  57     ...                     "private volatile Thread currentThread;")
  58     >>>
  59
  60     .ratio() returns a float in [0, 1], measuring the "similarity" of the
  61     sequences.  As a rule of thumb, a .ratio() value over 0.6 means the
  62     sequences are close matches:
  63
  64     >>> print round(s.ratio(), 3)
  65     0.866
  66     >>>
  67
  68     If you're only interested in where the sequences match,
  69     .get_matching_blocks() is handy:
  70
  71     >>> for block in s.get_matching_blocks():
  72     ...     print "a[%d] and b[%d] match for %d elements" % block
  73     a[0] and b[0] match for 8 elements
  74     a[8] and b[17] match for 6 elements
  75     a[14] and b[23] match for 15 elements
  76     a[29] and b[38] match for 0 elements
  77
  78     Note that the last tuple returned by .get_matching_blocks() is always a
  79     dummy, (len(a), len(b), 0), and this is the only case in which the last
  80     tuple element (number of elements matched) is 0.
  81
  82     If you want to know how to change the first sequence into the second,
  83     use .get_opcodes():
  84
  85     >>> for opcode in s.get_opcodes():
  86     ...     print "%6s a[%d:%d] b[%d:%d]" % opcode
  87      equal a[0:8] b[0:8]
  88     insert a[8:8] b[8:17]
  89      equal a[8:14] b[17:23]
  90      equal a[14:29] b[23:38]
  91
  92     See the Differ class for a fancy human-friendly file differencer, which
  93     uses SequenceMatcher both to compare sequences of lines, and to compare
  94     sequences of characters within similar (near-matching) lines.
  95
  96     See also function get_close_matches() in this module, which shows how
  97     simple code building on SequenceMatcher can be used to do useful work.
  98
  99     Timing:  Basic R-O is cubic time worst case and quadratic time expected
 100     case.  SequenceMatcher is quadratic time for the worst case and has
 101     expected-case behavior dependent in a complicated way on how many
 102     elements the sequences have in common; best case time is linear.
 103
 104     Methods:
 105
 106     __init__(isjunk=None, a='', b='')
 107         Construct a SequenceMatcher.
 108
 109     set_seqs(a, b)
 110         Set the two sequences to be compared.
 111
 112     set_seq1(a)
 113         Set the first sequence to be compared.
 114
 115     set_seq2(b)
 116         Set the second sequence to be compared.
 117
 118     find_longest_match(alo, ahi, blo, bhi)
 119         Find longest matching block in a[alo:ahi] and b[blo:bhi].
 120
 121     get_matching_blocks()
 122         Return list of triples describing matching subsequences.
 123
 124     get_opcodes()
 125         Return list of 5-tuples describing how to turn a into b.
 126
 127     ratio()
 128         Return a measure of the sequences' similarity (float in [0,1]).
 129
 130     quick_ratio()
 131         Return an upper bound on .ratio() relatively quickly.
 132
 133     real_quick_ratio()
 134         Return an upper bound on ratio() very quickly.
 135     """
 136
 137     def __init__(self, isjunk=None, a='', b=''):
 138         """Construct a SequenceMatcher.
 139
 140         Optional arg isjunk is None (the default), or a one-argument
 141         function that takes a sequence element and returns true iff the
 142         element is junk.  None is equivalent to passing "lambda x: 0", i.e.
 143         no elements are considered to be junk.  For example, pass
 144             lambda x: x in " \\t"
 145         if you're comparing lines as sequences of characters, and don't
 146         want to synch up on blanks or hard tabs.
 147
 148         Optional arg a is the first of two sequences to be compared.  By
 149         default, an empty string.  The elements of a must be hashable.  See
 150         also .set_seqs() and .set_seq1().
 151
 152         Optional arg b is the second of two sequences to be compared.  By
 153         default, an empty string.  The elements of b must be hashable. See
 154         also .set_seqs() and .set_seq2().
 155         """
 156
 157         # Members:
 158         # a
 159         #      first sequence
 160         # b
 161         #      second sequence; differences are computed as "what do
 162         #      we need to do to 'a' to change it into 'b'?"
 163         # b2j
 164         #      for x in b, b2j[x] is a list of the indices (into b)
 165         #      at which x appears; junk elements do not appear
 166         # b2jhas
 167         #      b2j.has_key
 168         # fullbcount
 169         #      for x in b, fullbcount[x] == the number of times x
 170         #      appears in b; only materialized if really needed (used
 171         #      only for computing quick_ratio())
 172         # matching_blocks
 173         #      a list of (i, j, k) triples, where a[i:i+k] == b[j:j+k];
 174         #      ascending & non-overlapping in i and in j; terminated by
 175         #      a dummy (len(a), len(b), 0) sentinel
 176         # opcodes
 177         #      a list of (tag, i1, i2, j1, j2) tuples, where tag is
 178         #      one of
 179         #          'replace'   a[i1:i2] should be replaced by b[j1:j2]
 180         #          'delete'    a[i1:i2] should be deleted
 181         #          'insert'    b[j1:j2] should be inserted
 182         #          'equal'     a[i1:i2] == b[j1:j2]
 183         # isjunk
 184         #      a user-supplied function taking a sequence element and
 185         #      returning true iff the element is "junk" -- this has
 186         #      subtle but helpful effects on the algorithm, which I'll
 187         #      get around to writing up someday <0.9 wink>.
 188         #      DON'T USE!  Only __chain_b uses this.  Use isbjunk.
 189         # isbjunk
 190         #      for x in b, isbjunk(x) == isjunk(x) but much faster;
 191         #      it's really the has_key method of a hidden dict.
 192         #      DOES NOT WORK for x in a!
 193
 194         self.isjunk = isjunk
 195         self.a = self.b = None
 196         self.set_seqs(a, b)
 197
 198     def set_seqs(self, a, b):
 199         """Set the two sequences to be compared.
 200
 201         >>> s = SequenceMatcher()
 202         >>> s.set_seqs("abcd", "bcde")
 203         >>> s.ratio()
 204         0.75
 205         """
 206
 207         self.set_seq1(a)
 208         self.set_seq2(b)
 209
 210     def set_seq1(self, a):
 211         """Set the first sequence to be compared.
 212
 213         The second sequence to be compared is not changed.
 214
 215         >>> s = SequenceMatcher(None, "abcd", "bcde")
 216         >>> s.ratio()
 217         0.75
 218         >>> s.set_seq1("bcde")
 219         >>> s.ratio()
 220         1.0
 221         >>>
 222
 223         SequenceMatcher computes and caches detailed information about the
 224         second sequence, so if you want to compare one sequence S against
 225         many sequences, use .set_seq2(S) once and call .set_seq1(x)
 226         repeatedly for each of the other sequences.
 227
 228         See also set_seqs() and set_seq2().
 229         """
 230
 231         if a is self.a:
 232             return
 233         self.a = a
 234         self.matching_blocks = self.opcodes = None
 235
 236     def set_seq2(self, b):
 237         """Set the second sequence to be compared.
 238
 239         The first sequence to be compared is not changed.
 240
 241         >>> s = SequenceMatcher(None, "abcd", "bcde")
 242         >>> s.ratio()
 243         0.75
 244         >>> s.set_seq2("abcd")
 245         >>> s.ratio()
 246         1.0
 247         >>>
 248
 249         SequenceMatcher computes and caches detailed information about the
 250         second sequence, so if you want to compare one sequence S against
 251         many sequences, use .set_seq2(S) once and call .set_seq1(x)
 252         repeatedly for each of the other sequences.
 253
 254         See also set_seqs() and set_seq1().
 255         """
 256
 257         if b is self.b:
 258             return
 259         self.b = b
 260         self.matching_blocks = self.opcodes = None
 261         self.fullbcount = None
 262         self.__chain_b()
 263
 264     # For each element x in b, set b2j[x] to a list of the indices in
 265     # b where x appears; the indices are in increasing order; note that
 266     # the number of times x appears in b is len(b2j[x]) ...
 267     # when self.isjunk is defined, junk elements don't show up in this
 268     # map at all, which stops the central find_longest_match method
 269     # from starting any matching block at a junk element ...
 270     # also creates the fast isbjunk function ...
 271     # note that this is only called when b changes; so for cross-product
 272     # kinds of matches, it's best to call set_seq2 once, then set_seq1
 273     # repeatedly
 274
 275     def __chain_b(self):
 276         # Because isjunk is a user-defined (not C) function, and we test
 277         # for junk a LOT, it's important to minimize the number of calls.
 278         # Before the tricks described here, __chain_b was by far the most
 279         # time-consuming routine in the whole module!  If anyone sees
 280         # Jim Roskind, thank him again for profile.py -- I never would
 281         # have guessed that.
 282         # The first trick is to build b2j ignoring the possibility
 283         # of junk.  I.e., we don't call isjunk at all yet.  Throwing
 284         # out the junk later is much cheaper than building b2j "right"
 285         # from the start.
 286         b = self.b
 287         self.b2j = b2j = {}
 288         self.b2jhas = b2jhas = b2j.has_key
 289         for i in xrange(len(b)):
 290             elt = b[i]
 291             if b2jhas(elt):
 292                 b2j[elt].append(i)
 293             else:
 294                 b2j[elt] = [i]
 295
 296         # Now b2j.keys() contains elements uniquely, and especially when
 297         # the sequence is a string, that's usually a good deal smaller
 298         # than len(string).  The difference is the number of isjunk calls
 299         # saved.
 300         isjunk, junkdict = self.isjunk, {}
 301         if isjunk:
 302             for elt in b2j.keys():
 303                 if isjunk(elt):
 304                     junkdict[elt] = 1   # value irrelevant; it's a set
 305                     del b2j[elt]
 306
 307         # Now for x in b, isjunk(x) == junkdict.has_key(x), but the
 308         # latter is much faster.  Note too that while there may be a
 309         # lot of junk in the sequence, the number of *unique* junk
 310         # elements is probably small.  So the memory burden of keeping
 311         # this dict alive is likely trivial compared to the size of b2j.
 312         self.isbjunk = junkdict.has_key
 313
 314     def find_longest_match(self, alo, ahi, blo, bhi):
 315         """Find longest matching block in a[alo:ahi] and b[blo:bhi].
 316
 317         If isjunk is not defined:
 318
 319         Return (i,j,k) such that a[i:i+k] is equal to b[j:j+k], where
 320             alo <= i <= i+k <= ahi
 321             blo <= j <= j+k <= bhi
 322         and for all (i',j',k') meeting those conditions,
 323             k >= k'
 324             i <= i'
 325             and if i == i', j <= j'
 326
 327         In other words, of all maximal matching blocks, return one that
 328         starts earliest in a, and of all those maximal matching blocks that
 329         start earliest in a, return the one that starts earliest in b.
 330
 331         >>> s = SequenceMatcher(None, " abcd", "abcd abcd")
 332         >>> s.find_longest_match(0, 5, 0, 9)
 333         (0, 4, 5)
 334
 335         If isjunk is defined, first the longest matching block is
 336         determined as above, but with the additional restriction that no
 337         junk element appears in the block.  Then that block is extended as
 338         far as possible by matching (only) junk elements on both sides.  So
 339         the resulting block never matches on junk except as identical junk
 340         happens to be adjacent to an "interesting" match.
 341
 342         Here's the same example as before, but considering blanks to be
 343         junk.  That prevents " abcd" from matching the " abcd" at the tail
 344         end of the second sequence directly.  Instead only the "abcd" can
 345         match, and matches the leftmost "abcd" in the second sequence:
 346
 347         >>> s = SequenceMatcher(lambda x: x==" ", " abcd", "abcd abcd")
 348         >>> s.find_longest_match(0, 5, 0, 9)
 349         (1, 0, 4)
 350
 351         If no blocks match, return (alo, blo, 0).
 352
 353         >>> s = SequenceMatcher(None, "ab", "c")
 354         >>> s.find_longest_match(0, 2, 0, 1)
 355         (0, 0, 0)
 356         """
 357
 358         # CAUTION:  stripping common prefix or suffix would be incorrect.
 359         # E.g.,
 360         #    ab
 361         #    acab
 362         # Longest matching block is "ab", but if common prefix is
 363         # stripped, it's "a" (tied with "b").  UNIX(tm) diff does so
 364         # strip, so ends up claiming that ab is changed to acab by
 365         # inserting "ca" in the middle.  That's minimal but unintuitive:
 366         # "it's obvious" that someone inserted "ac" at the front.
 367         # Windiff ends up at the same place as diff, but by pairing up
 368         # the unique 'b's and then matching the first two 'a's.
 369
 370         a, b, b2j, isbjunk = self.a, self.b, self.b2j, self.isbjunk
 371         besti, bestj, bestsize = alo, blo, 0
 372         # find longest junk-free match
 373         # during an iteration of the loop, j2len[j] = length of longest
 374         # junk-free match ending with a[i-1] and b[j]
 375         j2len = {}
 376         nothing = []
 377         for i in xrange(alo, ahi):
 378             # look at all instances of a[i] in b; note that because
 379             # b2j has no junk keys, the loop is skipped if a[i] is junk
 380             j2lenget = j2len.get
 381             newj2len = {}
 382             for j in b2j.get(a[i], nothing):
 383                 # a[i] matches b[j]
 384                 if j < blo:
 385                     continue
 386                 if j >= bhi:
 387                     break
 388                 k = newj2len[j] = j2lenget(j-1, 0) + 1
 389                 if k > bestsize:
 390                     besti, bestj, bestsize = i-k+1, j-k+1, k
 391             j2len = newj2len
 392
 393         # Now that we have a wholly interesting match (albeit possibly
 394         # empty!), we may as well suck up the matching junk on each
 395         # side of it too.  Can't think of a good reason not to, and it
 396         # saves post-processing the (possibly considerable) expense of
 397         # figuring out what to do with it.  In the case of an empty
 398         # interesting match, this is clearly the right thing to do,
 399         # because no other kind of match is possible in the regions.
 400         while besti > alo and bestj > blo and \
 401               isbjunk(b[bestj-1]) and \
 402               a[besti-1] == b[bestj-1]:
 403             besti, bestj, bestsize = besti-1, bestj-1, bestsize+1
 404         while besti+bestsize < ahi and bestj+bestsize < bhi and \
 405               isbjunk(b[bestj+bestsize]) and \
 406               a[besti+bestsize] == b[bestj+bestsize]:
 407             bestsize = bestsize + 1
 408
 409         return besti, bestj, bestsize
 410
 411     def get_matching_blocks(self):
 412         """Return list of triples describing matching subsequences.
 413
 414         Each triple is of the form (i, j, n), and means that
 415         a[i:i+n] == b[j:j+n].  The triples are monotonically increasing in
 416         i and in j.
 417
 418         The last triple is a dummy, (len(a), len(b), 0), and is the only
 419         triple with n==0.
 420
 421         >>> s = SequenceMatcher(None, "abxcd", "abcd")
 422         >>> s.get_matching_blocks()
 423         [(0, 0, 2), (3, 2, 2), (5, 4, 0)]
 424         """
 425
 426         if self.matching_blocks is not None:
 427             return self.matching_blocks
 428         self.matching_blocks = []
 429         la, lb = len(self.a), len(self.b)
 430         self.__helper(0, la, 0, lb, self.matching_blocks)
 431         self.matching_blocks.append( (la, lb, 0) )
 432         return self.matching_blocks
 433
 434     # builds list of matching blocks covering a[alo:ahi] and
 435     # b[blo:bhi], appending them in increasing order to answer
 436
 437     def __helper(self, alo, ahi, blo, bhi, answer):
 438         i, j, k = x = self.find_longest_match(alo, ahi, blo, bhi)
 439         # a[alo:i] vs b[blo:j] unknown
 440         # a[i:i+k] same as b[j:j+k]
 441         # a[i+k:ahi] vs b[j+k:bhi] unknown
 442         if k:
 443             if alo < i and blo < j:
 444                 self.__helper(alo, i, blo, j, answer)
 445             answer.append(x)
 446             if i+k < ahi and j+k < bhi:
 447                 self.__helper(i+k, ahi, j+k, bhi, answer)
 448
 449     def get_opcodes(self):
 450         """Return list of 5-tuples describing how to turn a into b.
 451
 452         Each tuple is of the form (tag, i1, i2, j1, j2).  The first tuple
 453         has i1 == j1 == 0, and remaining tuples have i1 == the i2 from the
 454         tuple preceding it, and likewise for j1 == the previous j2.
 455
 456         The tags are strings, with these meanings:
 457
 458         'replace':  a[i1:i2] should be replaced by b[j1:j2]
 459         'delete':   a[i1:i2] should be deleted.
 460                     Note that j1==j2 in this case.
 461         'insert':   b[j1:j2] should be inserted at a[i1:i1].
 462                     Note that i1==i2 in this case.
 463         'equal':    a[i1:i2] == b[j1:j2]
 464
 465         >>> a = "qabxcd"
 466         >>> b = "abycdf"
 467         >>> s = SequenceMatcher(None, a, b)
 468         >>> for tag, i1, i2, j1, j2 in s.get_opcodes():
 469         ...    print ("%7s a[%d:%d] (%s) b[%d:%d] (%s)" %
 470         ...           (tag, i1, i2, a[i1:i2], j1, j2, b[j1:j2]))
 471          delete a[0:1] (q) b[0:0] ()
 472           equal a[1:3] (ab) b[0:2] (ab)
 473         replace a[3:4] (x) b[2:3] (y)
 474           equal a[4:6] (cd) b[3:5] (cd)
 475          insert a[6:6] () b[5:6] (f)
 476         """
 477
 478         if self.opcodes is not None:
 479             return self.opcodes
 480         i = j = 0
 481         self.opcodes = answer = []
 482         for ai, bj, size in self.get_matching_blocks():
 483             # invariant:  we've pumped out correct diffs to change
 484             # a[:i] into b[:j], and the next matching block is
 485             # a[ai:ai+size] == b[bj:bj+size].  So we need to pump
 486             # out a diff to change a[i:ai] into b[j:bj], pump out
 487             # the matching block, and move (i,j) beyond the match
 488             tag = ''
 489             if i < ai and j < bj:
 490                 tag = 'replace'
 491             elif i < ai:
 492                 tag = 'delete'
 493             elif j < bj:
 494                 tag = 'insert'
 495             if tag:
 496                 answer.append( (tag, i, ai, j, bj) )
 497             i, j = ai+size, bj+size
 498             # the list of matching blocks is terminated by a
 499             # sentinel with size 0
 500             if size:
 501                 answer.append( ('equal', ai, i, bj, j) )
 502         return answer
 503
 504     def ratio(self):
 505         """Return a measure of the sequences' similarity (float in [0,1]).
 506
 507         Where T is the total number of elements in both sequences, and
 508         M is the number of matches, this is 2,0*M / T.
 509         Note that this is 1 if the sequences are identical, and 0 if
 510         they have nothing in common.
 511
 512         .ratio() is expensive to compute if you haven't already computed
 513         .get_matching_blocks() or .get_opcodes(), in which case you may
 514         want to try .quick_ratio() or .real_quick_ratio() first to get an
 515         upper bound.
 516
 517         >>> s = SequenceMatcher(None, "abcd", "bcde")
 518         >>> s.ratio()
 519         0.75
 520         >>> s.quick_ratio()
 521         0.75
 522         >>> s.real_quick_ratio()
 523         1.0
 524         """
 525
 526         matches = reduce(lambda sum, triple: sum + triple[-1],
 527                          self.get_matching_blocks(), 0)
 528         return 2.0 * matches / (len(self.a) + len(self.b))
 529
 530     def quick_ratio(self):
 531         """Return an upper bound on ratio() relatively quickly.
 532
 533         This isn't defined beyond that it is an upper bound on .ratio(), and
 534         is faster to compute.
 535         """
 536
 537         # viewing a and b as multisets, set matches to the cardinality
 538         # of their intersection; this counts the number of matches
 539         # without regard to order, so is clearly an upper bound
 540         if self.fullbcount is None:
 541             self.fullbcount = fullbcount = {}
 542             for elt in self.b:
 543                 fullbcount[elt] = fullbcount.get(elt, 0) + 1
 544         fullbcount = self.fullbcount
 545         # avail[x] is the number of times x appears in 'b' less the
 546         # number of times we've seen it in 'a' so far ... kinda
 547         avail = {}
 548         availhas, matches = avail.has_key, 0
 549         for elt in self.a:
 550             if availhas(elt):
 551                 numb = avail[elt]
 552             else:
 553                 numb = fullbcount.get(elt, 0)
 554             avail[elt] = numb - 1
 555             if numb > 0:
 556                 matches = matches + 1
 557         return 2.0 * matches / (len(self.a) + len(self.b))
 558
 559     def real_quick_ratio(self):
 560         """Return an upper bound on ratio() very quickly.
 561
 562         This isn't defined beyond that it is an upper bound on .ratio(), and
 563         is faster to compute than either .ratio() or .quick_ratio().
 564         """
 565
 566         la, lb = len(self.a), len(self.b)
 567         # can't have more matches than the number of elements in the
 568         # shorter sequence
 569         return 2.0 * min(la, lb) / (la + lb)
 570
 571 def get_close_matches(word, possibilities, n=3, cutoff=0.6):
 572     """Use SequenceMatcher to return list of the best "good enough" matches.
 573
 574     word is a sequence for which close matches are desired (typically a
 575     string).
 576
 577     possibilities is a list of sequences against which to match word
 578     (typically a list of strings).
 579
 580     Optional arg n (default 3) is the maximum number of close matches to
 581     return.  n must be > 0.
 582
 583     Optional arg cutoff (default 0.6) is a float in [0, 1].  Possibilities
 584     that don't score at least that similar to word are ignored.
 585
 586     The best (no more than n) matches among the possibilities are returned
 587     in a list, sorted by similarity score, most similar first.
 588
 589     >>> get_close_matches("appel", ["ape", "apple", "peach", "puppy"])
 590     ['apple', 'ape']
 591     >>> import keyword as _keyword
 592     >>> get_close_matches("wheel", _keyword.kwlist)
 593     ['while']
 594     >>> get_close_matches("apple", _keyword.kwlist)
 595     []
 596     >>> get_close_matches("accept", _keyword.kwlist)
 597     ['except']
 598     """
 599
 600     if not n >  0:
 601         raise ValueError("n must be > 0: " + `n`)
 602     if not 0.0 <= cutoff <= 1.0:
 603         raise ValueError("cutoff must be in [0.0, 1.0]: " + `cutoff`)
 604     result = []
 605     s = SequenceMatcher()
 606     s.set_seq2(word)
 607     for x in possibilities:
 608         s.set_seq1(x)
 609         if s.real_quick_ratio() >= cutoff and \
 610            s.quick_ratio() >= cutoff and \
 611            s.ratio() >= cutoff:
 612             result.append((s.ratio(), x))
 613     # Sort by score.
 614     result.sort()
 615     # Retain only the best n.
 616     result = result[-n:]
 617     # Move best-scorer to head of list.
 618     result.reverse()
 619     # Strip scores.
 620     return [x for score, x in result]
 621
 622
 623 def _count_leading(line, ch):
 624     """
 625     Return number of `ch` characters at the start of `line`.
 626
 627     Example:
 628
 629     >>> _count_leading('   abc', ' ')
 630     3
 631     """
 632
 633     i, n = 0, len(line)
 634     while i < n and line[i] == ch:
 635         i += 1
 636     return i
 637
 638 class Differ:
 639     r"""
 640     Differ is a class for comparing sequences of lines of text, and
 641     producing human-readable differences or deltas.  Differ uses
 642     SequenceMatcher both to compare sequences of lines, and to compare
 643     sequences of characters within similar (near-matching) lines.
 644
 645     Each line of a Differ delta begins with a two-letter code:
 646
 647         '- '    line unique to sequence 1
 648         '+ '    line unique to sequence 2
 649         '  '    line common to both sequences
 650         '? '    line not present in either input sequence
 651
 652     Lines beginning with '? ' attempt to guide the eye to intraline
 653     differences, and were not present in either input sequence.  These lines
 654     can be confusing if the sequences contain tab characters.
 655
 656     Note that Differ makes no claim to produce a *minimal* diff.  To the
 657     contrary, minimal diffs are often counter-intuitive, because they synch
 658     up anywhere possible, sometimes accidental matches 100 pages apart.
 659     Restricting synch points to contiguous matches preserves some notion of
 660     locality, at the occasional cost of producing a longer diff.
 661
 662     Example: Comparing two texts.
 663
 664     First we set up the texts, sequences of individual single-line strings
 665     ending with newlines (such sequences can also be obtained from the
 666     `readlines()` method of file-like objects):
 667
 668     >>> text1 = '''  1. Beautiful is better than ugly.
 669     ...   2. Explicit is better than implicit.
 670     ...   3. Simple is better than complex.
 671     ...   4. Complex is better than complicated.
 672     ... '''.splitlines(1)
 673     >>> len(text1)
 674     4
 675     >>> text1[0][-1]
 676     '\n'
 677     >>> text2 = '''  1. Beautiful is better than ugly.
 678     ...   3.   Simple is better than complex.
 679     ...   4. Complicated is better than complex.
 680     ...   5. Flat is better than nested.
 681     ... '''.splitlines(1)
 682
 683     Next we instantiate a Differ object:
 684
 685     >>> d = Differ()
 686
 687     Note that when instantiating a Differ object we may pass functions to
 688     filter out line and character 'junk'.  See Differ.__init__ for details.
 689
 690     Finally, we compare the two:
 691
 692     >>> result = list(d.compare(text1, text2))
 693
 694     'result' is a list of strings, so let's pretty-print it:
 695
 696     >>> from pprint import pprint as _pprint
 697     >>> _pprint(result)
 698     ['    1. Beautiful is better than ugly.\n',
 699      '-   2. Explicit is better than implicit.\n',
 700      '-   3. Simple is better than complex.\n',
 701      '+   3.   Simple is better than complex.\n',
 702      '?     ++\n',
 703      '-   4. Complex is better than complicated.\n',
 704      '?            ^                     ---- ^\n',
 705      '+   4. Complicated is better than complex.\n',
 706      '?           ++++ ^                      ^\n',
 707      '+   5. Flat is better than nested.\n']
 708
 709     As a single multi-line string it looks like this:
 710
 711     >>> print ''.join(result),
 712         1. Beautiful is better than ugly.
 713     -   2. Explicit is better than implicit.
 714     -   3. Simple is better than complex.
 715     +   3.   Simple is better than complex.
 716     ?     ++
 717     -   4. Complex is better than complicated.
 718     ?            ^                     ---- ^
 719     +   4. Complicated is better than complex.
 720     ?           ++++ ^                      ^
 721     +   5. Flat is better than nested.
 722
 723     Methods:
 724
 725     __init__(linejunk=None, charjunk=None)
 726         Construct a text differencer, with optional filters.
 727
 728     compare(a, b)
 729         Compare two sequences of lines; generate the resulting delta.
 730     """
 731
 732     def __init__(self, linejunk=None, charjunk=None):
 733         """
 734         Construct a text differencer, with optional filters.
 735
 736         The two optional keyword parameters are for filter functions:
 737
 738         - `linejunk`: A function that should accept a single string argument,
 739           and return true iff the string is junk. The module-level function
 740           `IS_LINE_JUNK` may be used to filter out lines without visible
 741           characters, except for at most one splat ('#').
 742
 743         - `charjunk`: A function that should accept a string of length 1. The
 744           module-level function `IS_CHARACTER_JUNK` may be used to filter out
 745           whitespace characters (a blank or tab; **note**: bad idea to include
 746           newline in this!).
 747         """
 748
 749         self.linejunk = linejunk
 750         self.charjunk = charjunk
 751
 752     def compare(self, a, b):
 753         r"""
 754         Compare two sequences of lines; generate the resulting delta.
 755
 756         Each sequence must contain individual single-line strings ending with
 757         newlines. Such sequences can be obtained from the `readlines()` method
 758         of file-like objects.  The delta generated also consists of newline-
 759         terminated strings, ready to be printed as-is via the writeline()
 760         method of a file-like object.
 761
 762         Example:
 763
 764         >>> print ''.join(Differ().compare('one\ntwo\nthree\n'.splitlines(1),
 765         ...                                'ore\ntree\nemu\n'.splitlines(1))),
 766         - one
 767         ?  ^
 768         + ore
 769         ?  ^
 770         - two
 771         - three
 772         ?  -
 773         + tree
 774         + emu
 775         """
 776
 777         cruncher = SequenceMatcher(self.linejunk, a, b)
 778         for tag, alo, ahi, blo, bhi in cruncher.get_opcodes():
 779             if tag == 'replace':
 780                 g = self._fancy_replace(a, alo, ahi, b, blo, bhi)
 781             elif tag == 'delete':
 782                 g = self._dump('-', a, alo, ahi)
 783             elif tag == 'insert':
 784                 g = self._dump('+', b, blo, bhi)
 785             elif tag == 'equal':
 786                 g = self._dump(' ', a, alo, ahi)
 787             else:
 788                 raise ValueError, 'unknown tag ' + `tag`
 789
 790             for line in g:
 791                 yield line
 792
 793     def _dump(self, tag, x, lo, hi):
 794         """Generate comparison results for a same-tagged range."""
 795         for i in xrange(lo, hi):
 796             yield '%s %s' % (tag, x[i])
 797
 798     def _plain_replace(self, a, alo, ahi, b, blo, bhi):
 799         assert alo < ahi and blo < bhi
 800         # dump the shorter block first -- reduces the burden on short-term
 801         # memory if the blocks are of very different sizes
 802         if bhi - blo < ahi - alo:
 803             first  = self._dump('+', b, blo, bhi)
 804             second = self._dump('-', a, alo, ahi)
 805         else:
 806             first  = self._dump('-', a, alo, ahi)
 807             second = self._dump('+', b, blo, bhi)
 808
 809         for g in first, second:
 810             for line in g:
 811                 yield line
 812
 813     def _fancy_replace(self, a, alo, ahi, b, blo, bhi):
 814         r"""
 815         When replacing one block of lines with another, search the blocks
 816         for *similar* lines; the best-matching pair (if any) is used as a
 817         synch point, and intraline difference marking is done on the
 818         similar pair. Lots of work, but often worth it.
 819
 820         Example:
 821
 822         >>> d = Differ()
 823         >>> d._fancy_replace(['abcDefghiJkl\n'], 0, 1, ['abcdefGhijkl\n'], 0, 1)
 824         >>> print ''.join(d.results),
 825         - abcDefghiJkl
 826         ?    ^  ^  ^
 827         + abcdefGhijkl
 828         ?    ^  ^  ^
 829         """
 830
 831         # don't synch up unless the lines have a similarity score of at
 832         # least cutoff; best_ratio tracks the best score seen so far
 833         best_ratio, cutoff = 0.74, 0.75
 834         cruncher = SequenceMatcher(self.charjunk)
 835         eqi, eqj = None, None   # 1st indices of equal lines (if any)
 836
 837         # search for the pair that matches best without being identical
 838         # (identical lines must be junk lines, & we don't want to synch up
 839         # on junk -- unless we have to)
 840         for j in xrange(blo, bhi):
 841             bj = b[j]
 842             cruncher.set_seq2(bj)
 843             for i in xrange(alo, ahi):
 844                 ai = a[i]
 845                 if ai == bj:
 846                     if eqi is None:
 847                         eqi, eqj = i, j
 848                     continue
 849                 cruncher.set_seq1(ai)
 850                 # computing similarity is expensive, so use the quick
 851                 # upper bounds first -- have seen this speed up messy
 852                 # compares by a factor of 3.
 853                 # note that ratio() is only expensive to compute the first
 854                 # time it's called on a sequence pair; the expensive part
 855                 # of the computation is cached by cruncher
 856                 if cruncher.real_quick_ratio() > best_ratio and \
 857                       cruncher.quick_ratio() > best_ratio and \
 858                       cruncher.ratio() > best_ratio:
 859                     best_ratio, best_i, best_j = cruncher.ratio(), i, j
 860         if best_ratio < cutoff:
 861             # no non-identical "pretty close" pair
 862             if eqi is None:
 863                 # no identical pair either -- treat it as a straight replace
 864                 for line in self._plain_replace(a, alo, ahi, b, blo, bhi):
 865                     yield line
 866                 return
 867             # no close pair, but an identical pair -- synch up on that
 868             best_i, best_j, best_ratio = eqi, eqj, 1.0
 869         else:
 870             # there's a close pair, so forget the identical pair (if any)
 871             eqi = None
 872
 873         # a[best_i] very similar to b[best_j]; eqi is None iff they're not
 874         # identical
 875
 876         # pump out diffs from before the synch point
 877         for line in self._fancy_helper(a, alo, best_i, b, blo, best_j):
 878             yield line
 879
 880         # do intraline marking on the synch pair
 881         aelt, belt = a[best_i], b[best_j]
 882         if eqi is None:
 883             # pump out a '-', '?', '+', '?' quad for the synched lines
 884             atags = btags = ""
 885             cruncher.set_seqs(aelt, belt)
 886             for tag, ai1, ai2, bj1, bj2 in cruncher.get_opcodes():
 887                 la, lb = ai2 - ai1, bj2 - bj1
 888                 if tag == 'replace':
 889                     atags += '^' * la
 890                     btags += '^' * lb
 891                 elif tag == 'delete':
 892                     atags += '-' * la
 893                 elif tag == 'insert':
 894                     btags += '+' * lb
 895                 elif tag == 'equal':
 896                     atags += ' ' * la
 897                     btags += ' ' * lb
 898                 else:
 899                     raise ValueError, 'unknown tag ' + `tag`
 900             for line in self._qformat(aelt, belt, atags, btags):
 901                 yield line
 902         else:
 903             # the synch pair is identical
 904             yield '  ' + aelt
 905
 906         # pump out diffs from after the synch point
 907         for line in self._fancy_helper(a, best_i+1, ahi, b, best_j+1, bhi):
 908             yield line
 909
 910     def _fancy_helper(self, a, alo, ahi, b, blo, bhi):
 911         g = []
 912         if alo < ahi:
 913             if blo < bhi:
 914                 g = self._fancy_replace(a, alo, ahi, b, blo, bhi)
 915             else:
 916                 g = self._dump('-', a, alo, ahi)
 917         elif blo < bhi:
 918             g = self._dump('+', b, blo, bhi)
 919
 920         for line in g:
 921             yield line
 922
 923     def _qformat(self, aline, bline, atags, btags):
 924         r"""
 925         Format "?" output and deal with leading tabs.
 926
 927         Example:
 928
 929         >>> d = Differ()
 930         >>> d._qformat('\tabcDefghiJkl\n', '\t\tabcdefGhijkl\n',
 931         ...            '  ^ ^  ^      ', '+  ^ ^  ^      ')
 932         >>> for line in d.results: print repr(line)
 933         ...
 934         '- \tabcDefghiJkl\n'
 935         '? \t ^ ^  ^\n'
 936         '+ \t\tabcdefGhijkl\n'
 937         '? \t  ^ ^  ^\n'
 938         """
 939
 940         # Can hurt, but will probably help most of the time.
 941         common = min(_count_leading(aline, "\t"),
 942                      _count_leading(bline, "\t"))
 943         common = min(common, _count_leading(atags[:common], " "))
 944         atags = atags[common:].rstrip()
 945         btags = btags[common:].rstrip()
 946
 947         yield "- " + aline
 948         if atags:
 949             yield "? %s%s\n" % ("\t" * common, atags)
 950
 951         yield "+ " + bline
 952         if btags:
 953             yield "? %s%s\n" % ("\t" * common, btags)
 954
 955 # With respect to junk, an earlier version of ndiff simply refused to
 956 # *start* a match with a junk element.  The result was cases like this:
 957 #     before: private Thread currentThread;
 958 #     after:  private volatile Thread currentThread;
 959 # If you consider whitespace to be junk, the longest contiguous match
 960 # not starting with junk is "e Thread currentThread".  So ndiff reported
 961 # that "e volatil" was inserted between the 't' and the 'e' in "private".
 962 # While an accurate view, to people that's absurd.  The current version
 963 # looks for matching blocks that are entirely junk-free, then extends the
 964 # longest one of those as far as possible but only with matching junk.
 965 # So now "currentThread" is matched, then extended to suck up the
 966 # preceding blank; then "private" is matched, and extended to suck up the
 967 # following blank; then "Thread" is matched; and finally ndiff reports
 968 # that "volatile " was inserted before "Thread".  The only quibble
 969 # remaining is that perhaps it was really the case that " volatile"
 970 # was inserted after "private".  I can live with that <wink>.
 971
 972 import re
 973
 974 def IS_LINE_JUNK(line, pat=re.compile(r"\s*#?\s*$").match):
 975     r"""
 976     Return 1 for ignorable line: iff `line` is blank or contains a single '#'.
 977
 978     Examples:
 979
 980     >>> IS_LINE_JUNK('\n')
 981     1
 982     >>> IS_LINE_JUNK('  #   \n')
 983     1
 984     >>> IS_LINE_JUNK('hello\n')
 985     0
 986     """
 987
 988     return pat(line) is not None
 989
 990 def IS_CHARACTER_JUNK(ch, ws=" \t"):
 991     r"""
 992     Return 1 for ignorable character: iff `ch` is a space or tab.
 993
 994     Examples:
 995
 996     >>> IS_CHARACTER_JUNK(' ')
 997     1
 998     >>> IS_CHARACTER_JUNK('\t')
 999     1
1000     >>> IS_CHARACTER_JUNK('\n')
1001     0
1002     >>> IS_CHARACTER_JUNK('x')
1003     0
1004     """
1005
1006     return ch in ws
1007
1008 del re
1009
1010 def ndiff(a, b, linejunk=IS_LINE_JUNK, charjunk=IS_CHARACTER_JUNK):
1011     r"""
1012     Compare `a` and `b` (lists of strings); return a `Differ`-style delta.
1013
1014     Optional keyword parameters `linejunk` and `charjunk` are for filter
1015     functions (or None):
1016
1017     - linejunk: A function that should accept a single string argument, and
1018       return true iff the string is junk. The default is module-level function
1019       IS_LINE_JUNK, which filters out lines without visible characters, except
1020       for at most one splat ('#').
1021
1022     - charjunk: A function that should accept a string of length 1. The
1023       default is module-level function IS_CHARACTER_JUNK, which filters out
1024       whitespace characters (a blank or tab; note: bad idea to include newline
1025       in this!).
1026
1027     Tools/scripts/ndiff.py is a command-line front-end to this function.
1028
1029     Example:
1030
1031     >>> diff = ndiff('one\ntwo\nthree\n'.splitlines(1),
1032     ...              'ore\ntree\nemu\n'.splitlines(1))
1033     >>> print ''.join(diff),
1034     - one
1035     ?  ^
1036     + ore
1037     ?  ^
1038     - two
1039     - three
1040     ?  -
1041     + tree
1042     + emu
1043     """
1044     return Differ(linejunk, charjunk).compare(a, b)
1045
1046 def restore(delta, which):
1047     r"""
1048     Generate one of the two sequences that generated a delta.
1049
1050     Given a `delta` produced by `Differ.compare()` or `ndiff()`, extract
1051     lines originating from file 1 or 2 (parameter `which`), stripping off line
1052     prefixes.
1053
1054     Examples:
1055
1056     >>> diff = ndiff('one\ntwo\nthree\n'.splitlines(1),
1057     ...              'ore\ntree\nemu\n'.splitlines(1))
1058     >>> diff = list(diff)
1059     >>> print ''.join(restore(diff, 1)),
1060     one
1061     two
1062     three
1063     >>> print ''.join(restore(diff, 2)),
1064     ore
1065     tree
1066     emu
1067     """
1068     try:
1069         tag = {1: "- ", 2: "+ "}[int(which)]
1070     except KeyError:
1071         raise ValueError, ('unknown delta choice (must be 1 or 2): %r'
1072                            % which)
1073     prefixes = ("  ", tag)
1074     for line in delta:
1075         if line[:2] in prefixes:
1076             yield line[2:]
1077
1078 def _test():
1079     import doctest, difflib
1080     return doctest.testmod(difflib)
1081
1082 if __name__ == "__main__":
1083     _test()