Lib/difflib.py

   1 #! /usr/bin/env python
   2
   3 """
   4 Module difflib -- helpers for computing deltas between objects.
   5
   6 Function get_close_matches(word, possibilities, n=3, cutoff=0.6):
   7     Use SequenceMatcher to return list of the best "good enough" matches.
   8
   9 Function ndiff(a, b):
  10     Return a delta: the difference between `a` and `b` (lists of strings).
  11
  12 Function restore(delta, which):
  13     Return one of the two sequences that generated an ndiff delta.
  14
  15 Class SequenceMatcher:
  16     A flexible class for comparing pairs of sequences of any type.
  17
  18 Class Differ:
  19     For producing human-readable deltas from sequences of lines of text.
  20 """
  21
  22 __all__ = ['get_close_matches', 'ndiff', 'restore', 'SequenceMatcher',
  23            'Differ']
  24
  25 class SequenceMatcher:
  26
  27     """
  28     SequenceMatcher is a flexible class for comparing pairs of sequences of
  29     any type, so long as the sequence elements are hashable.  The basic
  30     algorithm predates, and is a little fancier than, an algorithm
  31     published in the late 1980's by Ratcliff and Obershelp under the
  32     hyperbolic name "gestalt pattern matching".  The basic idea is to find
  33     the longest contiguous matching subsequence that contains no "junk"
  34     elements (R-O doesn't address junk).  The same idea is then applied
  35     recursively to the pieces of the sequences to the left and to the right
  36     of the matching subsequence.  This does not yield minimal edit
  37     sequences, but does tend to yield matches that "look right" to people.
  38
  39     SequenceMatcher tries to compute a "human-friendly diff" between two
  40     sequences.  Unlike e.g. UNIX(tm) diff, the fundamental notion is the
  41     longest *contiguous* & junk-free matching subsequence.  That's what
  42     catches peoples' eyes.  The Windows(tm) windiff has another interesting
  43     notion, pairing up elements that appear uniquely in each sequence.
  44     That, and the method here, appear to yield more intuitive difference
  45     reports than does diff.  This method appears to be the least vulnerable
  46     to synching up on blocks of "junk lines", though (like blank lines in
  47     ordinary text files, or maybe "<P>" lines in HTML files).  That may be
  48     because this is the only method of the 3 that has a *concept* of
  49     "junk" <wink>.
  50
  51     Example, comparing two strings, and considering blanks to be "junk":
  52
  53     >>> s = SequenceMatcher(lambda x: x == " ",
  54     ...                     "private Thread currentThread;",
  55     ...                     "private volatile Thread currentThread;")
  56     >>>
  57
  58     .ratio() returns a float in [0, 1], measuring the "similarity" of the
  59     sequences.  As a rule of thumb, a .ratio() value over 0.6 means the
  60     sequences are close matches:
  61
  62     >>> print round(s.ratio(), 3)
  63     0.866
  64     >>>
  65
  66     If you're only interested in where the sequences match,
  67     .get_matching_blocks() is handy:
  68
  69     >>> for block in s.get_matching_blocks():
  70     ...     print "a[%d] and b[%d] match for %d elements" % block
  71     a[0] and b[0] match for 8 elements
  72     a[8] and b[17] match for 6 elements
  73     a[14] and b[23] match for 15 elements
  74     a[29] and b[38] match for 0 elements
  75
  76     Note that the last tuple returned by .get_matching_blocks() is always a
  77     dummy, (len(a), len(b), 0), and this is the only case in which the last
  78     tuple element (number of elements matched) is 0.
  79
  80     If you want to know how to change the first sequence into the second,
  81     use .get_opcodes():
  82
  83     >>> for opcode in s.get_opcodes():
  84     ...     print "%6s a[%d:%d] b[%d:%d]" % opcode
  85      equal a[0:8] b[0:8]
  86     insert a[8:8] b[8:17]
  87      equal a[8:14] b[17:23]
  88      equal a[14:29] b[23:38]
  89
  90     See the Differ class for a fancy human-friendly file differencer, which
  91     uses SequenceMatcher both to compare sequences of lines, and to compare
  92     sequences of characters within similar (near-matching) lines.
  93
  94     See also function get_close_matches() in this module, which shows how
  95     simple code building on SequenceMatcher can be used to do useful work.
  96
  97     Timing:  Basic R-O is cubic time worst case and quadratic time expected
  98     case.  SequenceMatcher is quadratic time for the worst case and has
  99     expected-case behavior dependent in a complicated way on how many
 100     elements the sequences have in common; best case time is linear.
 101
 102     Methods:
 103
 104     __init__(isjunk=None, a='', b='')
 105         Construct a SequenceMatcher.
 106
 107     set_seqs(a, b)
 108         Set the two sequences to be compared.
 109
 110     set_seq1(a)
 111         Set the first sequence to be compared.
 112
 113     set_seq2(b)
 114         Set the second sequence to be compared.
 115
 116     find_longest_match(alo, ahi, blo, bhi)
 117         Find longest matching block in a[alo:ahi] and b[blo:bhi].
 118
 119     get_matching_blocks()
 120         Return list of triples describing matching subsequences.
 121
 122     get_opcodes()
 123         Return list of 5-tuples describing how to turn a into b.
 124
 125     ratio()
 126         Return a measure of the sequences' similarity (float in [0,1]).
 127
 128     quick_ratio()
 129         Return an upper bound on .ratio() relatively quickly.
 130
 131     real_quick_ratio()
 132         Return an upper bound on ratio() very quickly.
 133     """
 134
 135     def __init__(self, isjunk=None, a='', b=''):
 136         """Construct a SequenceMatcher.
 137
 138         Optional arg isjunk is None (the default), or a one-argument
 139         function that takes a sequence element and returns true iff the
 140         element is junk.  None is equivalent to passing "lambda x: 0", i.e.
 141         no elements are considered to be junk.  For example, pass
 142             lambda x: x in " \\t"
 143         if you're comparing lines as sequences of characters, and don't
 144         want to synch up on blanks or hard tabs.
 145
 146         Optional arg a is the first of two sequences to be compared.  By
 147         default, an empty string.  The elements of a must be hashable.  See
 148         also .set_seqs() and .set_seq1().
 149
 150         Optional arg b is the second of two sequences to be compared.  By
 151         default, an empty string.  The elements of b must be hashable. See
 152         also .set_seqs() and .set_seq2().
 153         """
 154
 155         # Members:
 156         # a
 157         #      first sequence
 158         # b
 159         #      second sequence; differences are computed as "what do
 160         #      we need to do to 'a' to change it into 'b'?"
 161         # b2j
 162         #      for x in b, b2j[x] is a list of the indices (into b)
 163         #      at which x appears; junk elements do not appear
 164         # fullbcount
 165         #      for x in b, fullbcount[x] == the number of times x
 166         #      appears in b; only materialized if really needed (used
 167         #      only for computing quick_ratio())
 168         # matching_blocks
 169         #      a list of (i, j, k) triples, where a[i:i+k] == b[j:j+k];
 170         #      ascending & non-overlapping in i and in j; terminated by
 171         #      a dummy (len(a), len(b), 0) sentinel
 172         # opcodes
 173         #      a list of (tag, i1, i2, j1, j2) tuples, where tag is
 174         #      one of
 175         #          'replace'   a[i1:i2] should be replaced by b[j1:j2]
 176         #          'delete'    a[i1:i2] should be deleted
 177         #          'insert'    b[j1:j2] should be inserted
 178         #          'equal'     a[i1:i2] == b[j1:j2]
 179         # isjunk
 180         #      a user-supplied function taking a sequence element and
 181         #      returning true iff the element is "junk" -- this has
 182         #      subtle but helpful effects on the algorithm, which I'll
 183         #      get around to writing up someday <0.9 wink>.
 184         #      DON'T USE!  Only __chain_b uses this.  Use isbjunk.
 185         # isbjunk
 186         #      for x in b, isbjunk(x) == isjunk(x) but much faster;
 187         #      it's really the has_key method of a hidden dict.
 188         #      DOES NOT WORK for x in a!
 189         # isbpopular
 190         #      for x in b, isbpopular(x) is true iff b is reasonably long
 191         #      (at least 200 elements) and x accounts for more than 1% of
 192         #      its elements.  DOES NOT WORK for x in a!
 193
 194         self.isjunk = isjunk
 195         self.a = self.b = None
 196         self.set_seqs(a, b)
 197
 198     def set_seqs(self, a, b):
 199         """Set the two sequences to be compared.
 200
 201         >>> s = SequenceMatcher()
 202         >>> s.set_seqs("abcd", "bcde")
 203         >>> s.ratio()
 204         0.75
 205         """
 206
 207         self.set_seq1(a)
 208         self.set_seq2(b)
 209
 210     def set_seq1(self, a):
 211         """Set the first sequence to be compared.
 212
 213         The second sequence to be compared is not changed.
 214
 215         >>> s = SequenceMatcher(None, "abcd", "bcde")
 216         >>> s.ratio()
 217         0.75
 218         >>> s.set_seq1("bcde")
 219         >>> s.ratio()
 220         1.0
 221         >>>
 222
 223         SequenceMatcher computes and caches detailed information about the
 224         second sequence, so if you want to compare one sequence S against
 225         many sequences, use .set_seq2(S) once and call .set_seq1(x)
 226         repeatedly for each of the other sequences.
 227
 228         See also set_seqs() and set_seq2().
 229         """
 230
 231         if a is self.a:
 232             return
 233         self.a = a
 234         self.matching_blocks = self.opcodes = None
 235
 236     def set_seq2(self, b):
 237         """Set the second sequence to be compared.
 238
 239         The first sequence to be compared is not changed.
 240
 241         >>> s = SequenceMatcher(None, "abcd", "bcde")
 242         >>> s.ratio()
 243         0.75
 244         >>> s.set_seq2("abcd")
 245         >>> s.ratio()
 246         1.0
 247         >>>
 248
 249         SequenceMatcher computes and caches detailed information about the
 250         second sequence, so if you want to compare one sequence S against
 251         many sequences, use .set_seq2(S) once and call .set_seq1(x)
 252         repeatedly for each of the other sequences.
 253
 254         See also set_seqs() and set_seq1().
 255         """
 256
 257         if b is self.b:
 258             return
 259         self.b = b
 260         self.matching_blocks = self.opcodes = None
 261         self.fullbcount = None
 262         self.__chain_b()
 263
 264     # For each element x in b, set b2j[x] to a list of the indices in
 265     # b where x appears; the indices are in increasing order; note that
 266     # the number of times x appears in b is len(b2j[x]) ...
 267     # when self.isjunk is defined, junk elements don't show up in this
 268     # map at all, which stops the central find_longest_match method
 269     # from starting any matching block at a junk element ...
 270     # also creates the fast isbjunk function ...
 271     # b2j also does not contain entries for "popular" elements, meaning
 272     # elements that account for more than 1% of the total elements, and
 273     # when the sequence is reasonably large (>= 200 elements); this can
 274     # be viewed as an adaptive notion of semi-junk, and yields an enormous
 275     # speedup when, e.g., comparing program files with hundreds of
 276     # instances of "return NULL;" ...
 277     # note that this is only called when b changes; so for cross-product
 278     # kinds of matches, it's best to call set_seq2 once, then set_seq1
 279     # repeatedly
 280
 281     def __chain_b(self):
 282         # Because isjunk is a user-defined (not C) function, and we test
 283         # for junk a LOT, it's important to minimize the number of calls.
 284         # Before the tricks described here, __chain_b was by far the most
 285         # time-consuming routine in the whole module!  If anyone sees
 286         # Jim Roskind, thank him again for profile.py -- I never would
 287         # have guessed that.
 288         # The first trick is to build b2j ignoring the possibility
 289         # of junk.  I.e., we don't call isjunk at all yet.  Throwing
 290         # out the junk later is much cheaper than building b2j "right"
 291         # from the start.
 292         b = self.b
 293         n = len(b)
 294         self.b2j = b2j = {}
 295         populardict = {}
 296         for i, elt in enumerate(b):
 297             if elt in b2j:
 298                 indices = b2j[elt]
 299                 if n >= 200 and len(indices) * 100 > n:
 300                     populardict[elt] = 1
 301                     del indices[:]
 302                 else:
 303                     indices.append(i)
 304             else:
 305                 b2j[elt] = [i]
 306
 307         # Purge leftover indices for popular elements.
 308         for elt in populardict:
 309             del b2j[elt]
 310
 311         # Now b2j.keys() contains elements uniquely, and especially when
 312         # the sequence is a string, that's usually a good deal smaller
 313         # than len(string).  The difference is the number of isjunk calls
 314         # saved.
 315         isjunk = self.isjunk
 316         junkdict = {}
 317         if isjunk:
 318             for d in populardict, b2j:
 319                 for elt in d.keys():
 320                     if isjunk(elt):
 321                         junkdict[elt] = 1
 322                         del d[elt]
 323
 324         # Now for x in b, isjunk(x) == x in junkdict, but the
 325         # latter is much faster.  Note too that while there may be a
 326         # lot of junk in the sequence, the number of *unique* junk
 327         # elements is probably small.  So the memory burden of keeping
 328         # this dict alive is likely trivial compared to the size of b2j.
 329         self.isbjunk = junkdict.has_key
 330         self.isbpopular = populardict.has_key
 331
 332     def find_longest_match(self, alo, ahi, blo, bhi):
 333         """Find longest matching block in a[alo:ahi] and b[blo:bhi].
 334
 335         If isjunk is not defined:
 336
 337         Return (i,j,k) such that a[i:i+k] is equal to b[j:j+k], where
 338             alo <= i <= i+k <= ahi
 339             blo <= j <= j+k <= bhi
 340         and for all (i',j',k') meeting those conditions,
 341             k >= k'
 342             i <= i'
 343             and if i == i', j <= j'
 344
 345         In other words, of all maximal matching blocks, return one that
 346         starts earliest in a, and of all those maximal matching blocks that
 347         start earliest in a, return the one that starts earliest in b.
 348
 349         >>> s = SequenceMatcher(None, " abcd", "abcd abcd")
 350         >>> s.find_longest_match(0, 5, 0, 9)
 351         (0, 4, 5)
 352
 353         If isjunk is defined, first the longest matching block is
 354         determined as above, but with the additional restriction that no
 355         junk element appears in the block.  Then that block is extended as
 356         far as possible by matching (only) junk elements on both sides.  So
 357         the resulting block never matches on junk except as identical junk
 358         happens to be adjacent to an "interesting" match.
 359
 360         Here's the same example as before, but considering blanks to be
 361         junk.  That prevents " abcd" from matching the " abcd" at the tail
 362         end of the second sequence directly.  Instead only the "abcd" can
 363         match, and matches the leftmost "abcd" in the second sequence:
 364
 365         >>> s = SequenceMatcher(lambda x: x==" ", " abcd", "abcd abcd")
 366         >>> s.find_longest_match(0, 5, 0, 9)
 367         (1, 0, 4)
 368
 369         If no blocks match, return (alo, blo, 0).
 370
 371         >>> s = SequenceMatcher(None, "ab", "c")
 372         >>> s.find_longest_match(0, 2, 0, 1)
 373         (0, 0, 0)
 374         """
 375
 376         # CAUTION:  stripping common prefix or suffix would be incorrect.
 377         # E.g.,
 378         #    ab
 379         #    acab
 380         # Longest matching block is "ab", but if common prefix is
 381         # stripped, it's "a" (tied with "b").  UNIX(tm) diff does so
 382         # strip, so ends up claiming that ab is changed to acab by
 383         # inserting "ca" in the middle.  That's minimal but unintuitive:
 384         # "it's obvious" that someone inserted "ac" at the front.
 385         # Windiff ends up at the same place as diff, but by pairing up
 386         # the unique 'b's and then matching the first two 'a's.
 387
 388         a, b, b2j, isbjunk = self.a, self.b, self.b2j, self.isbjunk
 389         besti, bestj, bestsize = alo, blo, 0
 390         # find longest junk-free match
 391         # during an iteration of the loop, j2len[j] = length of longest
 392         # junk-free match ending with a[i-1] and b[j]
 393         j2len = {}
 394         nothing = []
 395         for i in xrange(alo, ahi):
 396             # look at all instances of a[i] in b; note that because
 397             # b2j has no junk keys, the loop is skipped if a[i] is junk
 398             j2lenget = j2len.get
 399             newj2len = {}
 400             for j in b2j.get(a[i], nothing):
 401                 # a[i] matches b[j]
 402                 if j < blo:
 403                     continue
 404                 if j >= bhi:
 405                     break
 406                 k = newj2len[j] = j2lenget(j-1, 0) + 1
 407                 if k > bestsize:
 408                     besti, bestj, bestsize = i-k+1, j-k+1, k
 409             j2len = newj2len
 410
 411         # Extend the best by non-junk elements on each end.  In particular,
 412         # "popular" non-junk elements aren't in b2j, which greatly speeds
 413         # the inner loop above, but also means "the best" match so far
 414         # doesn't contain any junk *or* popular non-junk elements.
 415         while besti > alo and bestj > blo and \
 416               not isbjunk(b[bestj-1]) and \
 417               a[besti-1] == b[bestj-1]:
 418             besti, bestj, bestsize = besti-1, bestj-1, bestsize+1
 419         while besti+bestsize < ahi and bestj+bestsize < bhi and \
 420               not isbjunk(b[bestj+bestsize]) and \
 421               a[besti+bestsize] == b[bestj+bestsize]:
 422             bestsize += 1
 423
 424         # Now that we have a wholly interesting match (albeit possibly
 425         # empty!), we may as well suck up the matching junk on each
 426         # side of it too.  Can't think of a good reason not to, and it
 427         # saves post-processing the (possibly considerable) expense of
 428         # figuring out what to do with it.  In the case of an empty
 429         # interesting match, this is clearly the right thing to do,
 430         # because no other kind of match is possible in the regions.
 431         while besti > alo and bestj > blo and \
 432               isbjunk(b[bestj-1]) and \
 433               a[besti-1] == b[bestj-1]:
 434             besti, bestj, bestsize = besti-1, bestj-1, bestsize+1
 435         while besti+bestsize < ahi and bestj+bestsize < bhi and \
 436               isbjunk(b[bestj+bestsize]) and \
 437               a[besti+bestsize] == b[bestj+bestsize]:
 438             bestsize = bestsize + 1
 439
 440         return besti, bestj, bestsize
 441
 442     def get_matching_blocks(self):
 443         """Return list of triples describing matching subsequences.
 444
 445         Each triple is of the form (i, j, n), and means that
 446         a[i:i+n] == b[j:j+n].  The triples are monotonically increasing in
 447         i and in j.
 448
 449         The last triple is a dummy, (len(a), len(b), 0), and is the only
 450         triple with n==0.
 451
 452         >>> s = SequenceMatcher(None, "abxcd", "abcd")
 453         >>> s.get_matching_blocks()
 454         [(0, 0, 2), (3, 2, 2), (5, 4, 0)]
 455         """
 456
 457         if self.matching_blocks is not None:
 458             return self.matching_blocks
 459         self.matching_blocks = []
 460         la, lb = len(self.a), len(self.b)
 461         self.__helper(0, la, 0, lb, self.matching_blocks)
 462         self.matching_blocks.append( (la, lb, 0) )
 463         return self.matching_blocks
 464
 465     # builds list of matching blocks covering a[alo:ahi] and
 466     # b[blo:bhi], appending them in increasing order to answer
 467
 468     def __helper(self, alo, ahi, blo, bhi, answer):
 469         i, j, k = x = self.find_longest_match(alo, ahi, blo, bhi)
 470         # a[alo:i] vs b[blo:j] unknown
 471         # a[i:i+k] same as b[j:j+k]
 472         # a[i+k:ahi] vs b[j+k:bhi] unknown
 473         if k:
 474             if alo < i and blo < j:
 475                 self.__helper(alo, i, blo, j, answer)
 476             answer.append(x)
 477             if i+k < ahi and j+k < bhi:
 478                 self.__helper(i+k, ahi, j+k, bhi, answer)
 479
 480     def get_opcodes(self):
 481         """Return list of 5-tuples describing how to turn a into b.
 482
 483         Each tuple is of the form (tag, i1, i2, j1, j2).  The first tuple
 484         has i1 == j1 == 0, and remaining tuples have i1 == the i2 from the
 485         tuple preceding it, and likewise for j1 == the previous j2.
 486
 487         The tags are strings, with these meanings:
 488
 489         'replace':  a[i1:i2] should be replaced by b[j1:j2]
 490         'delete':   a[i1:i2] should be deleted.
 491                     Note that j1==j2 in this case.
 492         'insert':   b[j1:j2] should be inserted at a[i1:i1].
 493                     Note that i1==i2 in this case.
 494         'equal':    a[i1:i2] == b[j1:j2]
 495
 496         >>> a = "qabxcd"
 497         >>> b = "abycdf"
 498         >>> s = SequenceMatcher(None, a, b)
 499         >>> for tag, i1, i2, j1, j2 in s.get_opcodes():
 500         ...    print ("%7s a[%d:%d] (%s) b[%d:%d] (%s)" %
 501         ...           (tag, i1, i2, a[i1:i2], j1, j2, b[j1:j2]))
 502          delete a[0:1] (q) b[0:0] ()
 503           equal a[1:3] (ab) b[0:2] (ab)
 504         replace a[3:4] (x) b[2:3] (y)
 505           equal a[4:6] (cd) b[3:5] (cd)
 506          insert a[6:6] () b[5:6] (f)
 507         """
 508
 509         if self.opcodes is not None:
 510             return self.opcodes
 511         i = j = 0
 512         self.opcodes = answer = []
 513         for ai, bj, size in self.get_matching_blocks():
 514             # invariant:  we've pumped out correct diffs to change
 515             # a[:i] into b[:j], and the next matching block is
 516             # a[ai:ai+size] == b[bj:bj+size].  So we need to pump
 517             # out a diff to change a[i:ai] into b[j:bj], pump out
 518             # the matching block, and move (i,j) beyond the match
 519             tag = ''
 520             if i < ai and j < bj:
 521                 tag = 'replace'
 522             elif i < ai:
 523                 tag = 'delete'
 524             elif j < bj:
 525                 tag = 'insert'
 526             if tag:
 527                 answer.append( (tag, i, ai, j, bj) )
 528             i, j = ai+size, bj+size
 529             # the list of matching blocks is terminated by a
 530             # sentinel with size 0
 531             if size:
 532                 answer.append( ('equal', ai, i, bj, j) )
 533         return answer
 534
 535     def ratio(self):
 536         """Return a measure of the sequences' similarity (float in [0,1]).
 537
 538         Where T is the total number of elements in both sequences, and
 539         M is the number of matches, this is 2,0*M / T.
 540         Note that this is 1 if the sequences are identical, and 0 if
 541         they have nothing in common.
 542
 543         .ratio() is expensive to compute if you haven't already computed
 544         .get_matching_blocks() or .get_opcodes(), in which case you may
 545         want to try .quick_ratio() or .real_quick_ratio() first to get an
 546         upper bound.
 547
 548         >>> s = SequenceMatcher(None, "abcd", "bcde")
 549         >>> s.ratio()
 550         0.75
 551         >>> s.quick_ratio()
 552         0.75
 553         >>> s.real_quick_ratio()
 554         1.0
 555         """
 556
 557         matches = reduce(lambda sum, triple: sum + triple[-1],
 558                          self.get_matching_blocks(), 0)
 559         return 2.0 * matches / (len(self.a) + len(self.b))
 560
 561     def quick_ratio(self):
 562         """Return an upper bound on ratio() relatively quickly.
 563
 564         This isn't defined beyond that it is an upper bound on .ratio(), and
 565         is faster to compute.
 566         """
 567
 568         # viewing a and b as multisets, set matches to the cardinality
 569         # of their intersection; this counts the number of matches
 570         # without regard to order, so is clearly an upper bound
 571         if self.fullbcount is None:
 572             self.fullbcount = fullbcount = {}
 573             for elt in self.b:
 574                 fullbcount[elt] = fullbcount.get(elt, 0) + 1
 575         fullbcount = self.fullbcount
 576         # avail[x] is the number of times x appears in 'b' less the
 577         # number of times we've seen it in 'a' so far ... kinda
 578         avail = {}
 579         availhas, matches = avail.has_key, 0
 580         for elt in self.a:
 581             if availhas(elt):
 582                 numb = avail[elt]
 583             else:
 584                 numb = fullbcount.get(elt, 0)
 585             avail[elt] = numb - 1
 586             if numb > 0:
 587                 matches = matches + 1
 588         return 2.0 * matches / (len(self.a) + len(self.b))
 589
 590     def real_quick_ratio(self):
 591         """Return an upper bound on ratio() very quickly.
 592
 593         This isn't defined beyond that it is an upper bound on .ratio(), and
 594         is faster to compute than either .ratio() or .quick_ratio().
 595         """
 596
 597         la, lb = len(self.a), len(self.b)
 598         # can't have more matches than the number of elements in the
 599         # shorter sequence
 600         return 2.0 * min(la, lb) / (la + lb)
 601
 602 def get_close_matches(word, possibilities, n=3, cutoff=0.6):
 603     """Use SequenceMatcher to return list of the best "good enough" matches.
 604
 605     word is a sequence for which close matches are desired (typically a
 606     string).
 607
 608     possibilities is a list of sequences against which to match word
 609     (typically a list of strings).
 610
 611     Optional arg n (default 3) is the maximum number of close matches to
 612     return.  n must be > 0.
 613
 614     Optional arg cutoff (default 0.6) is a float in [0, 1].  Possibilities
 615     that don't score at least that similar to word are ignored.
 616
 617     The best (no more than n) matches among the possibilities are returned
 618     in a list, sorted by similarity score, most similar first.
 619
 620     >>> get_close_matches("appel", ["ape", "apple", "peach", "puppy"])
 621     ['apple', 'ape']
 622     >>> import keyword as _keyword
 623     >>> get_close_matches("wheel", _keyword.kwlist)
 624     ['while']
 625     >>> get_close_matches("apple", _keyword.kwlist)
 626     []
 627     >>> get_close_matches("accept", _keyword.kwlist)
 628     ['except']
 629     """
 630
 631     if not n >  0:
 632         raise ValueError("n must be > 0: " + `n`)
 633     if not 0.0 <= cutoff <= 1.0:
 634         raise ValueError("cutoff must be in [0.0, 1.0]: " + `cutoff`)
 635     result = []
 636     s = SequenceMatcher()
 637     s.set_seq2(word)
 638     for x in possibilities:
 639         s.set_seq1(x)
 640         if s.real_quick_ratio() >= cutoff and \
 641            s.quick_ratio() >= cutoff and \
 642            s.ratio() >= cutoff:
 643             result.append((s.ratio(), x))
 644     # Sort by score.
 645     result.sort()
 646     # Retain only the best n.
 647     result = result[-n:]
 648     # Move best-scorer to head of list.
 649     result.reverse()
 650     # Strip scores.
 651     return [x for score, x in result]
 652
 653
 654 def _count_leading(line, ch):
 655     """
 656     Return number of `ch` characters at the start of `line`.
 657
 658     Example:
 659
 660     >>> _count_leading('   abc', ' ')
 661     3
 662     """
 663
 664     i, n = 0, len(line)
 665     while i < n and line[i] == ch:
 666         i += 1
 667     return i
 668
 669 class Differ:
 670     r"""
 671     Differ is a class for comparing sequences of lines of text, and
 672     producing human-readable differences or deltas.  Differ uses
 673     SequenceMatcher both to compare sequences of lines, and to compare
 674     sequences of characters within similar (near-matching) lines.
 675
 676     Each line of a Differ delta begins with a two-letter code:
 677
 678         '- '    line unique to sequence 1
 679         '+ '    line unique to sequence 2
 680         '  '    line common to both sequences
 681         '? '    line not present in either input sequence
 682
 683     Lines beginning with '? ' attempt to guide the eye to intraline
 684     differences, and were not present in either input sequence.  These lines
 685     can be confusing if the sequences contain tab characters.
 686
 687     Note that Differ makes no claim to produce a *minimal* diff.  To the
 688     contrary, minimal diffs are often counter-intuitive, because they synch
 689     up anywhere possible, sometimes accidental matches 100 pages apart.
 690     Restricting synch points to contiguous matches preserves some notion of
 691     locality, at the occasional cost of producing a longer diff.
 692
 693     Example: Comparing two texts.
 694
 695     First we set up the texts, sequences of individual single-line strings
 696     ending with newlines (such sequences can also be obtained from the
 697     `readlines()` method of file-like objects):
 698
 699     >>> text1 = '''  1. Beautiful is better than ugly.
 700     ...   2. Explicit is better than implicit.
 701     ...   3. Simple is better than complex.
 702     ...   4. Complex is better than complicated.
 703     ... '''.splitlines(1)
 704     >>> len(text1)
 705     4
 706     >>> text1[0][-1]
 707     '\n'
 708     >>> text2 = '''  1. Beautiful is better than ugly.
 709     ...   3.   Simple is better than complex.
 710     ...   4. Complicated is better than complex.
 711     ...   5. Flat is better than nested.
 712     ... '''.splitlines(1)
 713
 714     Next we instantiate a Differ object:
 715
 716     >>> d = Differ()
 717
 718     Note that when instantiating a Differ object we may pass functions to
 719     filter out line and character 'junk'.  See Differ.__init__ for details.
 720
 721     Finally, we compare the two:
 722
 723     >>> result = list(d.compare(text1, text2))
 724
 725     'result' is a list of strings, so let's pretty-print it:
 726
 727     >>> from pprint import pprint as _pprint
 728     >>> _pprint(result)
 729     ['    1. Beautiful is better than ugly.\n',
 730      '-   2. Explicit is better than implicit.\n',
 731      '-   3. Simple is better than complex.\n',
 732      '+   3.   Simple is better than complex.\n',
 733      '?     ++\n',
 734      '-   4. Complex is better than complicated.\n',
 735      '?            ^                     ---- ^\n',
 736      '+   4. Complicated is better than complex.\n',
 737      '?           ++++ ^                      ^\n',
 738      '+   5. Flat is better than nested.\n']
 739
 740     As a single multi-line string it looks like this:
 741
 742     >>> print ''.join(result),
 743         1. Beautiful is better than ugly.
 744     -   2. Explicit is better than implicit.
 745     -   3. Simple is better than complex.
 746     +   3.   Simple is better than complex.
 747     ?     ++
 748     -   4. Complex is better than complicated.
 749     ?            ^                     ---- ^
 750     +   4. Complicated is better than complex.
 751     ?           ++++ ^                      ^
 752     +   5. Flat is better than nested.
 753
 754     Methods:
 755
 756     __init__(linejunk=None, charjunk=None)
 757         Construct a text differencer, with optional filters.
 758
 759     compare(a, b)
 760         Compare two sequences of lines; generate the resulting delta.
 761     """
 762
 763     def __init__(self, linejunk=None, charjunk=None):
 764         """
 765         Construct a text differencer, with optional filters.
 766
 767         The two optional keyword parameters are for filter functions:
 768
 769         - `linejunk`: A function that should accept a single string argument,
 770           and return true iff the string is junk. The module-level function
 771           `IS_LINE_JUNK` may be used to filter out lines without visible
 772           characters, except for at most one splat ('#').  It is recommended
 773           to leave linejunk None; as of Python 2.3, the underlying
 774           SequenceMatcher class has grown an adaptive notion of "noise" lines
 775           that's better than any static definition the author has ever been
 776           able to craft.
 777
 778         - `charjunk`: A function that should accept a string of length 1. The
 779           module-level function `IS_CHARACTER_JUNK` may be used to filter out
 780           whitespace characters (a blank or tab; **note**: bad idea to include
 781           newline in this!).  Use of IS_CHARACTER_JUNK is recommended.
 782         """
 783
 784         self.linejunk = linejunk
 785         self.charjunk = charjunk
 786
 787     def compare(self, a, b):
 788         r"""
 789         Compare two sequences of lines; generate the resulting delta.
 790
 791         Each sequence must contain individual single-line strings ending with
 792         newlines. Such sequences can be obtained from the `readlines()` method
 793         of file-like objects.  The delta generated also consists of newline-
 794         terminated strings, ready to be printed as-is via the writeline()
 795         method of a file-like object.
 796
 797         Example:
 798
 799         >>> print ''.join(Differ().compare('one\ntwo\nthree\n'.splitlines(1),
 800         ...                                'ore\ntree\nemu\n'.splitlines(1))),
 801         - one
 802         ?  ^
 803         + ore
 804         ?  ^
 805         - two
 806         - three
 807         ?  -
 808         + tree
 809         + emu
 810         """
 811
 812         cruncher = SequenceMatcher(self.linejunk, a, b)
 813         for tag, alo, ahi, blo, bhi in cruncher.get_opcodes():
 814             if tag == 'replace':
 815                 g = self._fancy_replace(a, alo, ahi, b, blo, bhi)
 816             elif tag == 'delete':
 817                 g = self._dump('-', a, alo, ahi)
 818             elif tag == 'insert':
 819                 g = self._dump('+', b, blo, bhi)
 820             elif tag == 'equal':
 821                 g = self._dump(' ', a, alo, ahi)
 822             else:
 823                 raise ValueError, 'unknown tag ' + `tag`
 824
 825             for line in g:
 826                 yield line
 827
 828     def _dump(self, tag, x, lo, hi):
 829         """Generate comparison results for a same-tagged range."""
 830         for i in xrange(lo, hi):
 831             yield '%s %s' % (tag, x[i])
 832
 833     def _plain_replace(self, a, alo, ahi, b, blo, bhi):
 834         assert alo < ahi and blo < bhi
 835         # dump the shorter block first -- reduces the burden on short-term
 836         # memory if the blocks are of very different sizes
 837         if bhi - blo < ahi - alo:
 838             first  = self._dump('+', b, blo, bhi)
 839             second = self._dump('-', a, alo, ahi)
 840         else:
 841             first  = self._dump('-', a, alo, ahi)
 842             second = self._dump('+', b, blo, bhi)
 843
 844         for g in first, second:
 845             for line in g:
 846                 yield line
 847
 848     def _fancy_replace(self, a, alo, ahi, b, blo, bhi):
 849         r"""
 850         When replacing one block of lines with another, search the blocks
 851         for *similar* lines; the best-matching pair (if any) is used as a
 852         synch point, and intraline difference marking is done on the
 853         similar pair. Lots of work, but often worth it.
 854
 855         Example:
 856
 857         >>> d = Differ()
 858         >>> d._fancy_replace(['abcDefghiJkl\n'], 0, 1, ['abcdefGhijkl\n'], 0, 1)
 859         >>> print ''.join(d.results),
 860         - abcDefghiJkl
 861         ?    ^  ^  ^
 862         + abcdefGhijkl
 863         ?    ^  ^  ^
 864         """
 865
 866         # don't synch up unless the lines have a similarity score of at
 867         # least cutoff; best_ratio tracks the best score seen so far
 868         best_ratio, cutoff = 0.74, 0.75
 869         cruncher = SequenceMatcher(self.charjunk)
 870         eqi, eqj = None, None   # 1st indices of equal lines (if any)
 871
 872         # search for the pair that matches best without being identical
 873         # (identical lines must be junk lines, & we don't want to synch up
 874         # on junk -- unless we have to)
 875         for j in xrange(blo, bhi):
 876             bj = b[j]
 877             cruncher.set_seq2(bj)
 878             for i in xrange(alo, ahi):
 879                 ai = a[i]
 880                 if ai == bj:
 881                     if eqi is None:
 882                         eqi, eqj = i, j
 883                     continue
 884                 cruncher.set_seq1(ai)
 885                 # computing similarity is expensive, so use the quick
 886                 # upper bounds first -- have seen this speed up messy
 887                 # compares by a factor of 3.
 888                 # note that ratio() is only expensive to compute the first
 889                 # time it's called on a sequence pair; the expensive part
 890                 # of the computation is cached by cruncher
 891                 if cruncher.real_quick_ratio() > best_ratio and \
 892                       cruncher.quick_ratio() > best_ratio and \
 893                       cruncher.ratio() > best_ratio:
 894                     best_ratio, best_i, best_j = cruncher.ratio(), i, j
 895         if best_ratio < cutoff:
 896             # no non-identical "pretty close" pair
 897             if eqi is None:
 898                 # no identical pair either -- treat it as a straight replace
 899                 for line in self._plain_replace(a, alo, ahi, b, blo, bhi):
 900                     yield line
 901                 return
 902             # no close pair, but an identical pair -- synch up on that
 903             best_i, best_j, best_ratio = eqi, eqj, 1.0
 904         else:
 905             # there's a close pair, so forget the identical pair (if any)
 906             eqi = None
 907
 908         # a[best_i] very similar to b[best_j]; eqi is None iff they're not
 909         # identical
 910
 911         # pump out diffs from before the synch point
 912         for line in self._fancy_helper(a, alo, best_i, b, blo, best_j):
 913             yield line
 914
 915         # do intraline marking on the synch pair
 916         aelt, belt = a[best_i], b[best_j]
 917         if eqi is None:
 918             # pump out a '-', '?', '+', '?' quad for the synched lines
 919             atags = btags = ""
 920             cruncher.set_seqs(aelt, belt)
 921             for tag, ai1, ai2, bj1, bj2 in cruncher.get_opcodes():
 922                 la, lb = ai2 - ai1, bj2 - bj1
 923                 if tag == 'replace':
 924                     atags += '^' * la
 925                     btags += '^' * lb
 926                 elif tag == 'delete':
 927                     atags += '-' * la
 928                 elif tag == 'insert':
 929                     btags += '+' * lb
 930                 elif tag == 'equal':
 931                     atags += ' ' * la
 932                     btags += ' ' * lb
 933                 else:
 934                     raise ValueError, 'unknown tag ' + `tag`
 935             for line in self._qformat(aelt, belt, atags, btags):
 936                 yield line
 937         else:
 938             # the synch pair is identical
 939             yield '  ' + aelt
 940
 941         # pump out diffs from after the synch point
 942         for line in self._fancy_helper(a, best_i+1, ahi, b, best_j+1, bhi):
 943             yield line
 944
 945     def _fancy_helper(self, a, alo, ahi, b, blo, bhi):
 946         g = []
 947         if alo < ahi:
 948             if blo < bhi:
 949                 g = self._fancy_replace(a, alo, ahi, b, blo, bhi)
 950             else:
 951                 g = self._dump('-', a, alo, ahi)
 952         elif blo < bhi:
 953             g = self._dump('+', b, blo, bhi)
 954
 955         for line in g:
 956             yield line
 957
 958     def _qformat(self, aline, bline, atags, btags):
 959         r"""
 960         Format "?" output and deal with leading tabs.
 961
 962         Example:
 963
 964         >>> d = Differ()
 965         >>> d._qformat('\tabcDefghiJkl\n', '\t\tabcdefGhijkl\n',
 966         ...            '  ^ ^  ^      ', '+  ^ ^  ^      ')
 967         >>> for line in d.results: print repr(line)
 968         ...
 969         '- \tabcDefghiJkl\n'
 970         '? \t ^ ^  ^\n'
 971         '+ \t\tabcdefGhijkl\n'
 972         '? \t  ^ ^  ^\n'
 973         """
 974
 975         # Can hurt, but will probably help most of the time.
 976         common = min(_count_leading(aline, "\t"),
 977                      _count_leading(bline, "\t"))
 978         common = min(common, _count_leading(atags[:common], " "))
 979         atags = atags[common:].rstrip()
 980         btags = btags[common:].rstrip()
 981
 982         yield "- " + aline
 983         if atags:
 984             yield "? %s%s\n" % ("\t" * common, atags)
 985
 986         yield "+ " + bline
 987         if btags:
 988             yield "? %s%s\n" % ("\t" * common, btags)
 989
 990 # With respect to junk, an earlier version of ndiff simply refused to
 991 # *start* a match with a junk element.  The result was cases like this:
 992 #     before: private Thread currentThread;
 993 #     after:  private volatile Thread currentThread;
 994 # If you consider whitespace to be junk, the longest contiguous match
 995 # not starting with junk is "e Thread currentThread".  So ndiff reported
 996 # that "e volatil" was inserted between the 't' and the 'e' in "private".
 997 # While an accurate view, to people that's absurd.  The current version
 998 # looks for matching blocks that are entirely junk-free, then extends the
 999 # longest one of those as far as possible but only with matching junk.
1000 # So now "currentThread" is matched, then extended to suck up the
1001 # preceding blank; then "private" is matched, and extended to suck up the
1002 # following blank; then "Thread" is matched; and finally ndiff reports
1003 # that "volatile " was inserted before "Thread".  The only quibble
1004 # remaining is that perhaps it was really the case that " volatile"
1005 # was inserted after "private".  I can live with that <wink>.
1006
1007 import re
1008
1009 def IS_LINE_JUNK(line, pat=re.compile(r"\s*#?\s*$").match):
1010     r"""
1011     Return 1 for ignorable line: iff `line` is blank or contains a single '#'.
1012
1013     Examples:
1014
1015     >>> IS_LINE_JUNK('\n')
1016     True
1017     >>> IS_LINE_JUNK('  #   \n')
1018     True
1019     >>> IS_LINE_JUNK('hello\n')
1020     False
1021     """
1022
1023     return pat(line) is not None
1024
1025 def IS_CHARACTER_JUNK(ch, ws=" \t"):
1026     r"""
1027     Return 1 for ignorable character: iff `ch` is a space or tab.
1028
1029     Examples:
1030
1031     >>> IS_CHARACTER_JUNK(' ')
1032     True
1033     >>> IS_CHARACTER_JUNK('\t')
1034     True
1035     >>> IS_CHARACTER_JUNK('\n')
1036     False
1037     >>> IS_CHARACTER_JUNK('x')
1038     False
1039     """
1040
1041     return ch in ws
1042
1043 del re
1044
1045 def ndiff(a, b, linejunk=None, charjunk=IS_CHARACTER_JUNK):
1046     r"""
1047     Compare `a` and `b` (lists of strings); return a `Differ`-style delta.
1048
1049     Optional keyword parameters `linejunk` and `charjunk` are for filter
1050     functions (or None):
1051
1052     - linejunk: A function that should accept a single string argument, and
1053       return true iff the string is junk.  The default is None, and is
1054       recommended; as of Python 2.3, an adaptive notion of "noise" lines is
1055       used that does a good job on its own.
1056
1057     - charjunk: A function that should accept a string of length 1. The
1058       default is module-level function IS_CHARACTER_JUNK, which filters out
1059       whitespace characters (a blank or tab; note: bad idea to include newline
1060       in this!).
1061
1062     Tools/scripts/ndiff.py is a command-line front-end to this function.
1063
1064     Example:
1065
1066     >>> diff = ndiff('one\ntwo\nthree\n'.splitlines(1),
1067     ...              'ore\ntree\nemu\n'.splitlines(1))
1068     >>> print ''.join(diff),
1069     - one
1070     ?  ^
1071     + ore
1072     ?  ^
1073     - two
1074     - three
1075     ?  -
1076     + tree
1077     + emu
1078     """
1079     return Differ(linejunk, charjunk).compare(a, b)
1080
1081 def restore(delta, which):
1082     r"""
1083     Generate one of the two sequences that generated a delta.
1084
1085     Given a `delta` produced by `Differ.compare()` or `ndiff()`, extract
1086     lines originating from file 1 or 2 (parameter `which`), stripping off line
1087     prefixes.
1088
1089     Examples:
1090
1091     >>> diff = ndiff('one\ntwo\nthree\n'.splitlines(1),
1092     ...              'ore\ntree\nemu\n'.splitlines(1))
1093     >>> diff = list(diff)
1094     >>> print ''.join(restore(diff, 1)),
1095     one
1096     two
1097     three
1098     >>> print ''.join(restore(diff, 2)),
1099     ore
1100     tree
1101     emu
1102     """
1103     try:
1104         tag = {1: "- ", 2: "+ "}[int(which)]
1105     except KeyError:
1106         raise ValueError, ('unknown delta choice (must be 1 or 2): %r'
1107                            % which)
1108     prefixes = ("  ", tag)
1109     for line in delta:
1110         if line[:2] in prefixes:
1111             yield line[2:]
1112
1113 def _test():
1114     import doctest, difflib
1115     return doctest.testmod(difflib)
1116
1117 if __name__ == "__main__":
1118     _test()