This commit was manufactured by cvs2svn to create tag 'r22a4-fork'.
[python/dscho.git] / Lib / difflib.py
blob8493503e6f4c5efa94584bcad441dccfb72946f1
1 #! /usr/bin/env python
3 from __future__ import generators
5 """
6 Module difflib -- helpers for computing deltas between objects.
8 Function get_close_matches(word, possibilities, n=3, cutoff=0.6):
9 Use SequenceMatcher to return list of the best "good enough" matches.
11 Function ndiff(a, b):
12 Return a delta: the difference between `a` and `b` (lists of strings).
14 Function restore(delta, which):
15 Return one of the two sequences that generated an ndiff delta.
17 Class SequenceMatcher:
18 A flexible class for comparing pairs of sequences of any type.
20 Class Differ:
21 For producing human-readable deltas from sequences of lines of text.
22 """
24 __all__ = ['get_close_matches', 'ndiff', 'restore', 'SequenceMatcher',
25 'Differ']
27 class SequenceMatcher:
29 """
30 SequenceMatcher is a flexible class for comparing pairs of sequences of
31 any type, so long as the sequence elements are hashable. The basic
32 algorithm predates, and is a little fancier than, an algorithm
33 published in the late 1980's by Ratcliff and Obershelp under the
34 hyperbolic name "gestalt pattern matching". The basic idea is to find
35 the longest contiguous matching subsequence that contains no "junk"
36 elements (R-O doesn't address junk). The same idea is then applied
37 recursively to the pieces of the sequences to the left and to the right
38 of the matching subsequence. This does not yield minimal edit
39 sequences, but does tend to yield matches that "look right" to people.
41 SequenceMatcher tries to compute a "human-friendly diff" between two
42 sequences. Unlike e.g. UNIX(tm) diff, the fundamental notion is the
43 longest *contiguous* & junk-free matching subsequence. That's what
44 catches peoples' eyes. The Windows(tm) windiff has another interesting
45 notion, pairing up elements that appear uniquely in each sequence.
46 That, and the method here, appear to yield more intuitive difference
47 reports than does diff. This method appears to be the least vulnerable
48 to synching up on blocks of "junk lines", though (like blank lines in
49 ordinary text files, or maybe "<P>" lines in HTML files). That may be
50 because this is the only method of the 3 that has a *concept* of
51 "junk" <wink>.
53 Example, comparing two strings, and considering blanks to be "junk":
55 >>> s = SequenceMatcher(lambda x: x == " ",
56 ... "private Thread currentThread;",
57 ... "private volatile Thread currentThread;")
58 >>>
60 .ratio() returns a float in [0, 1], measuring the "similarity" of the
61 sequences. As a rule of thumb, a .ratio() value over 0.6 means the
62 sequences are close matches:
64 >>> print round(s.ratio(), 3)
65 0.866
66 >>>
68 If you're only interested in where the sequences match,
69 .get_matching_blocks() is handy:
71 >>> for block in s.get_matching_blocks():
72 ... print "a[%d] and b[%d] match for %d elements" % block
73 a[0] and b[0] match for 8 elements
74 a[8] and b[17] match for 6 elements
75 a[14] and b[23] match for 15 elements
76 a[29] and b[38] match for 0 elements
78 Note that the last tuple returned by .get_matching_blocks() is always a
79 dummy, (len(a), len(b), 0), and this is the only case in which the last
80 tuple element (number of elements matched) is 0.
82 If you want to know how to change the first sequence into the second,
83 use .get_opcodes():
85 >>> for opcode in s.get_opcodes():
86 ... print "%6s a[%d:%d] b[%d:%d]" % opcode
87 equal a[0:8] b[0:8]
88 insert a[8:8] b[8:17]
89 equal a[8:14] b[17:23]
90 equal a[14:29] b[23:38]
92 See the Differ class for a fancy human-friendly file differencer, which
93 uses SequenceMatcher both to compare sequences of lines, and to compare
94 sequences of characters within similar (near-matching) lines.
96 See also function get_close_matches() in this module, which shows how
97 simple code building on SequenceMatcher can be used to do useful work.
99 Timing: Basic R-O is cubic time worst case and quadratic time expected
100 case. SequenceMatcher is quadratic time for the worst case and has
101 expected-case behavior dependent in a complicated way on how many
102 elements the sequences have in common; best case time is linear.
104 Methods:
106 __init__(isjunk=None, a='', b='')
107 Construct a SequenceMatcher.
109 set_seqs(a, b)
110 Set the two sequences to be compared.
112 set_seq1(a)
113 Set the first sequence to be compared.
115 set_seq2(b)
116 Set the second sequence to be compared.
118 find_longest_match(alo, ahi, blo, bhi)
119 Find longest matching block in a[alo:ahi] and b[blo:bhi].
121 get_matching_blocks()
122 Return list of triples describing matching subsequences.
124 get_opcodes()
125 Return list of 5-tuples describing how to turn a into b.
127 ratio()
128 Return a measure of the sequences' similarity (float in [0,1]).
130 quick_ratio()
131 Return an upper bound on .ratio() relatively quickly.
133 real_quick_ratio()
134 Return an upper bound on ratio() very quickly.
137 def __init__(self, isjunk=None, a='', b=''):
138 """Construct a SequenceMatcher.
140 Optional arg isjunk is None (the default), or a one-argument
141 function that takes a sequence element and returns true iff the
142 element is junk. None is equivalent to passing "lambda x: 0", i.e.
143 no elements are considered to be junk. For example, pass
144 lambda x: x in " \\t"
145 if you're comparing lines as sequences of characters, and don't
146 want to synch up on blanks or hard tabs.
148 Optional arg a is the first of two sequences to be compared. By
149 default, an empty string. The elements of a must be hashable. See
150 also .set_seqs() and .set_seq1().
152 Optional arg b is the second of two sequences to be compared. By
153 default, an empty string. The elements of b must be hashable. See
154 also .set_seqs() and .set_seq2().
157 # Members:
159 # first sequence
161 # second sequence; differences are computed as "what do
162 # we need to do to 'a' to change it into 'b'?"
163 # b2j
164 # for x in b, b2j[x] is a list of the indices (into b)
165 # at which x appears; junk elements do not appear
166 # b2jhas
167 # b2j.has_key
168 # fullbcount
169 # for x in b, fullbcount[x] == the number of times x
170 # appears in b; only materialized if really needed (used
171 # only for computing quick_ratio())
172 # matching_blocks
173 # a list of (i, j, k) triples, where a[i:i+k] == b[j:j+k];
174 # ascending & non-overlapping in i and in j; terminated by
175 # a dummy (len(a), len(b), 0) sentinel
176 # opcodes
177 # a list of (tag, i1, i2, j1, j2) tuples, where tag is
178 # one of
179 # 'replace' a[i1:i2] should be replaced by b[j1:j2]
180 # 'delete' a[i1:i2] should be deleted
181 # 'insert' b[j1:j2] should be inserted
182 # 'equal' a[i1:i2] == b[j1:j2]
183 # isjunk
184 # a user-supplied function taking a sequence element and
185 # returning true iff the element is "junk" -- this has
186 # subtle but helpful effects on the algorithm, which I'll
187 # get around to writing up someday <0.9 wink>.
188 # DON'T USE! Only __chain_b uses this. Use isbjunk.
189 # isbjunk
190 # for x in b, isbjunk(x) == isjunk(x) but much faster;
191 # it's really the has_key method of a hidden dict.
192 # DOES NOT WORK for x in a!
194 self.isjunk = isjunk
195 self.a = self.b = None
196 self.set_seqs(a, b)
198 def set_seqs(self, a, b):
199 """Set the two sequences to be compared.
201 >>> s = SequenceMatcher()
202 >>> s.set_seqs("abcd", "bcde")
203 >>> s.ratio()
204 0.75
207 self.set_seq1(a)
208 self.set_seq2(b)
210 def set_seq1(self, a):
211 """Set the first sequence to be compared.
213 The second sequence to be compared is not changed.
215 >>> s = SequenceMatcher(None, "abcd", "bcde")
216 >>> s.ratio()
217 0.75
218 >>> s.set_seq1("bcde")
219 >>> s.ratio()
223 SequenceMatcher computes and caches detailed information about the
224 second sequence, so if you want to compare one sequence S against
225 many sequences, use .set_seq2(S) once and call .set_seq1(x)
226 repeatedly for each of the other sequences.
228 See also set_seqs() and set_seq2().
231 if a is self.a:
232 return
233 self.a = a
234 self.matching_blocks = self.opcodes = None
236 def set_seq2(self, b):
237 """Set the second sequence to be compared.
239 The first sequence to be compared is not changed.
241 >>> s = SequenceMatcher(None, "abcd", "bcde")
242 >>> s.ratio()
243 0.75
244 >>> s.set_seq2("abcd")
245 >>> s.ratio()
249 SequenceMatcher computes and caches detailed information about the
250 second sequence, so if you want to compare one sequence S against
251 many sequences, use .set_seq2(S) once and call .set_seq1(x)
252 repeatedly for each of the other sequences.
254 See also set_seqs() and set_seq1().
257 if b is self.b:
258 return
259 self.b = b
260 self.matching_blocks = self.opcodes = None
261 self.fullbcount = None
262 self.__chain_b()
264 # For each element x in b, set b2j[x] to a list of the indices in
265 # b where x appears; the indices are in increasing order; note that
266 # the number of times x appears in b is len(b2j[x]) ...
267 # when self.isjunk is defined, junk elements don't show up in this
268 # map at all, which stops the central find_longest_match method
269 # from starting any matching block at a junk element ...
270 # also creates the fast isbjunk function ...
271 # note that this is only called when b changes; so for cross-product
272 # kinds of matches, it's best to call set_seq2 once, then set_seq1
273 # repeatedly
275 def __chain_b(self):
276 # Because isjunk is a user-defined (not C) function, and we test
277 # for junk a LOT, it's important to minimize the number of calls.
278 # Before the tricks described here, __chain_b was by far the most
279 # time-consuming routine in the whole module! If anyone sees
280 # Jim Roskind, thank him again for profile.py -- I never would
281 # have guessed that.
282 # The first trick is to build b2j ignoring the possibility
283 # of junk. I.e., we don't call isjunk at all yet. Throwing
284 # out the junk later is much cheaper than building b2j "right"
285 # from the start.
286 b = self.b
287 self.b2j = b2j = {}
288 self.b2jhas = b2jhas = b2j.has_key
289 for i in xrange(len(b)):
290 elt = b[i]
291 if b2jhas(elt):
292 b2j[elt].append(i)
293 else:
294 b2j[elt] = [i]
296 # Now b2j.keys() contains elements uniquely, and especially when
297 # the sequence is a string, that's usually a good deal smaller
298 # than len(string). The difference is the number of isjunk calls
299 # saved.
300 isjunk, junkdict = self.isjunk, {}
301 if isjunk:
302 for elt in b2j.keys():
303 if isjunk(elt):
304 junkdict[elt] = 1 # value irrelevant; it's a set
305 del b2j[elt]
307 # Now for x in b, isjunk(x) == junkdict.has_key(x), but the
308 # latter is much faster. Note too that while there may be a
309 # lot of junk in the sequence, the number of *unique* junk
310 # elements is probably small. So the memory burden of keeping
311 # this dict alive is likely trivial compared to the size of b2j.
312 self.isbjunk = junkdict.has_key
314 def find_longest_match(self, alo, ahi, blo, bhi):
315 """Find longest matching block in a[alo:ahi] and b[blo:bhi].
317 If isjunk is not defined:
319 Return (i,j,k) such that a[i:i+k] is equal to b[j:j+k], where
320 alo <= i <= i+k <= ahi
321 blo <= j <= j+k <= bhi
322 and for all (i',j',k') meeting those conditions,
323 k >= k'
324 i <= i'
325 and if i == i', j <= j'
327 In other words, of all maximal matching blocks, return one that
328 starts earliest in a, and of all those maximal matching blocks that
329 start earliest in a, return the one that starts earliest in b.
331 >>> s = SequenceMatcher(None, " abcd", "abcd abcd")
332 >>> s.find_longest_match(0, 5, 0, 9)
333 (0, 4, 5)
335 If isjunk is defined, first the longest matching block is
336 determined as above, but with the additional restriction that no
337 junk element appears in the block. Then that block is extended as
338 far as possible by matching (only) junk elements on both sides. So
339 the resulting block never matches on junk except as identical junk
340 happens to be adjacent to an "interesting" match.
342 Here's the same example as before, but considering blanks to be
343 junk. That prevents " abcd" from matching the " abcd" at the tail
344 end of the second sequence directly. Instead only the "abcd" can
345 match, and matches the leftmost "abcd" in the second sequence:
347 >>> s = SequenceMatcher(lambda x: x==" ", " abcd", "abcd abcd")
348 >>> s.find_longest_match(0, 5, 0, 9)
349 (1, 0, 4)
351 If no blocks match, return (alo, blo, 0).
353 >>> s = SequenceMatcher(None, "ab", "c")
354 >>> s.find_longest_match(0, 2, 0, 1)
355 (0, 0, 0)
358 # CAUTION: stripping common prefix or suffix would be incorrect.
359 # E.g.,
360 # ab
361 # acab
362 # Longest matching block is "ab", but if common prefix is
363 # stripped, it's "a" (tied with "b"). UNIX(tm) diff does so
364 # strip, so ends up claiming that ab is changed to acab by
365 # inserting "ca" in the middle. That's minimal but unintuitive:
366 # "it's obvious" that someone inserted "ac" at the front.
367 # Windiff ends up at the same place as diff, but by pairing up
368 # the unique 'b's and then matching the first two 'a's.
370 a, b, b2j, isbjunk = self.a, self.b, self.b2j, self.isbjunk
371 besti, bestj, bestsize = alo, blo, 0
372 # find longest junk-free match
373 # during an iteration of the loop, j2len[j] = length of longest
374 # junk-free match ending with a[i-1] and b[j]
375 j2len = {}
376 nothing = []
377 for i in xrange(alo, ahi):
378 # look at all instances of a[i] in b; note that because
379 # b2j has no junk keys, the loop is skipped if a[i] is junk
380 j2lenget = j2len.get
381 newj2len = {}
382 for j in b2j.get(a[i], nothing):
383 # a[i] matches b[j]
384 if j < blo:
385 continue
386 if j >= bhi:
387 break
388 k = newj2len[j] = j2lenget(j-1, 0) + 1
389 if k > bestsize:
390 besti, bestj, bestsize = i-k+1, j-k+1, k
391 j2len = newj2len
393 # Now that we have a wholly interesting match (albeit possibly
394 # empty!), we may as well suck up the matching junk on each
395 # side of it too. Can't think of a good reason not to, and it
396 # saves post-processing the (possibly considerable) expense of
397 # figuring out what to do with it. In the case of an empty
398 # interesting match, this is clearly the right thing to do,
399 # because no other kind of match is possible in the regions.
400 while besti > alo and bestj > blo and \
401 isbjunk(b[bestj-1]) and \
402 a[besti-1] == b[bestj-1]:
403 besti, bestj, bestsize = besti-1, bestj-1, bestsize+1
404 while besti+bestsize < ahi and bestj+bestsize < bhi and \
405 isbjunk(b[bestj+bestsize]) and \
406 a[besti+bestsize] == b[bestj+bestsize]:
407 bestsize = bestsize + 1
409 return besti, bestj, bestsize
411 def get_matching_blocks(self):
412 """Return list of triples describing matching subsequences.
414 Each triple is of the form (i, j, n), and means that
415 a[i:i+n] == b[j:j+n]. The triples are monotonically increasing in
416 i and in j.
418 The last triple is a dummy, (len(a), len(b), 0), and is the only
419 triple with n==0.
421 >>> s = SequenceMatcher(None, "abxcd", "abcd")
422 >>> s.get_matching_blocks()
423 [(0, 0, 2), (3, 2, 2), (5, 4, 0)]
426 if self.matching_blocks is not None:
427 return self.matching_blocks
428 self.matching_blocks = []
429 la, lb = len(self.a), len(self.b)
430 self.__helper(0, la, 0, lb, self.matching_blocks)
431 self.matching_blocks.append( (la, lb, 0) )
432 return self.matching_blocks
434 # builds list of matching blocks covering a[alo:ahi] and
435 # b[blo:bhi], appending them in increasing order to answer
437 def __helper(self, alo, ahi, blo, bhi, answer):
438 i, j, k = x = self.find_longest_match(alo, ahi, blo, bhi)
439 # a[alo:i] vs b[blo:j] unknown
440 # a[i:i+k] same as b[j:j+k]
441 # a[i+k:ahi] vs b[j+k:bhi] unknown
442 if k:
443 if alo < i and blo < j:
444 self.__helper(alo, i, blo, j, answer)
445 answer.append(x)
446 if i+k < ahi and j+k < bhi:
447 self.__helper(i+k, ahi, j+k, bhi, answer)
449 def get_opcodes(self):
450 """Return list of 5-tuples describing how to turn a into b.
452 Each tuple is of the form (tag, i1, i2, j1, j2). The first tuple
453 has i1 == j1 == 0, and remaining tuples have i1 == the i2 from the
454 tuple preceding it, and likewise for j1 == the previous j2.
456 The tags are strings, with these meanings:
458 'replace': a[i1:i2] should be replaced by b[j1:j2]
459 'delete': a[i1:i2] should be deleted.
460 Note that j1==j2 in this case.
461 'insert': b[j1:j2] should be inserted at a[i1:i1].
462 Note that i1==i2 in this case.
463 'equal': a[i1:i2] == b[j1:j2]
465 >>> a = "qabxcd"
466 >>> b = "abycdf"
467 >>> s = SequenceMatcher(None, a, b)
468 >>> for tag, i1, i2, j1, j2 in s.get_opcodes():
469 ... print ("%7s a[%d:%d] (%s) b[%d:%d] (%s)" %
470 ... (tag, i1, i2, a[i1:i2], j1, j2, b[j1:j2]))
471 delete a[0:1] (q) b[0:0] ()
472 equal a[1:3] (ab) b[0:2] (ab)
473 replace a[3:4] (x) b[2:3] (y)
474 equal a[4:6] (cd) b[3:5] (cd)
475 insert a[6:6] () b[5:6] (f)
478 if self.opcodes is not None:
479 return self.opcodes
480 i = j = 0
481 self.opcodes = answer = []
482 for ai, bj, size in self.get_matching_blocks():
483 # invariant: we've pumped out correct diffs to change
484 # a[:i] into b[:j], and the next matching block is
485 # a[ai:ai+size] == b[bj:bj+size]. So we need to pump
486 # out a diff to change a[i:ai] into b[j:bj], pump out
487 # the matching block, and move (i,j) beyond the match
488 tag = ''
489 if i < ai and j < bj:
490 tag = 'replace'
491 elif i < ai:
492 tag = 'delete'
493 elif j < bj:
494 tag = 'insert'
495 if tag:
496 answer.append( (tag, i, ai, j, bj) )
497 i, j = ai+size, bj+size
498 # the list of matching blocks is terminated by a
499 # sentinel with size 0
500 if size:
501 answer.append( ('equal', ai, i, bj, j) )
502 return answer
504 def ratio(self):
505 """Return a measure of the sequences' similarity (float in [0,1]).
507 Where T is the total number of elements in both sequences, and
508 M is the number of matches, this is 2,0*M / T.
509 Note that this is 1 if the sequences are identical, and 0 if
510 they have nothing in common.
512 .ratio() is expensive to compute if you haven't already computed
513 .get_matching_blocks() or .get_opcodes(), in which case you may
514 want to try .quick_ratio() or .real_quick_ratio() first to get an
515 upper bound.
517 >>> s = SequenceMatcher(None, "abcd", "bcde")
518 >>> s.ratio()
519 0.75
520 >>> s.quick_ratio()
521 0.75
522 >>> s.real_quick_ratio()
526 matches = reduce(lambda sum, triple: sum + triple[-1],
527 self.get_matching_blocks(), 0)
528 return 2.0 * matches / (len(self.a) + len(self.b))
530 def quick_ratio(self):
531 """Return an upper bound on ratio() relatively quickly.
533 This isn't defined beyond that it is an upper bound on .ratio(), and
534 is faster to compute.
537 # viewing a and b as multisets, set matches to the cardinality
538 # of their intersection; this counts the number of matches
539 # without regard to order, so is clearly an upper bound
540 if self.fullbcount is None:
541 self.fullbcount = fullbcount = {}
542 for elt in self.b:
543 fullbcount[elt] = fullbcount.get(elt, 0) + 1
544 fullbcount = self.fullbcount
545 # avail[x] is the number of times x appears in 'b' less the
546 # number of times we've seen it in 'a' so far ... kinda
547 avail = {}
548 availhas, matches = avail.has_key, 0
549 for elt in self.a:
550 if availhas(elt):
551 numb = avail[elt]
552 else:
553 numb = fullbcount.get(elt, 0)
554 avail[elt] = numb - 1
555 if numb > 0:
556 matches = matches + 1
557 return 2.0 * matches / (len(self.a) + len(self.b))
559 def real_quick_ratio(self):
560 """Return an upper bound on ratio() very quickly.
562 This isn't defined beyond that it is an upper bound on .ratio(), and
563 is faster to compute than either .ratio() or .quick_ratio().
566 la, lb = len(self.a), len(self.b)
567 # can't have more matches than the number of elements in the
568 # shorter sequence
569 return 2.0 * min(la, lb) / (la + lb)
571 def get_close_matches(word, possibilities, n=3, cutoff=0.6):
572 """Use SequenceMatcher to return list of the best "good enough" matches.
574 word is a sequence for which close matches are desired (typically a
575 string).
577 possibilities is a list of sequences against which to match word
578 (typically a list of strings).
580 Optional arg n (default 3) is the maximum number of close matches to
581 return. n must be > 0.
583 Optional arg cutoff (default 0.6) is a float in [0, 1]. Possibilities
584 that don't score at least that similar to word are ignored.
586 The best (no more than n) matches among the possibilities are returned
587 in a list, sorted by similarity score, most similar first.
589 >>> get_close_matches("appel", ["ape", "apple", "peach", "puppy"])
590 ['apple', 'ape']
591 >>> import keyword as _keyword
592 >>> get_close_matches("wheel", _keyword.kwlist)
593 ['while']
594 >>> get_close_matches("apple", _keyword.kwlist)
596 >>> get_close_matches("accept", _keyword.kwlist)
597 ['except']
600 if not n > 0:
601 raise ValueError("n must be > 0: " + `n`)
602 if not 0.0 <= cutoff <= 1.0:
603 raise ValueError("cutoff must be in [0.0, 1.0]: " + `cutoff`)
604 result = []
605 s = SequenceMatcher()
606 s.set_seq2(word)
607 for x in possibilities:
608 s.set_seq1(x)
609 if s.real_quick_ratio() >= cutoff and \
610 s.quick_ratio() >= cutoff and \
611 s.ratio() >= cutoff:
612 result.append((s.ratio(), x))
613 # Sort by score.
614 result.sort()
615 # Retain only the best n.
616 result = result[-n:]
617 # Move best-scorer to head of list.
618 result.reverse()
619 # Strip scores.
620 return [x for score, x in result]
623 def _count_leading(line, ch):
625 Return number of `ch` characters at the start of `line`.
627 Example:
629 >>> _count_leading(' abc', ' ')
633 i, n = 0, len(line)
634 while i < n and line[i] == ch:
635 i += 1
636 return i
638 class Differ:
639 r"""
640 Differ is a class for comparing sequences of lines of text, and
641 producing human-readable differences or deltas. Differ uses
642 SequenceMatcher both to compare sequences of lines, and to compare
643 sequences of characters within similar (near-matching) lines.
645 Each line of a Differ delta begins with a two-letter code:
647 '- ' line unique to sequence 1
648 '+ ' line unique to sequence 2
649 ' ' line common to both sequences
650 '? ' line not present in either input sequence
652 Lines beginning with '? ' attempt to guide the eye to intraline
653 differences, and were not present in either input sequence. These lines
654 can be confusing if the sequences contain tab characters.
656 Note that Differ makes no claim to produce a *minimal* diff. To the
657 contrary, minimal diffs are often counter-intuitive, because they synch
658 up anywhere possible, sometimes accidental matches 100 pages apart.
659 Restricting synch points to contiguous matches preserves some notion of
660 locality, at the occasional cost of producing a longer diff.
662 Example: Comparing two texts.
664 First we set up the texts, sequences of individual single-line strings
665 ending with newlines (such sequences can also be obtained from the
666 `readlines()` method of file-like objects):
668 >>> text1 = ''' 1. Beautiful is better than ugly.
669 ... 2. Explicit is better than implicit.
670 ... 3. Simple is better than complex.
671 ... 4. Complex is better than complicated.
672 ... '''.splitlines(1)
673 >>> len(text1)
675 >>> text1[0][-1]
676 '\n'
677 >>> text2 = ''' 1. Beautiful is better than ugly.
678 ... 3. Simple is better than complex.
679 ... 4. Complicated is better than complex.
680 ... 5. Flat is better than nested.
681 ... '''.splitlines(1)
683 Next we instantiate a Differ object:
685 >>> d = Differ()
687 Note that when instantiating a Differ object we may pass functions to
688 filter out line and character 'junk'. See Differ.__init__ for details.
690 Finally, we compare the two:
692 >>> result = list(d.compare(text1, text2))
694 'result' is a list of strings, so let's pretty-print it:
696 >>> from pprint import pprint as _pprint
697 >>> _pprint(result)
698 [' 1. Beautiful is better than ugly.\n',
699 '- 2. Explicit is better than implicit.\n',
700 '- 3. Simple is better than complex.\n',
701 '+ 3. Simple is better than complex.\n',
702 '? ++\n',
703 '- 4. Complex is better than complicated.\n',
704 '? ^ ---- ^\n',
705 '+ 4. Complicated is better than complex.\n',
706 '? ++++ ^ ^\n',
707 '+ 5. Flat is better than nested.\n']
709 As a single multi-line string it looks like this:
711 >>> print ''.join(result),
712 1. Beautiful is better than ugly.
713 - 2. Explicit is better than implicit.
714 - 3. Simple is better than complex.
715 + 3. Simple is better than complex.
716 ? ++
717 - 4. Complex is better than complicated.
718 ? ^ ---- ^
719 + 4. Complicated is better than complex.
720 ? ++++ ^ ^
721 + 5. Flat is better than nested.
723 Methods:
725 __init__(linejunk=None, charjunk=None)
726 Construct a text differencer, with optional filters.
728 compare(a, b)
729 Compare two sequences of lines; generate the resulting delta.
732 def __init__(self, linejunk=None, charjunk=None):
734 Construct a text differencer, with optional filters.
736 The two optional keyword parameters are for filter functions:
738 - `linejunk`: A function that should accept a single string argument,
739 and return true iff the string is junk. The module-level function
740 `IS_LINE_JUNK` may be used to filter out lines without visible
741 characters, except for at most one splat ('#').
743 - `charjunk`: A function that should accept a string of length 1. The
744 module-level function `IS_CHARACTER_JUNK` may be used to filter out
745 whitespace characters (a blank or tab; **note**: bad idea to include
746 newline in this!).
749 self.linejunk = linejunk
750 self.charjunk = charjunk
752 def compare(self, a, b):
753 r"""
754 Compare two sequences of lines; generate the resulting delta.
756 Each sequence must contain individual single-line strings ending with
757 newlines. Such sequences can be obtained from the `readlines()` method
758 of file-like objects. The delta generated also consists of newline-
759 terminated strings, ready to be printed as-is via the writeline()
760 method of a file-like object.
762 Example:
764 >>> print ''.join(Differ().compare('one\ntwo\nthree\n'.splitlines(1),
765 ... 'ore\ntree\nemu\n'.splitlines(1))),
766 - one
768 + ore
770 - two
771 - three
773 + tree
774 + emu
777 cruncher = SequenceMatcher(self.linejunk, a, b)
778 for tag, alo, ahi, blo, bhi in cruncher.get_opcodes():
779 if tag == 'replace':
780 g = self._fancy_replace(a, alo, ahi, b, blo, bhi)
781 elif tag == 'delete':
782 g = self._dump('-', a, alo, ahi)
783 elif tag == 'insert':
784 g = self._dump('+', b, blo, bhi)
785 elif tag == 'equal':
786 g = self._dump(' ', a, alo, ahi)
787 else:
788 raise ValueError, 'unknown tag ' + `tag`
790 for line in g:
791 yield line
793 def _dump(self, tag, x, lo, hi):
794 """Generate comparison results for a same-tagged range."""
795 for i in xrange(lo, hi):
796 yield '%s %s' % (tag, x[i])
798 def _plain_replace(self, a, alo, ahi, b, blo, bhi):
799 assert alo < ahi and blo < bhi
800 # dump the shorter block first -- reduces the burden on short-term
801 # memory if the blocks are of very different sizes
802 if bhi - blo < ahi - alo:
803 first = self._dump('+', b, blo, bhi)
804 second = self._dump('-', a, alo, ahi)
805 else:
806 first = self._dump('-', a, alo, ahi)
807 second = self._dump('+', b, blo, bhi)
809 for g in first, second:
810 for line in g:
811 yield line
813 def _fancy_replace(self, a, alo, ahi, b, blo, bhi):
814 r"""
815 When replacing one block of lines with another, search the blocks
816 for *similar* lines; the best-matching pair (if any) is used as a
817 synch point, and intraline difference marking is done on the
818 similar pair. Lots of work, but often worth it.
820 Example:
822 >>> d = Differ()
823 >>> d._fancy_replace(['abcDefghiJkl\n'], 0, 1, ['abcdefGhijkl\n'], 0, 1)
824 >>> print ''.join(d.results),
825 - abcDefghiJkl
826 ? ^ ^ ^
827 + abcdefGhijkl
828 ? ^ ^ ^
831 # don't synch up unless the lines have a similarity score of at
832 # least cutoff; best_ratio tracks the best score seen so far
833 best_ratio, cutoff = 0.74, 0.75
834 cruncher = SequenceMatcher(self.charjunk)
835 eqi, eqj = None, None # 1st indices of equal lines (if any)
837 # search for the pair that matches best without being identical
838 # (identical lines must be junk lines, & we don't want to synch up
839 # on junk -- unless we have to)
840 for j in xrange(blo, bhi):
841 bj = b[j]
842 cruncher.set_seq2(bj)
843 for i in xrange(alo, ahi):
844 ai = a[i]
845 if ai == bj:
846 if eqi is None:
847 eqi, eqj = i, j
848 continue
849 cruncher.set_seq1(ai)
850 # computing similarity is expensive, so use the quick
851 # upper bounds first -- have seen this speed up messy
852 # compares by a factor of 3.
853 # note that ratio() is only expensive to compute the first
854 # time it's called on a sequence pair; the expensive part
855 # of the computation is cached by cruncher
856 if cruncher.real_quick_ratio() > best_ratio and \
857 cruncher.quick_ratio() > best_ratio and \
858 cruncher.ratio() > best_ratio:
859 best_ratio, best_i, best_j = cruncher.ratio(), i, j
860 if best_ratio < cutoff:
861 # no non-identical "pretty close" pair
862 if eqi is None:
863 # no identical pair either -- treat it as a straight replace
864 for line in self._plain_replace(a, alo, ahi, b, blo, bhi):
865 yield line
866 return
867 # no close pair, but an identical pair -- synch up on that
868 best_i, best_j, best_ratio = eqi, eqj, 1.0
869 else:
870 # there's a close pair, so forget the identical pair (if any)
871 eqi = None
873 # a[best_i] very similar to b[best_j]; eqi is None iff they're not
874 # identical
876 # pump out diffs from before the synch point
877 for line in self._fancy_helper(a, alo, best_i, b, blo, best_j):
878 yield line
880 # do intraline marking on the synch pair
881 aelt, belt = a[best_i], b[best_j]
882 if eqi is None:
883 # pump out a '-', '?', '+', '?' quad for the synched lines
884 atags = btags = ""
885 cruncher.set_seqs(aelt, belt)
886 for tag, ai1, ai2, bj1, bj2 in cruncher.get_opcodes():
887 la, lb = ai2 - ai1, bj2 - bj1
888 if tag == 'replace':
889 atags += '^' * la
890 btags += '^' * lb
891 elif tag == 'delete':
892 atags += '-' * la
893 elif tag == 'insert':
894 btags += '+' * lb
895 elif tag == 'equal':
896 atags += ' ' * la
897 btags += ' ' * lb
898 else:
899 raise ValueError, 'unknown tag ' + `tag`
900 for line in self._qformat(aelt, belt, atags, btags):
901 yield line
902 else:
903 # the synch pair is identical
904 yield ' ' + aelt
906 # pump out diffs from after the synch point
907 for line in self._fancy_helper(a, best_i+1, ahi, b, best_j+1, bhi):
908 yield line
910 def _fancy_helper(self, a, alo, ahi, b, blo, bhi):
911 g = []
912 if alo < ahi:
913 if blo < bhi:
914 g = self._fancy_replace(a, alo, ahi, b, blo, bhi)
915 else:
916 g = self._dump('-', a, alo, ahi)
917 elif blo < bhi:
918 g = self._dump('+', b, blo, bhi)
920 for line in g:
921 yield line
923 def _qformat(self, aline, bline, atags, btags):
924 r"""
925 Format "?" output and deal with leading tabs.
927 Example:
929 >>> d = Differ()
930 >>> d._qformat('\tabcDefghiJkl\n', '\t\tabcdefGhijkl\n',
931 ... ' ^ ^ ^ ', '+ ^ ^ ^ ')
932 >>> for line in d.results: print repr(line)
934 '- \tabcDefghiJkl\n'
935 '? \t ^ ^ ^\n'
936 '+ \t\tabcdefGhijkl\n'
937 '? \t ^ ^ ^\n'
940 # Can hurt, but will probably help most of the time.
941 common = min(_count_leading(aline, "\t"),
942 _count_leading(bline, "\t"))
943 common = min(common, _count_leading(atags[:common], " "))
944 atags = atags[common:].rstrip()
945 btags = btags[common:].rstrip()
947 yield "- " + aline
948 if atags:
949 yield "? %s%s\n" % ("\t" * common, atags)
951 yield "+ " + bline
952 if btags:
953 yield "? %s%s\n" % ("\t" * common, btags)
955 # With respect to junk, an earlier version of ndiff simply refused to
956 # *start* a match with a junk element. The result was cases like this:
957 # before: private Thread currentThread;
958 # after: private volatile Thread currentThread;
959 # If you consider whitespace to be junk, the longest contiguous match
960 # not starting with junk is "e Thread currentThread". So ndiff reported
961 # that "e volatil" was inserted between the 't' and the 'e' in "private".
962 # While an accurate view, to people that's absurd. The current version
963 # looks for matching blocks that are entirely junk-free, then extends the
964 # longest one of those as far as possible but only with matching junk.
965 # So now "currentThread" is matched, then extended to suck up the
966 # preceding blank; then "private" is matched, and extended to suck up the
967 # following blank; then "Thread" is matched; and finally ndiff reports
968 # that "volatile " was inserted before "Thread". The only quibble
969 # remaining is that perhaps it was really the case that " volatile"
970 # was inserted after "private". I can live with that <wink>.
972 import re
974 def IS_LINE_JUNK(line, pat=re.compile(r"\s*#?\s*$").match):
975 r"""
976 Return 1 for ignorable line: iff `line` is blank or contains a single '#'.
978 Examples:
980 >>> IS_LINE_JUNK('\n')
982 >>> IS_LINE_JUNK(' # \n')
984 >>> IS_LINE_JUNK('hello\n')
988 return pat(line) is not None
990 def IS_CHARACTER_JUNK(ch, ws=" \t"):
991 r"""
992 Return 1 for ignorable character: iff `ch` is a space or tab.
994 Examples:
996 >>> IS_CHARACTER_JUNK(' ')
998 >>> IS_CHARACTER_JUNK('\t')
1000 >>> IS_CHARACTER_JUNK('\n')
1002 >>> IS_CHARACTER_JUNK('x')
1006 return ch in ws
1008 del re
1010 def ndiff(a, b, linejunk=IS_LINE_JUNK, charjunk=IS_CHARACTER_JUNK):
1011 r"""
1012 Compare `a` and `b` (lists of strings); return a `Differ`-style delta.
1014 Optional keyword parameters `linejunk` and `charjunk` are for filter
1015 functions (or None):
1017 - linejunk: A function that should accept a single string argument, and
1018 return true iff the string is junk. The default is module-level function
1019 IS_LINE_JUNK, which filters out lines without visible characters, except
1020 for at most one splat ('#').
1022 - charjunk: A function that should accept a string of length 1. The
1023 default is module-level function IS_CHARACTER_JUNK, which filters out
1024 whitespace characters (a blank or tab; note: bad idea to include newline
1025 in this!).
1027 Tools/scripts/ndiff.py is a command-line front-end to this function.
1029 Example:
1031 >>> diff = ndiff('one\ntwo\nthree\n'.splitlines(1),
1032 ... 'ore\ntree\nemu\n'.splitlines(1))
1033 >>> print ''.join(diff),
1034 - one
1036 + ore
1038 - two
1039 - three
1041 + tree
1042 + emu
1044 return Differ(linejunk, charjunk).compare(a, b)
1046 def restore(delta, which):
1047 r"""
1048 Generate one of the two sequences that generated a delta.
1050 Given a `delta` produced by `Differ.compare()` or `ndiff()`, extract
1051 lines originating from file 1 or 2 (parameter `which`), stripping off line
1052 prefixes.
1054 Examples:
1056 >>> diff = ndiff('one\ntwo\nthree\n'.splitlines(1),
1057 ... 'ore\ntree\nemu\n'.splitlines(1))
1058 >>> diff = list(diff)
1059 >>> print ''.join(restore(diff, 1)),
1062 three
1063 >>> print ''.join(restore(diff, 2)),
1065 tree
1068 try:
1069 tag = {1: "- ", 2: "+ "}[int(which)]
1070 except KeyError:
1071 raise ValueError, ('unknown delta choice (must be 1 or 2): %r'
1072 % which)
1073 prefixes = (" ", tag)
1074 for line in delta:
1075 if line[:2] in prefixes:
1076 yield line[2:]
1078 def _test():
1079 import doctest, difflib
1080 return doctest.testmod(difflib)
1082 if __name__ == "__main__":
1083 _test()