3 # Module ndiff version 1.4.0
4 # Released to the public domain 27-Mar-1999,
5 # by Tim Peters (tim_one@email.msn.com).
7 # Provided as-is; use at your own risk; no warranty; no promises; enjoy!
9 """ndiff [-q] file1 file2
11 ndiff (-r1 | -r2) < ndiff_output > file1_or_file2
13 Print a human-friendly file difference report to stdout. Both inter-
14 and intra-line differences are noted. In the second form, recreate file1
15 (-r1) or file2 (-r2) on stdout, from an ndiff report on stdin.
17 In the first form, if -q ("quiet") is not specified, the first two lines
23 Each remaining line begins with a two-letter code:
25 "- " line unique to file1
26 "+ " line unique to file2
27 " " line common to both files
28 "? " line not present in either input file
30 Lines beginning with "? " attempt to guide the eye to intraline
31 differences, and were not present in either input file. These lines can
32 be confusing if the source files contain tab characters.
34 The first file can be recovered by retaining only lines that begin with
35 " " or "- ", and deleting those 2-character prefixes; use ndiff with -r1.
37 The second file can be recovered similarly, but by retaining only " "
38 and "+ " lines; use ndiff with -r2; or, on Unix, the second file can be
39 recovered by piping the output through
41 sed -n '/^[+ ] /s/^..//p'
43 See module comments for details and programmatic interface.
48 # SequenceMatcher tries to compute a "human-friendly diff" between
49 # two sequences (chiefly picturing a file as a sequence of lines,
50 # and a line as a sequence of characters, here). Unlike e.g. UNIX(tm)
51 # diff, the fundamental notion is the longest *contiguous* & junk-free
52 # matching subsequence. That's what catches peoples' eyes. The
53 # Windows(tm) windiff has another interesting notion, pairing up elements
54 # that appear uniquely in each sequence. That, and the method here,
55 # appear to yield more intuitive difference reports than does diff. This
56 # method appears to be the least vulnerable to synching up on blocks
57 # of "junk lines", though (like blank lines in ordinary text files,
58 # or maybe "<P>" lines in HTML files). That may be because this is
59 # the only method of the 3 that has a *concept* of "junk" <wink>.
61 # Note that ndiff makes no claim to produce a *minimal* diff. To the
62 # contrary, minimal diffs are often counter-intuitive, because they
63 # synch up anywhere possible, sometimes accidental matches 100 pages
64 # apart. Restricting synch points to contiguous matches preserves some
65 # notion of locality, at the occasional cost of producing a longer diff.
67 # With respect to junk, an earlier version of ndiff simply refused to
68 # *start* a match with a junk element. The result was cases like this:
69 # before: private Thread currentThread;
70 # after: private volatile Thread currentThread;
71 # If you consider whitespace to be junk, the longest contiguous match
72 # not starting with junk is "e Thread currentThread". So ndiff reported
73 # that "e volatil" was inserted between the 't' and the 'e' in "private".
74 # While an accurate view, to people that's absurd. The current version
75 # looks for matching blocks that are entirely junk-free, then extends the
76 # longest one of those as far as possible but only with matching junk.
77 # So now "currentThread" is matched, then extended to suck up the
78 # preceding blank; then "private" is matched, and extended to suck up the
79 # following blank; then "Thread" is matched; and finally ndiff reports
80 # that "volatile " was inserted before "Thread". The only quibble
81 # remaining is that perhaps it was really the case that " volatile"
82 # was inserted after "private". I can live with that <wink>.
84 # NOTE on junk: the module-level names
87 # can be set to any functions you like. The first one should accept
88 # a single string argument, and return true iff the string is junk.
89 # The default is whether the regexp r"\s*#?\s*$" matches (i.e., a
90 # line without visible characters, except for at most one splat).
91 # The second should accept a string of length 1 etc. The default is
92 # whether the character is a blank or tab (note: bad idea to include
95 # After setting those, you can call fcompare(f1name, f2name) with the
96 # names of the files you want to compare. The difference report
97 # is sent to stdout. Or you can call main(args), passing what would
98 # have been in sys.argv[1:] had the cmd-line form been used.
103 # define what "junk" means
106 def IS_LINE_JUNK(line
, pat
=re
.compile(r
"\s*#?\s*$").match
):
107 return pat(line
) is not None
109 def IS_CHARACTER_JUNK(ch
, ws
=" \t"):
114 class SequenceMatcher
:
115 def __init__(self
, isjunk
=None, a
='', b
=''):
120 # second sequence; differences are computed as "what do
121 # we need to do to 'a' to change it into 'b'?"
123 # for x in b, b2j[x] is a list of the indices (into b)
124 # at which x appears; junk elements do not appear
128 # for x in b, fullbcount[x] == the number of times x
129 # appears in b; only materialized if really needed (used
130 # only for computing quick_ratio())
132 # a list of (i, j, k) triples, where a[i:i+k] == b[j:j+k];
133 # ascending & non-overlapping in i and in j; terminated by
134 # a dummy (len(a), len(b), 0) sentinel
136 # a list of (tag, i1, i2, j1, j2) tuples, where tag is
138 # 'replace' a[i1:i2] should be replaced by b[j1:j2]
139 # 'delete' a[i1:i2] should be deleted
140 # 'insert' b[j1:j2] should be inserted
141 # 'equal' a[i1:i2] == b[j1:j2]
143 # a user-supplied function taking a sequence element and
144 # returning true iff the element is "junk" -- this has
145 # subtle but helpful effects on the algorithm, which I'll
146 # get around to writing up someday <0.9 wink>.
147 # DON'T USE! Only __chain_b uses this. Use isbjunk.
149 # for x in b, isbjunk(x) == isjunk(x) but much faster;
150 # it's really the has_key method of a hidden dict.
151 # DOES NOT WORK for x in a!
154 self
.a
= self
.b
= None
157 def set_seqs(self
, a
, b
):
161 def set_seq1(self
, a
):
165 self
.matching_blocks
= self
.opcodes
= None
167 def set_seq2(self
, b
):
171 self
.matching_blocks
= self
.opcodes
= None
172 self
.fullbcount
= None
175 # For each element x in b, set b2j[x] to a list of the indices in
176 # b where x appears; the indices are in increasing order; note that
177 # the number of times x appears in b is len(b2j[x]) ...
178 # when self.isjunk is defined, junk elements don't show up in this
179 # map at all, which stops the central find_longest_match method
180 # from starting any matching block at a junk element ...
181 # also creates the fast isbjunk function ...
182 # note that this is only called when b changes; so for cross-product
183 # kinds of matches, it's best to call set_seq2 once, then set_seq1
187 # Because isjunk is a user-defined (not C) function, and we test
188 # for junk a LOT, it's important to minimize the number of calls.
189 # Before the tricks described here, __chain_b was by far the most
190 # time-consuming routine in the whole module! If anyone sees
191 # Jim Roskind, thank him again for profile.py -- I never would
193 # The first trick is to build b2j ignoring the possibility
194 # of junk. I.e., we don't call isjunk at all yet. Throwing
195 # out the junk later is much cheaper than building b2j "right"
199 self
.b2jhas
= b2jhas
= b2j
.has_key
200 for i
in xrange(len(b
)):
207 # Now b2j.keys() contains elements uniquely, and especially when
208 # the sequence is a string, that's usually a good deal smaller
209 # than len(string). The difference is the number of isjunk calls
211 isjunk
, junkdict
= self
.isjunk
, {}
213 for elt
in b2j
.keys():
215 junkdict
[elt
] = 1 # value irrelevant; it's a set
218 # Now for x in b, isjunk(x) == junkdict.has_key(x), but the
219 # latter is much faster. Note too that while there may be a
220 # lot of junk in the sequence, the number of *unique* junk
221 # elements is probably small. So the memory burden of keeping
222 # this dict alive is likely trivial compared to the size of b2j.
223 self
.isbjunk
= junkdict
.has_key
225 def find_longest_match(self
, alo
, ahi
, blo
, bhi
):
226 """Find longest matching block in a[alo:ahi] and b[blo:bhi].
228 If isjunk is not defined:
230 Return (i,j,k) such that a[i:i+k] is equal to b[j:j+k], where
231 alo <= i <= i+k <= ahi
232 blo <= j <= j+k <= bhi
233 and for all (i',j',k') meeting those conditions,
236 and if i == i', j <= j'
237 In other words, of all maximal matching blocks, return one
238 that starts earliest in a, and of all those maximal matching
239 blocks that start earliest in a, return the one that starts
242 If isjunk is defined, first the longest matching block is
243 determined as above, but with the additional restriction that
244 no junk element appears in the block. Then that block is
245 extended as far as possible by matching (only) junk elements on
246 both sides. So the resulting block never matches on junk except
247 as identical junk happens to be adjacent to an "interesting"
250 If no blocks match, return (alo, blo, 0).
253 # CAUTION: stripping common prefix or suffix would be incorrect.
257 # Longest matching block is "ab", but if common prefix is
258 # stripped, it's "a" (tied with "b"). UNIX(tm) diff does so
259 # strip, so ends up claiming that ab is changed to acab by
260 # inserting "ca" in the middle. That's minimal but unintuitive:
261 # "it's obvious" that someone inserted "ac" at the front.
262 # Windiff ends up at the same place as diff, but by pairing up
263 # the unique 'b's and then matching the first two 'a's.
265 a
, b
, b2j
, isbjunk
= self
.a
, self
.b
, self
.b2j
, self
.isbjunk
266 besti
, bestj
, bestsize
= alo
, blo
, 0
267 # find longest junk-free match
268 # during an iteration of the loop, j2len[j] = length of longest
269 # junk-free match ending with a[i-1] and b[j]
272 for i
in xrange(alo
, ahi
):
273 # look at all instances of a[i] in b; note that because
274 # b2j has no junk keys, the loop is skipped if a[i] is junk
277 for j
in b2j
.get(a
[i
], nothing
):
283 k
= newj2len
[j
] = j2lenget(j
-1, 0) + 1
285 besti
, bestj
, bestsize
= i
-k
+1, j
-k
+1, k
288 # Now that we have a wholly interesting match (albeit possibly
289 # empty!), we may as well suck up the matching junk on each
290 # side of it too. Can't think of a good reason not to, and it
291 # saves post-processing the (possibly considerable) expense of
292 # figuring out what to do with it. In the case of an empty
293 # interesting match, this is clearly the right thing to do,
294 # because no other kind of match is possible in the regions.
295 while besti
> alo
and bestj
> blo
and \
296 isbjunk(b
[bestj
-1]) and \
297 a
[besti
-1] == b
[bestj
-1]:
298 besti
, bestj
, bestsize
= besti
-1, bestj
-1, bestsize
+1
299 while besti
+bestsize
< ahi
and bestj
+bestsize
< bhi
and \
300 isbjunk(b
[bestj
+bestsize
]) and \
301 a
[besti
+bestsize
] == b
[bestj
+bestsize
]:
302 bestsize
= bestsize
+ 1
305 print "get_matching_blocks", alo
, ahi
, blo
, bhi
306 print " returns", besti
, bestj
, bestsize
307 return besti
, bestj
, bestsize
309 def get_matching_blocks(self
):
310 if self
.matching_blocks
is not None:
311 return self
.matching_blocks
312 self
.matching_blocks
= []
313 la
, lb
= len(self
.a
), len(self
.b
)
314 self
.__helper
(0, la
, 0, lb
, self
.matching_blocks
)
315 self
.matching_blocks
.append( (la
, lb
, 0) )
317 print '*** matching blocks', self
.matching_blocks
318 return self
.matching_blocks
320 # builds list of matching blocks covering a[alo:ahi] and
321 # b[blo:bhi], appending them in increasing order to answer
323 def __helper(self
, alo
, ahi
, blo
, bhi
, answer
):
324 i
, j
, k
= x
= self
.find_longest_match(alo
, ahi
, blo
, bhi
)
325 # a[alo:i] vs b[blo:j] unknown
326 # a[i:i+k] same as b[j:j+k]
327 # a[i+k:ahi] vs b[j+k:bhi] unknown
329 if alo
< i
and blo
< j
:
330 self
.__helper
(alo
, i
, blo
, j
, answer
)
332 if i
+k
< ahi
and j
+k
< bhi
:
333 self
.__helper
(i
+k
, ahi
, j
+k
, bhi
, answer
)
336 """Return a measure of the sequences' similarity (float in [0,1]).
338 Where T is the total number of elements in both sequences, and
339 M is the number of matches, this is 2*M / T.
340 Note that this is 1 if the sequences are identical, and 0 if
341 they have nothing in common.
344 matches
= reduce(lambda sum, triple
: sum + triple
[-1],
345 self
.get_matching_blocks(), 0)
346 return 2.0 * matches
/ (len(self
.a
) + len(self
.b
))
348 def quick_ratio(self
):
349 """Return an upper bound on ratio() relatively quickly."""
350 # viewing a and b as multisets, set matches to the cardinality
351 # of their intersection; this counts the number of matches
352 # without regard to order, so is clearly an upper bound
353 if self
.fullbcount
is None:
354 self
.fullbcount
= fullbcount
= {}
356 fullbcount
[elt
] = fullbcount
.get(elt
, 0) + 1
357 fullbcount
= self
.fullbcount
358 # avail[x] is the number of times x appears in 'b' less the
359 # number of times we've seen it in 'a' so far ... kinda
361 availhas
, matches
= avail
.has_key
, 0
366 numb
= fullbcount
.get(elt
, 0)
367 avail
[elt
] = numb
- 1
369 matches
= matches
+ 1
370 return 2.0 * matches
/ (len(self
.a
) + len(self
.b
))
372 def real_quick_ratio(self
):
373 """Return an upper bound on ratio() very quickly"""
374 la
, lb
= len(self
.a
), len(self
.b
)
375 # can't have more matches than the number of elements in the
377 return 2.0 * min(la
, lb
) / (la
+ lb
)
379 def get_opcodes(self
):
380 if self
.opcodes
is not None:
383 self
.opcodes
= answer
= []
384 for ai
, bj
, size
in self
.get_matching_blocks():
385 # invariant: we've pumped out correct diffs to change
386 # a[:i] into b[:j], and the next matching block is
387 # a[ai:ai+size] == b[bj:bj+size]. So we need to pump
388 # out a diff to change a[i:ai] into b[j:bj], pump out
389 # the matching block, and move (i,j) beyond the match
391 if i
< ai
and j
< bj
:
398 answer
.append( (tag
, i
, ai
, j
, bj
) )
399 i
, j
= ai
+size
, bj
+size
400 # the list of matching blocks is terminated by a
401 # sentinel with size 0
403 answer
.append( ('equal', ai
, i
, bj
, j
) )
406 # meant for dumping lines
407 def dump(tag
, x
, lo
, hi
):
408 for i
in xrange(lo
, hi
):
411 # figure out which mark to stick under characters in lines that
412 # have changed (blank = same, - = deleted, + = inserted, ^ = replaced)
413 _combine
= { ' ': ' ',
418 def plain_replace(a
, alo
, ahi
, b
, blo
, bhi
):
419 assert alo
< ahi
and blo
< bhi
420 # dump the shorter block first -- reduces the burden on short-term
421 # memory if the blocks are of very different sizes
422 if bhi
- blo
< ahi
- alo
:
423 dump('+', b
, blo
, bhi
)
424 dump('-', a
, alo
, ahi
)
426 dump('-', a
, alo
, ahi
)
427 dump('+', b
, blo
, bhi
)
429 # When replacing one block of lines with another, this guy searches
430 # the blocks for *similar* lines; the best-matching pair (if any) is
431 # used as a synch point, and intraline difference marking is done on
432 # the similar pair. Lots of work, but often worth it.
434 def fancy_replace(a
, alo
, ahi
, b
, blo
, bhi
):
436 print '*** fancy_replace', alo
, ahi
, blo
, bhi
437 dump('>', a
, alo
, ahi
)
438 dump('<', b
, blo
, bhi
)
440 # don't synch up unless the lines have a similarity score of at
441 # least cutoff; best_ratio tracks the best score seen so far
442 best_ratio
, cutoff
= 0.74, 0.75
443 cruncher
= SequenceMatcher(IS_CHARACTER_JUNK
)
444 eqi
, eqj
= None, None # 1st indices of equal lines (if any)
446 # search for the pair that matches best without being identical
447 # (identical lines must be junk lines, & we don't want to synch up
448 # on junk -- unless we have to)
449 for j
in xrange(blo
, bhi
):
451 cruncher
.set_seq2(bj
)
452 for i
in xrange(alo
, ahi
):
458 cruncher
.set_seq1(ai
)
459 # computing similarity is expensive, so use the quick
460 # upper bounds first -- have seen this speed up messy
461 # compares by a factor of 3.
462 # note that ratio() is only expensive to compute the first
463 # time it's called on a sequence pair; the expensive part
464 # of the computation is cached by cruncher
465 if cruncher
.real_quick_ratio() > best_ratio
and \
466 cruncher
.quick_ratio() > best_ratio
and \
467 cruncher
.ratio() > best_ratio
:
468 best_ratio
, best_i
, best_j
= cruncher
.ratio(), i
, j
469 if best_ratio
< cutoff
:
470 # no non-identical "pretty close" pair
472 # no identical pair either -- treat it as a straight replace
473 plain_replace(a
, alo
, ahi
, b
, blo
, bhi
)
475 # no close pair, but an identical pair -- synch up on that
476 best_i
, best_j
, best_ratio
= eqi
, eqj
, 1.0
478 # there's a close pair, so forget the identical pair (if any)
481 # a[best_i] very similar to b[best_j]; eqi is None iff they're not
484 print '*** best_ratio', best_ratio
, best_i
, best_j
485 dump('>', a
, best_i
, best_i
+1)
486 dump('<', b
, best_j
, best_j
+1)
488 # pump out diffs from before the synch point
489 fancy_helper(a
, alo
, best_i
, b
, blo
, best_j
)
491 # do intraline marking on the synch pair
492 aelt
, belt
= a
[best_i
], b
[best_j
]
494 # pump out a '-', '+', '?' triple for the synched lines;
496 cruncher
.set_seqs(aelt
, belt
)
497 for tag
, ai1
, ai2
, bj1
, bj2
in cruncher
.get_opcodes():
498 la
, lb
= ai2
- ai1
, bj2
- bj1
500 atags
= atags
+ '.' * la
501 btags
= btags
+ '.' * lb
502 elif tag
== 'delete':
503 atags
= atags
+ '.' * la
504 elif tag
== 'insert':
505 btags
= btags
+ '.' * lb
507 atags
= atags
+ ' ' * la
508 btags
= btags
+ ' ' * lb
510 raise ValueError, 'unknown tag ' + `tag`
511 la
, lb
= len(atags
), len(btags
)
513 atags
= atags
+ ' ' * (lb
- la
)
515 btags
= btags
+ ' ' * (la
- lb
)
516 combined
= map(lambda x
,y
: _combine
[x
+y
], atags
, btags
)
517 print '-', aelt
, '+', belt
, '?', \
518 string
.rstrip(string
.join(combined
, ''))
520 # the synch pair is identical
523 # pump out diffs from after the synch point
524 fancy_helper(a
, best_i
+1, ahi
, b
, best_j
+1, bhi
)
526 def fancy_helper(a
, alo
, ahi
, b
, blo
, bhi
):
529 fancy_replace(a
, alo
, ahi
, b
, blo
, bhi
)
531 dump('-', a
, alo
, ahi
)
533 dump('+', b
, blo
, bhi
)
537 out
= sys
.stderr
.write
542 # open a file & return the file object; gripe and return 0 if it
546 return open(fname
, 'r')
547 except IOError, detail
:
548 return fail("couldn't open " + fname
+ ": " + str(detail
))
550 # open two files & spray the diff to stdout; return false iff a problem
551 def fcompare(f1name
, f2name
):
557 a
= f1
.readlines(); f1
.close()
558 b
= f2
.readlines(); f2
.close()
560 cruncher
= SequenceMatcher(IS_LINE_JUNK
, a
, b
)
561 for tag
, alo
, ahi
, blo
, bhi
in cruncher
.get_opcodes():
563 fancy_replace(a
, alo
, ahi
, b
, blo
, bhi
)
564 elif tag
== 'delete':
565 dump('-', a
, alo
, ahi
)
566 elif tag
== 'insert':
567 dump('+', b
, blo
, bhi
)
569 dump(' ', a
, alo
, ahi
)
571 raise ValueError, 'unknown tag ' + `tag`
575 # crack args (sys.argv[1:] is normal) & compare;
576 # return false iff a problem
581 opts
, args
= getopt
.getopt(args
, "qr:")
582 except getopt
.error
, detail
:
583 return fail(str(detail
))
586 for opt
, val
in opts
:
594 return fail("can't specify both -q and -r")
597 return fail("no args allowed with -r option")
598 if whichfile
in "12":
601 return fail("-r value must be 1 or 2")
603 return fail("need 2 filename args")
604 f1name
, f2name
= args
608 return fcompare(f1name
, f2name
)
612 tag
= {"1": "- ", "2": "+ "}[which
]
613 prefixes
= (" ", tag
)
614 for line
in sys
.stdin
.readlines():
615 if line
[:2] in prefixes
:
618 if __name__
== '__main__':
621 if "-profile" in args
:
622 import profile
, pstats
623 args
.remove("-profile")
625 profile
.run("main(args)", statf
)
626 stats
= pstats
.Stats(statf
)
627 stats
.strip_dirs().sort_stats('time').print_stats()