3 # Module ndiff version 1.6.0
4 # Released to the public domain 08-Dec-2000,
5 # by Tim Peters (tim.one@home.com).
7 # Provided as-is; use at your own risk; no warranty; no promises; enjoy!
9 """ndiff [-q] file1 file2
11 ndiff (-r1 | -r2) < ndiff_output > file1_or_file2
13 Print a human-friendly file difference report to stdout. Both inter-
14 and intra-line differences are noted. In the second form, recreate file1
15 (-r1) or file2 (-r2) on stdout, from an ndiff report on stdin.
17 In the first form, if -q ("quiet") is not specified, the first two lines
23 Each remaining line begins with a two-letter code:
25 "- " line unique to file1
26 "+ " line unique to file2
27 " " line common to both files
28 "? " line not present in either input file
30 Lines beginning with "? " attempt to guide the eye to intraline
31 differences, and were not present in either input file. These lines can be
32 confusing if the source files contain tab characters.
34 The first file can be recovered by retaining only lines that begin with
35 " " or "- ", and deleting those 2-character prefixes; use ndiff with -r1.
37 The second file can be recovered similarly, but by retaining only " " and
38 "+ " lines; use ndiff with -r2; or, on Unix, the second file can be
39 recovered by piping the output through
41 sed -n '/^[+ ] /s/^..//p'
43 See module comments for details and programmatic interface.
48 # SequenceMatcher tries to compute a "human-friendly diff" between
49 # two sequences (chiefly picturing a file as a sequence of lines,
50 # and a line as a sequence of characters, here). Unlike e.g. UNIX(tm)
51 # diff, the fundamental notion is the longest *contiguous* & junk-free
52 # matching subsequence. That's what catches peoples' eyes. The
53 # Windows(tm) windiff has another interesting notion, pairing up elements
54 # that appear uniquely in each sequence. That, and the method here,
55 # appear to yield more intuitive difference reports than does diff. This
56 # method appears to be the least vulnerable to synching up on blocks
57 # of "junk lines", though (like blank lines in ordinary text files,
58 # or maybe "<P>" lines in HTML files). That may be because this is
59 # the only method of the 3 that has a *concept* of "junk" <wink>.
61 # Note that ndiff makes no claim to produce a *minimal* diff. To the
62 # contrary, minimal diffs are often counter-intuitive, because they
63 # synch up anywhere possible, sometimes accidental matches 100 pages
64 # apart. Restricting synch points to contiguous matches preserves some
65 # notion of locality, at the occasional cost of producing a longer diff.
67 # With respect to junk, an earlier version of ndiff simply refused to
68 # *start* a match with a junk element. The result was cases like this:
69 # before: private Thread currentThread;
70 # after: private volatile Thread currentThread;
71 # If you consider whitespace to be junk, the longest contiguous match
72 # not starting with junk is "e Thread currentThread". So ndiff reported
73 # that "e volatil" was inserted between the 't' and the 'e' in "private".
74 # While an accurate view, to people that's absurd. The current version
75 # looks for matching blocks that are entirely junk-free, then extends the
76 # longest one of those as far as possible but only with matching junk.
77 # So now "currentThread" is matched, then extended to suck up the
78 # preceding blank; then "private" is matched, and extended to suck up the
79 # following blank; then "Thread" is matched; and finally ndiff reports
80 # that "volatile " was inserted before "Thread". The only quibble
81 # remaining is that perhaps it was really the case that " volatile"
82 # was inserted after "private". I can live with that <wink>.
84 # NOTE on junk: the module-level names
87 # can be set to any functions you like. The first one should accept
88 # a single string argument, and return true iff the string is junk.
89 # The default is whether the regexp r"\s*#?\s*$" matches (i.e., a
90 # line without visible characters, except for at most one splat).
91 # The second should accept a string of length 1 etc. The default is
92 # whether the character is a blank or tab (note: bad idea to include
95 # After setting those, you can call fcompare(f1name, f2name) with the
96 # names of the files you want to compare. The difference report
97 # is sent to stdout. Or you can call main(args), passing what would
98 # have been in sys.argv[1:] had the cmd-line form been used.
100 from difflib
import SequenceMatcher
105 # define what "junk" means
108 def IS_LINE_JUNK(line
, pat
=re
.compile(r
"\s*#?\s*$").match
):
109 return pat(line
) is not None
111 def IS_CHARACTER_JUNK(ch
, ws
=" \t"):
116 # meant for dumping lines
117 def dump(tag
, x
, lo
, hi
):
118 for i
in xrange(lo
, hi
):
121 def plain_replace(a
, alo
, ahi
, b
, blo
, bhi
):
122 assert alo
< ahi
and blo
< bhi
123 # dump the shorter block first -- reduces the burden on short-term
124 # memory if the blocks are of very different sizes
125 if bhi
- blo
< ahi
- alo
:
126 dump('+', b
, blo
, bhi
)
127 dump('-', a
, alo
, ahi
)
129 dump('-', a
, alo
, ahi
)
130 dump('+', b
, blo
, bhi
)
132 # When replacing one block of lines with another, this guy searches
133 # the blocks for *similar* lines; the best-matching pair (if any) is
134 # used as a synch point, and intraline difference marking is done on
135 # the similar pair. Lots of work, but often worth it.
137 def fancy_replace(a
, alo
, ahi
, b
, blo
, bhi
):
139 print '*** fancy_replace', alo
, ahi
, blo
, bhi
140 dump('>', a
, alo
, ahi
)
141 dump('<', b
, blo
, bhi
)
143 # don't synch up unless the lines have a similarity score of at
144 # least cutoff; best_ratio tracks the best score seen so far
145 best_ratio
, cutoff
= 0.74, 0.75
146 cruncher
= SequenceMatcher(IS_CHARACTER_JUNK
)
147 eqi
, eqj
= None, None # 1st indices of equal lines (if any)
149 # search for the pair that matches best without being identical
150 # (identical lines must be junk lines, & we don't want to synch up
151 # on junk -- unless we have to)
152 for j
in xrange(blo
, bhi
):
154 cruncher
.set_seq2(bj
)
155 for i
in xrange(alo
, ahi
):
161 cruncher
.set_seq1(ai
)
162 # computing similarity is expensive, so use the quick
163 # upper bounds first -- have seen this speed up messy
164 # compares by a factor of 3.
165 # note that ratio() is only expensive to compute the first
166 # time it's called on a sequence pair; the expensive part
167 # of the computation is cached by cruncher
168 if cruncher
.real_quick_ratio() > best_ratio
and \
169 cruncher
.quick_ratio() > best_ratio
and \
170 cruncher
.ratio() > best_ratio
:
171 best_ratio
, best_i
, best_j
= cruncher
.ratio(), i
, j
172 if best_ratio
< cutoff
:
173 # no non-identical "pretty close" pair
175 # no identical pair either -- treat it as a straight replace
176 plain_replace(a
, alo
, ahi
, b
, blo
, bhi
)
178 # no close pair, but an identical pair -- synch up on that
179 best_i
, best_j
, best_ratio
= eqi
, eqj
, 1.0
181 # there's a close pair, so forget the identical pair (if any)
184 # a[best_i] very similar to b[best_j]; eqi is None iff they're not
187 print '*** best_ratio', best_ratio
, best_i
, best_j
188 dump('>', a
, best_i
, best_i
+1)
189 dump('<', b
, best_j
, best_j
+1)
191 # pump out diffs from before the synch point
192 fancy_helper(a
, alo
, best_i
, b
, blo
, best_j
)
194 # do intraline marking on the synch pair
195 aelt
, belt
= a
[best_i
], b
[best_j
]
197 # pump out a '-', '?', '+', '?' quad for the synched lines
199 cruncher
.set_seqs(aelt
, belt
)
200 for tag
, ai1
, ai2
, bj1
, bj2
in cruncher
.get_opcodes():
201 la
, lb
= ai2
- ai1
, bj2
- bj1
205 elif tag
== 'delete':
207 elif tag
== 'insert':
213 raise ValueError, 'unknown tag ' + `tag`
214 printq(aelt
, belt
, atags
, btags
)
216 # the synch pair is identical
219 # pump out diffs from after the synch point
220 fancy_helper(a
, best_i
+1, ahi
, b
, best_j
+1, bhi
)
222 def fancy_helper(a
, alo
, ahi
, b
, blo
, bhi
):
225 fancy_replace(a
, alo
, ahi
, b
, blo
, bhi
)
227 dump('-', a
, alo
, ahi
)
229 dump('+', b
, blo
, bhi
)
231 # Crap to deal with leading tabs in "?" output. Can hurt, but will
232 # probably help most of the time.
234 def printq(aline
, bline
, atags
, btags
):
235 common
= min(count_leading(aline
, "\t"),
236 count_leading(bline
, "\t"))
237 common
= min(common
, count_leading(atags
[:common
], " "))
239 if count_leading(atags
, " ") < len(atags
):
240 print "?", "\t" * common
+ atags
[common
:]
242 if count_leading(btags
, " ") < len(btags
):
243 print "?", "\t" * common
+ btags
[common
:]
245 def count_leading(line
, ch
):
247 while i
< n
and line
[i
] == ch
:
253 out
= sys
.stderr
.write
258 # open a file & return the file object; gripe and return 0 if it
262 return open(fname
, 'r')
263 except IOError, detail
:
264 return fail("couldn't open " + fname
+ ": " + str(detail
))
266 # open two files & spray the diff to stdout; return false iff a problem
267 def fcompare(f1name
, f2name
):
273 a
= f1
.readlines(); f1
.close()
274 b
= f2
.readlines(); f2
.close()
276 cruncher
= SequenceMatcher(IS_LINE_JUNK
, a
, b
)
277 for tag
, alo
, ahi
, blo
, bhi
in cruncher
.get_opcodes():
279 fancy_replace(a
, alo
, ahi
, b
, blo
, bhi
)
280 elif tag
== 'delete':
281 dump('-', a
, alo
, ahi
)
282 elif tag
== 'insert':
283 dump('+', b
, blo
, bhi
)
285 dump(' ', a
, alo
, ahi
)
287 raise ValueError, 'unknown tag ' + `tag`
291 # crack args (sys.argv[1:] is normal) & compare;
292 # return false iff a problem
297 opts
, args
= getopt
.getopt(args
, "qr:")
298 except getopt
.error
, detail
:
299 return fail(str(detail
))
302 for opt
, val
in opts
:
310 return fail("can't specify both -q and -r")
313 return fail("no args allowed with -r option")
314 if whichfile
in "12":
317 return fail("-r value must be 1 or 2")
319 return fail("need 2 filename args")
320 f1name
, f2name
= args
324 return fcompare(f1name
, f2name
)
328 tag
= {"1": "- ", "2": "+ "}[which
]
329 prefixes
= (" ", tag
)
330 for line
in sys
.stdin
.readlines():
331 if line
[:2] in prefixes
:
334 if __name__
== '__main__':
337 if "-profile" in args
:
338 import profile
, pstats
339 args
.remove("-profile")
341 profile
.run("main(args)", statf
)
342 stats
= pstats
.Stats(statf
)
343 stats
.strip_dirs().sort_stats('time').print_stats()