Files for 2.1b1 distribution.
[python/dscho.git] / Tools / scripts / ndiff.py
blobddca07d38ff3cdb2065947e1aadc41051988efa1
1 #! /usr/bin/env python
3 # Module ndiff version 1.6.0
4 # Released to the public domain 08-Dec-2000,
5 # by Tim Peters (tim.one@home.com).
7 # Provided as-is; use at your own risk; no warranty; no promises; enjoy!
9 """ndiff [-q] file1 file2
11 ndiff (-r1 | -r2) < ndiff_output > file1_or_file2
13 Print a human-friendly file difference report to stdout. Both inter-
14 and intra-line differences are noted. In the second form, recreate file1
15 (-r1) or file2 (-r2) on stdout, from an ndiff report on stdin.
17 In the first form, if -q ("quiet") is not specified, the first two lines
18 of output are
20 -: file1
21 +: file2
23 Each remaining line begins with a two-letter code:
25 "- " line unique to file1
26 "+ " line unique to file2
27 " " line common to both files
28 "? " line not present in either input file
30 Lines beginning with "? " attempt to guide the eye to intraline
31 differences, and were not present in either input file. These lines can be
32 confusing if the source files contain tab characters.
34 The first file can be recovered by retaining only lines that begin with
35 " " or "- ", and deleting those 2-character prefixes; use ndiff with -r1.
37 The second file can be recovered similarly, but by retaining only " " and
38 "+ " lines; use ndiff with -r2; or, on Unix, the second file can be
39 recovered by piping the output through
41 sed -n '/^[+ ] /s/^..//p'
43 See module comments for details and programmatic interface.
44 """
46 __version__ = 1, 5, 0
48 # SequenceMatcher tries to compute a "human-friendly diff" between
49 # two sequences (chiefly picturing a file as a sequence of lines,
50 # and a line as a sequence of characters, here). Unlike e.g. UNIX(tm)
51 # diff, the fundamental notion is the longest *contiguous* & junk-free
52 # matching subsequence. That's what catches peoples' eyes. The
53 # Windows(tm) windiff has another interesting notion, pairing up elements
54 # that appear uniquely in each sequence. That, and the method here,
55 # appear to yield more intuitive difference reports than does diff. This
56 # method appears to be the least vulnerable to synching up on blocks
57 # of "junk lines", though (like blank lines in ordinary text files,
58 # or maybe "<P>" lines in HTML files). That may be because this is
59 # the only method of the 3 that has a *concept* of "junk" <wink>.
61 # Note that ndiff makes no claim to produce a *minimal* diff. To the
62 # contrary, minimal diffs are often counter-intuitive, because they
63 # synch up anywhere possible, sometimes accidental matches 100 pages
64 # apart. Restricting synch points to contiguous matches preserves some
65 # notion of locality, at the occasional cost of producing a longer diff.
67 # With respect to junk, an earlier version of ndiff simply refused to
68 # *start* a match with a junk element. The result was cases like this:
69 # before: private Thread currentThread;
70 # after: private volatile Thread currentThread;
71 # If you consider whitespace to be junk, the longest contiguous match
72 # not starting with junk is "e Thread currentThread". So ndiff reported
73 # that "e volatil" was inserted between the 't' and the 'e' in "private".
74 # While an accurate view, to people that's absurd. The current version
75 # looks for matching blocks that are entirely junk-free, then extends the
76 # longest one of those as far as possible but only with matching junk.
77 # So now "currentThread" is matched, then extended to suck up the
78 # preceding blank; then "private" is matched, and extended to suck up the
79 # following blank; then "Thread" is matched; and finally ndiff reports
80 # that "volatile " was inserted before "Thread". The only quibble
81 # remaining is that perhaps it was really the case that " volatile"
82 # was inserted after "private". I can live with that <wink>.
84 # NOTE on junk: the module-level names
85 # IS_LINE_JUNK
86 # IS_CHARACTER_JUNK
87 # can be set to any functions you like. The first one should accept
88 # a single string argument, and return true iff the string is junk.
89 # The default is whether the regexp r"\s*#?\s*$" matches (i.e., a
90 # line without visible characters, except for at most one splat).
91 # The second should accept a string of length 1 etc. The default is
92 # whether the character is a blank or tab (note: bad idea to include
93 # newline in this!).
95 # After setting those, you can call fcompare(f1name, f2name) with the
96 # names of the files you want to compare. The difference report
97 # is sent to stdout. Or you can call main(args), passing what would
98 # have been in sys.argv[1:] had the cmd-line form been used.
100 from difflib import SequenceMatcher
102 import string
103 TRACE = 0
105 # define what "junk" means
106 import re
108 def IS_LINE_JUNK(line, pat=re.compile(r"\s*#?\s*$").match):
109 return pat(line) is not None
111 def IS_CHARACTER_JUNK(ch, ws=" \t"):
112 return ch in ws
114 del re
116 # meant for dumping lines
117 def dump(tag, x, lo, hi):
118 for i in xrange(lo, hi):
119 print tag, x[i],
121 def plain_replace(a, alo, ahi, b, blo, bhi):
122 assert alo < ahi and blo < bhi
123 # dump the shorter block first -- reduces the burden on short-term
124 # memory if the blocks are of very different sizes
125 if bhi - blo < ahi - alo:
126 dump('+', b, blo, bhi)
127 dump('-', a, alo, ahi)
128 else:
129 dump('-', a, alo, ahi)
130 dump('+', b, blo, bhi)
132 # When replacing one block of lines with another, this guy searches
133 # the blocks for *similar* lines; the best-matching pair (if any) is
134 # used as a synch point, and intraline difference marking is done on
135 # the similar pair. Lots of work, but often worth it.
137 def fancy_replace(a, alo, ahi, b, blo, bhi):
138 if TRACE:
139 print '*** fancy_replace', alo, ahi, blo, bhi
140 dump('>', a, alo, ahi)
141 dump('<', b, blo, bhi)
143 # don't synch up unless the lines have a similarity score of at
144 # least cutoff; best_ratio tracks the best score seen so far
145 best_ratio, cutoff = 0.74, 0.75
146 cruncher = SequenceMatcher(IS_CHARACTER_JUNK)
147 eqi, eqj = None, None # 1st indices of equal lines (if any)
149 # search for the pair that matches best without being identical
150 # (identical lines must be junk lines, & we don't want to synch up
151 # on junk -- unless we have to)
152 for j in xrange(blo, bhi):
153 bj = b[j]
154 cruncher.set_seq2(bj)
155 for i in xrange(alo, ahi):
156 ai = a[i]
157 if ai == bj:
158 if eqi is None:
159 eqi, eqj = i, j
160 continue
161 cruncher.set_seq1(ai)
162 # computing similarity is expensive, so use the quick
163 # upper bounds first -- have seen this speed up messy
164 # compares by a factor of 3.
165 # note that ratio() is only expensive to compute the first
166 # time it's called on a sequence pair; the expensive part
167 # of the computation is cached by cruncher
168 if cruncher.real_quick_ratio() > best_ratio and \
169 cruncher.quick_ratio() > best_ratio and \
170 cruncher.ratio() > best_ratio:
171 best_ratio, best_i, best_j = cruncher.ratio(), i, j
172 if best_ratio < cutoff:
173 # no non-identical "pretty close" pair
174 if eqi is None:
175 # no identical pair either -- treat it as a straight replace
176 plain_replace(a, alo, ahi, b, blo, bhi)
177 return
178 # no close pair, but an identical pair -- synch up on that
179 best_i, best_j, best_ratio = eqi, eqj, 1.0
180 else:
181 # there's a close pair, so forget the identical pair (if any)
182 eqi = None
184 # a[best_i] very similar to b[best_j]; eqi is None iff they're not
185 # identical
186 if TRACE:
187 print '*** best_ratio', best_ratio, best_i, best_j
188 dump('>', a, best_i, best_i+1)
189 dump('<', b, best_j, best_j+1)
191 # pump out diffs from before the synch point
192 fancy_helper(a, alo, best_i, b, blo, best_j)
194 # do intraline marking on the synch pair
195 aelt, belt = a[best_i], b[best_j]
196 if eqi is None:
197 # pump out a '-', '?', '+', '?' quad for the synched lines
198 atags = btags = ""
199 cruncher.set_seqs(aelt, belt)
200 for tag, ai1, ai2, bj1, bj2 in cruncher.get_opcodes():
201 la, lb = ai2 - ai1, bj2 - bj1
202 if tag == 'replace':
203 atags += '^' * la
204 btags += '^' * lb
205 elif tag == 'delete':
206 atags += '-' * la
207 elif tag == 'insert':
208 btags += '+' * lb
209 elif tag == 'equal':
210 atags += ' ' * la
211 btags += ' ' * lb
212 else:
213 raise ValueError, 'unknown tag ' + `tag`
214 printq(aelt, belt, atags, btags)
215 else:
216 # the synch pair is identical
217 print ' ', aelt,
219 # pump out diffs from after the synch point
220 fancy_helper(a, best_i+1, ahi, b, best_j+1, bhi)
222 def fancy_helper(a, alo, ahi, b, blo, bhi):
223 if alo < ahi:
224 if blo < bhi:
225 fancy_replace(a, alo, ahi, b, blo, bhi)
226 else:
227 dump('-', a, alo, ahi)
228 elif blo < bhi:
229 dump('+', b, blo, bhi)
231 # Crap to deal with leading tabs in "?" output. Can hurt, but will
232 # probably help most of the time.
234 def printq(aline, bline, atags, btags):
235 common = min(count_leading(aline, "\t"),
236 count_leading(bline, "\t"))
237 common = min(common, count_leading(atags[:common], " "))
238 print "-", aline,
239 if count_leading(atags, " ") < len(atags):
240 print "?", "\t" * common + atags[common:]
241 print "+", bline,
242 if count_leading(btags, " ") < len(btags):
243 print "?", "\t" * common + btags[common:]
245 def count_leading(line, ch):
246 i, n = 0, len(line)
247 while i < n and line[i] == ch:
248 i += 1
249 return i
251 def fail(msg):
252 import sys
253 out = sys.stderr.write
254 out(msg + "\n\n")
255 out(__doc__)
256 return 0
258 # open a file & return the file object; gripe and return 0 if it
259 # couldn't be opened
260 def fopen(fname):
261 try:
262 return open(fname, 'r')
263 except IOError, detail:
264 return fail("couldn't open " + fname + ": " + str(detail))
266 # open two files & spray the diff to stdout; return false iff a problem
267 def fcompare(f1name, f2name):
268 f1 = fopen(f1name)
269 f2 = fopen(f2name)
270 if not f1 or not f2:
271 return 0
273 a = f1.readlines(); f1.close()
274 b = f2.readlines(); f2.close()
276 cruncher = SequenceMatcher(IS_LINE_JUNK, a, b)
277 for tag, alo, ahi, blo, bhi in cruncher.get_opcodes():
278 if tag == 'replace':
279 fancy_replace(a, alo, ahi, b, blo, bhi)
280 elif tag == 'delete':
281 dump('-', a, alo, ahi)
282 elif tag == 'insert':
283 dump('+', b, blo, bhi)
284 elif tag == 'equal':
285 dump(' ', a, alo, ahi)
286 else:
287 raise ValueError, 'unknown tag ' + `tag`
289 return 1
291 # crack args (sys.argv[1:] is normal) & compare;
292 # return false iff a problem
294 def main(args):
295 import getopt
296 try:
297 opts, args = getopt.getopt(args, "qr:")
298 except getopt.error, detail:
299 return fail(str(detail))
300 noisy = 1
301 qseen = rseen = 0
302 for opt, val in opts:
303 if opt == "-q":
304 qseen = 1
305 noisy = 0
306 elif opt == "-r":
307 rseen = 1
308 whichfile = val
309 if qseen and rseen:
310 return fail("can't specify both -q and -r")
311 if rseen:
312 if args:
313 return fail("no args allowed with -r option")
314 if whichfile in "12":
315 restore(whichfile)
316 return 1
317 return fail("-r value must be 1 or 2")
318 if len(args) != 2:
319 return fail("need 2 filename args")
320 f1name, f2name = args
321 if noisy:
322 print '-:', f1name
323 print '+:', f2name
324 return fcompare(f1name, f2name)
326 def restore(which):
327 import sys
328 tag = {"1": "- ", "2": "+ "}[which]
329 prefixes = (" ", tag)
330 for line in sys.stdin.readlines():
331 if line[:2] in prefixes:
332 print line[2:],
334 if __name__ == '__main__':
335 import sys
336 args = sys.argv[1:]
337 if "-profile" in args:
338 import profile, pstats
339 args.remove("-profile")
340 statf = "ndiff.pro"
341 profile.run("main(args)", statf)
342 stats = pstats.Stats(statf)
343 stats.strip_dirs().sort_stats('time').print_stats()
344 else:
345 main(args)