Tools/scripts/ndiff.py

   1 #! /usr/bin/env python
   2
   3 # Module ndiff version 1.6.0
   4 # Released to the public domain 08-Dec-2000,
   5 # by Tim Peters (tim.one@home.com).
   6
   7 # Provided as-is; use at your own risk; no warranty; no promises; enjoy!
   8
   9 """ndiff [-q] file1 file2
  10     or
  11 ndiff (-r1 | -r2) < ndiff_output > file1_or_file2
  12
  13 Print a human-friendly file difference report to stdout.  Both inter-
  14 and intra-line differences are noted.  In the second form, recreate file1
  15 (-r1) or file2 (-r2) on stdout, from an ndiff report on stdin.
  16
  17 In the first form, if -q ("quiet") is not specified, the first two lines
  18 of output are
  19
  20 -: file1
  21 +: file2
  22
  23 Each remaining line begins with a two-letter code:
  24
  25     "- "    line unique to file1
  26     "+ "    line unique to file2
  27     "  "    line common to both files
  28     "? "    line not present in either input file
  29
  30 Lines beginning with "? " attempt to guide the eye to intraline
  31 differences, and were not present in either input file.  These lines can be
  32 confusing if the source files contain tab characters.
  33
  34 The first file can be recovered by retaining only lines that begin with
  35 "  " or "- ", and deleting those 2-character prefixes; use ndiff with -r1.
  36
  37 The second file can be recovered similarly, but by retaining only "  " and
  38 "+ " lines; use ndiff with -r2; or, on Unix, the second file can be
  39 recovered by piping the output through
  40
  41     sed -n '/^[+ ] /s/^..//p'
  42
  43 See module comments for details and programmatic interface.
  44 """
  45
  46 __version__ = 1, 5, 0
  47
  48 # SequenceMatcher tries to compute a "human-friendly diff" between
  49 # two sequences (chiefly picturing a file as a sequence of lines,
  50 # and a line as a sequence of characters, here).  Unlike e.g. UNIX(tm)
  51 # diff, the fundamental notion is the longest *contiguous* & junk-free
  52 # matching subsequence.  That's what catches peoples' eyes.  The
  53 # Windows(tm) windiff has another interesting notion, pairing up elements
  54 # that appear uniquely in each sequence.  That, and the method here,
  55 # appear to yield more intuitive difference reports than does diff.  This
  56 # method appears to be the least vulnerable to synching up on blocks
  57 # of "junk lines", though (like blank lines in ordinary text files,
  58 # or maybe "<P>" lines in HTML files).  That may be because this is
  59 # the only method of the 3 that has a *concept* of "junk" <wink>.
  60 #
  61 # Note that ndiff makes no claim to produce a *minimal* diff.  To the
  62 # contrary, minimal diffs are often counter-intuitive, because they
  63 # synch up anywhere possible, sometimes accidental matches 100 pages
  64 # apart.  Restricting synch points to contiguous matches preserves some
  65 # notion of locality, at the occasional cost of producing a longer diff.
  66 #
  67 # With respect to junk, an earlier version of ndiff simply refused to
  68 # *start* a match with a junk element.  The result was cases like this:
  69 #     before: private Thread currentThread;
  70 #     after:  private volatile Thread currentThread;
  71 # If you consider whitespace to be junk, the longest contiguous match
  72 # not starting with junk is "e Thread currentThread".  So ndiff reported
  73 # that "e volatil" was inserted between the 't' and the 'e' in "private".
  74 # While an accurate view, to people that's absurd.  The current version
  75 # looks for matching blocks that are entirely junk-free, then extends the
  76 # longest one of those as far as possible but only with matching junk.
  77 # So now "currentThread" is matched, then extended to suck up the
  78 # preceding blank; then "private" is matched, and extended to suck up the
  79 # following blank; then "Thread" is matched; and finally ndiff reports
  80 # that "volatile " was inserted before "Thread".  The only quibble
  81 # remaining is that perhaps it was really the case that " volatile"
  82 # was inserted after "private".  I can live with that <wink>.
  83 #
  84 # NOTE on junk:  the module-level names
  85 #    IS_LINE_JUNK
  86 #    IS_CHARACTER_JUNK
  87 # can be set to any functions you like.  The first one should accept
  88 # a single string argument, and return true iff the string is junk.
  89 # The default is whether the regexp r"\s*#?\s*$" matches (i.e., a
  90 # line without visible characters, except for at most one splat).
  91 # The second should accept a string of length 1 etc.  The default is
  92 # whether the character is a blank or tab (note: bad idea to include
  93 # newline in this!).
  94 #
  95 # After setting those, you can call fcompare(f1name, f2name) with the
  96 # names of the files you want to compare.  The difference report
  97 # is sent to stdout.  Or you can call main(args), passing what would
  98 # have been in sys.argv[1:] had the cmd-line form been used.
  99
 100 from difflib import SequenceMatcher
 101
 102 import string
 103 TRACE = 0
 104
 105 # define what "junk" means
 106 import re
 107
 108 def IS_LINE_JUNK(line, pat=re.compile(r"\s*#?\s*$").match):
 109     return pat(line) is not None
 110
 111 def IS_CHARACTER_JUNK(ch, ws=" \t"):
 112     return ch in ws
 113
 114 del re
 115
 116 # meant for dumping lines
 117 def dump(tag, x, lo, hi):
 118     for i in xrange(lo, hi):
 119         print tag, x[i],
 120
 121 def plain_replace(a, alo, ahi, b, blo, bhi):
 122     assert alo < ahi and blo < bhi
 123     # dump the shorter block first -- reduces the burden on short-term
 124     # memory if the blocks are of very different sizes
 125     if bhi - blo < ahi - alo:
 126         dump('+', b, blo, bhi)
 127         dump('-', a, alo, ahi)
 128     else:
 129         dump('-', a, alo, ahi)
 130         dump('+', b, blo, bhi)
 131
 132 # When replacing one block of lines with another, this guy searches
 133 # the blocks for *similar* lines; the best-matching pair (if any) is
 134 # used as a synch point, and intraline difference marking is done on
 135 # the similar pair.  Lots of work, but often worth it.
 136
 137 def fancy_replace(a, alo, ahi, b, blo, bhi):
 138     if TRACE:
 139         print '*** fancy_replace', alo, ahi, blo, bhi
 140         dump('>', a, alo, ahi)
 141         dump('<', b, blo, bhi)
 142
 143     # don't synch up unless the lines have a similarity score of at
 144     # least cutoff; best_ratio tracks the best score seen so far
 145     best_ratio, cutoff = 0.74, 0.75
 146     cruncher = SequenceMatcher(IS_CHARACTER_JUNK)
 147     eqi, eqj = None, None   # 1st indices of equal lines (if any)
 148
 149     # search for the pair that matches best without being identical
 150     # (identical lines must be junk lines, & we don't want to synch up
 151     # on junk -- unless we have to)
 152     for j in xrange(blo, bhi):
 153         bj = b[j]
 154         cruncher.set_seq2(bj)
 155         for i in xrange(alo, ahi):
 156             ai = a[i]
 157             if ai == bj:
 158                 if eqi is None:
 159                     eqi, eqj = i, j
 160                 continue
 161             cruncher.set_seq1(ai)
 162             # computing similarity is expensive, so use the quick
 163             # upper bounds first -- have seen this speed up messy
 164             # compares by a factor of 3.
 165             # note that ratio() is only expensive to compute the first
 166             # time it's called on a sequence pair; the expensive part
 167             # of the computation is cached by cruncher
 168             if cruncher.real_quick_ratio() > best_ratio and \
 169                   cruncher.quick_ratio() > best_ratio and \
 170                   cruncher.ratio() > best_ratio:
 171                 best_ratio, best_i, best_j = cruncher.ratio(), i, j
 172     if best_ratio < cutoff:
 173         # no non-identical "pretty close" pair
 174         if eqi is None:
 175             # no identical pair either -- treat it as a straight replace
 176             plain_replace(a, alo, ahi, b, blo, bhi)
 177             return
 178         # no close pair, but an identical pair -- synch up on that
 179         best_i, best_j, best_ratio = eqi, eqj, 1.0
 180     else:
 181         # there's a close pair, so forget the identical pair (if any)
 182         eqi = None
 183
 184     # a[best_i] very similar to b[best_j]; eqi is None iff they're not
 185     # identical
 186     if TRACE:
 187         print '*** best_ratio', best_ratio, best_i, best_j
 188         dump('>', a, best_i, best_i+1)
 189         dump('<', b, best_j, best_j+1)
 190
 191     # pump out diffs from before the synch point
 192     fancy_helper(a, alo, best_i, b, blo, best_j)
 193
 194     # do intraline marking on the synch pair
 195     aelt, belt = a[best_i], b[best_j]
 196     if eqi is None:
 197         # pump out a '-', '?', '+', '?' quad for the synched lines
 198         atags = btags = ""
 199         cruncher.set_seqs(aelt, belt)
 200         for tag, ai1, ai2, bj1, bj2 in cruncher.get_opcodes():
 201             la, lb = ai2 - ai1, bj2 - bj1
 202             if tag == 'replace':
 203                 atags += '^' * la
 204                 btags += '^' * lb
 205             elif tag == 'delete':
 206                 atags += '-' * la
 207             elif tag == 'insert':
 208                 btags += '+' * lb
 209             elif tag == 'equal':
 210                 atags += ' ' * la
 211                 btags += ' ' * lb
 212             else:
 213                 raise ValueError, 'unknown tag ' + `tag`
 214         printq(aelt, belt, atags, btags)
 215     else:
 216         # the synch pair is identical
 217         print ' ', aelt,
 218
 219     # pump out diffs from after the synch point
 220     fancy_helper(a, best_i+1, ahi, b, best_j+1, bhi)
 221
 222 def fancy_helper(a, alo, ahi, b, blo, bhi):
 223     if alo < ahi:
 224         if blo < bhi:
 225             fancy_replace(a, alo, ahi, b, blo, bhi)
 226         else:
 227             dump('-', a, alo, ahi)
 228     elif blo < bhi:
 229         dump('+', b, blo, bhi)
 230
 231 # Crap to deal with leading tabs in "?" output.  Can hurt, but will
 232 # probably help most of the time.
 233
 234 def printq(aline, bline, atags, btags):
 235     common = min(count_leading(aline, "\t"),
 236                  count_leading(bline, "\t"))
 237     common = min(common, count_leading(atags[:common], " "))
 238     print "-", aline,
 239     if count_leading(atags, " ") < len(atags):
 240         print "?", "\t" * common + atags[common:]
 241     print "+", bline,
 242     if count_leading(btags, " ") < len(btags):
 243         print "?", "\t" * common + btags[common:]
 244
 245 def count_leading(line, ch):
 246     i, n = 0, len(line)
 247     while i < n and line[i] == ch:
 248         i += 1
 249     return i
 250
 251 def fail(msg):
 252     import sys
 253     out = sys.stderr.write
 254     out(msg + "\n\n")
 255     out(__doc__)
 256     return 0
 257
 258 # open a file & return the file object; gripe and return 0 if it
 259 # couldn't be opened
 260 def fopen(fname):
 261     try:
 262         return open(fname, 'r')
 263     except IOError, detail:
 264         return fail("couldn't open " + fname + ": " + str(detail))
 265
 266 # open two files & spray the diff to stdout; return false iff a problem
 267 def fcompare(f1name, f2name):
 268     f1 = fopen(f1name)
 269     f2 = fopen(f2name)
 270     if not f1 or not f2:
 271         return 0
 272
 273     a = f1.readlines(); f1.close()
 274     b = f2.readlines(); f2.close()
 275
 276     cruncher = SequenceMatcher(IS_LINE_JUNK, a, b)
 277     for tag, alo, ahi, blo, bhi in cruncher.get_opcodes():
 278         if tag == 'replace':
 279             fancy_replace(a, alo, ahi, b, blo, bhi)
 280         elif tag == 'delete':
 281             dump('-', a, alo, ahi)
 282         elif tag == 'insert':
 283             dump('+', b, blo, bhi)
 284         elif tag == 'equal':
 285             dump(' ', a, alo, ahi)
 286         else:
 287             raise ValueError, 'unknown tag ' + `tag`
 288
 289     return 1
 290
 291 # crack args (sys.argv[1:] is normal) & compare;
 292 # return false iff a problem
 293
 294 def main(args):
 295     import getopt
 296     try:
 297         opts, args = getopt.getopt(args, "qr:")
 298     except getopt.error, detail:
 299         return fail(str(detail))
 300     noisy = 1
 301     qseen = rseen = 0
 302     for opt, val in opts:
 303         if opt == "-q":
 304             qseen = 1
 305             noisy = 0
 306         elif opt == "-r":
 307             rseen = 1
 308             whichfile = val
 309     if qseen and rseen:
 310         return fail("can't specify both -q and -r")
 311     if rseen:
 312         if args:
 313             return fail("no args allowed with -r option")
 314         if whichfile in "12":
 315             restore(whichfile)
 316             return 1
 317         return fail("-r value must be 1 or 2")
 318     if len(args) != 2:
 319         return fail("need 2 filename args")
 320     f1name, f2name = args
 321     if noisy:
 322         print '-:', f1name
 323         print '+:', f2name
 324     return fcompare(f1name, f2name)
 325
 326 def restore(which):
 327     import sys
 328     tag = {"1": "- ", "2": "+ "}[which]
 329     prefixes = ("  ", tag)
 330     for line in sys.stdin.readlines():
 331         if line[:2] in prefixes:
 332             print line[2:],
 333
 334 if __name__ == '__main__':
 335     import sys
 336     args = sys.argv[1:]
 337     if "-profile" in args:
 338         import profile, pstats
 339         args.remove("-profile")
 340         statf = "ndiff.pro"
 341         profile.run("main(args)", statf)
 342         stats = pstats.Stats(statf)
 343         stats.strip_dirs().sort_stats('time').print_stats()
 344     else:
 345         main(args)