Doc/lib/libdifflib.tex

   1 \section{\module{difflib} ---
   2          Helpers for computing deltas}
   3
   4 \declaremodule{standard}{difflib}
   5 \modulesynopsis{Helpers for computing differences between objects.}
   6 \moduleauthor{Tim Peters}{tim_one@users.sourceforge.net}
   7 \sectionauthor{Tim Peters}{tim_one@users.sourceforge.net}
   8 % LaTeXification by Fred L. Drake, Jr. <fdrake@acm.org>.
   9
  10 \versionadded{2.1}
  11
  12
  13 \begin{classdesc*}{SequenceMatcher}
  14   This is a flexible class for comparing pairs of sequences of any
  15   type, so long as the sequence elements are hashable.  The basic
  16   algorithm predates, and is a little fancier than, an algorithm
  17   published in the late 1980's by Ratcliff and Obershelp under the
  18   hyperbolic name ``gestalt pattern matching.''  The idea is to find
  19   the longest contiguous matching subsequence that contains no
  20   ``junk'' elements (the Ratcliff and Obershelp algorithm doesn't
  21   address junk).  The same idea is then applied recursively to the
  22   pieces of the sequences to the left and to the right of the matching
  23   subsequence.  This does not yield minimal edit sequences, but does
  24   tend to yield matches that ``look right'' to people.
  25
  26   \strong{Timing:} The basic Ratcliff-Obershelp algorithm is cubic
  27   time in the worst case and quadratic time in the expected case.
  28   \class{SequenceMatcher} is quadratic time for the worst case and has
  29   expected-case behavior dependent in a complicated way on how many
  30   elements the sequences have in common; best case time is linear.
  31 \end{classdesc*}
  32
  33 \begin{classdesc*}{Differ}
  34   This is a class for comparing sequences of lines of text, and
  35   producing human-readable differences or deltas.  Differ uses
  36   \class{SequenceMatcher} both to compare sequences of lines, and to
  37   compare sequences of characters within similar (near-matching)
  38   lines.
  39
  40   Each line of a \class{Differ} delta begins with a two-letter code:
  41
  42 \begin{tableii}{l|l}{code}{Code}{Meaning}
  43   \lineii{'- '}{line unique to sequence 1}
  44   \lineii{'+ '}{line unique to sequence 2}
  45   \lineii{'  '}{line common to both sequences}
  46   \lineii{'? '}{line not present in either input sequence}
  47 \end{tableii}
  48
  49   Lines beginning with `\code{?~}' attempt to guide the eye to
  50   intraline differences, and were not present in either input
  51   sequence. These lines can be confusing if the sequences contain tab
  52   characters.
  53 \end{classdesc*}
  54
  55 \begin{classdesc*}{HtmlDiff}
  56
  57   This class can be used to create an HTML table (or a complete HTML file
  58   containing the table) showing a side by side, line by line comparison
  59   of text with inter-line and intra-line change highlights.  The table can
  60   be generated in either full or contextual difference mode.
  61
  62   The constructor for this class is:
  63
  64   \begin{funcdesc}{__init__}{\optional{tabsize}\optional{,
  65     wrapcolumn}\optional{, linejunk}\optional{, charjunk}}
  66
  67     Initializes instance of \class{HtmlDiff}.
  68
  69     \var{tabsize} is an optional keyword argument to specify tab stop spacing
  70     and defaults to \code{8}.
  71
  72     \var{wrapcolumn} is an optional keyword to specify column number where
  73     lines are broken and wrapped, defaults to \code{None} where lines are not
  74     wrapped.
  75
  76     \var{linejunk} and \var{charjunk} are optional keyword arguments passed
  77     into \code{ndiff()} (used by \class{HtmlDiff} to generate the
  78     side by side HTML differences).  See \code{ndiff()} documentation for
  79     argument default values and descriptions.
  80
  81   \end{funcdesc}
  82
  83   The following methods are public:
  84
  85   \begin{funcdesc}{make_file}{fromlines, tolines
  86     \optional{, fromdesc}\optional{, todesc}\optional{, context}\optional{,
  87     numlines}}
  88     Compares \var{fromlines} and \var{tolines} (lists of strings) and returns
  89     a string which is a complete HTML file containing a table showing line by
  90     line differences with inter-line and intra-line changes highlighted.
  91
  92     \var{fromdesc} and \var{todesc} are optional keyword arguments to specify
  93     from/to file column header strings (both default to an empty string).
  94
  95     \var{context} and \var{numlines} are both optional keyword arguments.
  96     Set \var{context} to \code{True} when contextual differences are to be
  97     shown, else the default is \code{False} to show the full files.
  98     \var{numlines} defaults to \code{5}.  When \var{context} is \code{True}
  99     \var{numlines} controls the number of context lines which surround the
 100     difference highlights.  When \var{context} is \code{False} \var{numlines}
 101     controls the number of lines which are shown before a difference
 102     highlight when using the "next" hyperlinks (setting to zero would cause
 103     the "next" hyperlinks to place the next difference highlight at the top of
 104     the browser without any leading context).
 105   \end{funcdesc}
 106
 107   \begin{funcdesc}{make_table}{fromlines, tolines
 108     \optional{, fromdesc}\optional{, todesc}\optional{, context}\optional{,
 109     numlines}}
 110     Compares \var{fromlines} and \var{tolines} (lists of strings) and returns
 111     a string which is a complete HTML table showing line by line differences
 112     with inter-line and intra-line changes highlighted.
 113
 114     The arguments for this method are the same as those for the
 115     \method{make_file()} method.
 116   \end{funcdesc}
 117
 118   \file{Tools/scripts/diff.py} is a command-line front-end to this class
 119   and contains a good example of its use.
 120
 121   \versionadded{2.4}
 122 \end{classdesc*}
 123
 124 \begin{funcdesc}{context_diff}{a, b\optional{, fromfile}\optional{,
 125     tofile}\optional{, fromfiledate}\optional{, tofiledate}\optional{,
 126     n}\optional{, lineterm}}
 127   Compare \var{a} and \var{b} (lists of strings); return a
 128   delta (a generator generating the delta lines) in context diff
 129   format.
 130
 131   Context diffs are a compact way of showing just the lines that have
 132   changed plus a few lines of context.  The changes are shown in a
 133   before/after style.  The number of context lines is set by \var{n}
 134   which defaults to three.
 135
 136   By default, the diff control lines (those with \code{***} or \code{---})
 137   are created with a trailing newline.  This is helpful so that inputs created
 138   from \function{file.readlines()} result in diffs that are suitable for use
 139   with \function{file.writelines()} since both the inputs and outputs have
 140   trailing newlines.
 141
 142   For inputs that do not have trailing newlines, set the \var{lineterm}
 143   argument to \code{""} so that the output will be uniformly newline free.
 144
 145   The context diff format normally has a header for filenames and
 146   modification times.  Any or all of these may be specified using strings for
 147   \var{fromfile}, \var{tofile}, \var{fromfiledate}, and \var{tofiledate}.
 148   The modification times are normally expressed in the format returned by
 149   \function{time.ctime()}.  If not specified, the strings default to blanks.
 150
 151   \file{Tools/scripts/diff.py} is a command-line front-end for this
 152   function.
 153
 154   \versionadded{2.3}
 155 \end{funcdesc}
 156
 157 \begin{funcdesc}{get_close_matches}{word, possibilities\optional{,
 158                  n}\optional{, cutoff}}
 159   Return a list of the best ``good enough'' matches.  \var{word} is a
 160   sequence for which close matches are desired (typically a string),
 161   and \var{possibilities} is a list of sequences against which to
 162   match \var{word} (typically a list of strings).
 163
 164   Optional argument \var{n} (default \code{3}) is the maximum number
 165   of close matches to return; \var{n} must be greater than \code{0}.
 166
 167   Optional argument \var{cutoff} (default \code{0.6}) is a float in
 168   the range [0, 1].  Possibilities that don't score at least that
 169   similar to \var{word} are ignored.
 170
 171   The best (no more than \var{n}) matches among the possibilities are
 172   returned in a list, sorted by similarity score, most similar first.
 173
 174 \begin{verbatim}
 175 >>> get_close_matches('appel', ['ape', 'apple', 'peach', 'puppy'])
 176 ['apple', 'ape']
 177 >>> import keyword
 178 >>> get_close_matches('wheel', keyword.kwlist)
 179 ['while']
 180 >>> get_close_matches('apple', keyword.kwlist)
 181 []
 182 >>> get_close_matches('accept', keyword.kwlist)
 183 ['except']
 184 \end{verbatim}
 185 \end{funcdesc}
 186
 187 \begin{funcdesc}{ndiff}{a, b\optional{, linejunk}\optional{, charjunk}}
 188   Compare \var{a} and \var{b} (lists of strings); return a
 189   \class{Differ}-style delta (a generator generating the delta lines).
 190
 191   Optional keyword parameters \var{linejunk} and \var{charjunk} are
 192   for filter functions (or \code{None}):
 193
 194   \var{linejunk}: A function that accepts a single string
 195   argument, and returns true if the string is junk, or false if not.
 196   The default is (\code{None}), starting with Python 2.3.  Before then,
 197   the default was the module-level function
 198   \function{IS_LINE_JUNK()}, which filters out lines without visible
 199   characters, except for at most one pound character (\character{\#}).
 200   As of Python 2.3, the underlying \class{SequenceMatcher} class
 201   does a dynamic analysis of which lines are so frequent as to
 202   constitute noise, and this usually works better than the pre-2.3
 203   default.
 204
 205   \var{charjunk}: A function that accepts a character (a string of
 206   length 1), and returns if the character is junk, or false if not.
 207   The default is module-level function \function{IS_CHARACTER_JUNK()},
 208   which filters out whitespace characters (a blank or tab; note: bad
 209   idea to include newline in this!).
 210
 211   \file{Tools/scripts/ndiff.py} is a command-line front-end to this
 212   function.
 213
 214 \begin{verbatim}
 215 >>> diff = ndiff('one\ntwo\nthree\n'.splitlines(1),
 216 ...              'ore\ntree\nemu\n'.splitlines(1))
 217 >>> print ''.join(diff),
 218 - one
 219 ?  ^
 220 + ore
 221 ?  ^
 222 - two
 223 - three
 224 ?  -
 225 + tree
 226 + emu
 227 \end{verbatim}
 228 \end{funcdesc}
 229
 230 \begin{funcdesc}{restore}{sequence, which}
 231   Return one of the two sequences that generated a delta.
 232
 233   Given a \var{sequence} produced by \method{Differ.compare()} or
 234   \function{ndiff()}, extract lines originating from file 1 or 2
 235   (parameter \var{which}), stripping off line prefixes.
 236
 237   Example:
 238
 239 \begin{verbatim}
 240 >>> diff = ndiff('one\ntwo\nthree\n'.splitlines(1),
 241 ...              'ore\ntree\nemu\n'.splitlines(1))
 242 >>> diff = list(diff) # materialize the generated delta into a list
 243 >>> print ''.join(restore(diff, 1)),
 244 one
 245 two
 246 three
 247 >>> print ''.join(restore(diff, 2)),
 248 ore
 249 tree
 250 emu
 251 \end{verbatim}
 252
 253 \end{funcdesc}
 254
 255 \begin{funcdesc}{unified_diff}{a, b\optional{, fromfile}\optional{,
 256     tofile}\optional{, fromfiledate}\optional{, tofiledate}\optional{,
 257     n}\optional{, lineterm}}
 258   Compare \var{a} and \var{b} (lists of strings); return a
 259   delta (a generator generating the delta lines) in unified diff
 260   format.
 261
 262   Unified diffs are a compact way of showing just the lines that have
 263   changed plus a few lines of context.  The changes are shown in a
 264   inline style (instead of separate before/after blocks).  The number
 265   of context lines is set by \var{n} which defaults to three.
 266
 267   By default, the diff control lines (those with \code{---}, \code{+++},
 268   or \code{@@}) are created with a trailing newline.  This is helpful so
 269   that inputs created from \function{file.readlines()} result in diffs
 270   that are suitable for use with \function{file.writelines()} since both
 271   the inputs and outputs have trailing newlines.
 272
 273   For inputs that do not have trailing newlines, set the \var{lineterm}
 274   argument to \code{""} so that the output will be uniformly newline free.
 275
 276   The context diff format normally has a header for filenames and
 277   modification times.  Any or all of these may be specified using strings for
 278   \var{fromfile}, \var{tofile}, \var{fromfiledate}, and \var{tofiledate}.
 279   The modification times are normally expressed in the format returned by
 280   \function{time.ctime()}.  If not specified, the strings default to blanks.
 281
 282   \file{Tools/scripts/diff.py} is a command-line front-end for this
 283   function.
 284
 285   \versionadded{2.3}
 286 \end{funcdesc}
 287
 288 \begin{funcdesc}{IS_LINE_JUNK}{line}
 289   Return true for ignorable lines.  The line \var{line} is ignorable
 290   if \var{line} is blank or contains a single \character{\#},
 291   otherwise it is not ignorable.  Used as a default for parameter
 292   \var{linejunk} in \function{ndiff()} before Python 2.3.
 293 \end{funcdesc}
 294
 295
 296 \begin{funcdesc}{IS_CHARACTER_JUNK}{ch}
 297   Return true for ignorable characters.  The character \var{ch} is
 298   ignorable if \var{ch} is a space or tab, otherwise it is not
 299   ignorable.  Used as a default for parameter \var{charjunk} in
 300   \function{ndiff()}.
 301 \end{funcdesc}
 302
 303
 304 \begin{seealso}
 305   \seetitle[http://www.ddj.com/documents/s=1103/ddj8807c/]
 306            {Pattern Matching: The Gestalt Approach}{Discussion of a
 307             similar algorithm by John W. Ratcliff and D. E. Metzener.
 308             This was published in
 309             \citetitle[http://www.ddj.com/]{Dr. Dobb's Journal} in
 310             July, 1988.}
 311 \end{seealso}
 312
 313
 314 \subsection{SequenceMatcher Objects \label{sequence-matcher}}
 315
 316 The \class{SequenceMatcher} class has this constructor:
 317
 318 \begin{classdesc}{SequenceMatcher}{\optional{isjunk\optional{,
 319                                    a\optional{, b}}}}
 320   Optional argument \var{isjunk} must be \code{None} (the default) or
 321   a one-argument function that takes a sequence element and returns
 322   true if and only if the element is ``junk'' and should be ignored.
 323   Passing \code{None} for \var{isjunk} is equivalent to passing
 324   \code{lambda x: 0}; in other words, no elements are ignored.  For
 325   example, pass:
 326
 327 \begin{verbatim}
 328 lambda x: x in " \t"
 329 \end{verbatim}
 330
 331   if you're comparing lines as sequences of characters, and don't want
 332   to synch up on blanks or hard tabs.
 333
 334   The optional arguments \var{a} and \var{b} are sequences to be
 335   compared; both default to empty strings.  The elements of both
 336   sequences must be hashable.
 337 \end{classdesc}
 338
 339
 340 \class{SequenceMatcher} objects have the following methods:
 341
 342 \begin{methoddesc}{set_seqs}{a, b}
 343   Set the two sequences to be compared.
 344 \end{methoddesc}
 345
 346 \class{SequenceMatcher} computes and caches detailed information about
 347 the second sequence, so if you want to compare one sequence against
 348 many sequences, use \method{set_seq2()} to set the commonly used
 349 sequence once and call \method{set_seq1()} repeatedly, once for each
 350 of the other sequences.
 351
 352 \begin{methoddesc}{set_seq1}{a}
 353   Set the first sequence to be compared.  The second sequence to be
 354   compared is not changed.
 355 \end{methoddesc}
 356
 357 \begin{methoddesc}{set_seq2}{b}
 358   Set the second sequence to be compared.  The first sequence to be
 359   compared is not changed.
 360 \end{methoddesc}
 361
 362 \begin{methoddesc}{find_longest_match}{alo, ahi, blo, bhi}
 363   Find longest matching block in \code{\var{a}[\var{alo}:\var{ahi}]}
 364   and \code{\var{b}[\var{blo}:\var{bhi}]}.
 365
 366   If \var{isjunk} was omitted or \code{None},
 367   \method{get_longest_match()} returns \code{(\var{i}, \var{j},
 368   \var{k})} such that \code{\var{a}[\var{i}:\var{i}+\var{k}]} is equal
 369   to \code{\var{b}[\var{j}:\var{j}+\var{k}]}, where
 370       \code{\var{alo} <= \var{i} <= \var{i}+\var{k} <= \var{ahi}} and
 371       \code{\var{blo} <= \var{j} <= \var{j}+\var{k} <= \var{bhi}}.
 372   For all \code{(\var{i'}, \var{j'}, \var{k'})} meeting those
 373   conditions, the additional conditions
 374       \code{\var{k} >= \var{k'}},
 375       \code{\var{i} <= \var{i'}},
 376       and if \code{\var{i} == \var{i'}}, \code{\var{j} <= \var{j'}}
 377   are also met.
 378   In other words, of all maximal matching blocks, return one that
 379   starts earliest in \var{a}, and of all those maximal matching blocks
 380   that start earliest in \var{a}, return the one that starts earliest
 381   in \var{b}.
 382
 383 \begin{verbatim}
 384 >>> s = SequenceMatcher(None, " abcd", "abcd abcd")
 385 >>> s.find_longest_match(0, 5, 0, 9)
 386 (0, 4, 5)
 387 \end{verbatim}
 388
 389   If \var{isjunk} was provided, first the longest matching block is
 390   determined as above, but with the additional restriction that no
 391   junk element appears in the block.  Then that block is extended as
 392   far as possible by matching (only) junk elements on both sides.
 393   So the resulting block never matches on junk except as identical
 394   junk happens to be adjacent to an interesting match.
 395
 396   Here's the same example as before, but considering blanks to be junk.
 397   That prevents \code{' abcd'} from matching the \code{' abcd'} at the
 398   tail end of the second sequence directly.  Instead only the
 399   \code{'abcd'} can match, and matches the leftmost \code{'abcd'} in
 400   the second sequence:
 401
 402 \begin{verbatim}
 403 >>> s = SequenceMatcher(lambda x: x==" ", " abcd", "abcd abcd")
 404 >>> s.find_longest_match(0, 5, 0, 9)
 405 (1, 0, 4)
 406 \end{verbatim}
 407
 408   If no blocks match, this returns \code{(\var{alo}, \var{blo}, 0)}.
 409 \end{methoddesc}
 410
 411 \begin{methoddesc}{get_matching_blocks}{}
 412   Return list of triples describing matching subsequences.
 413   Each triple is of the form \code{(\var{i}, \var{j}, \var{n})}, and
 414   means that \code{\var{a}[\var{i}:\var{i}+\var{n}] ==
 415   \var{b}[\var{j}:\var{j}+\var{n}]}.  The triples are monotonically
 416   increasing in \var{i} and \var{j}.
 417
 418   The last triple is a dummy, and has the value \code{(len(\var{a}),
 419   len(\var{b}), 0)}.  It is the only triple with \code{\var{n} == 0}.
 420   % Explain why a dummy is used!
 421
 422 \begin{verbatim}
 423 >>> s = SequenceMatcher(None, "abxcd", "abcd")
 424 >>> s.get_matching_blocks()
 425 [(0, 0, 2), (3, 2, 2), (5, 4, 0)]
 426 \end{verbatim}
 427 \end{methoddesc}
 428
 429 \begin{methoddesc}{get_opcodes}{}
 430   Return list of 5-tuples describing how to turn \var{a} into \var{b}.
 431   Each tuple is of the form \code{(\var{tag}, \var{i1}, \var{i2},
 432   \var{j1}, \var{j2})}.  The first tuple has \code{\var{i1} ==
 433   \var{j1} == 0}, and remaining tuples have \var{i1} equal to the
 434   \var{i2} from the preceding tuple, and, likewise, \var{j1} equal to
 435   the previous \var{j2}.
 436
 437   The \var{tag} values are strings, with these meanings:
 438
 439 \begin{tableii}{l|l}{code}{Value}{Meaning}
 440   \lineii{'replace'}{\code{\var{a}[\var{i1}:\var{i2}]} should be
 441                      replaced by \code{\var{b}[\var{j1}:\var{j2}]}.}
 442   \lineii{'delete'}{\code{\var{a}[\var{i1}:\var{i2}]} should be
 443                     deleted.  Note that \code{\var{j1} == \var{j2}} in
 444                     this case.}
 445   \lineii{'insert'}{\code{\var{b}[\var{j1}:\var{j2}]} should be
 446                     inserted at \code{\var{a}[\var{i1}:\var{i1}]}.
 447                     Note that \code{\var{i1} == \var{i2}} in this
 448                     case.}
 449   \lineii{'equal'}{\code{\var{a}[\var{i1}:\var{i2}] ==
 450                    \var{b}[\var{j1}:\var{j2}]} (the sub-sequences are
 451                    equal).}
 452 \end{tableii}
 453
 454 For example:
 455
 456 \begin{verbatim}
 457 >>> a = "qabxcd"
 458 >>> b = "abycdf"
 459 >>> s = SequenceMatcher(None, a, b)
 460 >>> for tag, i1, i2, j1, j2 in s.get_opcodes():
 461 ...    print ("%7s a[%d:%d] (%s) b[%d:%d] (%s)" %
 462 ...           (tag, i1, i2, a[i1:i2], j1, j2, b[j1:j2]))
 463  delete a[0:1] (q) b[0:0] ()
 464   equal a[1:3] (ab) b[0:2] (ab)
 465 replace a[3:4] (x) b[2:3] (y)
 466   equal a[4:6] (cd) b[3:5] (cd)
 467  insert a[6:6] () b[5:6] (f)
 468 \end{verbatim}
 469 \end{methoddesc}
 470
 471 \begin{methoddesc}{get_grouped_opcodes}{\optional{n}}
 472   Return a generator of groups with up to \var{n} lines of context.
 473
 474   Starting with the groups returned by \method{get_opcodes()},
 475   this method splits out smaller change clusters and eliminates
 476   intervening ranges which have no changes.
 477
 478   The groups are returned in the same format as \method{get_opcodes()}.
 479   \versionadded{2.3}
 480 \end{methoddesc}
 481
 482 \begin{methoddesc}{ratio}{}
 483   Return a measure of the sequences' similarity as a float in the
 484   range [0, 1].
 485
 486   Where T is the total number of elements in both sequences, and M is
 487   the number of matches, this is 2.0*M / T. Note that this is
 488   \code{1.0} if the sequences are identical, and \code{0.0} if they
 489   have nothing in common.
 490
 491   This is expensive to compute if \method{get_matching_blocks()} or
 492   \method{get_opcodes()} hasn't already been called, in which case you
 493   may want to try \method{quick_ratio()} or
 494   \method{real_quick_ratio()} first to get an upper bound.
 495 \end{methoddesc}
 496
 497 \begin{methoddesc}{quick_ratio}{}
 498   Return an upper bound on \method{ratio()} relatively quickly.
 499
 500   This isn't defined beyond that it is an upper bound on
 501   \method{ratio()}, and is faster to compute.
 502 \end{methoddesc}
 503
 504 \begin{methoddesc}{real_quick_ratio}{}
 505   Return an upper bound on \method{ratio()} very quickly.
 506
 507   This isn't defined beyond that it is an upper bound on
 508   \method{ratio()}, and is faster to compute than either
 509   \method{ratio()} or \method{quick_ratio()}.
 510 \end{methoddesc}
 511
 512 The three methods that return the ratio of matching to total characters
 513 can give different results due to differing levels of approximation,
 514 although \method{quick_ratio()} and \method{real_quick_ratio()} are always
 515 at least as large as \method{ratio()}:
 516
 517 \begin{verbatim}
 518 >>> s = SequenceMatcher(None, "abcd", "bcde")
 519 >>> s.ratio()
 520 0.75
 521 >>> s.quick_ratio()
 522 0.75
 523 >>> s.real_quick_ratio()
 524 1.0
 525 \end{verbatim}
 526
 527
 528 \subsection{SequenceMatcher Examples \label{sequencematcher-examples}}
 529
 530
 531 This example compares two strings, considering blanks to be ``junk:''
 532
 533 \begin{verbatim}
 534 >>> s = SequenceMatcher(lambda x: x == " ",
 535 ...                     "private Thread currentThread;",
 536 ...                     "private volatile Thread currentThread;")
 537 \end{verbatim}
 538
 539 \method{ratio()} returns a float in [0, 1], measuring the similarity
 540 of the sequences.  As a rule of thumb, a \method{ratio()} value over
 541 0.6 means the sequences are close matches:
 542
 543 \begin{verbatim}
 544 >>> print round(s.ratio(), 3)
 545 0.866
 546 \end{verbatim}
 547
 548 If you're only interested in where the sequences match,
 549 \method{get_matching_blocks()} is handy:
 550
 551 \begin{verbatim}
 552 >>> for block in s.get_matching_blocks():
 553 ...     print "a[%d] and b[%d] match for %d elements" % block
 554 a[0] and b[0] match for 8 elements
 555 a[8] and b[17] match for 6 elements
 556 a[14] and b[23] match for 15 elements
 557 a[29] and b[38] match for 0 elements
 558 \end{verbatim}
 559
 560 Note that the last tuple returned by \method{get_matching_blocks()} is
 561 always a dummy, \code{(len(\var{a}), len(\var{b}), 0)}, and this is
 562 the only case in which the last tuple element (number of elements
 563 matched) is \code{0}.
 564
 565 If you want to know how to change the first sequence into the second,
 566 use \method{get_opcodes()}:
 567
 568 \begin{verbatim}
 569 >>> for opcode in s.get_opcodes():
 570 ...     print "%6s a[%d:%d] b[%d:%d]" % opcode
 571  equal a[0:8] b[0:8]
 572 insert a[8:8] b[8:17]
 573  equal a[8:14] b[17:23]
 574  equal a[14:29] b[23:38]
 575 \end{verbatim}
 576
 577 See also the function \function{get_close_matches()} in this module,
 578 which shows how simple code building on \class{SequenceMatcher} can be
 579 used to do useful work.
 580
 581
 582 \subsection{Differ Objects \label{differ-objects}}
 583
 584 Note that \class{Differ}-generated deltas make no claim to be
 585 \strong{minimal} diffs. To the contrary, minimal diffs are often
 586 counter-intuitive, because they synch up anywhere possible, sometimes
 587 accidental matches 100 pages apart. Restricting synch points to
 588 contiguous matches preserves some notion of locality, at the
 589 occasional cost of producing a longer diff.
 590
 591 The \class{Differ} class has this constructor:
 592
 593 \begin{classdesc}{Differ}{\optional{linejunk\optional{, charjunk}}}
 594   Optional keyword parameters \var{linejunk} and \var{charjunk} are
 595   for filter functions (or \code{None}):
 596
 597   \var{linejunk}: A function that accepts a single string
 598   argument, and returns true if the string is junk.  The default is
 599   \code{None}, meaning that no line is considered junk.
 600
 601   \var{charjunk}: A function that accepts a single character argument
 602   (a string of length 1), and returns true if the character is junk.
 603   The default is \code{None}, meaning that no character is
 604   considered junk.
 605 \end{classdesc}
 606
 607 \class{Differ} objects are used (deltas generated) via a single
 608 method:
 609
 610 \begin{methoddesc}{compare}{a, b}
 611   Compare two sequences of lines, and generate the delta (a sequence
 612   of lines).
 613
 614   Each sequence must contain individual single-line strings ending
 615   with newlines. Such sequences can be obtained from the
 616   \method{readlines()} method of file-like objects.  The delta generated
 617   also consists of newline-terminated strings, ready to be printed as-is
 618   via the \method{writelines()} method of a file-like object.
 619 \end{methoddesc}
 620
 621
 622 \subsection{Differ Example \label{differ-examples}}
 623
 624 This example compares two texts. First we set up the texts, sequences
 625 of individual single-line strings ending with newlines (such sequences
 626 can also be obtained from the \method{readlines()} method of file-like
 627 objects):
 628
 629 \begin{verbatim}
 630 >>> text1 = '''  1. Beautiful is better than ugly.
 631 ...   2. Explicit is better than implicit.
 632 ...   3. Simple is better than complex.
 633 ...   4. Complex is better than complicated.
 634 ... '''.splitlines(1)
 635 >>> len(text1)
 636 4
 637 >>> text1[0][-1]
 638 '\n'
 639 >>> text2 = '''  1. Beautiful is better than ugly.
 640 ...   3.   Simple is better than complex.
 641 ...   4. Complicated is better than complex.
 642 ...   5. Flat is better than nested.
 643 ... '''.splitlines(1)
 644 \end{verbatim}
 645
 646 Next we instantiate a Differ object:
 647
 648 \begin{verbatim}
 649 >>> d = Differ()
 650 \end{verbatim}
 651
 652 Note that when instantiating a \class{Differ} object we may pass
 653 functions to filter out line and character ``junk.''  See the
 654 \method{Differ()} constructor for details.
 655
 656 Finally, we compare the two:
 657
 658 \begin{verbatim}
 659 >>> result = list(d.compare(text1, text2))
 660 \end{verbatim}
 661
 662 \code{result} is a list of strings, so let's pretty-print it:
 663
 664 \begin{verbatim}
 665 >>> from pprint import pprint
 666 >>> pprint(result)
 667 ['    1. Beautiful is better than ugly.\n',
 668  '-   2. Explicit is better than implicit.\n',
 669  '-   3. Simple is better than complex.\n',
 670  '+   3.   Simple is better than complex.\n',
 671  '?     ++                                \n',
 672  '-   4. Complex is better than complicated.\n',
 673  '?            ^                     ---- ^  \n',
 674  '+   4. Complicated is better than complex.\n',
 675  '?           ++++ ^                      ^  \n',
 676  '+   5. Flat is better than nested.\n']
 677 \end{verbatim}
 678
 679 As a single multi-line string it looks like this:
 680
 681 \begin{verbatim}
 682 >>> import sys
 683 >>> sys.stdout.writelines(result)
 684     1. Beautiful is better than ugly.
 685 -   2. Explicit is better than implicit.
 686 -   3. Simple is better than complex.
 687 +   3.   Simple is better than complex.
 688 ?     ++
 689 -   4. Complex is better than complicated.
 690 ?            ^                     ---- ^
 691 +   4. Complicated is better than complex.
 692 ?           ++++ ^                      ^
 693 +   5. Flat is better than nested.
 694 \end{verbatim}