py-cvs-rel2_1 (Rev 1.2) merge
[python/dscho.git] / Lib / difflib.py
blobdeb73616a820c4d4753f4beec23b6cf5bf9bf66f
1 #! /usr/bin/env python
3 """
4 Module difflib -- helpers for computing deltas between objects.
6 Function get_close_matches(word, possibilities, n=3, cutoff=0.6):
8 Use SequenceMatcher to return list of the best "good enough" matches.
10 word is a sequence for which close matches are desired (typically a
11 string).
13 possibilities is a list of sequences against which to match word
14 (typically a list of strings).
16 Optional arg n (default 3) is the maximum number of close matches to
17 return. n must be > 0.
19 Optional arg cutoff (default 0.6) is a float in [0, 1]. Possibilities
20 that don't score at least that similar to word are ignored.
22 The best (no more than n) matches among the possibilities are returned
23 in a list, sorted by similarity score, most similar first.
25 >>> get_close_matches("appel", ["ape", "apple", "peach", "puppy"])
26 ['apple', 'ape']
27 >>> import keyword
28 >>> get_close_matches("wheel", keyword.kwlist)
29 ['while']
30 >>> get_close_matches("apple", keyword.kwlist)
32 >>> get_close_matches("accept", keyword.kwlist)
33 ['except']
35 Class SequenceMatcher
37 SequenceMatcher is a flexible class for comparing pairs of sequences of any
38 type, so long as the sequence elements are hashable. The basic algorithm
39 predates, and is a little fancier than, an algorithm published in the late
40 1980's by Ratcliff and Obershelp under the hyperbolic name "gestalt pattern
41 matching". The basic idea is to find the longest contiguous matching
42 subsequence that contains no "junk" elements (R-O doesn't address junk).
43 The same idea is then applied recursively to the pieces of the sequences to
44 the left and to the right of the matching subsequence. This does not yield
45 minimal edit sequences, but does tend to yield matches that "look right"
46 to people.
48 Example, comparing two strings, and considering blanks to be "junk":
50 >>> s = SequenceMatcher(lambda x: x == " ",
51 ... "private Thread currentThread;",
52 ... "private volatile Thread currentThread;")
53 >>>
55 .ratio() returns a float in [0, 1], measuring the "similarity" of the
56 sequences. As a rule of thumb, a .ratio() value over 0.6 means the
57 sequences are close matches:
59 >>> print round(s.ratio(), 3)
60 0.866
61 >>>
63 If you're only interested in where the sequences match,
64 .get_matching_blocks() is handy:
66 >>> for block in s.get_matching_blocks():
67 ... print "a[%d] and b[%d] match for %d elements" % block
68 a[0] and b[0] match for 8 elements
69 a[8] and b[17] match for 6 elements
70 a[14] and b[23] match for 15 elements
71 a[29] and b[38] match for 0 elements
73 Note that the last tuple returned by .get_matching_blocks() is always a
74 dummy, (len(a), len(b), 0), and this is the only case in which the last
75 tuple element (number of elements matched) is 0.
77 If you want to know how to change the first sequence into the second, use
78 .get_opcodes():
80 >>> for opcode in s.get_opcodes():
81 ... print "%6s a[%d:%d] b[%d:%d]" % opcode
82 equal a[0:8] b[0:8]
83 insert a[8:8] b[8:17]
84 equal a[8:14] b[17:23]
85 equal a[14:29] b[23:38]
87 See Tools/scripts/ndiff.py for a fancy human-friendly file differencer,
88 which uses SequenceMatcher both to view files as sequences of lines, and
89 lines as sequences of characters.
91 See also function get_close_matches() in this module, which shows how
92 simple code building on SequenceMatcher can be used to do useful work.
94 Timing: Basic R-O is cubic time worst case and quadratic time expected
95 case. SequenceMatcher is quadratic time for the worst case and has
96 expected-case behavior dependent in a complicated way on how many
97 elements the sequences have in common; best case time is linear.
99 SequenceMatcher methods:
101 __init__(isjunk=None, a='', b='')
102 Construct a SequenceMatcher.
104 Optional arg isjunk is None (the default), or a one-argument function
105 that takes a sequence element and returns true iff the element is junk.
106 None is equivalent to passing "lambda x: 0", i.e. no elements are
107 considered to be junk. For example, pass
108 lambda x: x in " \\t"
109 if you're comparing lines as sequences of characters, and don't want to
110 synch up on blanks or hard tabs.
112 Optional arg a is the first of two sequences to be compared. By
113 default, an empty string. The elements of a must be hashable.
115 Optional arg b is the second of two sequences to be compared. By
116 default, an empty string. The elements of b must be hashable.
118 set_seqs(a, b)
119 Set the two sequences to be compared.
121 >>> s = SequenceMatcher()
122 >>> s.set_seqs("abcd", "bcde")
123 >>> s.ratio()
124 0.75
126 set_seq1(a)
127 Set the first sequence to be compared.
129 The second sequence to be compared is not changed.
131 >>> s = SequenceMatcher(None, "abcd", "bcde")
132 >>> s.ratio()
133 0.75
134 >>> s.set_seq1("bcde")
135 >>> s.ratio()
139 SequenceMatcher computes and caches detailed information about the
140 second sequence, so if you want to compare one sequence S against many
141 sequences, use .set_seq2(S) once and call .set_seq1(x) repeatedly for
142 each of the other sequences.
144 See also set_seqs() and set_seq2().
146 set_seq2(b)
147 Set the second sequence to be compared.
149 The first sequence to be compared is not changed.
151 >>> s = SequenceMatcher(None, "abcd", "bcde")
152 >>> s.ratio()
153 0.75
154 >>> s.set_seq2("abcd")
155 >>> s.ratio()
159 SequenceMatcher computes and caches detailed information about the
160 second sequence, so if you want to compare one sequence S against many
161 sequences, use .set_seq2(S) once and call .set_seq1(x) repeatedly for
162 each of the other sequences.
164 See also set_seqs() and set_seq1().
166 find_longest_match(alo, ahi, blo, bhi)
167 Find longest matching block in a[alo:ahi] and b[blo:bhi].
169 If isjunk is not defined:
171 Return (i,j,k) such that a[i:i+k] is equal to b[j:j+k], where
172 alo <= i <= i+k <= ahi
173 blo <= j <= j+k <= bhi
174 and for all (i',j',k') meeting those conditions,
175 k >= k'
176 i <= i'
177 and if i == i', j <= j'
179 In other words, of all maximal matching blocks, return one that starts
180 earliest in a, and of all those maximal matching blocks that start
181 earliest in a, return the one that starts earliest in b.
183 >>> s = SequenceMatcher(None, " abcd", "abcd abcd")
184 >>> s.find_longest_match(0, 5, 0, 9)
185 (0, 4, 5)
187 If isjunk is defined, first the longest matching block is determined as
188 above, but with the additional restriction that no junk element appears
189 in the block. Then that block is extended as far as possible by
190 matching (only) junk elements on both sides. So the resulting block
191 never matches on junk except as identical junk happens to be adjacent
192 to an "interesting" match.
194 Here's the same example as before, but considering blanks to be junk.
195 That prevents " abcd" from matching the " abcd" at the tail end of the
196 second sequence directly. Instead only the "abcd" can match, and
197 matches the leftmost "abcd" in the second sequence:
199 >>> s = SequenceMatcher(lambda x: x==" ", " abcd", "abcd abcd")
200 >>> s.find_longest_match(0, 5, 0, 9)
201 (1, 0, 4)
203 If no blocks match, return (alo, blo, 0).
205 >>> s = SequenceMatcher(None, "ab", "c")
206 >>> s.find_longest_match(0, 2, 0, 1)
207 (0, 0, 0)
209 get_matching_blocks()
210 Return list of triples describing matching subsequences.
212 Each triple is of the form (i, j, n), and means that
213 a[i:i+n] == b[j:j+n]. The triples are monotonically increasing in i
214 and in j.
216 The last triple is a dummy, (len(a), len(b), 0), and is the only triple
217 with n==0.
219 >>> s = SequenceMatcher(None, "abxcd", "abcd")
220 >>> s.get_matching_blocks()
221 [(0, 0, 2), (3, 2, 2), (5, 4, 0)]
223 get_opcodes()
224 Return list of 5-tuples describing how to turn a into b.
226 Each tuple is of the form (tag, i1, i2, j1, j2). The first tuple has
227 i1 == j1 == 0, and remaining tuples have i1 == the i2 from the tuple
228 preceding it, and likewise for j1 == the previous j2.
230 The tags are strings, with these meanings:
232 'replace': a[i1:i2] should be replaced by b[j1:j2]
233 'delete': a[i1:i2] should be deleted.
234 Note that j1==j2 in this case.
235 'insert': b[j1:j2] should be inserted at a[i1:i1].
236 Note that i1==i2 in this case.
237 'equal': a[i1:i2] == b[j1:j2]
239 >>> a = "qabxcd"
240 >>> b = "abycdf"
241 >>> s = SequenceMatcher(None, a, b)
242 >>> for tag, i1, i2, j1, j2 in s.get_opcodes():
243 ... print ("%7s a[%d:%d] (%s) b[%d:%d] (%s)" %
244 ... (tag, i1, i2, a[i1:i2], j1, j2, b[j1:j2]))
245 delete a[0:1] (q) b[0:0] ()
246 equal a[1:3] (ab) b[0:2] (ab)
247 replace a[3:4] (x) b[2:3] (y)
248 equal a[4:6] (cd) b[3:5] (cd)
249 insert a[6:6] () b[5:6] (f)
251 ratio()
252 Return a measure of the sequences' similarity (float in [0,1]).
254 Where T is the total number of elements in both sequences, and M is the
255 number of matches, this is 2,0*M / T. Note that this is 1 if the
256 sequences are identical, and 0 if they have nothing in common.
258 .ratio() is expensive to compute if you haven't already computed
259 .get_matching_blocks() or .get_opcodes(), in which case you may want to
260 try .quick_ratio() or .real_quick_ratio() first to get an upper bound.
262 >>> s = SequenceMatcher(None, "abcd", "bcde")
263 >>> s.ratio()
264 0.75
265 >>> s.quick_ratio()
266 0.75
267 >>> s.real_quick_ratio()
270 quick_ratio()
271 Return an upper bound on .ratio() relatively quickly.
273 This isn't defined beyond that it is an upper bound on .ratio(), and
274 is faster to compute.
276 real_quick_ratio():
277 Return an upper bound on ratio() very quickly.
279 This isn't defined beyond that it is an upper bound on .ratio(), and
280 is faster to compute than either .ratio() or .quick_ratio().
283 TRACE = 0
285 class SequenceMatcher:
286 def __init__(self, isjunk=None, a='', b=''):
287 """Construct a SequenceMatcher.
289 Optional arg isjunk is None (the default), or a one-argument
290 function that takes a sequence element and returns true iff the
291 element is junk. None is equivalent to passing "lambda x: 0", i.e.
292 no elements are considered to be junk. For example, pass
293 lambda x: x in " \\t"
294 if you're comparing lines as sequences of characters, and don't
295 want to synch up on blanks or hard tabs.
297 Optional arg a is the first of two sequences to be compared. By
298 default, an empty string. The elements of a must be hashable. See
299 also .set_seqs() and .set_seq1().
301 Optional arg b is the second of two sequences to be compared. By
302 default, an empty string. The elements of b must be hashable. See
303 also .set_seqs() and .set_seq2().
306 # Members:
308 # first sequence
310 # second sequence; differences are computed as "what do
311 # we need to do to 'a' to change it into 'b'?"
312 # b2j
313 # for x in b, b2j[x] is a list of the indices (into b)
314 # at which x appears; junk elements do not appear
315 # b2jhas
316 # b2j.has_key
317 # fullbcount
318 # for x in b, fullbcount[x] == the number of times x
319 # appears in b; only materialized if really needed (used
320 # only for computing quick_ratio())
321 # matching_blocks
322 # a list of (i, j, k) triples, where a[i:i+k] == b[j:j+k];
323 # ascending & non-overlapping in i and in j; terminated by
324 # a dummy (len(a), len(b), 0) sentinel
325 # opcodes
326 # a list of (tag, i1, i2, j1, j2) tuples, where tag is
327 # one of
328 # 'replace' a[i1:i2] should be replaced by b[j1:j2]
329 # 'delete' a[i1:i2] should be deleted
330 # 'insert' b[j1:j2] should be inserted
331 # 'equal' a[i1:i2] == b[j1:j2]
332 # isjunk
333 # a user-supplied function taking a sequence element and
334 # returning true iff the element is "junk" -- this has
335 # subtle but helpful effects on the algorithm, which I'll
336 # get around to writing up someday <0.9 wink>.
337 # DON'T USE! Only __chain_b uses this. Use isbjunk.
338 # isbjunk
339 # for x in b, isbjunk(x) == isjunk(x) but much faster;
340 # it's really the has_key method of a hidden dict.
341 # DOES NOT WORK for x in a!
343 self.isjunk = isjunk
344 self.a = self.b = None
345 self.set_seqs(a, b)
347 def set_seqs(self, a, b):
348 """Set the two sequences to be compared.
350 >>> s = SequenceMatcher()
351 >>> s.set_seqs("abcd", "bcde")
352 >>> s.ratio()
353 0.75
356 self.set_seq1(a)
357 self.set_seq2(b)
359 def set_seq1(self, a):
360 """Set the first sequence to be compared.
362 The second sequence to be compared is not changed.
364 >>> s = SequenceMatcher(None, "abcd", "bcde")
365 >>> s.ratio()
366 0.75
367 >>> s.set_seq1("bcde")
368 >>> s.ratio()
372 SequenceMatcher computes and caches detailed information about the
373 second sequence, so if you want to compare one sequence S against
374 many sequences, use .set_seq2(S) once and call .set_seq1(x)
375 repeatedly for each of the other sequences.
377 See also set_seqs() and set_seq2().
380 if a is self.a:
381 return
382 self.a = a
383 self.matching_blocks = self.opcodes = None
385 def set_seq2(self, b):
386 """Set the second sequence to be compared.
388 The first sequence to be compared is not changed.
390 >>> s = SequenceMatcher(None, "abcd", "bcde")
391 >>> s.ratio()
392 0.75
393 >>> s.set_seq2("abcd")
394 >>> s.ratio()
398 SequenceMatcher computes and caches detailed information about the
399 second sequence, so if you want to compare one sequence S against
400 many sequences, use .set_seq2(S) once and call .set_seq1(x)
401 repeatedly for each of the other sequences.
403 See also set_seqs() and set_seq1().
406 if b is self.b:
407 return
408 self.b = b
409 self.matching_blocks = self.opcodes = None
410 self.fullbcount = None
411 self.__chain_b()
413 # For each element x in b, set b2j[x] to a list of the indices in
414 # b where x appears; the indices are in increasing order; note that
415 # the number of times x appears in b is len(b2j[x]) ...
416 # when self.isjunk is defined, junk elements don't show up in this
417 # map at all, which stops the central find_longest_match method
418 # from starting any matching block at a junk element ...
419 # also creates the fast isbjunk function ...
420 # note that this is only called when b changes; so for cross-product
421 # kinds of matches, it's best to call set_seq2 once, then set_seq1
422 # repeatedly
424 def __chain_b(self):
425 # Because isjunk is a user-defined (not C) function, and we test
426 # for junk a LOT, it's important to minimize the number of calls.
427 # Before the tricks described here, __chain_b was by far the most
428 # time-consuming routine in the whole module! If anyone sees
429 # Jim Roskind, thank him again for profile.py -- I never would
430 # have guessed that.
431 # The first trick is to build b2j ignoring the possibility
432 # of junk. I.e., we don't call isjunk at all yet. Throwing
433 # out the junk later is much cheaper than building b2j "right"
434 # from the start.
435 b = self.b
436 self.b2j = b2j = {}
437 self.b2jhas = b2jhas = b2j.has_key
438 for i in xrange(len(b)):
439 elt = b[i]
440 if b2jhas(elt):
441 b2j[elt].append(i)
442 else:
443 b2j[elt] = [i]
445 # Now b2j.keys() contains elements uniquely, and especially when
446 # the sequence is a string, that's usually a good deal smaller
447 # than len(string). The difference is the number of isjunk calls
448 # saved.
449 isjunk, junkdict = self.isjunk, {}
450 if isjunk:
451 for elt in b2j.keys():
452 if isjunk(elt):
453 junkdict[elt] = 1 # value irrelevant; it's a set
454 del b2j[elt]
456 # Now for x in b, isjunk(x) == junkdict.has_key(x), but the
457 # latter is much faster. Note too that while there may be a
458 # lot of junk in the sequence, the number of *unique* junk
459 # elements is probably small. So the memory burden of keeping
460 # this dict alive is likely trivial compared to the size of b2j.
461 self.isbjunk = junkdict.has_key
463 def find_longest_match(self, alo, ahi, blo, bhi):
464 """Find longest matching block in a[alo:ahi] and b[blo:bhi].
466 If isjunk is not defined:
468 Return (i,j,k) such that a[i:i+k] is equal to b[j:j+k], where
469 alo <= i <= i+k <= ahi
470 blo <= j <= j+k <= bhi
471 and for all (i',j',k') meeting those conditions,
472 k >= k'
473 i <= i'
474 and if i == i', j <= j'
476 In other words, of all maximal matching blocks, return one that
477 starts earliest in a, and of all those maximal matching blocks that
478 start earliest in a, return the one that starts earliest in b.
480 >>> s = SequenceMatcher(None, " abcd", "abcd abcd")
481 >>> s.find_longest_match(0, 5, 0, 9)
482 (0, 4, 5)
484 If isjunk is defined, first the longest matching block is
485 determined as above, but with the additional restriction that no
486 junk element appears in the block. Then that block is extended as
487 far as possible by matching (only) junk elements on both sides. So
488 the resulting block never matches on junk except as identical junk
489 happens to be adjacent to an "interesting" match.
491 Here's the same example as before, but considering blanks to be
492 junk. That prevents " abcd" from matching the " abcd" at the tail
493 end of the second sequence directly. Instead only the "abcd" can
494 match, and matches the leftmost "abcd" in the second sequence:
496 >>> s = SequenceMatcher(lambda x: x==" ", " abcd", "abcd abcd")
497 >>> s.find_longest_match(0, 5, 0, 9)
498 (1, 0, 4)
500 If no blocks match, return (alo, blo, 0).
502 >>> s = SequenceMatcher(None, "ab", "c")
503 >>> s.find_longest_match(0, 2, 0, 1)
504 (0, 0, 0)
507 # CAUTION: stripping common prefix or suffix would be incorrect.
508 # E.g.,
509 # ab
510 # acab
511 # Longest matching block is "ab", but if common prefix is
512 # stripped, it's "a" (tied with "b"). UNIX(tm) diff does so
513 # strip, so ends up claiming that ab is changed to acab by
514 # inserting "ca" in the middle. That's minimal but unintuitive:
515 # "it's obvious" that someone inserted "ac" at the front.
516 # Windiff ends up at the same place as diff, but by pairing up
517 # the unique 'b's and then matching the first two 'a's.
519 a, b, b2j, isbjunk = self.a, self.b, self.b2j, self.isbjunk
520 besti, bestj, bestsize = alo, blo, 0
521 # find longest junk-free match
522 # during an iteration of the loop, j2len[j] = length of longest
523 # junk-free match ending with a[i-1] and b[j]
524 j2len = {}
525 nothing = []
526 for i in xrange(alo, ahi):
527 # look at all instances of a[i] in b; note that because
528 # b2j has no junk keys, the loop is skipped if a[i] is junk
529 j2lenget = j2len.get
530 newj2len = {}
531 for j in b2j.get(a[i], nothing):
532 # a[i] matches b[j]
533 if j < blo:
534 continue
535 if j >= bhi:
536 break
537 k = newj2len[j] = j2lenget(j-1, 0) + 1
538 if k > bestsize:
539 besti, bestj, bestsize = i-k+1, j-k+1, k
540 j2len = newj2len
542 # Now that we have a wholly interesting match (albeit possibly
543 # empty!), we may as well suck up the matching junk on each
544 # side of it too. Can't think of a good reason not to, and it
545 # saves post-processing the (possibly considerable) expense of
546 # figuring out what to do with it. In the case of an empty
547 # interesting match, this is clearly the right thing to do,
548 # because no other kind of match is possible in the regions.
549 while besti > alo and bestj > blo and \
550 isbjunk(b[bestj-1]) and \
551 a[besti-1] == b[bestj-1]:
552 besti, bestj, bestsize = besti-1, bestj-1, bestsize+1
553 while besti+bestsize < ahi and bestj+bestsize < bhi and \
554 isbjunk(b[bestj+bestsize]) and \
555 a[besti+bestsize] == b[bestj+bestsize]:
556 bestsize = bestsize + 1
558 if TRACE:
559 print "get_matching_blocks", alo, ahi, blo, bhi
560 print " returns", besti, bestj, bestsize
561 return besti, bestj, bestsize
563 def get_matching_blocks(self):
564 """Return list of triples describing matching subsequences.
566 Each triple is of the form (i, j, n), and means that
567 a[i:i+n] == b[j:j+n]. The triples are monotonically increasing in
568 i and in j.
570 The last triple is a dummy, (len(a), len(b), 0), and is the only
571 triple with n==0.
573 >>> s = SequenceMatcher(None, "abxcd", "abcd")
574 >>> s.get_matching_blocks()
575 [(0, 0, 2), (3, 2, 2), (5, 4, 0)]
578 if self.matching_blocks is not None:
579 return self.matching_blocks
580 self.matching_blocks = []
581 la, lb = len(self.a), len(self.b)
582 self.__helper(0, la, 0, lb, self.matching_blocks)
583 self.matching_blocks.append( (la, lb, 0) )
584 if TRACE:
585 print '*** matching blocks', self.matching_blocks
586 return self.matching_blocks
588 # builds list of matching blocks covering a[alo:ahi] and
589 # b[blo:bhi], appending them in increasing order to answer
591 def __helper(self, alo, ahi, blo, bhi, answer):
592 i, j, k = x = self.find_longest_match(alo, ahi, blo, bhi)
593 # a[alo:i] vs b[blo:j] unknown
594 # a[i:i+k] same as b[j:j+k]
595 # a[i+k:ahi] vs b[j+k:bhi] unknown
596 if k:
597 if alo < i and blo < j:
598 self.__helper(alo, i, blo, j, answer)
599 answer.append(x)
600 if i+k < ahi and j+k < bhi:
601 self.__helper(i+k, ahi, j+k, bhi, answer)
603 def get_opcodes(self):
604 """Return list of 5-tuples describing how to turn a into b.
606 Each tuple is of the form (tag, i1, i2, j1, j2). The first tuple
607 has i1 == j1 == 0, and remaining tuples have i1 == the i2 from the
608 tuple preceding it, and likewise for j1 == the previous j2.
610 The tags are strings, with these meanings:
612 'replace': a[i1:i2] should be replaced by b[j1:j2]
613 'delete': a[i1:i2] should be deleted.
614 Note that j1==j2 in this case.
615 'insert': b[j1:j2] should be inserted at a[i1:i1].
616 Note that i1==i2 in this case.
617 'equal': a[i1:i2] == b[j1:j2]
619 >>> a = "qabxcd"
620 >>> b = "abycdf"
621 >>> s = SequenceMatcher(None, a, b)
622 >>> for tag, i1, i2, j1, j2 in s.get_opcodes():
623 ... print ("%7s a[%d:%d] (%s) b[%d:%d] (%s)" %
624 ... (tag, i1, i2, a[i1:i2], j1, j2, b[j1:j2]))
625 delete a[0:1] (q) b[0:0] ()
626 equal a[1:3] (ab) b[0:2] (ab)
627 replace a[3:4] (x) b[2:3] (y)
628 equal a[4:6] (cd) b[3:5] (cd)
629 insert a[6:6] () b[5:6] (f)
632 if self.opcodes is not None:
633 return self.opcodes
634 i = j = 0
635 self.opcodes = answer = []
636 for ai, bj, size in self.get_matching_blocks():
637 # invariant: we've pumped out correct diffs to change
638 # a[:i] into b[:j], and the next matching block is
639 # a[ai:ai+size] == b[bj:bj+size]. So we need to pump
640 # out a diff to change a[i:ai] into b[j:bj], pump out
641 # the matching block, and move (i,j) beyond the match
642 tag = ''
643 if i < ai and j < bj:
644 tag = 'replace'
645 elif i < ai:
646 tag = 'delete'
647 elif j < bj:
648 tag = 'insert'
649 if tag:
650 answer.append( (tag, i, ai, j, bj) )
651 i, j = ai+size, bj+size
652 # the list of matching blocks is terminated by a
653 # sentinel with size 0
654 if size:
655 answer.append( ('equal', ai, i, bj, j) )
656 return answer
658 def ratio(self):
659 """Return a measure of the sequences' similarity (float in [0,1]).
661 Where T is the total number of elements in both sequences, and
662 M is the number of matches, this is 2,0*M / T.
663 Note that this is 1 if the sequences are identical, and 0 if
664 they have nothing in common.
666 .ratio() is expensive to compute if you haven't already computed
667 .get_matching_blocks() or .get_opcodes(), in which case you may
668 want to try .quick_ratio() or .real_quick_ratio() first to get an
669 upper bound.
671 >>> s = SequenceMatcher(None, "abcd", "bcde")
672 >>> s.ratio()
673 0.75
674 >>> s.quick_ratio()
675 0.75
676 >>> s.real_quick_ratio()
680 matches = reduce(lambda sum, triple: sum + triple[-1],
681 self.get_matching_blocks(), 0)
682 return 2.0 * matches / (len(self.a) + len(self.b))
684 def quick_ratio(self):
685 """Return an upper bound on ratio() relatively quickly.
687 This isn't defined beyond that it is an upper bound on .ratio(), and
688 is faster to compute.
691 # viewing a and b as multisets, set matches to the cardinality
692 # of their intersection; this counts the number of matches
693 # without regard to order, so is clearly an upper bound
694 if self.fullbcount is None:
695 self.fullbcount = fullbcount = {}
696 for elt in self.b:
697 fullbcount[elt] = fullbcount.get(elt, 0) + 1
698 fullbcount = self.fullbcount
699 # avail[x] is the number of times x appears in 'b' less the
700 # number of times we've seen it in 'a' so far ... kinda
701 avail = {}
702 availhas, matches = avail.has_key, 0
703 for elt in self.a:
704 if availhas(elt):
705 numb = avail[elt]
706 else:
707 numb = fullbcount.get(elt, 0)
708 avail[elt] = numb - 1
709 if numb > 0:
710 matches = matches + 1
711 return 2.0 * matches / (len(self.a) + len(self.b))
713 def real_quick_ratio(self):
714 """Return an upper bound on ratio() very quickly.
716 This isn't defined beyond that it is an upper bound on .ratio(), and
717 is faster to compute than either .ratio() or .quick_ratio().
720 la, lb = len(self.a), len(self.b)
721 # can't have more matches than the number of elements in the
722 # shorter sequence
723 return 2.0 * min(la, lb) / (la + lb)
725 def get_close_matches(word, possibilities, n=3, cutoff=0.6):
726 """Use SequenceMatcher to return list of the best "good enough" matches.
728 word is a sequence for which close matches are desired (typically a
729 string).
731 possibilities is a list of sequences against which to match word
732 (typically a list of strings).
734 Optional arg n (default 3) is the maximum number of close matches to
735 return. n must be > 0.
737 Optional arg cutoff (default 0.6) is a float in [0, 1]. Possibilities
738 that don't score at least that similar to word are ignored.
740 The best (no more than n) matches among the possibilities are returned
741 in a list, sorted by similarity score, most similar first.
743 >>> get_close_matches("appel", ["ape", "apple", "peach", "puppy"])
744 ['apple', 'ape']
745 >>> import keyword
746 >>> get_close_matches("wheel", keyword.kwlist)
747 ['while']
748 >>> get_close_matches("apple", keyword.kwlist)
750 >>> get_close_matches("accept", keyword.kwlist)
751 ['except']
754 if not n > 0:
755 raise ValueError("n must be > 0: " + `n`)
756 if not 0.0 <= cutoff <= 1.0:
757 raise ValueError("cutoff must be in [0.0, 1.0]: " + `cutoff`)
758 result = []
759 s = SequenceMatcher()
760 s.set_seq2(word)
761 for x in possibilities:
762 s.set_seq1(x)
763 if s.real_quick_ratio() >= cutoff and \
764 s.quick_ratio() >= cutoff and \
765 s.ratio() >= cutoff:
766 result.append((s.ratio(), x))
767 # Sort by score.
768 result.sort()
769 # Retain only the best n.
770 result = result[-n:]
771 # Move best-scorer to head of list.
772 result.reverse()
773 # Strip scores.
774 return [x for score, x in result]
776 def _test():
777 import doctest, difflib
778 return doctest.testmod(difflib)
780 if __name__ == "__main__":
781 _test()