1 # -*- coding: utf-8 -*-
3 # Copyright 2006-2007 Zuza Software Foundation
5 # This file is part of translate.
7 # translate is free software; you can redistribute it and/or modify
8 # it under the terms of the GNU General Public License as published by
9 # the Free Software Foundation; either version 2 of the License, or
10 # (at your option) any later version.
12 # translate is distributed in the hope that it will be useful,
13 # but WITHOUT ANY WARRANTY; without even the implied warranty of
14 # MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
15 # GNU General Public License for more details.
17 # You should have received a copy of the GNU General Public License
18 # along with translate; if not, write to the Free Software
19 # Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA 02111-1307 USA
21 """A class to calculate a similarity based on the Levenshtein
22 distance. See http://en.wikipedia.org/wiki/Levenshtein_distance.
24 If available, the python-Levenshtein package will be used which will provide
25 better performance as it is implemented natively. See
26 http://trific.ath.cx/python/levenshtein/
32 def python_distance(a
, b
, stopvalue
=-1):
33 """Calculates the distance for use in similarity calculation. Python
40 for i
in range(1, l2
+1):
41 previous
, current
= current
, [i
]+[0]*l1
43 for j
in range(1, l1
+ 1):
44 change
= previous
[j
-1]
47 insert
= previous
[j
] + 1
48 delete
= current
[j
-1] + 1
49 current
[j
] = min(insert
, delete
, change
)
50 if least
> current
[j
]:
52 #The smallest value in the current array is the best (lowest) value
53 #that can be attained in the end if the strings are identical further
59 def native_distance(a
, b
, stopvalue
=0):
60 """Same as python_distance in functionality. This uses the fast C
61 version if we detected it earlier.
63 Note that this does not support arbitrary sequence types, but only
65 return Levenshtein
.distance(a
, b
)
68 import Levenshtein
as Levenshtein
69 distance
= native_distance
71 print >> sys
.stderr
, "Python-Levenshtein not found. Continuing with built-in (slower) fuzzy matching."
72 distance
= python_distance
74 class LevenshteinComparer
:
75 def __init__(self
, max_len
=200):
76 self
.MAX_LEN
= max_len
78 def similarity(self
, a
, b
, stoppercentage
=40):
79 similarity
= self
.similarity_real(a
, b
, stoppercentage
)
82 # chr_a = segment.characters(a)
83 # chr_b = segment.characters(b)
84 # if chr_a and chr_b and abs(len(chr_a) - len(a)) + abs(len(chr_b) - len(b)):
85 # similarity += self.similarity_real(chr_a, chr_b, stoppercentage)
91 # wrd_a = segment.words(a)
92 # wrd_b = segment.words(b)
93 # if len(wrd_a) + len(wrd_b) > 2:
94 # similarity += self.similarity_real(wrd_a, wrd_b, 0)
96 return similarity
/ measurements
98 def similarity_real(self
, a
, b
, stoppercentage
=40):
99 """Returns the similarity between a and b based on Levenshtein distance. It
100 can stop prematurely as soon as it sees that a and b will be no simmilar than
101 the percentage specified in stoppercentage.
103 The Levenshtein distance is calculated, but the following should be noted:
104 * Only the first MAX_LEN characters are considered. Long strings differing
105 at the end will therefore seem to match better than they should. See the
106 use of the variable penalty to lessen the effect of this.
107 * Strings with widely different lengths give the opportunity for shortcut.
108 This is by definition of the Levenshtein distance: the distance will be
109 at least as much as the difference in string length.
110 * Calculation is stopped as soon as a similarity of stoppercentage becomes
111 unattainable. See the use of the variable stopvalue.
112 * Implementation uses memory O(min(len(a), len(b))
113 * Excecution time is O(len(a)*len(b))
115 l1
, l2
= len(a
), len(b
)
116 if l1
== 0 or l2
== 0:
118 #Let's make l1 the smallest
123 #maxsimilarity is the maximum similarity that can be attained as constrained
124 #by the difference in string length
125 maxsimilarity
= 100 - 100.0*abs(l1
- l2
)/l2
126 if maxsimilarity
< stoppercentage
:
127 return maxsimilarity
* 1.0
129 #Let's penalise the score in cases where we shorten strings
131 if l2
> self
.MAX_LEN
:
135 if l1
> self
.MAX_LEN
:
140 #The actual value in the array that would represent a giveup situation:
141 stopvalue
= math
.ceil((100.0 - stoppercentage
)/100 * l2
)
142 dist
= distance(a
, b
, stopvalue
)
144 return stoppercentage
- 1.0
146 #If MAX_LEN came into play, we consider the calculated distance to be
147 #representative of the distance between the whole, untrimmed strings
150 return 100 - (dist
*1.0/l2
)*100 - penalty
153 if __name__
== "__main__":
155 comparer
= LevenshteinComparer()
156 print "Similarity:\n%s" % comparer
.similarity(argv
[1], argv
[2], 50)