search/lshtein.py

   1 # -*- coding: utf-8 -*-
   2 #
   3 # Copyright 2006-2007 Zuza Software Foundation
   4 #
   5 # This file is part of translate.
   6 #
   7 # translate is free software; you can redistribute it and/or modify
   8 # it under the terms of the GNU General Public License as published by
   9 # the Free Software Foundation; either version 2 of the License, or
  10 # (at your option) any later version.
  11 #
  12 # translate is distributed in the hope that it will be useful,
  13 # but WITHOUT ANY WARRANTY; without even the implied warranty of
  14 # MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
  15 # GNU General Public License for more details.
  16 #
  17 # You should have received a copy of the GNU General Public License
  18 # along with translate; if not, write to the Free Software
  19 # Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA  02111-1307  USA
  20
  21 """A class to calculate a similarity based on the Levenshtein
  22 distance. See http://en.wikipedia.org/wiki/Levenshtein_distance.
  23
  24 If available, the python-Levenshtein package will be used which will provide
  25 better performance as it is implemented natively. See
  26 http://trific.ath.cx/python/levenshtein/
  27 """
  28
  29 import math
  30 import sys
  31
  32 def python_distance(a, b, stopvalue=-1):
  33     """Calculates the distance for use in similarity calculation. Python
  34     version."""
  35     l1 = len(a)
  36     l2 = len(b)
  37     if stopvalue == -1:
  38         stopvalue = l2
  39     current = range(l1+1)
  40     for i in range(1, l2+1):
  41         previous, current = current, [i]+[0]*l1
  42         least = l2
  43         for j in range(1, l1 + 1):
  44             change = previous[j-1]
  45             if a[j-1] != b[i-1]:
  46                 change = change + 1
  47             insert = previous[j] + 1
  48             delete = current[j-1] + 1
  49             current[j] = min(insert, delete, change)
  50             if least > current[j]:
  51                 least = current[j]
  52         #The smallest value in the current array is the best (lowest) value
  53         #that can be attained in the end if the strings are identical further
  54         if least > stopvalue:
  55             return least
  56
  57     return current[l1]
  58
  59 def native_distance(a, b, stopvalue=0):
  60     """Same as python_distance in functionality. This uses the fast C
  61     version if we detected it earlier.
  62
  63     Note that this does not support arbitrary sequence types, but only
  64     string types."""
  65     return Levenshtein.distance(a, b)
  66
  67 try:
  68     import Levenshtein as Levenshtein
  69     distance = native_distance
  70 except Exception:
  71     print >> sys.stderr, "Python-Levenshtein not found. Continuing with built-in (slower) fuzzy matching."
  72     distance = python_distance
  73
  74 class LevenshteinComparer:
  75     def __init__(self, max_len=200):
  76         self.MAX_LEN = max_len
  77
  78     def similarity(self, a, b, stoppercentage=40):
  79         similarity = self.similarity_real(a, b, stoppercentage)
  80         measurements = 1
  81
  82 #        chr_a = segment.characters(a)
  83 #        chr_b = segment.characters(b)
  84 #        if chr_a and chr_b and abs(len(chr_a) - len(a)) + abs(len(chr_b) - len(b)):
  85 #            similarity += self.similarity_real(chr_a, chr_b, stoppercentage)
  86 #            measurements += 1
  87 #        else:
  88 #            similarity *= 2
  89 #            measurements += 1
  90 #
  91 #        wrd_a = segment.words(a)
  92 #        wrd_b = segment.words(b)
  93 #        if len(wrd_a) + len(wrd_b) > 2:
  94 #            similarity += self.similarity_real(wrd_a, wrd_b, 0)
  95 #            measurements += 1
  96         return similarity / measurements
  97
  98     def similarity_real(self, a, b, stoppercentage=40):
  99         """Returns the similarity between a and b based on Levenshtein distance. It
 100     can stop prematurely as soon as it sees that a and b will be no simmilar than
 101     the percentage specified in stoppercentage.
 102
 103     The Levenshtein distance is calculated, but the following should be noted:
 104         * Only the first MAX_LEN characters are considered. Long strings differing
 105           at the end will therefore seem to match better than they should. See the
 106           use of the variable penalty to lessen the effect of this.
 107         * Strings with widely different lengths give the opportunity for shortcut.
 108           This is by definition of the Levenshtein distance: the distance will be
 109           at least as much as the difference in string length.
 110         * Calculation is stopped as soon as a similarity of stoppercentage becomes
 111           unattainable. See the use of the variable stopvalue.
 112         * Implementation uses memory O(min(len(a), len(b))
 113         * Excecution time is O(len(a)*len(b))
 114     """
 115         l1, l2 = len(a), len(b)
 116         if l1 == 0 or l2 == 0:
 117             return 0
 118         #Let's make l1 the smallest
 119         if l1 > l2:
 120             l1, l2 = l2, l1
 121             a, b = b, a
 122
 123         #maxsimilarity is the maximum similarity that can be attained as constrained
 124         #by the difference in string length
 125         maxsimilarity = 100 - 100.0*abs(l1 - l2)/l2
 126         if maxsimilarity < stoppercentage:
 127             return maxsimilarity * 1.0
 128
 129         #Let's penalise the score in cases where we shorten strings
 130         penalty = 0
 131         if l2 > self.MAX_LEN:
 132             b = b[:self.MAX_LEN]
 133             l2 = self.MAX_LEN
 134             penalty += 7
 135             if l1 > self.MAX_LEN:
 136                 a = a[:self.MAX_LEN]
 137                 l1 = self.MAX_LEN
 138                 penalty += 7
 139
 140         #The actual value in the array that would represent a giveup situation:
 141         stopvalue = math.ceil((100.0 - stoppercentage)/100 * l2)
 142         dist = distance(a, b, stopvalue)
 143         if dist > stopvalue:
 144             return stoppercentage - 1.0
 145
 146         #If MAX_LEN came into play, we consider the calculated distance to be
 147         #representative of the distance between the whole, untrimmed strings
 148         if dist != 0:
 149             penalty = 0
 150         return 100 - (dist*1.0/l2)*100 - penalty
 151
 152
 153 if __name__ == "__main__":
 154     from sys import argv
 155     comparer = LevenshteinComparer()
 156     print "Similarity:\n%s" % comparer.similarity(argv[1], argv[2], 50)
 157