search/match.py

   1 # -*- coding: utf-8 -*-
   2 #
   3 # Copyright 2006-2007 Zuza Software Foundation
   4 #
   5 # This file is part of translate.
   6 #
   7 # translate is free software; you can redistribute it and/or modify
   8 # it under the terms of the GNU General Public License as published by
   9 # the Free Software Foundation; either version 2 of the License, or
  10 # (at your option) any later version.
  11 #
  12 # translate is distributed in the hope that it will be useful,
  13 # but WITHOUT ANY WARRANTY; without even the implied warranty of
  14 # MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
  15 # GNU General Public License for more details.
  16 #
  17 # You should have received a copy of the GNU General Public License
  18 # along with translate; if not, write to the Free Software
  19 # Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA  02111-1307  USA
  20 #
  21
  22 """Class to perform translation memory matching from a store of translation units"""
  23
  24 from translate.search import lshtein
  25 from translate.search import terminology
  26 from translate.storage import base
  27 from translate.storage import po
  28 from translate.misc.multistring import multistring
  29 import heapq
  30
  31 def sourcelen(unit):
  32     """Returns the length of the source string"""
  33     return len(unit.source)
  34
  35 def sourcelencmp(x, y):
  36     """Compares using sourcelen"""
  37     # This is mostly useful for Python 2.3
  38     xlen = sourcelen(x)
  39     ylen = sourcelen(y)
  40     return cmp(xlen, ylen)
  41
  42 class matcher:
  43     """A class that will do matching and store configuration for the matching process"""
  44     def __init__(self, store, max_candidates=10, min_similarity=75, max_length=70, comparer=None, usefuzzy=False):
  45         """max_candidates is the maximum number of candidates that should be assembled,
  46         min_similarity is the minimum similarity that must be attained to be included in
  47         the result, comparer is an optional Comparer with similarity() function"""
  48         if comparer is None:
  49             comparer = lshtein.LevenshteinComparer(max_length)
  50         self.comparer = comparer
  51         self.setparameters(max_candidates, min_similarity, max_length)
  52         self.usefuzzy = usefuzzy
  53         self.inittm(store)
  54         self.addpercentage = True
  55
  56     def usable(self, unit):
  57         """Returns whether this translation unit is usable for TM"""
  58         #TODO: We might want to consider more attributes, such as approved, reviewed, etc.
  59         source = unit.source
  60         target = unit.target
  61         if source and target and (self.usefuzzy or not unit.isfuzzy()):
  62             if source in self.existingunits and self.existingunits[source] == target:
  63                 return False
  64             else:
  65                 self.existingunits[source] = target
  66                 return True
  67         return False
  68
  69     def inittm(self, stores):
  70         """Initialises the memory for later use. We use simple base units for
  71         speedup."""
  72         self.existingunits = {}
  73         self.candidates = base.TranslationStore()
  74
  75         if not isinstance(stores, list):
  76             stores = [stores]
  77         for store in stores:
  78             self.extendtm(store.units, store=store, sort=False)
  79         self.candidates.units.sort(sourcelencmp)
  80         # print "TM initialised with %d candidates (%d to %d characters long)" % \
  81         #        (len(self.candidates.units), len(self.candidates.units[0].source), len(self.candidates.units[-1].source))
  82
  83     def extendtm(self, units, store=None, sort=True):
  84         """Extends the memory with extra unit(s).
  85
  86         @param units: The units to add to the TM.
  87         @param store: Optional store from where some metadata can be retrieved
  88         and associated with each unit.
  89         @param sort:  Optional parameter that can be set to False to supress
  90         sorting of the candidates list. This should probably only be used in
  91         inittm().
  92         """
  93         if not isinstance(units, list):
  94             units = [units]
  95         candidates = filter(self.usable, units)
  96         for candidate in candidates:
  97             simpleunit = base.TranslationUnit("")
  98             # We need to ensure that we don't pass multistrings futher, since
  99             # some modules (like the native Levenshtein) can't use it.
 100             if isinstance(candidate.source, multistring):
 101                 if len(candidate.source.strings) > 1:
 102                     simpleunit.orig_source = candidate.source
 103                     simpleunit.orig_target = candidate.target
 104                 simpleunit.source = unicode(candidate.source)
 105                 simpleunit.target = unicode(candidate.target)
 106             else:
 107                 simpleunit.source = candidate.source
 108                 simpleunit.target = candidate.target
 109             # If we now only get translator comments, we don't get programmer
 110             # comments in TM suggestions (in Pootle, for example). If we get all
 111             # notes, pot2po adds all previous comments as translator comments
 112             # in the new po file
 113             simpleunit.addnote(candidate.getnotes(origin="translator"))
 114             simpleunit.fuzzy = candidate.isfuzzy()
 115             if store:
 116                 simpleunit.filepath = store.filepath
 117                 simpleunit.translator = store.translator
 118                 simpleunit.date = store.date
 119             self.candidates.units.append(simpleunit)
 120         if sort:
 121             self.candidates.units.sort(sourcelencmp)
 122
 123     def setparameters(self, max_candidates=10, min_similarity=75, max_length=70):
 124         """Sets the parameters without reinitialising the tm. If a parameter
 125         is not specified, it is set to the default, not ignored"""
 126         self.MAX_CANDIDATES = max_candidates
 127         self.MIN_SIMILARITY = min_similarity
 128         self.MAX_LENGTH = max_length
 129
 130     def getstoplength(self, min_similarity, text):
 131         """Calculates a length beyond which we are not interested.
 132         The extra fat is because we don't use plain character distance only."""
 133         return min(len(text) / (min_similarity/100.0), self.MAX_LENGTH)
 134
 135     def getstartlength(self, min_similarity, text):
 136         """Calculates the minimum length we are interested in.
 137         The extra fat is because we don't use plain character distance only."""
 138         return max(len(text) * (min_similarity/100.0), 1)
 139
 140     def matches(self, text):
 141         """Returns a list of possible matches for given source text.
 142
 143         @type text: String
 144         @param text: The text that will be search for in the translation memory
 145         @rtype: list
 146         @return: a list of units with the source and target strings from the
 147         translation memory. If self.addpercentage is true (default) the match
 148         quality is given as a percentage in the notes.
 149         """
 150         bestcandidates = [(0.0, None)]*self.MAX_CANDIDATES
 151         #We use self.MIN_SIMILARITY, but if we already know we have max_candidates
 152         #that are better, we can adjust min_similarity upwards for speedup
 153         min_similarity = self.MIN_SIMILARITY
 154
 155         # We want to limit our search in self.candidates, so we want to ignore
 156         # all units with a source string that is too short or too long. We use
 157         # a binary search to find the shortest string, from where we start our
 158         # search in the candidates.
 159
 160         # minimum source string length to be considered
 161         startlength = self.getstartlength(min_similarity, text)
 162         startindex = 0
 163         endindex = len(self.candidates.units)
 164         while startindex < endindex:
 165             mid = (startindex + endindex) // 2
 166             if sourcelen(self.candidates.units[mid]) < startlength:
 167                 startindex = mid + 1
 168             else:
 169                 endindex = mid
 170
 171         # maximum source string length to be considered
 172         stoplength = self.getstoplength(min_similarity, text)
 173         lowestscore = 0
 174
 175         for candidate in self.candidates.units[startindex:]:
 176             cmpstring = candidate.source
 177             if len(cmpstring) > stoplength:
 178                 break
 179             similarity = self.comparer.similarity(text, cmpstring, min_similarity)
 180             if similarity < min_similarity:
 181                 continue
 182             if similarity > lowestscore:
 183                 heapq.heapreplace(bestcandidates, (similarity, candidate))
 184                 lowestscore = bestcandidates[0][0]
 185                 if lowestscore >= 100:
 186                     break
 187                 if min_similarity < lowestscore:
 188                     min_similarity = lowestscore
 189                     stoplength = self.getstoplength(min_similarity, text)
 190
 191         #Remove the empty ones:
 192         def notzero(item):
 193             score = item[0]
 194             return score != 0
 195         bestcandidates = filter(notzero, bestcandidates)
 196         #Sort for use as a general list, and reverse so the best one is at index 0
 197         bestcandidates.sort()
 198         # We reverse as separate step for compatibility with Python 2.3
 199         bestcandidates.reverse()
 200         return self.buildunits(bestcandidates)
 201
 202     def buildunits(self, candidates):
 203         """Builds a list of units conforming to base API, with the score in the comment"""
 204         units = []
 205         for score, candidate in candidates:
 206             if hasattr(candidate, "orig_source"):
 207                 candidate.source = candidate.orig_source
 208                 candidate.target = candidate.orig_target
 209             newunit = po.pounit(candidate.source)
 210             newunit.target = candidate.target
 211             newunit.markfuzzy(candidate.fuzzy)
 212             newunit.filepath = candidate.filepath
 213             newunit.translator = candidate.translator
 214             newunit.date = candidate.date
 215             candidatenotes = candidate.getnotes().strip()
 216             if candidatenotes:
 217                 newunit.addnote(candidatenotes)
 218             if self.addpercentage:
 219                 newunit.addnote("%d%%" % score)
 220             units.append(newunit)
 221         return units
 222
 223 class terminologymatcher(matcher):
 224     """A matcher with settings specifically for terminology matching"""
 225     def __init__(self, store, max_candidates=10, min_similarity=75, max_length=500, comparer=None):
 226         if comparer is None:
 227             comparer = terminology.TerminologyComparer(max_length)
 228         matcher.__init__(self, store, max_candidates, min_similarity=10, max_length=max_length, comparer=comparer)
 229         self.addpercentage = False
 230
 231     def inittm(self, store):
 232         """Normal initialisation, but convert all source strings to lower case"""
 233         matcher.inittm(self, store)
 234         for unit in self.candidates.units:
 235             unit.source = unit.source.lower()
 236
 237     def getstartlength(self, min_similarity, text):
 238         # Let's number false matches by not working with terms of two
 239         # characters or less
 240         return 3
 241
 242     def getstoplength(self, min_similarity, text):
 243         # Let's ignore terms with more than 30 characters. Perhaps someone
 244         # gave a file with normal (long) translations
 245         return 30
 246
 247     def matches(self, text):
 248         """Normal matching after converting text to lower case. Then replace
 249         with the original unit to retain comments, etc."""
 250         text = text.lower()
 251         matches = matcher.matches(self, text)
 252         return matches
 253