1 # -*- coding: utf-8 -*-
3 # Copyright 2006-2007 Zuza Software Foundation
5 # This file is part of translate.
7 # translate is free software; you can redistribute it and/or modify
8 # it under the terms of the GNU General Public License as published by
9 # the Free Software Foundation; either version 2 of the License, or
10 # (at your option) any later version.
12 # translate is distributed in the hope that it will be useful,
13 # but WITHOUT ANY WARRANTY; without even the implied warranty of
14 # MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
15 # GNU General Public License for more details.
17 # You should have received a copy of the GNU General Public License
18 # along with translate; if not, write to the Free Software
19 # Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA 02111-1307 USA
22 """Class to perform translation memory matching from a store of translation units"""
24 from translate
.search
import lshtein
25 from translate
.search
import terminology
26 from translate
.storage
import base
27 from translate
.storage
import po
28 from translate
.misc
.multistring
import multistring
32 """Returns the length of the source string"""
33 return len(unit
.source
)
35 def sourcelencmp(x
, y
):
36 """Compares using sourcelen"""
37 # This is mostly useful for Python 2.3
40 return cmp(xlen
, ylen
)
43 """A class that will do matching and store configuration for the matching process"""
44 def __init__(self
, store
, max_candidates
=10, min_similarity
=75, max_length
=70, comparer
=None, usefuzzy
=False):
45 """max_candidates is the maximum number of candidates that should be assembled,
46 min_similarity is the minimum similarity that must be attained to be included in
47 the result, comparer is an optional Comparer with similarity() function"""
49 comparer
= lshtein
.LevenshteinComparer(max_length
)
50 self
.comparer
= comparer
51 self
.setparameters(max_candidates
, min_similarity
, max_length
)
52 self
.usefuzzy
= usefuzzy
54 self
.addpercentage
= True
56 def usable(self
, unit
):
57 """Returns whether this translation unit is usable for TM"""
58 #TODO: We might want to consider more attributes, such as approved, reviewed, etc.
61 if source
and target
and (self
.usefuzzy
or not unit
.isfuzzy()):
62 if source
in self
.existingunits
and self
.existingunits
[source
] == target
:
65 self
.existingunits
[source
] = target
69 def inittm(self
, stores
):
70 """Initialises the memory for later use. We use simple base units for
72 self
.existingunits
= {}
73 self
.candidates
= base
.TranslationStore()
75 if not isinstance(stores
, list):
78 self
.extendtm(store
.units
, store
=store
, sort
=False)
79 self
.candidates
.units
.sort(sourcelencmp
)
80 # print "TM initialised with %d candidates (%d to %d characters long)" % \
81 # (len(self.candidates.units), len(self.candidates.units[0].source), len(self.candidates.units[-1].source))
83 def extendtm(self
, units
, store
=None, sort
=True):
84 """Extends the memory with extra unit(s).
86 @param units: The units to add to the TM.
87 @param store: Optional store from where some metadata can be retrieved
88 and associated with each unit.
89 @param sort: Optional parameter that can be set to False to supress
90 sorting of the candidates list. This should probably only be used in
93 if not isinstance(units
, list):
95 candidates
= filter(self
.usable
, units
)
96 for candidate
in candidates
:
97 simpleunit
= base
.TranslationUnit("")
98 # We need to ensure that we don't pass multistrings futher, since
99 # some modules (like the native Levenshtein) can't use it.
100 if isinstance(candidate
.source
, multistring
):
101 if len(candidate
.source
.strings
) > 1:
102 simpleunit
.orig_source
= candidate
.source
103 simpleunit
.orig_target
= candidate
.target
104 simpleunit
.source
= unicode(candidate
.source
)
105 simpleunit
.target
= unicode(candidate
.target
)
107 simpleunit
.source
= candidate
.source
108 simpleunit
.target
= candidate
.target
109 # If we now only get translator comments, we don't get programmer
110 # comments in TM suggestions (in Pootle, for example). If we get all
111 # notes, pot2po adds all previous comments as translator comments
113 simpleunit
.addnote(candidate
.getnotes(origin
="translator"))
114 simpleunit
.fuzzy
= candidate
.isfuzzy()
116 simpleunit
.filepath
= store
.filepath
117 simpleunit
.translator
= store
.translator
118 simpleunit
.date
= store
.date
119 self
.candidates
.units
.append(simpleunit
)
121 self
.candidates
.units
.sort(sourcelencmp
)
123 def setparameters(self
, max_candidates
=10, min_similarity
=75, max_length
=70):
124 """Sets the parameters without reinitialising the tm. If a parameter
125 is not specified, it is set to the default, not ignored"""
126 self
.MAX_CANDIDATES
= max_candidates
127 self
.MIN_SIMILARITY
= min_similarity
128 self
.MAX_LENGTH
= max_length
130 def getstoplength(self
, min_similarity
, text
):
131 """Calculates a length beyond which we are not interested.
132 The extra fat is because we don't use plain character distance only."""
133 return min(len(text
) / (min_similarity
/100.0), self
.MAX_LENGTH
)
135 def getstartlength(self
, min_similarity
, text
):
136 """Calculates the minimum length we are interested in.
137 The extra fat is because we don't use plain character distance only."""
138 return max(len(text
) * (min_similarity
/100.0), 1)
140 def matches(self
, text
):
141 """Returns a list of possible matches for given source text.
144 @param text: The text that will be search for in the translation memory
146 @return: a list of units with the source and target strings from the
147 translation memory. If self.addpercentage is true (default) the match
148 quality is given as a percentage in the notes.
150 bestcandidates
= [(0.0, None)]*self
.MAX_CANDIDATES
151 #We use self.MIN_SIMILARITY, but if we already know we have max_candidates
152 #that are better, we can adjust min_similarity upwards for speedup
153 min_similarity
= self
.MIN_SIMILARITY
155 # We want to limit our search in self.candidates, so we want to ignore
156 # all units with a source string that is too short or too long. We use
157 # a binary search to find the shortest string, from where we start our
158 # search in the candidates.
160 # minimum source string length to be considered
161 startlength
= self
.getstartlength(min_similarity
, text
)
163 endindex
= len(self
.candidates
.units
)
164 while startindex
< endindex
:
165 mid
= (startindex
+ endindex
) // 2
166 if sourcelen(self
.candidates
.units
[mid
]) < startlength
:
171 # maximum source string length to be considered
172 stoplength
= self
.getstoplength(min_similarity
, text
)
175 for candidate
in self
.candidates
.units
[startindex
:]:
176 cmpstring
= candidate
.source
177 if len(cmpstring
) > stoplength
:
179 similarity
= self
.comparer
.similarity(text
, cmpstring
, min_similarity
)
180 if similarity
< min_similarity
:
182 if similarity
> lowestscore
:
183 heapq
.heapreplace(bestcandidates
, (similarity
, candidate
))
184 lowestscore
= bestcandidates
[0][0]
185 if lowestscore
>= 100:
187 if min_similarity
< lowestscore
:
188 min_similarity
= lowestscore
189 stoplength
= self
.getstoplength(min_similarity
, text
)
191 #Remove the empty ones:
195 bestcandidates
= filter(notzero
, bestcandidates
)
196 #Sort for use as a general list, and reverse so the best one is at index 0
197 bestcandidates
.sort()
198 # We reverse as separate step for compatibility with Python 2.3
199 bestcandidates
.reverse()
200 return self
.buildunits(bestcandidates
)
202 def buildunits(self
, candidates
):
203 """Builds a list of units conforming to base API, with the score in the comment"""
205 for score
, candidate
in candidates
:
206 if hasattr(candidate
, "orig_source"):
207 candidate
.source
= candidate
.orig_source
208 candidate
.target
= candidate
.orig_target
209 newunit
= po
.pounit(candidate
.source
)
210 newunit
.target
= candidate
.target
211 newunit
.markfuzzy(candidate
.fuzzy
)
212 newunit
.filepath
= candidate
.filepath
213 newunit
.translator
= candidate
.translator
214 newunit
.date
= candidate
.date
215 candidatenotes
= candidate
.getnotes().strip()
217 newunit
.addnote(candidatenotes
)
218 if self
.addpercentage
:
219 newunit
.addnote("%d%%" % score
)
220 units
.append(newunit
)
223 class terminologymatcher(matcher
):
224 """A matcher with settings specifically for terminology matching"""
225 def __init__(self
, store
, max_candidates
=10, min_similarity
=75, max_length
=500, comparer
=None):
227 comparer
= terminology
.TerminologyComparer(max_length
)
228 matcher
.__init
__(self
, store
, max_candidates
, min_similarity
=10, max_length
=max_length
, comparer
=comparer
)
229 self
.addpercentage
= False
231 def inittm(self
, store
):
232 """Normal initialisation, but convert all source strings to lower case"""
233 matcher
.inittm(self
, store
)
234 for unit
in self
.candidates
.units
:
235 unit
.source
= unit
.source
.lower()
237 def getstartlength(self
, min_similarity
, text
):
238 # Let's number false matches by not working with terms of two
242 def getstoplength(self
, min_similarity
, text
):
243 # Let's ignore terms with more than 30 characters. Perhaps someone
244 # gave a file with normal (long) translations
247 def matches(self
, text
):
248 """Normal matching after converting text to lower case. Then replace
249 with the original unit to retain comments, etc."""
251 matches
= matcher
.matches(self
, text
)