for git v1.5.2 (and below): chdir to the directory of the target file before executin...
[translate_toolkit.git] / search / terminology.py
blob6ffce447e5601e9a6decd64f2449f408c6ef387e
1 # -*- coding: utf-8 -*-
2 #
3 # Copyright 2006 Zuza Software Foundation
4 #
5 # This file is part of translate.
7 # translate is free software; you can redistribute it and/or modify
8 # it under the terms of the GNU General Public License as published by
9 # the Free Software Foundation; either version 2 of the License, or
10 # (at your option) any later version.
12 # translate is distributed in the hope that it will be useful,
13 # but WITHOUT ANY WARRANTY; without even the implied warranty of
14 # MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
15 # GNU General Public License for more details.
17 # You should have received a copy of the GNU General Public License
18 # along with translate; if not, write to the Free Software
19 # Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA 02111-1307 USA
21 """A class that does terminology matching"""
23 import re
25 # We don't want to miss certain forms of words that only change a little
26 # at the end. Now we are tying this code to English, but it should serve
27 # us well. For example "category" should be found in "categories",
28 # "copy" should be found in "copied"
30 # The tuples define a regular expression to search for, and what with
31 # what it should be replaced.
32 ignorepatterns = [("y\s*$", "ie"), #category/categories, identify/identifies, apply/applied
33 ("[\s-]*", ""), #down time / downtime, pre-order / preorder
34 ("-", " "), #pre-order / pre order
35 (" ", "-"), #pre order / pre-order
38 #TODO: compile regexes
40 class TerminologyComparer:
41 def __init__(self, max_len=500):
42 self.MAX_LEN = max_len
44 def similarity(self, a, b, stoppercentage=40):
45 """returns the match quality of term b in the text a"""
46 # We could segment the words, but mostly it will give less ideal
47 # results, since we'll miss plurals, etc. Then we also can't search for
48 # multiword terms, such as "Free Software". Ideally we should use a
49 # stemmer, like the Porter stemmer.
51 # So we just see if the word occurs anywhere. This is not perfect since
52 # we might get more than we bargained for. The term "form" will be found
53 # in the word "format", for example. A word like "at" will trigger too
54 # many false positives.
56 # First remove a possible disambiguating bracket at the end
57 b = re.sub("\s+\(.*\)\s*$", "", b)
59 if len(b) <= 2:
60 return 0
62 pos = a[:self.MAX_LEN].find(b)
63 if pos >= 0:
64 return 100 - pos * 10 / len(a[:self.MAX_LEN])
66 for ignorepattern in ignorepatterns:
67 newb = re.sub(ignorepattern[0], ignorepattern[1], b)
68 if newb in a[:self.MAX_LEN]:
69 return 80
70 return 0