1 # -*- coding: utf-8 -*-
3 # Copyright 2006 Zuza Software Foundation
5 # This file is part of translate.
7 # translate is free software; you can redistribute it and/or modify
8 # it under the terms of the GNU General Public License as published by
9 # the Free Software Foundation; either version 2 of the License, or
10 # (at your option) any later version.
12 # translate is distributed in the hope that it will be useful,
13 # but WITHOUT ANY WARRANTY; without even the implied warranty of
14 # MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
15 # GNU General Public License for more details.
17 # You should have received a copy of the GNU General Public License
18 # along with translate; if not, write to the Free Software
19 # Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA 02111-1307 USA
21 """A class that does terminology matching"""
25 # We don't want to miss certain forms of words that only change a little
26 # at the end. Now we are tying this code to English, but it should serve
27 # us well. For example "category" should be found in "categories",
28 # "copy" should be found in "copied"
30 # The tuples define a regular expression to search for, and what with
31 # what it should be replaced.
32 ignorepatterns
= [("y\s*$", "ie"), #category/categories, identify/identifies, apply/applied
33 ("[\s-]*", ""), #down time / downtime, pre-order / preorder
34 ("-", " "), #pre-order / pre order
35 (" ", "-"), #pre order / pre-order
38 #TODO: compile regexes
40 class TerminologyComparer
:
41 def __init__(self
, max_len
=500):
42 self
.MAX_LEN
= max_len
44 def similarity(self
, a
, b
, stoppercentage
=40):
45 """returns the match quality of term b in the text a"""
46 # We could segment the words, but mostly it will give less ideal
47 # results, since we'll miss plurals, etc. Then we also can't search for
48 # multiword terms, such as "Free Software". Ideally we should use a
49 # stemmer, like the Porter stemmer.
51 # So we just see if the word occurs anywhere. This is not perfect since
52 # we might get more than we bargained for. The term "form" will be found
53 # in the word "format", for example. A word like "at" will trigger too
54 # many false positives.
56 # First remove a possible disambiguating bracket at the end
57 b
= re
.sub("\s+\(.*\)\s*$", "", b
)
62 pos
= a
[:self
.MAX_LEN
].find(b
)
64 return 100 - pos
* 10 / len(a
[:self
.MAX_LEN
])
66 for ignorepattern
in ignorepatterns
:
67 newb
= re
.sub(ignorepattern
[0], ignorepattern
[1], b
)
68 if newb
in a
[:self
.MAX_LEN
]: