search/terminology.py

   1 # -*- coding: utf-8 -*-
   2 #
   3 # Copyright 2006 Zuza Software Foundation
   4 #
   5 # This file is part of translate.
   6 #
   7 # translate is free software; you can redistribute it and/or modify
   8 # it under the terms of the GNU General Public License as published by
   9 # the Free Software Foundation; either version 2 of the License, or
  10 # (at your option) any later version.
  11 #
  12 # translate is distributed in the hope that it will be useful,
  13 # but WITHOUT ANY WARRANTY; without even the implied warranty of
  14 # MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
  15 # GNU General Public License for more details.
  16 #
  17 # You should have received a copy of the GNU General Public License
  18 # along with translate; if not, write to the Free Software
  19 # Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA  02111-1307  USA
  20
  21 """A class that does terminology matching"""
  22
  23 import re
  24
  25 # We don't want to miss certain forms of words that only change a little
  26 # at the end. Now we are tying this code to English, but it should serve
  27 # us well. For example "category" should be found in "categories",
  28 # "copy" should be found in "copied"
  29 #
  30 # The tuples define a regular expression to search for, and what with
  31 # what it should be replaced.
  32 ignorepatterns = [("y\s*$", "ie"),          #category/categories, identify/identifies, apply/applied
  33                   ("[\s-]*", ""),           #down time / downtime, pre-order / preorder
  34                   ("-", " "),               #pre-order / pre order
  35                   (" ", "-"),               #pre order / pre-order
  36                  ]
  37
  38 #TODO: compile regexes
  39
  40 class TerminologyComparer:
  41     def __init__(self, max_len=500):
  42         self.MAX_LEN = max_len
  43
  44     def similarity(self, a, b, stoppercentage=40):
  45         """returns the match quality of term b in the text a"""
  46         # We could segment the words, but mostly it will give less ideal
  47         # results, since we'll miss plurals, etc. Then we also can't search for
  48         # multiword terms, such as "Free Software". Ideally we should use a
  49         # stemmer, like the Porter stemmer.
  50
  51         # So we just see if the word occurs anywhere. This is not perfect since
  52         # we might get more than we bargained for. The term "form" will be found
  53         # in the word "format", for example. A word like "at" will trigger too
  54         # many false positives.
  55
  56         # First remove a possible disambiguating bracket at the end
  57         b = re.sub("\s+\(.*\)\s*$", "", b)
  58
  59         if len(b) <= 2:
  60             return 0
  61
  62         pos = a[:self.MAX_LEN].find(b)
  63         if pos >= 0:
  64             return 100 - pos * 10 / len(a[:self.MAX_LEN])
  65
  66         for ignorepattern in ignorepatterns:
  67             newb = re.sub(ignorepattern[0], ignorepattern[1], b)
  68             if newb in a[:self.MAX_LEN]:
  69                 return 80
  70         return 0
  71