lang/common.py

   1 #!/usr/bin/env python
   2 # -*- coding: utf-8 -*-
   3 #
   4 # Copyright 2007-2008 Zuza Software Foundation
   5 #
   6 # This file is part of translate.
   7 #
   8 # translate is free software; you can redistribute it and/or modify
   9 # it under the terms of the GNU General Public License as published by
  10 # the Free Software Foundation; either version 2 of the License, or
  11 # (at your option) any later version.
  12 #
  13 # translate is distributed in the hope that it will be useful,
  14 # but WITHOUT ANY WARRANTY; without even the implied warranty of
  15 # MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
  16 # GNU General Public License for more details.
  17 #
  18 # You should have received a copy of the GNU General Public License
  19 # along with translate; if not, write to the Free Software
  20 # Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA  02111-1307  USA
  21
  22 """This module contains all the common features for languages.
  23
  24 Supported features:
  25 language code (km, af)
  26 language name (Khmer, Afrikaans)
  27 Plurals
  28   Number of plurals (nplurals)
  29   Plural equation
  30 pofilter tests to ignore
  31
  32 Segmentation
  33   characters
  34   words
  35   sentences
  36
  37 TODO:
  38 Ideas for possible features:
  39
  40 Language-Team information
  41
  42 Segmentation
  43   phrases
  44
  45 Punctuation
  46   End of sentence
  47   Start of sentence
  48   Middle of sentence
  49   Quotes
  50     single
  51     double
  52
  53 Valid characters
  54 Accelerator characters
  55 Special characters
  56 Direction (rtl or ltr)
  57 """
  58
  59 from translate.lang import data
  60 import re
  61
  62 class Common(object):
  63     """This class is the common parent class for all language classes."""
  64
  65     code = ""
  66     """The ISO 639 language code, possibly with a country specifier or other
  67     modifier.
  68
  69     Examples:
  70         km
  71         pt_BR
  72         sr_YU@Latn
  73     """
  74
  75     fullname = ""
  76     """The full (English) name of this language.
  77
  78     Dialect codes should have the form of
  79       Khmer
  80       Portugese (Brazil)
  81       #TODO: sr_YU@Latn?
  82     """
  83
  84     nplurals = 0
  85     """The number of plural forms of this language.
  86
  87     0 is not a valid value - it must be overridden.
  88     Any positive integer is valid (it should probably be between 1 and 6)
  89     Also see data.py
  90     """
  91
  92     pluralequation = "0"
  93     """The plural equation for selection of plural forms.
  94
  95     This is used for PO files to fill into the header.
  96     See U{http://www.gnu.org/software/gettext/manual/html_node/gettext_150.html}.
  97     Also see data.py
  98     """
  99
 100     listseperator = u", "
 101     """This string is used to seperate lists of textual elements. Most
 102     languages probably can stick with the default comma, but Arabic and some
 103     Asian languages might want to override this."""
 104
 105     commonpunc = u".,;:!?-@#$%^*_()[]{}/\\'`\"<>"
 106     """These punctuation marks are common in English and most languages that
 107     use latin script."""
 108
 109     quotes = u"‘’‛“”„‟′″‴‵‶‷‹›«»"
 110     """These are different quotation marks used by various languages."""
 111
 112     invertedpunc = u"¿¡"
 113     """Inveted punctuation sometimes used at the beginning of sentences in
 114     Spanish, Asturian, Galician, and Catalan."""
 115
 116     rtlpunc = u"،؟؛÷"
 117     """These punctuation marks are used by Arabic and Persian, for example."""
 118
 119     CJKpunc = u"。、，；！？「」『』【】"
 120     """These punctuation marks are used in certain circumstances with CJK
 121     languages."""
 122
 123     indicpunc = u"।॥॰"
 124     """These punctuation marks are used by several Indic languages."""
 125
 126     ethiopicpunc = u"።፤፣"
 127     """These punctuation marks are used by several Ethiopic languages."""
 128
 129     miscpunc = u"…±°¹²³·©®×£¥€"
 130     """The middle dot (·) is used by Greek and Georgian."""
 131
 132     punctuation = u"".join([commonpunc, quotes, invertedpunc, rtlpunc, CJKpunc,\
 133             indicpunc, ethiopicpunc, miscpunc])
 134     """We include many types of punctuation here, simply since this is only
 135     meant to determine if something is punctuation. Hopefully we catch some
 136     languages which might not be represented with modules. Most languages won't
 137     need to override this."""
 138
 139     sentenceend = u".!?…։؟।。！？።"
 140     """These marks can indicate a sentence end. Once again we try to account
 141     for many languages. Most langauges won't need to override this."""
 142
 143     #The following tries to account for a lot of things. For the best idea of
 144     #what works, see test_common.py. We try to ignore abbreviations, for
 145     #example, by checking that the following sentence doesn't start with lower
 146     #case or numbers.
 147     sentencere = re.compile(r"""(?s)    #make . also match newlines
 148                             .*?         #anything, but match non-greedy
 149                             [%s]        #the puntuation for sentence ending
 150                             \s+         #the spacing after the puntuation
 151                             (?=[^a-z\d])#lookahead that next part starts with caps
 152                             """ % sentenceend, re.VERBOSE)
 153
 154     puncdict = {}
 155     """A dictionary of punctuation transformation rules that can be used by
 156     punctranslate()."""
 157
 158     ignoretests = []
 159     """List of pofilter tests for this language that must be ignored."""
 160
 161     checker = None
 162     """A language specific checker (see filters.checks).
 163
 164     This doesn't need to be supplied, but will be used if it exists."""
 165
 166     _languages = {}
 167
 168     validaccel = None
 169     """Characters that can be used as accelerators (access keys) i.e. Alt+X
 170     where X is the accelerator.  These can include combining diacritics as
 171     long as they are accessible from the users keyboard in a single keystroke,
 172     but normally they would be at least precomposed characters. All characters,
 173     lower and upper, are included in the list."""
 174
 175     def __new__(cls, code):
 176         """This returns the language class for the given code, following a
 177         singleton like approach (only one object per language)."""
 178         code = code or ""
 179         # First see if a language object for this code already exists
 180         if code in cls._languages:
 181             return cls._languages[code]
 182         # No existing language. Let's build a new one and keep a copy
 183         language = cls._languages[code] = object.__new__(cls)
 184
 185         language.code = code
 186         while code:
 187             langdata = data.languages.get(code, None)
 188             if langdata:
 189                 language.fullname, language.nplurals, language.pluralequation = langdata
 190                 break
 191             code = data.simplercode(code)
 192         if not code:
 193 #            print >> sys.stderr, "Warning: No information found about language code %s" % code
 194             pass
 195         return language
 196
 197     def __repr__(self):
 198         """Give a simple string representation without address information to
 199         be able to store it in text for comparison later."""
 200         detail = ""
 201         if self.code:
 202             detail = "(%s)" % self.code
 203         return "<class 'translate.lang.common.Common%s'>" % detail
 204
 205     def punctranslate(cls, text):
 206         """Converts the punctuation in a string according to the rules of the
 207         language."""
 208 #        TODO: look at po::escapeforpo() for performance idea
 209         for source, target in cls.puncdict.iteritems():
 210             text = text.replace(source, target)
 211         # Let's account for cases where a punctuation symbol plus a space is
 212         # replaced, but the space won't exist at the end of a message
 213         if text and text[-1] + " " in cls.puncdict:
 214             text = text[:-1] + cls.puncdict[text[-1] + " "]
 215         return text
 216     punctranslate = classmethod(punctranslate)
 217
 218     def character_iter(cls, text):
 219         """Returns an iterator over the characters in text."""
 220         #We don't return more than one consecutive whitespace character
 221         prev = 'A'
 222         for c in text:
 223             if c.isspace() and prev.isspace():
 224                 continue
 225             prev = c
 226             if not (c in cls.punctuation):
 227                 yield c
 228     character_iter = classmethod(character_iter)
 229
 230     def characters(cls, text):
 231         """Returns a list of characters in text."""
 232         return [c for c in cls.character_iter(text)]
 233     characters = classmethod(characters)
 234
 235     def word_iter(cls, text):
 236         """Returns an iterator over the words in text."""
 237         #TODO: Consider replacing puctuation with space before split()
 238         for w in text.split():
 239             word = w.strip(cls.punctuation)
 240             if word:
 241                 yield word
 242     word_iter = classmethod(word_iter)
 243
 244     def words(cls, text):
 245         """Returns a list of words in text."""
 246         return [w for w in cls.word_iter(text)]
 247     words = classmethod(words)
 248
 249     def sentence_iter(cls, text, strip=True):
 250         """Returns an iterator over the sentences in text."""
 251         lastmatch = 0
 252         text = text or ""
 253         iter = cls.sentencere.finditer(text)
 254         for item in iter:
 255             lastmatch = item.end()
 256             sentence = item.group()
 257             if strip: sentence = sentence.strip()
 258             if sentence: yield sentence
 259         remainder = text[lastmatch:]
 260         if strip: remainder = remainder.strip()
 261         if remainder: yield remainder
 262     sentence_iter = classmethod(sentence_iter)
 263
 264     def sentences(cls, text, strip=True):
 265         """Returns a list of senteces in text."""
 266         return [s for s in cls.sentence_iter(text, strip=strip)]
 267     sentences = classmethod(sentences)
 268
 269     def capsstart(cls, text):
 270         """Determines whether the text starts with a capital letter."""
 271         stripped = text.lstrip().lstrip(cls.punctuation)
 272         return stripped and stripped[0].isupper()
 273     capsstart = classmethod(capsstart)
 274