for git v1.5.2 (and below): chdir to the directory of the target file before executin...
[translate_toolkit.git] / lang / common.py
blob93d6caa744b86c9c793f147a0eee193a44ce11d8
1 #!/usr/bin/env python
2 # -*- coding: utf-8 -*-
3 #
4 # Copyright 2007-2008 Zuza Software Foundation
5 #
6 # This file is part of translate.
8 # translate is free software; you can redistribute it and/or modify
9 # it under the terms of the GNU General Public License as published by
10 # the Free Software Foundation; either version 2 of the License, or
11 # (at your option) any later version.
13 # translate is distributed in the hope that it will be useful,
14 # but WITHOUT ANY WARRANTY; without even the implied warranty of
15 # MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
16 # GNU General Public License for more details.
18 # You should have received a copy of the GNU General Public License
19 # along with translate; if not, write to the Free Software
20 # Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA 02111-1307 USA
22 """This module contains all the common features for languages.
24 Supported features:
25 language code (km, af)
26 language name (Khmer, Afrikaans)
27 Plurals
28 Number of plurals (nplurals)
29 Plural equation
30 pofilter tests to ignore
32 Segmentation
33 characters
34 words
35 sentences
37 TODO:
38 Ideas for possible features:
40 Language-Team information
42 Segmentation
43 phrases
45 Punctuation
46 End of sentence
47 Start of sentence
48 Middle of sentence
49 Quotes
50 single
51 double
53 Valid characters
54 Accelerator characters
55 Special characters
56 Direction (rtl or ltr)
57 """
59 from translate.lang import data
60 import re
62 class Common(object):
63 """This class is the common parent class for all language classes."""
65 code = ""
66 """The ISO 639 language code, possibly with a country specifier or other
67 modifier.
69 Examples:
71 pt_BR
72 sr_YU@Latn
73 """
75 fullname = ""
76 """The full (English) name of this language.
78 Dialect codes should have the form of
79 Khmer
80 Portugese (Brazil)
81 #TODO: sr_YU@Latn?
82 """
84 nplurals = 0
85 """The number of plural forms of this language.
87 0 is not a valid value - it must be overridden.
88 Any positive integer is valid (it should probably be between 1 and 6)
89 Also see data.py
90 """
92 pluralequation = "0"
93 """The plural equation for selection of plural forms.
95 This is used for PO files to fill into the header.
96 See U{http://www.gnu.org/software/gettext/manual/html_node/gettext_150.html}.
97 Also see data.py
98 """
100 listseperator = u", "
101 """This string is used to seperate lists of textual elements. Most
102 languages probably can stick with the default comma, but Arabic and some
103 Asian languages might want to override this."""
105 commonpunc = u".,;:!?-@#$%^*_()[]{}/\\'`\"<>"
106 """These punctuation marks are common in English and most languages that
107 use latin script."""
109 quotes = u"‘’‛“”„‟′″‴‵‶‷‹›«»"
110 """These are different quotation marks used by various languages."""
112 invertedpunc = u"¿¡"
113 """Inveted punctuation sometimes used at the beginning of sentences in
114 Spanish, Asturian, Galician, and Catalan."""
116 rtlpunc = u"،؟؛÷"
117 """These punctuation marks are used by Arabic and Persian, for example."""
119 CJKpunc = u"。、,;!?「」『』【】"
120 """These punctuation marks are used in certain circumstances with CJK
121 languages."""
123 indicpunc = u"।॥॰"
124 """These punctuation marks are used by several Indic languages."""
126 ethiopicpunc = u"።፤፣"
127 """These punctuation marks are used by several Ethiopic languages."""
129 miscpunc = u"…±°¹²³·©®×£¥€"
130 """The middle dot (·) is used by Greek and Georgian."""
132 punctuation = u"".join([commonpunc, quotes, invertedpunc, rtlpunc, CJKpunc,\
133 indicpunc, ethiopicpunc, miscpunc])
134 """We include many types of punctuation here, simply since this is only
135 meant to determine if something is punctuation. Hopefully we catch some
136 languages which might not be represented with modules. Most languages won't
137 need to override this."""
139 sentenceend = u".!?…։؟।。!?።"
140 """These marks can indicate a sentence end. Once again we try to account
141 for many languages. Most langauges won't need to override this."""
143 #The following tries to account for a lot of things. For the best idea of
144 #what works, see test_common.py. We try to ignore abbreviations, for
145 #example, by checking that the following sentence doesn't start with lower
146 #case or numbers.
147 sentencere = re.compile(r"""(?s) #make . also match newlines
148 .*? #anything, but match non-greedy
149 [%s] #the puntuation for sentence ending
150 \s+ #the spacing after the puntuation
151 (?=[^a-z\d])#lookahead that next part starts with caps
152 """ % sentenceend, re.VERBOSE)
154 puncdict = {}
155 """A dictionary of punctuation transformation rules that can be used by
156 punctranslate()."""
158 ignoretests = []
159 """List of pofilter tests for this language that must be ignored."""
161 checker = None
162 """A language specific checker (see filters.checks).
164 This doesn't need to be supplied, but will be used if it exists."""
166 _languages = {}
168 validaccel = None
169 """Characters that can be used as accelerators (access keys) i.e. Alt+X
170 where X is the accelerator. These can include combining diacritics as
171 long as they are accessible from the users keyboard in a single keystroke,
172 but normally they would be at least precomposed characters. All characters,
173 lower and upper, are included in the list."""
175 def __new__(cls, code):
176 """This returns the language class for the given code, following a
177 singleton like approach (only one object per language)."""
178 code = code or ""
179 # First see if a language object for this code already exists
180 if code in cls._languages:
181 return cls._languages[code]
182 # No existing language. Let's build a new one and keep a copy
183 language = cls._languages[code] = object.__new__(cls)
185 language.code = code
186 while code:
187 langdata = data.languages.get(code, None)
188 if langdata:
189 language.fullname, language.nplurals, language.pluralequation = langdata
190 break
191 code = data.simplercode(code)
192 if not code:
193 # print >> sys.stderr, "Warning: No information found about language code %s" % code
194 pass
195 return language
197 def __repr__(self):
198 """Give a simple string representation without address information to
199 be able to store it in text for comparison later."""
200 detail = ""
201 if self.code:
202 detail = "(%s)" % self.code
203 return "<class 'translate.lang.common.Common%s'>" % detail
205 def punctranslate(cls, text):
206 """Converts the punctuation in a string according to the rules of the
207 language."""
208 # TODO: look at po::escapeforpo() for performance idea
209 for source, target in cls.puncdict.iteritems():
210 text = text.replace(source, target)
211 # Let's account for cases where a punctuation symbol plus a space is
212 # replaced, but the space won't exist at the end of a message
213 if text and text[-1] + " " in cls.puncdict:
214 text = text[:-1] + cls.puncdict[text[-1] + " "]
215 return text
216 punctranslate = classmethod(punctranslate)
218 def character_iter(cls, text):
219 """Returns an iterator over the characters in text."""
220 #We don't return more than one consecutive whitespace character
221 prev = 'A'
222 for c in text:
223 if c.isspace() and prev.isspace():
224 continue
225 prev = c
226 if not (c in cls.punctuation):
227 yield c
228 character_iter = classmethod(character_iter)
230 def characters(cls, text):
231 """Returns a list of characters in text."""
232 return [c for c in cls.character_iter(text)]
233 characters = classmethod(characters)
235 def word_iter(cls, text):
236 """Returns an iterator over the words in text."""
237 #TODO: Consider replacing puctuation with space before split()
238 for w in text.split():
239 word = w.strip(cls.punctuation)
240 if word:
241 yield word
242 word_iter = classmethod(word_iter)
244 def words(cls, text):
245 """Returns a list of words in text."""
246 return [w for w in cls.word_iter(text)]
247 words = classmethod(words)
249 def sentence_iter(cls, text, strip=True):
250 """Returns an iterator over the sentences in text."""
251 lastmatch = 0
252 text = text or ""
253 iter = cls.sentencere.finditer(text)
254 for item in iter:
255 lastmatch = item.end()
256 sentence = item.group()
257 if strip: sentence = sentence.strip()
258 if sentence: yield sentence
259 remainder = text[lastmatch:]
260 if strip: remainder = remainder.strip()
261 if remainder: yield remainder
262 sentence_iter = classmethod(sentence_iter)
264 def sentences(cls, text, strip=True):
265 """Returns a list of senteces in text."""
266 return [s for s in cls.sentence_iter(text, strip=strip)]
267 sentences = classmethod(sentences)
269 def capsstart(cls, text):
270 """Determines whether the text starts with a capital letter."""
271 stripped = text.lstrip().lstrip(cls.punctuation)
272 return stripped and stripped[0].isupper()
273 capsstart = classmethod(capsstart)