2 # -*- coding: utf-8 -*-
4 # Copyright 2007-2008 Zuza Software Foundation
6 # This file is part of translate.
8 # translate is free software; you can redistribute it and/or modify
9 # it under the terms of the GNU General Public License as published by
10 # the Free Software Foundation; either version 2 of the License, or
11 # (at your option) any later version.
13 # translate is distributed in the hope that it will be useful,
14 # but WITHOUT ANY WARRANTY; without even the implied warranty of
15 # MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
16 # GNU General Public License for more details.
18 # You should have received a copy of the GNU General Public License
19 # along with translate; if not, write to the Free Software
20 # Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA 02111-1307 USA
22 """This module contains all the common features for languages.
25 language code (km, af)
26 language name (Khmer, Afrikaans)
28 Number of plurals (nplurals)
30 pofilter tests to ignore
38 Ideas for possible features:
40 Language-Team information
54 Accelerator characters
56 Direction (rtl or ltr)
59 from translate
.lang
import data
63 """This class is the common parent class for all language classes."""
66 """The ISO 639 language code, possibly with a country specifier or other
76 """The full (English) name of this language.
78 Dialect codes should have the form of
85 """The number of plural forms of this language.
87 0 is not a valid value - it must be overridden.
88 Any positive integer is valid (it should probably be between 1 and 6)
93 """The plural equation for selection of plural forms.
95 This is used for PO files to fill into the header.
96 See U{http://www.gnu.org/software/gettext/manual/html_node/gettext_150.html}.
100 listseperator
= u
", "
101 """This string is used to seperate lists of textual elements. Most
102 languages probably can stick with the default comma, but Arabic and some
103 Asian languages might want to override this."""
105 commonpunc
= u
".,;:!?-@#$%^*_()[]{}/\\'`\"<>"
106 """These punctuation marks are common in English and most languages that
109 quotes
= u
"‘’‛“”„‟′″‴‵‶‷‹›«»"
110 """These are different quotation marks used by various languages."""
113 """Inveted punctuation sometimes used at the beginning of sentences in
114 Spanish, Asturian, Galician, and Catalan."""
117 """These punctuation marks are used by Arabic and Persian, for example."""
119 CJKpunc
= u
"。、,;!?「」『』【】"
120 """These punctuation marks are used in certain circumstances with CJK
124 """These punctuation marks are used by several Indic languages."""
126 ethiopicpunc
= u
"።፤፣"
127 """These punctuation marks are used by several Ethiopic languages."""
129 miscpunc
= u
"…±°¹²³·©®×£¥€"
130 """The middle dot (·) is used by Greek and Georgian."""
132 punctuation
= u
"".join([commonpunc
, quotes
, invertedpunc
, rtlpunc
, CJKpunc
,\
133 indicpunc
, ethiopicpunc
, miscpunc
])
134 """We include many types of punctuation here, simply since this is only
135 meant to determine if something is punctuation. Hopefully we catch some
136 languages which might not be represented with modules. Most languages won't
137 need to override this."""
139 sentenceend
= u
".!?…։؟।。!?።"
140 """These marks can indicate a sentence end. Once again we try to account
141 for many languages. Most langauges won't need to override this."""
143 #The following tries to account for a lot of things. For the best idea of
144 #what works, see test_common.py. We try to ignore abbreviations, for
145 #example, by checking that the following sentence doesn't start with lower
147 sentencere
= re
.compile(r
"""(?s) #make . also match newlines
148 .*? #anything, but match non-greedy
149 [%s] #the puntuation for sentence ending
150 \s+ #the spacing after the puntuation
151 (?=[^a-z\d])#lookahead that next part starts with caps
152 """ % sentenceend
, re
.VERBOSE
)
155 """A dictionary of punctuation transformation rules that can be used by
159 """List of pofilter tests for this language that must be ignored."""
162 """A language specific checker (see filters.checks).
164 This doesn't need to be supplied, but will be used if it exists."""
169 """Characters that can be used as accelerators (access keys) i.e. Alt+X
170 where X is the accelerator. These can include combining diacritics as
171 long as they are accessible from the users keyboard in a single keystroke,
172 but normally they would be at least precomposed characters. All characters,
173 lower and upper, are included in the list."""
175 def __new__(cls
, code
):
176 """This returns the language class for the given code, following a
177 singleton like approach (only one object per language)."""
179 # First see if a language object for this code already exists
180 if code
in cls
._languages
:
181 return cls
._languages
[code
]
182 # No existing language. Let's build a new one and keep a copy
183 language
= cls
._languages
[code
] = object.__new
__(cls
)
187 langdata
= data
.languages
.get(code
, None)
189 language
.fullname
, language
.nplurals
, language
.pluralequation
= langdata
191 code
= data
.simplercode(code
)
193 # print >> sys.stderr, "Warning: No information found about language code %s" % code
198 """Give a simple string representation without address information to
199 be able to store it in text for comparison later."""
202 detail
= "(%s)" % self
.code
203 return "<class 'translate.lang.common.Common%s'>" % detail
205 def punctranslate(cls
, text
):
206 """Converts the punctuation in a string according to the rules of the
208 # TODO: look at po::escapeforpo() for performance idea
209 for source
, target
in cls
.puncdict
.iteritems():
210 text
= text
.replace(source
, target
)
211 # Let's account for cases where a punctuation symbol plus a space is
212 # replaced, but the space won't exist at the end of a message
213 if text
and text
[-1] + " " in cls
.puncdict
:
214 text
= text
[:-1] + cls
.puncdict
[text
[-1] + " "]
216 punctranslate
= classmethod(punctranslate
)
218 def character_iter(cls
, text
):
219 """Returns an iterator over the characters in text."""
220 #We don't return more than one consecutive whitespace character
223 if c
.isspace() and prev
.isspace():
226 if not (c
in cls
.punctuation
):
228 character_iter
= classmethod(character_iter
)
230 def characters(cls
, text
):
231 """Returns a list of characters in text."""
232 return [c
for c
in cls
.character_iter(text
)]
233 characters
= classmethod(characters
)
235 def word_iter(cls
, text
):
236 """Returns an iterator over the words in text."""
237 #TODO: Consider replacing puctuation with space before split()
238 for w
in text
.split():
239 word
= w
.strip(cls
.punctuation
)
242 word_iter
= classmethod(word_iter
)
244 def words(cls
, text
):
245 """Returns a list of words in text."""
246 return [w
for w
in cls
.word_iter(text
)]
247 words
= classmethod(words
)
249 def sentence_iter(cls
, text
, strip
=True):
250 """Returns an iterator over the sentences in text."""
253 iter = cls
.sentencere
.finditer(text
)
255 lastmatch
= item
.end()
256 sentence
= item
.group()
257 if strip
: sentence
= sentence
.strip()
258 if sentence
: yield sentence
259 remainder
= text
[lastmatch
:]
260 if strip
: remainder
= remainder
.strip()
261 if remainder
: yield remainder
262 sentence_iter
= classmethod(sentence_iter
)
264 def sentences(cls
, text
, strip
=True):
265 """Returns a list of senteces in text."""
266 return [s
for s
in cls
.sentence_iter(text
, strip
=strip
)]
267 sentences
= classmethod(sentences
)
269 def capsstart(cls
, text
):
270 """Determines whether the text starts with a capital letter."""
271 stripped
= text
.lstrip().lstrip(cls
.punctuation
)
272 return stripped
and stripped
[0].isupper()
273 capsstart
= classmethod(capsstart
)