lang/data.py

   1 #!/usr/bin/env python
   2 # -*- coding: utf-8 -*-
   3 #
   4 # Copyright 2007 Zuza Software Foundation
   5 #
   6 # This file is part of translate.
   7 #
   8 # translate is free software; you can redistribute it and/or modify
   9 # it under the terms of the GNU General Public License as published by
  10 # the Free Software Foundation; either version 2 of the License, or
  11 # (at your option) any later version.
  12 #
  13 # translate is distributed in the hope that it will be useful,
  14 # but WITHOUT ANY WARRANTY; without even the implied warranty of
  15 # MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
  16 # GNU General Public License for more details.
  17 #
  18 # You should have received a copy of the GNU General Public License
  19 # along with translate; if not, write to the Free Software
  20 # Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA  02111-1307  USA
  21
  22 """This module stores information and functionality that relates to plurals."""
  23
  24 import unicodedata
  25
  26 # The key is the language code, which may contain country codes and modifiers.
  27 # The value is a tuple: (Full name in English, nplurals, plural equation)
  28
  29 languages = {
  30 'af': ('Afrikaans', 2, '(n != 1)'),
  31 'ak': ('Akan', 2, 'n > 1'),
  32 'ar': ('Arabic', 6, 'n==0 ? 0 : n==1 ? 1 : n==2 ? 2 : n>=3 && n<=10 ? 3 : n>=11 && n<=99 ? 4 : 5'),
  33 'az': ('Azerbaijani', 2, '(n != 1)'),
  34 'be': ('Belarusian', 3, 'n%10==1 && n%100!=11 ? 0 : n%10>=2 && n%10<=4 && (n%100<10 || n%100>=20) ? 1 : 2'),
  35 'bg': ('Bulgarian', 2, '(n != 1)'),
  36 'bn': ('Bengali', 2, '(n != 1)'),
  37 'bo': ('Tibetan', 1, '0'),
  38 'bs': ('Bosnian', 3, 'n%10==1 && n%100!=11 ? 0 : n%10>=2 && n%10<=4 && (n%100<10 || n%100>=20) ? 1 : 2'),
  39 'ca': ('Catalan', 2, '(n != 1)'),
  40 'cs': ('Czech', 3, '(n==1) ? 0 : (n>=2 && n<=4) ? 1 : 2'),
  41 'cy': ('Welsh', 2, '(n==2) ? 1 : 0'),
  42 'da': ('Danish', 2, '(n != 1)'),
  43 'de': ('German', 2, '(n != 1)'),
  44 'dz': ('Dzongkha', 1, '0'),
  45 'el': ('Greek', 2, '(n != 1)'),
  46 'en': ('English', 2, '(n != 1)'),
  47 'en_UK': ('English (United Kingdom)', 2, '(n != 1)'),
  48 'en_ZA': ('English (South Africa)', 2, '(n != 1)'),
  49 'eo': ('Esperanto', 2, '(n != 1)'),
  50 'es': ('Spanish', 2, '(n != 1)'),
  51 'et': ('Estonian', 2, '(n != 1)'),
  52 'eu': ('Basque', 2, '(n != 1)'),
  53 'fa': ('Persian', 1, '0'),
  54 'fi': ('Finnish', 2, '(n != 1)'),
  55 'fo': ('Faroese', 2, '(n != 1)'),
  56 'fr': ('French', 2, '(n > 1)'),
  57 'fur': ('Friulian', 2, '(n != 1)'),
  58 'fy': ('Frisian', 2, '(n != 1)'),
  59 'ga': ('Irish', 3, 'n==1 ? 0 : n==2 ? 1 : 2'),
  60 'gl': ('Galician', 2, '(n != 1)'),
  61 'gu': ('Gujarati', 2, '(n != 1)'),
  62 'he': ('Hebrew', 2, '(n != 1)'),
  63 'hi': ('Hindi', 2, '(n != 1)'),
  64 'hr': ('Croatian', 3, '(n%10==1 && n%100!=11 ? 0 : n%10>=2 && n%10<=4 && (n%100<10 || n%100>=20) ? 1 : 2)'),
  65 'hu': ('Hungarian', 2, '(n != 1)'),
  66 'id': ('Indonesian', 1, '0'),
  67 'is': ('Icelandic', 2, '(n != 1)'),
  68 'it': ('Italian', 2, '(n != 1)'),
  69 'ja': ('Japanese', 1, '0'),
  70 'ka': ('Georgian', 1, '0'),
  71 'km': ('Khmer', 1, '0'),
  72 'ko': ('Korean', 1, '0'),
  73 'ku': ('Kurdish', 2, '(n != 1)'),
  74 'lb': ('Letzeburgesch', 2, '(n != 1)'),
  75 'ln': ('Lingala', 2, '(n > 1)'),
  76 'lt': ('Lithuanian', 3, '(n%10==1 && n%100!=11 ? 0 : n%10>=2 && (n%100<10 || n%100>=20) ? 1 : 2)'),
  77 'lv': ('Latvian', 3, '(n%10==1 && n%100!=11 ? 0 : n != 0 ? 1 : 2)'),
  78 'mg': ('Malagasy', 2, '(n > 1)'),
  79 'mn': ('Mongolian', 2, '(n != 1)'),
  80 'mr': ('Marathi', 2, '(n != 1)'),
  81 'ms': ('Malay', 1, '0'),
  82 'mt': ('Maltese', 4, '(n==1 ? 0 : n==0 || ( n%100>1 && n%100<11) ? 1 : (n%100>10 && n%100<20 ) ? 2 : 3)'),
  83 'nah': ('Nahuatl', 2, '(n != 1)'),
  84 'nb': ('Norwegian Bokmal', 2, '(n != 1)'),
  85 'ne': ('Nepali', 2, '(n != 1)'),
  86 'nl': ('Dutch', 2, '(n != 1)'),
  87 'nn': ('Norwegian Nynorsk', 2, '(n != 1)'),
  88 'nso': ('Northern Sotho', 2, '(n > 1)'),
  89 'or': ('Oriya', 2, '(n != 1)'),
  90 'pa': ('Punjabi', 2, '(n != 1)'),
  91 'pap': ('Papiamento', 2, '(n != 1)'),
  92 'pl': ('Polish', 3, '(n==1 ? 0 : n%10>=2 && n%10<=4 && (n%100<10 || n%100>=20) ? 1 : 2)'),
  93 'pt': ('Portugese', 2, '(n != 1)'),
  94 'pt_BR': ('Portugese (Brazil)', 2, '(n > 1)'),
  95 'ro': ('Romanian', 3, '(n==1 ? 0 : (n==0 || (n%100 > 0 && n%100 < 20)) ? 1 : 2);'),
  96 'ru': ('Russian', 3, '(n%10==1 && n%100!=11 ? 0 : n%10>=2 && n%10<=4 && (n%100<10 || n%100>=20) ? 1 : 2)'),
  97 'sk': ('Slovak', 3, '(n==1) ? 0 : (n>=2 && n<=4) ? 1 : 2'),
  98 'sl': ('Slovenian', 4, '(n%100==1 ? 0 : n%100==2 ? 1 : n%100==3 || n%100==4 ? 2 : 3)'),
  99 'sq': ('Albanian', 2, '(n != 1)'),
 100 'sr': ('Serbian', 3, '(n%10==1 && n%100!=11 ? 0 : n%10>=2 && n%10<=4 && (n%100<10 || n%100>=20) ? 1 : 2)'),
 101 'sv': ('Swedish', 2, '(n != 1)'),
 102 'ta': ('Tamil', 2, '(n != 1)'),
 103 'th': ('Thai', 1, '0'),
 104 'tk': ('Turkmen', 2, '(n != 1)'),
 105 'tr': ('Turkish', 1, '0'),
 106 'uk': ('Ukrainian', 3, '(n%10==1 && n%100!=11 ? 0 : n%10>=2 && n%10<=4 && (n%100<10 || n%100>=20) ? 1 : 2)'),
 107 'vi': ('Vietnamese',1 , '0'),
 108 'wa': ('Walloon', 2, '(n > 1)'),
 109 # Chinese is difficult because the main divide is on script, not really
 110 # country. Simplified Chinese is used mostly in China, Singapore and Malaysia.
 111 # Traditional Chinese is used mostly in Hong Kong, Taiwan and Macau.
 112 'zh_CN': ('Chinese (China)', 1, '0'),
 113 'zh_HK': ('Chinese (Hong Kong)', 1, '0'),
 114 'zh_TW': ('Chinese (Taiwan)', 1, '0'),
 115 }
 116
 117 def simplercode(code):
 118     """This attempts to simplify the given language code by ignoring country
 119     codes, for example."""
 120     # Check http://www.rfc-editor.org/rfc/bcp/bcp47.txt for possible extra issues
 121     # http://www.rfc-editor.org/rfc/rfc4646.txt
 122     # http://www.w3.org/International/articles/language-tags/
 123     if not code:
 124         return code
 125
 126     # The @ modifier is used for script variants of the same language, like
 127     # sr@Latn or gez_ER@abegede
 128     modifier = code.rfind("@")
 129     if modifier >= 0:
 130         return code[:modifier]
 131
 132     underscore = code.rfind("_")
 133     if underscore >= 0:
 134         return code[:underscore]
 135
 136
 137 import gettext
 138 import re
 139
 140 iso639 = {}
 141 iso3166 = {}
 142
 143 dialectre = re.compile(r"([^(\s]+)\s*\(([^)]+)\)")
 144
 145 def tr_lang(langcode):
 146     """Gives a function that can translate a language name, even in the form
 147         "language (country)"
 148     into the language with iso code langcode."""
 149     langfunc = gettext_lang(langcode)
 150     countryfunc = gettext_country(langcode)
 151
 152     def handlelanguage(name):
 153         match = dialectre.match(name)
 154         if match:
 155             language, country = match.groups()
 156             return u"%s (%s)" % (langfunc(language), countryfunc(country))
 157         else:
 158             return langfunc(name)
 159
 160     return handlelanguage
 161
 162 def gettext_lang(langcode):
 163     """Returns a gettext function to translate language names into the given
 164     language."""
 165     if not langcode in iso639:
 166         t = gettext.translation('iso_639', languages=[langcode], fallback=True)
 167         iso639[langcode] = t.ugettext
 168     return iso639[langcode]
 169
 170 def gettext_country(langcode):
 171     """Returns a gettext function to translate country names into the given
 172     language."""
 173     if not langcode in iso3166:
 174         t = gettext.translation('iso_3166', languages=[langcode], fallback=True)
 175         iso3166[langcode] = t.ugettext
 176     return iso3166[langcode]
 177
 178 def normalize(string, normal_form="NFC"):
 179     """Return a unicode string in its normalized form
 180
 181     @param sting: The string to be normalized
 182     @param normal_form: NFC (default), NFD, NFCK, NFDK
 183     @return: Normalized string
 184     """
 185     return unicodedata.normalize(normal_form, string)
 186
 187 def forceunicode(string):
 188     """Helper method to ensure that the parameter becomes unicode if not yet"""
 189     if string is None:
 190         return None
 191     if isinstance(string, str):
 192         encoding = getattr(string, "encoding", "utf-8")
 193         string = string.decode(encoding)
 194     string = normalize(string)
 195     return string
 196