storage/pypo.py

   1 #!/usr/bin/env python
   2 # -*- coding: utf-8 -*-
   3 #
   4 # Copyright 2002-2007 Zuza Software Foundation
   5 #
   6 # This file is part of translate.
   7 #
   8 # translate is free software; you can redistribute it and/or modify
   9 # it under the terms of the GNU General Public License as published by
  10 # the Free Software Foundation; either version 2 of the License, or
  11 # (at your option) any later version.
  12 #
  13 # translate is distributed in the hope that it will be useful,
  14 # but WITHOUT ANY WARRANTY; without even the implied warranty of
  15 # MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
  16 # GNU General Public License for more details.
  17 #
  18 # You should have received a copy of the GNU General Public License
  19 # along with translate; if not, write to the Free Software
  20 # Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA  02111-1307  USA
  21
  22 """classes that hold units of .po files (pounit) or entire files (pofile)
  23 gettext-style .po (or .pot) files are used in translations for KDE et al (see kbabel)"""
  24
  25 from __future__ import generators
  26 from translate.misc.multistring import multistring
  27 from translate.misc import quote
  28 from translate.misc import textwrap
  29 from translate.lang import data
  30 from translate.storage import pocommon, base
  31 import re
  32
  33 lsep = "\n#: "
  34 """Seperator for #: entries"""
  35
  36 # general functions for quoting / unquoting po strings
  37
  38 po_unescape_map = {"\\r": "\r", "\\t": "\t", '\\"': '"', '\\n': '\n', '\\\\': '\\'}
  39 po_escape_map = dict([(value, key) for (key, value) in po_unescape_map.items()])
  40
  41 def escapeforpo(line):
  42     """Escapes a line for po format. assumes no \n occurs in the line.
  43
  44     @param line: unescaped text
  45     """
  46     special_locations = []
  47     for special_key in po_escape_map:
  48         special_locations.extend(quote.find_all(line, special_key))
  49     special_locations = dict.fromkeys(special_locations).keys()
  50     special_locations.sort()
  51     escaped_line = ""
  52     last_location = 0
  53     for location in special_locations:
  54         escaped_line += line[last_location:location]
  55         escaped_line += po_escape_map[line[location:location+1]]
  56         last_location = location+1
  57     escaped_line += line[last_location:]
  58     return escaped_line
  59
  60 def unescapehandler(escape):
  61
  62     return po_unescape_map.get(escape, escape)
  63
  64 def wrapline(line):
  65     """Wrap text for po files."""
  66     wrappedlines = textwrap.wrap(line, 76, replace_whitespace=False, expand_tabs=False, drop_whitespace=False)
  67
  68     # Lines should not start with a space...
  69     if len(wrappedlines) > 1:
  70         for index, line in enumerate(wrappedlines[1:]):
  71             if line.startswith(' '):
  72                 # Remove the space at the beginning of the line:
  73                 wrappedlines[index+1] = line[1:]
  74
  75                 # Append a space to the previous line:
  76                 wrappedlines[index] += ' '
  77     return wrappedlines
  78
  79 def quoteforpo(text):
  80     """quotes the given text for a PO file, returning quoted and escaped lines"""
  81     polines = []
  82     if text is None:
  83         return polines
  84     lines = text.split("\n")
  85     if len(lines) > 1 or (len(lines) == 1 and len(lines[0]) > 71):
  86         if len(lines) != 2 or lines[1]:
  87             polines.extend(['""'])
  88         for line in lines[:-1]:
  89             lns = wrapline(line)
  90             if len(lns) > 0:
  91                 for ln in lns[:-1]:
  92                     polines.extend(['"' + escapeforpo(ln) + '"'])
  93                 if lns[-1]:
  94                     polines.extend(['"' + escapeforpo(lns[-1]) + '\\n"'])
  95             else:
  96                 polines.extend(['"\\n"'])
  97     if lines[-1]:
  98         polines.extend(['"' + escapeforpo(line) + '"' for line in wrapline(lines[-1])])
  99     return polines
 100
 101 def extractpoline(line):
 102     """Remove quote and unescape line from po file.
 103
 104     @param line: a quoted line from a po file (msgid or msgstr)
 105     """
 106     extracted = quote.extractwithoutquotes(line, '"', '"', '\\', includeescapes=unescapehandler)[0]
 107     return extracted
 108
 109 def unquotefrompo(postr):
 110     return u"".join([extractpoline(line) for line in postr])
 111
 112 def encodingToUse(encoding):
 113     """Tests whether the given encoding is known in the python runtime, or returns utf-8.
 114     This function is used to ensure that a valid encoding is always used."""
 115     if encoding == "CHARSET" or encoding == None: return 'utf-8'
 116     return encoding
 117 #    if encoding is None: return False
 118 #    return True
 119 #    try:
 120 #        tuple = codecs.lookup(encoding)
 121 #    except LookupError:
 122 #        return False
 123 #    return True
 124
 125 """
 126 From the GNU gettext manual:
 127      WHITE-SPACE
 128      #  TRANSLATOR-COMMENTS
 129      #. AUTOMATIC-COMMENTS
 130      #| PREVIOUS MSGID                 (Gettext 0.16 - check if this is the correct position - not yet implemented)
 131      #: REFERENCE...
 132      #, FLAG...
 133      msgctxt CONTEXT                   (Gettext 0.15)
 134      msgid UNTRANSLATED-STRING
 135      msgstr TRANSLATED-STRING
 136 """
 137
 138 class pounit(pocommon.pounit):
 139     # othercomments = []      #   # this is another comment
 140     # automaticcomments = []  #   #. comment extracted from the source code
 141     # sourcecomments = []     #   #: sourcefile.xxx:35
 142     # typecomments = []       #   #, fuzzy
 143     # msgidcomments = []      #   _: within msgid
 144     # msgctxt
 145     # msgid = []
 146     # msgstr = []
 147
 148     def __init__(self, source=None, encoding="UTF-8"):
 149         self._encoding = encodingToUse(encoding)
 150         self.obsolete = False
 151         self._initallcomments(blankall=True)
 152         self.msgctxt = []
 153         self.msgid = []
 154         self.msgid_pluralcomments = []
 155         self.msgid_plural = []
 156         self.msgstr = []
 157         self.obsoletemsgctxt = []
 158         self.obsoletemsgid = []
 159         self.obsoletemsgid_pluralcomments = []
 160         self.obsoletemsgid_plural = []
 161         self.obsoletemsgstr = []
 162         if source:
 163             self.setsource(source)
 164         super(pounit, self).__init__(source)
 165
 166     def _initallcomments(self, blankall=False):
 167         """Initialises allcomments"""
 168         if blankall:
 169             self.othercomments = []
 170             self.automaticcomments = []
 171             self.sourcecomments = []
 172             self.typecomments = []
 173             self.msgidcomments = []
 174             self.obsoletemsgidcomments = []
 175         self.allcomments = [self.othercomments,
 176                             self.automaticcomments,
 177                             self.sourcecomments,
 178                             self.typecomments,
 179                             self.msgidcomments,
 180                             self.obsoletemsgidcomments]
 181
 182     def getsource(self):
 183         """Returns the unescaped msgid"""
 184         multi = multistring(unquotefrompo(self.msgid), self._encoding)
 185         if self.hasplural():
 186             pluralform = unquotefrompo(self.msgid_plural)
 187             if isinstance(pluralform, str):
 188                 pluralform = pluralform.decode(self._encoding)
 189             multi.strings.append(pluralform)
 190         return multi
 191
 192     def setsource(self, source):
 193         """Sets the msgid to the given (unescaped) value.
 194
 195         @param source: an unescaped source string.
 196         """
 197         if isinstance(source, str):
 198             source = source.decode(self._encoding)
 199         if isinstance(source, multistring):
 200             source = source.strings
 201         if isinstance(source, list):
 202             self.msgid = quoteforpo(source[0])
 203             if len(source) > 1:
 204                 self.msgid_plural = quoteforpo(source[1])
 205         else:
 206             self.msgid = quoteforpo(source)
 207     source = property(getsource, setsource)
 208
 209     def gettarget(self):
 210         """Returns the unescaped msgstr"""
 211         if isinstance(self.msgstr, dict):
 212             multi = multistring(map(unquotefrompo, self.msgstr.values()), self._encoding)
 213         else:
 214             multi = multistring(unquotefrompo(self.msgstr), self._encoding)
 215         return multi
 216
 217     def settarget(self, target):
 218         """Sets the msgstr to the given (unescaped) value"""
 219         if isinstance(target, str):
 220             target = target.decode(self._encoding)
 221         if target == self.target:
 222             return
 223         if self.hasplural():
 224             if isinstance(target, multistring):
 225                 target = target.strings
 226             elif isinstance(target, basestring):
 227                 target = [target]
 228         elif isinstance(target,(dict, list)):
 229             if len(target) == 1:
 230                 target = target[0]
 231             else:
 232                 raise ValueError("po msgid element has no plural but msgstr has %d elements (%s)" % (len(target), target))
 233         templates = self.msgstr
 234         if isinstance(templates, list):
 235             templates = {0: templates}
 236         if isinstance(target, list):
 237             self.msgstr = dict([(i, quoteforpo(target[i])) for i in range(len(target))])
 238         elif isinstance(target, dict):
 239             self.msgstr = dict([(i, quoteforpo(targetstring)) for i, targetstring in target.iteritems()])
 240         else:
 241             self.msgstr = quoteforpo(target)
 242     target = property(gettarget, settarget)
 243
 244     def getnotes(self, origin=None):
 245         """Return comments based on origin value (programmer, developer, source code and translator)"""
 246         if origin == None:
 247             comments = u"".join([comment[2:] for comment in self.othercomments])
 248             comments += u"".join([comment[3:] for comment in self.automaticcomments])
 249         elif origin == "translator":
 250             comments = u"".join ([comment[2:] for comment in self.othercomments])
 251         elif origin in ["programmer", "developer", "source code"]:
 252             comments = u"".join([comment[3:] for comment in self.automaticcomments])
 253         else:
 254             raise ValueError("Comment type not valid")
 255         # Let's drop the last newline
 256         return comments[:-1]
 257
 258     def addnote(self, text, origin=None, position="append"):
 259         """This is modeled on the XLIFF method. See xliff.py::xliffunit.addnote"""
 260         # We don't want to put in an empty '#' without a real comment:
 261         if not text:
 262             return
 263         text = data.forceunicode(text)
 264         commentlist = self.othercomments
 265         linestart = "# "
 266         if origin in ["programmer", "developer", "source code"]:
 267             autocomments = True
 268             commentlist = self.automaticcomments
 269             linestart = "#. "
 270         text = text.split("\n")
 271         if position == "append":
 272             commentlist += [linestart + line + "\n" for line in text]
 273         else:
 274             newcomments = [linestart + line + "\n" for line in text]
 275             newcomments += [line for line in commentlist]
 276             if autocomments:
 277                 self.automaticcomments = newcomments
 278             else:
 279                 self.othercomments = newcomments
 280
 281     def removenotes(self):
 282         """Remove all the translator's notes (other comments)"""
 283         self.othercomments = []
 284
 285     def copy(self):
 286         newpo = self.__class__()
 287         newpo.othercomments = self.othercomments[:]
 288         newpo.automaticcomments = self.automaticcomments[:]
 289         newpo.sourcecomments = self.sourcecomments[:]
 290         newpo.typecomments = self.typecomments[:]
 291         newpo.obsolete = self.obsolete
 292         newpo.msgidcomments = self.msgidcomments[:]
 293         newpo._initallcomments()
 294         newpo.msgctxt = self.msgctxt[:]
 295         newpo.msgid = self.msgid[:]
 296         newpo.msgid_pluralcomments = self.msgid_pluralcomments[:]
 297         newpo.msgid_plural = self.msgid_plural[:]
 298         if isinstance(self.msgstr, dict):
 299             newpo.msgstr = self.msgstr.copy()
 300         else:
 301             newpo.msgstr = self.msgstr[:]
 302
 303         newpo.obsoletemsgctxt = self.obsoletemsgctxt[:]
 304         newpo.obsoletemsgid = self.obsoletemsgid[:]
 305         newpo.obsoletemsgid_pluralcomments = self.obsoletemsgid_pluralcomments[:]
 306         newpo.obsoletemsgid_plural = self.obsoletemsgid_plural[:]
 307         if isinstance(self.obsoletemsgstr, dict):
 308             newpo.obsoletemsgstr = self.obsoletemsgstr.copy()
 309         else:
 310             newpo.obsoletemsgstr = self.obsoletemsgstr[:]
 311         return newpo
 312
 313     def msgidlen(self):
 314         if self.hasplural():
 315             return len(unquotefrompo(self.msgid).strip()) + len(unquotefrompo(self.msgid_plural).strip())
 316         else:
 317             return len(unquotefrompo(self.msgid).strip())
 318
 319     def msgstrlen(self):
 320         if isinstance(self.msgstr, dict):
 321             combinedstr = "\n".join([unquotefrompo(msgstr).strip() for msgstr in self.msgstr.itervalues()])
 322             return len(combinedstr.strip())
 323         else:
 324             return len(unquotefrompo(self.msgstr).strip())
 325
 326     def merge(self, otherpo, overwrite=False, comments=True, authoritative=False):
 327         """Merges the otherpo (with the same msgid) into this one.
 328
 329         Overwrite non-blank self.msgstr only if overwrite is True
 330         merge comments only if comments is True
 331
 332         """
 333
 334         def mergelists(list1, list2, split=False):
 335             #decode where necessary
 336             if unicode in [type(item) for item in list2] + [type(item) for item in list1]:
 337                 for position, item in enumerate(list1):
 338                     if isinstance(item, str):
 339                         list1[position] = item.decode("utf-8")
 340                 for position, item in enumerate(list2):
 341                     if isinstance(item, str):
 342                         list2[position] = item.decode("utf-8")
 343
 344             #Determine the newline style of list1
 345             lineend = ""
 346             if list1 and list1[0]:
 347                 for candidate in ["\n", "\r", "\n\r"]:
 348                     if list1[0].endswith(candidate):
 349                         lineend = candidate
 350                 if not lineend:
 351                     lineend = ""
 352             else:
 353                 lineend = "\n"
 354
 355             #Split if directed to do so:
 356             if split:
 357                 splitlist1 = []
 358                 splitlist2 = []
 359                 prefix = "#"
 360                 for item in list1:
 361                     splitlist1.extend(item.split()[1:])
 362                     prefix = item.split()[0]
 363                 for item in list2:
 364                     splitlist2.extend(item.split()[1:])
 365                     prefix = item.split()[0]
 366                 list1.extend(["%s %s%s" % (prefix, item, lineend) for item in splitlist2 if not item in splitlist1])
 367             else:
 368                 #Normal merge, but conform to list1 newline style
 369                 if list1 != list2:
 370                     for item in list2:
 371                         if lineend:
 372                             item = item.rstrip() + lineend
 373                         # avoid duplicate comment lines (this might cause some problems)
 374                         if item not in list1 or len(item) < 5:
 375                             list1.append(item)
 376         if not isinstance(otherpo, pounit):
 377             super(pounit, self).merge(otherpo, overwrite, comments)
 378             return
 379         if comments:
 380             mergelists(self.othercomments, otherpo.othercomments)
 381             mergelists(self.typecomments, otherpo.typecomments)
 382             if not authoritative:
 383                 # We don't bring across otherpo.automaticcomments as we consider ourself
 384                 # to be the the authority.  Same applies to otherpo.msgidcomments
 385                 mergelists(self.automaticcomments, otherpo.automaticcomments)
 386                 mergelists(self.msgidcomments, otherpo.msgidcomments)
 387                 mergelists(self.sourcecomments, otherpo.sourcecomments, split=True)
 388         if not self.istranslated() or overwrite:
 389             # Remove kde-style comments from the translation (if any).
 390             if self._extract_msgidcomments(otherpo.target):
 391                 otherpo.target = otherpo.target.replace('_: ' + otherpo._extract_msgidcomments()+ '\n', '')
 392             self.target = otherpo.target
 393             if self.source != otherpo.source:
 394                 self.markfuzzy()
 395             else:
 396                 self.markfuzzy(otherpo.isfuzzy())
 397         elif not otherpo.istranslated():
 398             if self.source != otherpo.source:
 399                 self.markfuzzy()
 400         else:
 401             if self.target != otherpo.target:
 402                 self.markfuzzy()
 403
 404     def isheader(self):
 405         #return (self.msgidlen() == 0) and (self.msgstrlen() > 0) and (len(self.msgidcomments) == 0)
 406         #rewritten here for performance:
 407         return ((self.msgid == [] or self.msgid == ['""']) and
 408                         not (self.msgstr == [] or self.msgstr == ['""'])
 409                         and self.msgidcomments == []
 410                         and (self.msgctxt == [] or self.msgctxt == ['""'])
 411                         and (self.sourcecomments == [] or self.sourcecomments == [""]))
 412
 413     def isblank(self):
 414         if self.isheader() or len(self.msgidcomments):
 415             return False
 416         if (self.msgidlen() == 0) and (self.msgstrlen() == 0):
 417             return True
 418         return False
 419         # TODO: remove:
 420         # Before, the equivalent of the following was the final return statement:
 421         # return len(self.source.strip()) == 0
 422
 423     def hastypecomment(self, typecomment):
 424         """check whether the given type comment is present"""
 425         # check for word boundaries properly by using a regular expression...
 426         return sum(map(lambda tcline: len(re.findall("\\b%s\\b" % typecomment, tcline)), self.typecomments)) != 0
 427
 428     def hasmarkedcomment(self, commentmarker):
 429         """check whether the given comment marker is present as # (commentmarker) ..."""
 430         commentmarker = "(%s)" % commentmarker
 431         for comment in self.othercomments:
 432             if comment.replace("#", "", 1).strip().startswith(commentmarker):
 433                 return True
 434         return False
 435
 436     def settypecomment(self, typecomment, present=True):
 437         """alters whether a given typecomment is present"""
 438         if self.hastypecomment(typecomment) != present:
 439             if present:
 440                 self.typecomments.append("#, %s\n" % typecomment)
 441             else:
 442                 # this should handle word boundaries properly ...
 443                 typecomments = map(lambda tcline: re.sub("\\b%s\\b[ \t,]*" % typecomment, "", tcline), self.typecomments)
 444                 self.typecomments = filter(lambda tcline: tcline.strip() != "#,", typecomments)
 445
 446     def istranslated(self):
 447         return super(pounit, self).istranslated() and not self.isobsolete()
 448
 449     def istranslatable(self):
 450         return not (self.isheader() or self.isblank())
 451
 452     def isfuzzy(self):
 453         return self.hastypecomment("fuzzy")
 454
 455     def markfuzzy(self, present=True):
 456         self.settypecomment("fuzzy", present)
 457
 458     def isreview(self):
 459         return self.hastypecomment("review") or self.hasmarkedcomment("review") or self.hasmarkedcomment("pofilter")
 460
 461     def isobsolete(self):
 462         return self.obsolete
 463
 464     def makeobsolete(self):
 465         """Makes this unit obsolete"""
 466         self.obsolete = True
 467         if self.msgctxt:
 468             self.obsoletemsgctxt = self.msgctxt
 469         if self.msgid:
 470             self.obsoletemsgid = self.msgid
 471             self.msgid = []
 472         if self.msgidcomments:
 473             self.obsoletemsgidcomments = self.msgidcomments
 474             self.msgidcomments = []
 475         if self.msgid_plural:
 476             self.obsoletemsgid_plural = self.msgid_plural
 477             self.msgid_plural = []
 478         if self.msgstr:
 479             self.obsoletemsgstr = self.msgstr
 480             self.msgstr = []
 481         self.sourcecomments = []
 482         self.automaticcomments = []
 483
 484     def resurrect(self):
 485         """Makes an obsolete unit normal"""
 486         self.obsolete = False
 487         if self.obsoletemsgctxt:
 488             self.msgid = self.obsoletemsgctxt
 489             self.obsoletemsgctxt = []
 490         if self.obsoletemsgid:
 491             self.msgid = self.obsoletemsgid
 492             self.obsoletemsgid = []
 493         if self.obsoletemsgidcomments:
 494             self.msgidcomments = self.obsoletemsgidcomments
 495             self.obsoletemsgidcomments = []
 496         if self.obsoletemsgid_plural:
 497             self.msgid_plural = self.obsoletemsgid_plural
 498             self.obsoletemsgid_plural = []
 499         if self.obsoletemsgstr:
 500             self.msgstr = self.obsoletemsgstr
 501             self.obsoletemgstr = []
 502
 503     def hasplural(self):
 504         """returns whether this pounit contains plural strings..."""
 505         return len(self.msgid_plural) > 0
 506
 507     def parselines(self, lines):
 508         inmsgctxt = 0
 509         inmsgid = 0
 510         inmsgid_comment = 0
 511         inmsgid_plural = 0
 512         inmsgstr = 0
 513         msgstr_pluralid = None
 514         linesprocessed = 0
 515         for line in lines:
 516             line = line + "\n"
 517             linesprocessed += 1
 518             if len(line) == 0:
 519                 continue
 520             elif line[0] == '#':
 521                 if inmsgstr and not line[1] == '~':
 522                     # if we're already in the message string, this is from the next element
 523                     break
 524                 if line[1] == '.':
 525                     self.automaticcomments.append(line)
 526                 elif line[1] == ':':
 527                     self.sourcecomments.append(line)
 528                 elif line[1] == ',':
 529                     self.typecomments.append(line)
 530                 elif line[1] == '~':
 531                     line = line[3:]
 532                     self.obsolete = True
 533                 else:
 534                     self.othercomments.append(line)
 535             if line.startswith('msgid_plural'):
 536                 inmsgctxt = 0
 537                 inmsgid = 0
 538                 inmsgid_plural = 1
 539                 inmsgstr = 0
 540                 inmsgid_comment = 0
 541             elif line.startswith('msgctxt'):
 542                 inmsgctxt = 1
 543                 inmsgid = 0
 544                 inmsgid_plural = 0
 545                 inmsgstr = 0
 546                 inmsgid_comment = 0
 547             elif line.startswith('msgid'):
 548                 # if we just finished a msgstr or msgid_plural, there is probably an
 549                 # empty line missing between the units, so let's stop the parsing now.
 550                 if inmsgstr or inmsgid_plural:
 551                     break
 552                 inmsgctxt = 0
 553                 inmsgid = 1
 554                 inmsgid_plural = 0
 555                 inmsgstr = 0
 556                 inmsgid_comment = 0
 557             elif line.startswith('msgstr'):
 558                 inmsgctxt = 0
 559                 inmsgid = 0
 560                 inmsgid_plural = 0
 561                 inmsgstr = 1
 562                 if line.startswith('msgstr['):
 563                     msgstr_pluralid = int(line[len('msgstr['):line.find(']')].strip())
 564                 else:
 565                     msgstr_pluralid = None
 566             extracted = quote.extractstr(line)
 567             if not extracted is None:
 568                 if inmsgctxt:
 569                     self.msgctxt.append(extracted)
 570                 elif inmsgid:
 571                     # TODO: improve kde comment detection
 572                     if extracted.find("_:") != -1:
 573                         inmsgid_comment = 1
 574                     if inmsgid_comment:
 575                         self.msgidcomments.append(extracted)
 576                     else:
 577                         self.msgid.append(extracted)
 578                     if inmsgid_comment and extracted.find("\\n") != -1:
 579                         inmsgid_comment = 0
 580                 elif inmsgid_plural:
 581                     if extracted.find("_:") != -1:
 582                         inmsgid_comment = 1
 583                     if inmsgid_comment:
 584                         self.msgid_pluralcomments.append(extracted)
 585                     else:
 586                         self.msgid_plural.append(extracted)
 587                     if inmsgid_comment and extracted.find("\\n") != -1:
 588                         inmsgid_comment = 0
 589                 elif inmsgstr:
 590                     if msgstr_pluralid is None:
 591                         self.msgstr.append(extracted)
 592                     else:
 593                         if type(self.msgstr) == list:
 594                             self.msgstr = {0: self.msgstr}
 595                         if msgstr_pluralid not in self.msgstr:
 596                             self.msgstr[msgstr_pluralid] = []
 597                         self.msgstr[msgstr_pluralid].append(extracted)
 598         if self.obsolete:
 599             self.makeobsolete()
 600         # If this unit is the header, we have to get the encoding to ensure that no
 601         # methods are called that need the encoding before we obtained it.
 602         if self.isheader():
 603             charset = re.search("charset=([^\\s]+)", unquotefrompo(self.msgstr))
 604             if charset:
 605                 self._encoding = encodingToUse(charset.group(1))
 606         return linesprocessed
 607
 608     def parse(self, src):
 609         if isinstance(src, str):
 610             # This has not been decoded yet, so we need to make a plan
 611             src = src.decode(self._encoding)
 612         return self.parselines(src.split("\n"))
 613
 614     def _getmsgpartstr(self, partname, partlines, partcomments=""):
 615         if isinstance(partlines, dict):
 616             partkeys = partlines.keys()
 617             partkeys.sort()
 618             return "".join([self._getmsgpartstr("%s[%d]" % (partname, partkey), partlines[partkey], partcomments) for partkey in partkeys])
 619         partstr = partname + " "
 620         partstartline = 0
 621         if len(partlines) > 0 and len(partcomments) == 0:
 622             partstr += partlines[0]
 623             partstartline = 1
 624         elif len(partcomments) > 0:
 625             if len(partlines) > 0 and len(unquotefrompo(partlines[:1])) == 0:
 626                 # if there is a blank leader line, it must come before the comment
 627                 partstr += partlines[0] + '\n'
 628                 # but if the whole string is blank, leave it in
 629                 if len(partlines) > 1:
 630                     partstartline += 1
 631             else:
 632                 # All partcomments should start on a newline
 633                 partstr += '""\n'
 634             # combine comments into one if more than one
 635             if len(partcomments) > 1:
 636                 combinedcomment = []
 637                 for comment in partcomments:
 638                     comment = unquotefrompo([comment])
 639                     if comment.startswith("_:"):
 640                         comment = comment[len("_:"):]
 641                     if comment.endswith("\\n"):
 642                         comment = comment[:-len("\\n")]
 643                     #Before we used to strip. Necessary in some cases?
 644                     combinedcomment.append(comment)
 645                 partcomments = quoteforpo("_:%s" % "".join(combinedcomment))
 646             # comments first, no blank leader line needed
 647             partstr += "\n".join(partcomments)
 648             partstr = quote.rstripeol(partstr)
 649         else:
 650             partstr += '""'
 651         partstr += '\n'
 652         # add the rest
 653         for partline in partlines[partstartline:]:
 654             partstr += partline + '\n'
 655         return partstr
 656
 657     def _encodeifneccessary(self, output):
 658         """encodes unicode strings and returns other strings unchanged"""
 659         if isinstance(output, unicode):
 660             encoding = encodingToUse(getattr(self, "encoding", "UTF-8"))
 661             return output.encode(encoding)
 662         return output
 663
 664     def __str__(self):
 665         """convert to a string. double check that unicode is handled somehow here"""
 666         output = self._getoutput()
 667         return self._encodeifneccessary(output)
 668
 669     def _getoutput(self):
 670         """return this po element as a string"""
 671         lines = []
 672         lines.extend(self.othercomments)
 673         if self.isobsolete():
 674             lines.extend(self.typecomments)
 675             obsoletelines = []
 676             if self.obsoletemsgctxt:
 677                 obsoletelines.append(self._getmsgpartstr("#~ msgctxt", self.obsoletemsgctxt))
 678             obsoletelines.append(self._getmsgpartstr("#~ msgid", self.obsoletemsgid, self.obsoletemsgidcomments))
 679             if self.obsoletemsgid_plural or self.obsoletemsgid_pluralcomments:
 680                 obsoletelines.append(self._getmsgpartstr("#~ msgid_plural", self.obsoletemsgid_plural, self.obsoletemsgid_pluralcomments))
 681             obsoletelines.append(self._getmsgpartstr("#~ msgstr", self.obsoletemsgstr))
 682             for index, obsoleteline in enumerate(obsoletelines):
 683                 # We need to account for a multiline msgid or msgstr here
 684                 obsoletelines[index] = obsoleteline.replace('\n"', '\n#~ "')
 685             lines.extend(obsoletelines)
 686             lines = [self._encodeifneccessary(line) for line in lines]
 687             return "".join(lines)
 688         # if there's no msgid don't do msgid and string, unless we're the header
 689         # this will also discard any comments other than plain othercomments...
 690         if (len(self.msgid) == 0) or ((len(self.msgid) == 1) and (self.msgid[0] == '""')):
 691             if not (self.isheader() or self.msgidcomments or self.sourcecomments):
 692                 return "".join(lines)
 693         lines.extend(self.automaticcomments)
 694         lines.extend(self.sourcecomments)
 695         lines.extend(self.typecomments)
 696         if self.msgctxt:
 697             lines.append(self._getmsgpartstr("msgctxt", self.msgctxt))
 698         lines.append(self._getmsgpartstr("msgid", self.msgid, self.msgidcomments))
 699         if self.msgid_plural or self.msgid_pluralcomments:
 700             lines.append(self._getmsgpartstr("msgid_plural", self.msgid_plural, self.msgid_pluralcomments))
 701         lines.append(self._getmsgpartstr("msgstr", self.msgstr))
 702         lines = [self._encodeifneccessary(line) for line in lines]
 703         postr = "".join(lines)
 704         return postr
 705
 706     def getlocations(self):
 707         """Get a list of locations from sourcecomments in the PO unit
 708
 709         rtype: List
 710         return: A list of the locations with '#: ' stripped
 711
 712         """
 713         locations = []
 714         for sourcecomment in self.sourcecomments:
 715             locations += quote.rstripeol(sourcecomment)[3:].split()
 716         return locations
 717
 718     def addlocation(self, location):
 719         """Add a location to sourcecomments in the PO unit
 720
 721         @param location: Text location e.g. 'file.c:23' does not include #:
 722         @type location: String
 723
 724         """
 725         self.sourcecomments.append("#: %s\n" % location)
 726
 727     def _extract_msgidcomments(self, text=None):
 728         """Extract KDE style msgid comments from the unit.
 729
 730         @rtype: String
 731         @return: Returns the extracted msgidcomments found in this unit's msgid.
 732
 733         """
 734
 735         if not text:
 736             text = unquotefrompo(self.msgidcomments)
 737         return text.split('\n')[0].replace('_: ', '', 1)
 738
 739     def getcontext(self):
 740         """Get the message context."""
 741         return unquotefrompo(self.msgctxt) + self._extract_msgidcomments()
 742
 743     def getid(self):
 744         """Returns a unique identifier for this unit."""
 745         context = self.getcontext()
 746         # Gettext does not consider the plural to determine duplicates, only
 747         # the msgid. For generation of .mo files, we might want to use this
 748         # code to generate the entry for the hash table, but for now, it is
 749         # commented out for conformance to gettext.
 750 #        id = '\0'.join(self.source.strings)
 751         id = self.source
 752         if self.msgidcomments:
 753             id = "_: %s\n%s" % (context, id)
 754         elif context:
 755             id = "%s\04%s" % (context, id)
 756         return id
 757
 758 class pofile(pocommon.pofile):
 759     """this represents a .po file containing various units"""
 760     UnitClass = pounit
 761     def __init__(self, inputfile=None, encoding=None, unitclass=pounit):
 762         """construct a pofile, optionally reading in from inputfile.
 763         encoding can be specified but otherwise will be read from the PO header"""
 764         self.UnitClass = unitclass
 765         pocommon.pofile.__init__(self, unitclass=unitclass)
 766         self.units = []
 767         self.filename = ''
 768         self._encoding = encodingToUse(encoding)
 769         if inputfile is not None:
 770             self.parse(inputfile)
 771
 772     def changeencoding(self, newencoding):
 773         """changes the encoding on the file"""
 774         self._encoding = encodingToUse(newencoding)
 775         if not self.units:
 776             return
 777         header = self.header()
 778         if not header or header.isblank():
 779             return
 780         charsetline = None
 781         headerstr = unquotefrompo(header.msgstr)
 782         for line in headerstr.split("\n"):
 783             if not ":" in line: continue
 784             key, value = line.strip().split(":", 1)
 785             if key.strip() != "Content-Type": continue
 786             charsetline = line
 787         if charsetline is None:
 788             headerstr += "Content-Type: text/plain; charset=%s" % self._encoding
 789         else:
 790             charset = re.search("charset=([^ ]*)", charsetline)
 791             if charset is None:
 792                 newcharsetline = charsetline
 793                 if not newcharsetline.strip().endswith(";"):
 794                     newcharsetline += ";"
 795                 newcharsetline += " charset=%s" % self._encoding
 796             else:
 797                 charset = charset.group(1)
 798                 newcharsetline = charsetline.replace("charset=%s" % charset, "charset=%s" % self._encoding, 1)
 799             headerstr = headerstr.replace(charsetline, newcharsetline, 1)
 800         header.msgstr = quoteforpo(headerstr)
 801
 802     def parse(self, input):
 803         """parses the given file or file source string"""
 804         try:
 805             if hasattr(input, 'name'):
 806                 self.filename = input.name
 807             elif not getattr(self, 'filename', ''):
 808                 self.filename = ''
 809             if hasattr(input, "read"):
 810                 posrc = input.read()
 811                 input.close()
 812                 input = posrc
 813             # TODO: change this to a proper parser that doesn't do line-by-line madness
 814             lines = input.split("\n")
 815             start = 0
 816             end = 0
 817             # make only the first one the header
 818             linesprocessed = 0
 819             is_decoded = False
 820             while end <= len(lines):
 821                 if (end == len(lines)) or (not lines[end].strip()):  # end of lines or blank line
 822                     newpe = self.UnitClass(encoding=self._encoding)
 823                     unit_lines = lines[start:end]
 824                     # We need to work carefully if we haven't decoded properly yet.
 825                     # So let's solve this temporarily until we actually get the
 826                     # encoding from the header.
 827                     if not is_decoded:
 828                         unit_lines = [line.decode('ascii', 'ignore') for line in unit_lines]
 829                     linesprocessed = newpe.parselines(unit_lines)
 830                     start += linesprocessed
 831                     # TODO: find a better way of working out if we actually read anything
 832                     if linesprocessed >= 1 and newpe._getoutput():
 833                         self.units.append(newpe)
 834                         if not is_decoded:
 835                             if newpe.isheader(): # If there is a header...
 836                                 if "Content-Type" in self.parseheader(): # and a Content-Type...
 837                                     if self._encoding.lower() != 'charset': # with a valid charset...
 838                                         self._encoding = newpe._encoding # then change the encoding
 839                                         # otherwise we'll decode using UTF-8
 840                             lines = self.decode(lines)
 841                             self.units = []
 842                             start = 0
 843                             end = 0
 844                             is_decoded = True
 845                 end = end+1
 846         except Exception, e:
 847             raise base.ParseError()
 848
 849     def removeduplicates(self, duplicatestyle="merge"):
 850         """make sure each msgid is unique ; merge comments etc from duplicates into original"""
 851         msgiddict = {}
 852         uniqueunits = []
 853         # we sometimes need to keep track of what has been marked
 854         # TODO: this is using a list as the pos aren't hashable, but this is slow...
 855         markedpos = []
 856         def addcomment(thepo):
 857             thepo.msgidcomments.append('"_: %s\\n"' % " ".join(thepo.getlocations()))
 858             markedpos.append(thepo)
 859         for thepo in self.units:
 860             if duplicatestyle.startswith("msgid_comment"):
 861                 msgid = unquotefrompo(thepo.msgidcomments) + unquotefrompo(thepo.msgid)
 862             else:
 863                 msgid = unquotefrompo(thepo.msgid)
 864             if thepo.isheader():
 865                 # header msgids shouldn't be merged...
 866                 uniqueunits.append(thepo)
 867             elif duplicatestyle == "msgid_comment_all":
 868                 addcomment(thepo)
 869                 uniqueunits.append(thepo)
 870             elif msgid in msgiddict:
 871                 if duplicatestyle == "merge":
 872                     if msgid:
 873                         msgiddict[msgid].merge(thepo)
 874                     else:
 875                         addcomment(thepo)
 876                         uniqueunits.append(thepo)
 877                 elif duplicatestyle == "keep":
 878                     uniqueunits.append(thepo)
 879                 elif duplicatestyle == "msgid_comment":
 880                     origpo = msgiddict[msgid]
 881                     if origpo not in markedpos:
 882                         addcomment(origpo)
 883                     addcomment(thepo)
 884                     uniqueunits.append(thepo)
 885                 elif duplicatestyle == "msgctxt":
 886                     origpo = msgiddict[msgid]
 887                     if origpo not in markedpos:
 888                         origpo.msgctxt.append('"%s"' % " ".join(origpo.getlocations()))
 889                         markedpos.append(thepo)
 890                     thepo.msgctxt.append('"%s"' % " ".join(thepo.getlocations()))
 891                     uniqueunits.append(thepo)
 892             else:
 893                 if not msgid and duplicatestyle != "keep":
 894                     addcomment(thepo)
 895                 msgiddict[msgid] = thepo
 896                 uniqueunits.append(thepo)
 897         self.units = uniqueunits
 898
 899     def __str__(self):
 900         """convert to a string. double check that unicode is handled somehow here"""
 901         output = self._getoutput()
 902         if isinstance(output, unicode):
 903             return output.encode(getattr(self, "encoding", "UTF-8"))
 904         return output
 905
 906     def _getoutput(self):
 907         """convert the units back to lines"""
 908         lines = []
 909         for unit in self.units:
 910             unitsrc = str(unit) + "\n"
 911             lines.append(unitsrc)
 912         lines = "".join(self.encode(lines)).rstrip()
 913         #After the last pounit we will have \n\n and we only want to end in \n:
 914         if lines: lines += "\n"
 915         return lines
 916
 917     def encode(self, lines):
 918         """encode any unicode strings in lines in self._encoding"""
 919         newlines = []
 920         encoding = self._encoding
 921         if encoding is None or encoding.lower() == "charset":
 922             encoding = 'UTF-8'
 923         for line in lines:
 924             if isinstance(line, unicode):
 925                 line = line.encode(encoding)
 926             newlines.append(line)
 927         return newlines
 928
 929     def decode(self, lines):
 930         """decode any non-unicode strings in lines with self._encoding"""
 931         newlines = []
 932         for line in lines:
 933             if isinstance(line, str) and self._encoding is not None and self._encoding.lower() != "charset":
 934                 try:
 935                     line = line.decode(self._encoding)
 936                 except UnicodeError, e:
 937                     raise UnicodeError("Error decoding line with encoding %r: %s. Line is %r" % (self._encoding, e, line))
 938             newlines.append(line)
 939         return newlines
 940
 941     def unit_iter(self):
 942         for unit in self.units:
 943             if not (unit.isheader() or unit.isobsolete()):
 944                 yield unit
 945
 946 if __name__ == '__main__':
 947     import sys
 948     pf = pofile(sys.stdin)
 949     sys.stdout.write(str(pf))
 950