Added some robustness to the stasistics code to handle corrupt files.
[translate_toolkit.git] / storage / pypo.py
blob8eba3096c0c8b549a68fe77beea29c8c216f4406
1 #!/usr/bin/env python
2 # -*- coding: utf-8 -*-
3 #
4 # Copyright 2002-2007 Zuza Software Foundation
5 #
6 # This file is part of translate.
8 # translate is free software; you can redistribute it and/or modify
9 # it under the terms of the GNU General Public License as published by
10 # the Free Software Foundation; either version 2 of the License, or
11 # (at your option) any later version.
13 # translate is distributed in the hope that it will be useful,
14 # but WITHOUT ANY WARRANTY; without even the implied warranty of
15 # MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
16 # GNU General Public License for more details.
18 # You should have received a copy of the GNU General Public License
19 # along with translate; if not, write to the Free Software
20 # Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA 02111-1307 USA
22 """classes that hold units of .po files (pounit) or entire files (pofile)
23 gettext-style .po (or .pot) files are used in translations for KDE et al (see kbabel)"""
25 from __future__ import generators
26 from translate.misc.multistring import multistring
27 from translate.misc import quote
28 from translate.misc import textwrap
29 from translate.lang import data
30 from translate.storage import pocommon, base
31 import re
33 lsep = "\n#: "
34 """Seperator for #: entries"""
36 # general functions for quoting / unquoting po strings
38 po_unescape_map = {"\\r": "\r", "\\t": "\t", '\\"': '"', '\\n': '\n', '\\\\': '\\'}
39 po_escape_map = dict([(value, key) for (key, value) in po_unescape_map.items()])
41 def escapeforpo(line):
42 """Escapes a line for po format. assumes no \n occurs in the line.
44 @param line: unescaped text
45 """
46 special_locations = []
47 for special_key in po_escape_map:
48 special_locations.extend(quote.find_all(line, special_key))
49 special_locations = dict.fromkeys(special_locations).keys()
50 special_locations.sort()
51 escaped_line = ""
52 last_location = 0
53 for location in special_locations:
54 escaped_line += line[last_location:location]
55 escaped_line += po_escape_map[line[location:location+1]]
56 last_location = location+1
57 escaped_line += line[last_location:]
58 return escaped_line
60 def unescapehandler(escape):
62 return po_unescape_map.get(escape, escape)
64 def wrapline(line):
65 """Wrap text for po files."""
66 wrappedlines = textwrap.wrap(line, 76, replace_whitespace=False, expand_tabs=False, drop_whitespace=False)
68 # Lines should not start with a space...
69 if len(wrappedlines) > 1:
70 for index, line in enumerate(wrappedlines[1:]):
71 if line.startswith(' '):
72 # Remove the space at the beginning of the line:
73 wrappedlines[index+1] = line[1:]
75 # Append a space to the previous line:
76 wrappedlines[index] += ' '
77 return wrappedlines
79 def quoteforpo(text):
80 """quotes the given text for a PO file, returning quoted and escaped lines"""
81 polines = []
82 if text is None:
83 return polines
84 lines = text.split("\n")
85 if len(lines) > 1 or (len(lines) == 1 and len(lines[0]) > 71):
86 if len(lines) != 2 or lines[1]:
87 polines.extend(['""'])
88 for line in lines[:-1]:
89 lns = wrapline(line)
90 if len(lns) > 0:
91 for ln in lns[:-1]:
92 polines.extend(['"' + escapeforpo(ln) + '"'])
93 if lns[-1]:
94 polines.extend(['"' + escapeforpo(lns[-1]) + '\\n"'])
95 else:
96 polines.extend(['"\\n"'])
97 if lines[-1]:
98 polines.extend(['"' + escapeforpo(line) + '"' for line in wrapline(lines[-1])])
99 return polines
101 def extractpoline(line):
102 """Remove quote and unescape line from po file.
104 @param line: a quoted line from a po file (msgid or msgstr)
106 extracted = quote.extractwithoutquotes(line, '"', '"', '\\', includeescapes=unescapehandler)[0]
107 return extracted
109 def unquotefrompo(postr):
110 return u"".join([extractpoline(line) for line in postr])
112 def encodingToUse(encoding):
113 """Tests whether the given encoding is known in the python runtime, or returns utf-8.
114 This function is used to ensure that a valid encoding is always used."""
115 if encoding == "CHARSET" or encoding == None: return 'utf-8'
116 return encoding
117 # if encoding is None: return False
118 # return True
119 # try:
120 # tuple = codecs.lookup(encoding)
121 # except LookupError:
122 # return False
123 # return True
126 From the GNU gettext manual:
127 WHITE-SPACE
128 # TRANSLATOR-COMMENTS
129 #. AUTOMATIC-COMMENTS
130 #| PREVIOUS MSGID (Gettext 0.16 - check if this is the correct position - not yet implemented)
131 #: REFERENCE...
132 #, FLAG...
133 msgctxt CONTEXT (Gettext 0.15)
134 msgid UNTRANSLATED-STRING
135 msgstr TRANSLATED-STRING
138 class pounit(pocommon.pounit):
139 # othercomments = [] # # this is another comment
140 # automaticcomments = [] # #. comment extracted from the source code
141 # sourcecomments = [] # #: sourcefile.xxx:35
142 # typecomments = [] # #, fuzzy
143 # msgidcomments = [] # _: within msgid
144 # msgctxt
145 # msgid = []
146 # msgstr = []
148 def __init__(self, source=None, encoding="UTF-8"):
149 self._encoding = encodingToUse(encoding)
150 self.obsolete = False
151 self._initallcomments(blankall=True)
152 self.msgctxt = []
153 self.msgid = []
154 self.msgid_pluralcomments = []
155 self.msgid_plural = []
156 self.msgstr = []
157 self.obsoletemsgctxt = []
158 self.obsoletemsgid = []
159 self.obsoletemsgid_pluralcomments = []
160 self.obsoletemsgid_plural = []
161 self.obsoletemsgstr = []
162 if source:
163 self.setsource(source)
164 super(pounit, self).__init__(source)
166 def _initallcomments(self, blankall=False):
167 """Initialises allcomments"""
168 if blankall:
169 self.othercomments = []
170 self.automaticcomments = []
171 self.sourcecomments = []
172 self.typecomments = []
173 self.msgidcomments = []
174 self.obsoletemsgidcomments = []
175 self.allcomments = [self.othercomments,
176 self.automaticcomments,
177 self.sourcecomments,
178 self.typecomments,
179 self.msgidcomments,
180 self.obsoletemsgidcomments]
182 def getsource(self):
183 """Returns the unescaped msgid"""
184 multi = multistring(unquotefrompo(self.msgid), self._encoding)
185 if self.hasplural():
186 pluralform = unquotefrompo(self.msgid_plural)
187 if isinstance(pluralform, str):
188 pluralform = pluralform.decode(self._encoding)
189 multi.strings.append(pluralform)
190 return multi
192 def setsource(self, source):
193 """Sets the msgid to the given (unescaped) value.
195 @param source: an unescaped source string.
197 if isinstance(source, str):
198 source = source.decode(self._encoding)
199 if isinstance(source, multistring):
200 source = source.strings
201 if isinstance(source, list):
202 self.msgid = quoteforpo(source[0])
203 if len(source) > 1:
204 self.msgid_plural = quoteforpo(source[1])
205 else:
206 self.msgid = quoteforpo(source)
207 source = property(getsource, setsource)
209 def gettarget(self):
210 """Returns the unescaped msgstr"""
211 if isinstance(self.msgstr, dict):
212 multi = multistring(map(unquotefrompo, self.msgstr.values()), self._encoding)
213 else:
214 multi = multistring(unquotefrompo(self.msgstr), self._encoding)
215 return multi
217 def settarget(self, target):
218 """Sets the msgstr to the given (unescaped) value"""
219 if isinstance(target, str):
220 target = target.decode(self._encoding)
221 if target == self.target:
222 return
223 if self.hasplural():
224 if isinstance(target, multistring):
225 target = target.strings
226 elif isinstance(target, basestring):
227 target = [target]
228 elif isinstance(target,(dict, list)):
229 if len(target) == 1:
230 target = target[0]
231 else:
232 raise ValueError("po msgid element has no plural but msgstr has %d elements (%s)" % (len(target), target))
233 templates = self.msgstr
234 if isinstance(templates, list):
235 templates = {0: templates}
236 if isinstance(target, list):
237 self.msgstr = dict([(i, quoteforpo(target[i])) for i in range(len(target))])
238 elif isinstance(target, dict):
239 self.msgstr = dict([(i, quoteforpo(targetstring)) for i, targetstring in target.iteritems()])
240 else:
241 self.msgstr = quoteforpo(target)
242 target = property(gettarget, settarget)
244 def getnotes(self, origin=None):
245 """Return comments based on origin value (programmer, developer, source code and translator)"""
246 if origin == None:
247 comments = u"".join([comment[2:] for comment in self.othercomments])
248 comments += u"".join([comment[3:] for comment in self.automaticcomments])
249 elif origin == "translator":
250 comments = u"".join ([comment[2:] for comment in self.othercomments])
251 elif origin in ["programmer", "developer", "source code"]:
252 comments = u"".join([comment[3:] for comment in self.automaticcomments])
253 else:
254 raise ValueError("Comment type not valid")
255 # Let's drop the last newline
256 return comments[:-1]
258 def addnote(self, text, origin=None, position="append"):
259 """This is modeled on the XLIFF method. See xliff.py::xliffunit.addnote"""
260 # We don't want to put in an empty '#' without a real comment:
261 if not text:
262 return
263 text = data.forceunicode(text)
264 commentlist = self.othercomments
265 linestart = "# "
266 if origin in ["programmer", "developer", "source code"]:
267 autocomments = True
268 commentlist = self.automaticcomments
269 linestart = "#. "
270 text = text.split("\n")
271 if position == "append":
272 commentlist += [linestart + line + "\n" for line in text]
273 else:
274 newcomments = [linestart + line + "\n" for line in text]
275 newcomments += [line for line in commentlist]
276 if autocomments:
277 self.automaticcomments = newcomments
278 else:
279 self.othercomments = newcomments
281 def removenotes(self):
282 """Remove all the translator's notes (other comments)"""
283 self.othercomments = []
285 def copy(self):
286 newpo = self.__class__()
287 newpo.othercomments = self.othercomments[:]
288 newpo.automaticcomments = self.automaticcomments[:]
289 newpo.sourcecomments = self.sourcecomments[:]
290 newpo.typecomments = self.typecomments[:]
291 newpo.obsolete = self.obsolete
292 newpo.msgidcomments = self.msgidcomments[:]
293 newpo._initallcomments()
294 newpo.msgctxt = self.msgctxt[:]
295 newpo.msgid = self.msgid[:]
296 newpo.msgid_pluralcomments = self.msgid_pluralcomments[:]
297 newpo.msgid_plural = self.msgid_plural[:]
298 if isinstance(self.msgstr, dict):
299 newpo.msgstr = self.msgstr.copy()
300 else:
301 newpo.msgstr = self.msgstr[:]
303 newpo.obsoletemsgctxt = self.obsoletemsgctxt[:]
304 newpo.obsoletemsgid = self.obsoletemsgid[:]
305 newpo.obsoletemsgid_pluralcomments = self.obsoletemsgid_pluralcomments[:]
306 newpo.obsoletemsgid_plural = self.obsoletemsgid_plural[:]
307 if isinstance(self.obsoletemsgstr, dict):
308 newpo.obsoletemsgstr = self.obsoletemsgstr.copy()
309 else:
310 newpo.obsoletemsgstr = self.obsoletemsgstr[:]
311 return newpo
313 def msgidlen(self):
314 if self.hasplural():
315 return len(unquotefrompo(self.msgid).strip()) + len(unquotefrompo(self.msgid_plural).strip())
316 else:
317 return len(unquotefrompo(self.msgid).strip())
319 def msgstrlen(self):
320 if isinstance(self.msgstr, dict):
321 combinedstr = "\n".join([unquotefrompo(msgstr).strip() for msgstr in self.msgstr.itervalues()])
322 return len(combinedstr.strip())
323 else:
324 return len(unquotefrompo(self.msgstr).strip())
326 def merge(self, otherpo, overwrite=False, comments=True, authoritative=False):
327 """Merges the otherpo (with the same msgid) into this one.
329 Overwrite non-blank self.msgstr only if overwrite is True
330 merge comments only if comments is True
334 def mergelists(list1, list2, split=False):
335 #decode where necessary
336 if unicode in [type(item) for item in list2] + [type(item) for item in list1]:
337 for position, item in enumerate(list1):
338 if isinstance(item, str):
339 list1[position] = item.decode("utf-8")
340 for position, item in enumerate(list2):
341 if isinstance(item, str):
342 list2[position] = item.decode("utf-8")
344 #Determine the newline style of list1
345 lineend = ""
346 if list1 and list1[0]:
347 for candidate in ["\n", "\r", "\n\r"]:
348 if list1[0].endswith(candidate):
349 lineend = candidate
350 if not lineend:
351 lineend = ""
352 else:
353 lineend = "\n"
355 #Split if directed to do so:
356 if split:
357 splitlist1 = []
358 splitlist2 = []
359 prefix = "#"
360 for item in list1:
361 splitlist1.extend(item.split()[1:])
362 prefix = item.split()[0]
363 for item in list2:
364 splitlist2.extend(item.split()[1:])
365 prefix = item.split()[0]
366 list1.extend(["%s %s%s" % (prefix, item, lineend) for item in splitlist2 if not item in splitlist1])
367 else:
368 #Normal merge, but conform to list1 newline style
369 if list1 != list2:
370 for item in list2:
371 if lineend:
372 item = item.rstrip() + lineend
373 # avoid duplicate comment lines (this might cause some problems)
374 if item not in list1 or len(item) < 5:
375 list1.append(item)
376 if not isinstance(otherpo, pounit):
377 super(pounit, self).merge(otherpo, overwrite, comments)
378 return
379 if comments:
380 mergelists(self.othercomments, otherpo.othercomments)
381 mergelists(self.typecomments, otherpo.typecomments)
382 if not authoritative:
383 # We don't bring across otherpo.automaticcomments as we consider ourself
384 # to be the the authority. Same applies to otherpo.msgidcomments
385 mergelists(self.automaticcomments, otherpo.automaticcomments)
386 mergelists(self.msgidcomments, otherpo.msgidcomments)
387 mergelists(self.sourcecomments, otherpo.sourcecomments, split=True)
388 if not self.istranslated() or overwrite:
389 # Remove kde-style comments from the translation (if any).
390 if self._extract_msgidcomments(otherpo.target):
391 otherpo.target = otherpo.target.replace('_: ' + otherpo._extract_msgidcomments()+ '\n', '')
392 self.target = otherpo.target
393 if self.source != otherpo.source:
394 self.markfuzzy()
395 else:
396 self.markfuzzy(otherpo.isfuzzy())
397 elif not otherpo.istranslated():
398 if self.source != otherpo.source:
399 self.markfuzzy()
400 else:
401 if self.target != otherpo.target:
402 self.markfuzzy()
404 def isheader(self):
405 #return (self.msgidlen() == 0) and (self.msgstrlen() > 0) and (len(self.msgidcomments) == 0)
406 #rewritten here for performance:
407 return ((self.msgid == [] or self.msgid == ['""']) and
408 not (self.msgstr == [] or self.msgstr == ['""'])
409 and self.msgidcomments == []
410 and (self.msgctxt == [] or self.msgctxt == ['""'])
411 and (self.sourcecomments == [] or self.sourcecomments == [""]))
413 def isblank(self):
414 if self.isheader() or len(self.msgidcomments):
415 return False
416 if (self.msgidlen() == 0) and (self.msgstrlen() == 0):
417 return True
418 return False
419 # TODO: remove:
420 # Before, the equivalent of the following was the final return statement:
421 # return len(self.source.strip()) == 0
423 def hastypecomment(self, typecomment):
424 """check whether the given type comment is present"""
425 # check for word boundaries properly by using a regular expression...
426 return sum(map(lambda tcline: len(re.findall("\\b%s\\b" % typecomment, tcline)), self.typecomments)) != 0
428 def hasmarkedcomment(self, commentmarker):
429 """check whether the given comment marker is present as # (commentmarker) ..."""
430 commentmarker = "(%s)" % commentmarker
431 for comment in self.othercomments:
432 if comment.replace("#", "", 1).strip().startswith(commentmarker):
433 return True
434 return False
436 def settypecomment(self, typecomment, present=True):
437 """alters whether a given typecomment is present"""
438 if self.hastypecomment(typecomment) != present:
439 if present:
440 self.typecomments.append("#, %s\n" % typecomment)
441 else:
442 # this should handle word boundaries properly ...
443 typecomments = map(lambda tcline: re.sub("\\b%s\\b[ \t,]*" % typecomment, "", tcline), self.typecomments)
444 self.typecomments = filter(lambda tcline: tcline.strip() != "#,", typecomments)
446 def istranslated(self):
447 return super(pounit, self).istranslated() and not self.isobsolete()
449 def istranslatable(self):
450 return not (self.isheader() or self.isblank())
452 def isfuzzy(self):
453 return self.hastypecomment("fuzzy")
455 def markfuzzy(self, present=True):
456 self.settypecomment("fuzzy", present)
458 def isreview(self):
459 return self.hastypecomment("review") or self.hasmarkedcomment("review") or self.hasmarkedcomment("pofilter")
461 def isobsolete(self):
462 return self.obsolete
464 def makeobsolete(self):
465 """Makes this unit obsolete"""
466 self.obsolete = True
467 if self.msgctxt:
468 self.obsoletemsgctxt = self.msgctxt
469 if self.msgid:
470 self.obsoletemsgid = self.msgid
471 self.msgid = []
472 if self.msgidcomments:
473 self.obsoletemsgidcomments = self.msgidcomments
474 self.msgidcomments = []
475 if self.msgid_plural:
476 self.obsoletemsgid_plural = self.msgid_plural
477 self.msgid_plural = []
478 if self.msgstr:
479 self.obsoletemsgstr = self.msgstr
480 self.msgstr = []
481 self.sourcecomments = []
482 self.automaticcomments = []
484 def resurrect(self):
485 """Makes an obsolete unit normal"""
486 self.obsolete = False
487 if self.obsoletemsgctxt:
488 self.msgid = self.obsoletemsgctxt
489 self.obsoletemsgctxt = []
490 if self.obsoletemsgid:
491 self.msgid = self.obsoletemsgid
492 self.obsoletemsgid = []
493 if self.obsoletemsgidcomments:
494 self.msgidcomments = self.obsoletemsgidcomments
495 self.obsoletemsgidcomments = []
496 if self.obsoletemsgid_plural:
497 self.msgid_plural = self.obsoletemsgid_plural
498 self.obsoletemsgid_plural = []
499 if self.obsoletemsgstr:
500 self.msgstr = self.obsoletemsgstr
501 self.obsoletemgstr = []
503 def hasplural(self):
504 """returns whether this pounit contains plural strings..."""
505 return len(self.msgid_plural) > 0
507 def parselines(self, lines):
508 inmsgctxt = 0
509 inmsgid = 0
510 inmsgid_comment = 0
511 inmsgid_plural = 0
512 inmsgstr = 0
513 msgstr_pluralid = None
514 linesprocessed = 0
515 for line in lines:
516 line = line + "\n"
517 linesprocessed += 1
518 if len(line) == 0:
519 continue
520 elif line[0] == '#':
521 if inmsgstr and not line[1] == '~':
522 # if we're already in the message string, this is from the next element
523 break
524 if line[1] == '.':
525 self.automaticcomments.append(line)
526 elif line[1] == ':':
527 self.sourcecomments.append(line)
528 elif line[1] == ',':
529 self.typecomments.append(line)
530 elif line[1] == '~':
531 line = line[3:]
532 self.obsolete = True
533 else:
534 self.othercomments.append(line)
535 if line.startswith('msgid_plural'):
536 inmsgctxt = 0
537 inmsgid = 0
538 inmsgid_plural = 1
539 inmsgstr = 0
540 inmsgid_comment = 0
541 elif line.startswith('msgctxt'):
542 inmsgctxt = 1
543 inmsgid = 0
544 inmsgid_plural = 0
545 inmsgstr = 0
546 inmsgid_comment = 0
547 elif line.startswith('msgid'):
548 # if we just finished a msgstr or msgid_plural, there is probably an
549 # empty line missing between the units, so let's stop the parsing now.
550 if inmsgstr or inmsgid_plural:
551 break
552 inmsgctxt = 0
553 inmsgid = 1
554 inmsgid_plural = 0
555 inmsgstr = 0
556 inmsgid_comment = 0
557 elif line.startswith('msgstr'):
558 inmsgctxt = 0
559 inmsgid = 0
560 inmsgid_plural = 0
561 inmsgstr = 1
562 if line.startswith('msgstr['):
563 msgstr_pluralid = int(line[len('msgstr['):line.find(']')].strip())
564 else:
565 msgstr_pluralid = None
566 extracted = quote.extractstr(line)
567 if not extracted is None:
568 if inmsgctxt:
569 self.msgctxt.append(extracted)
570 elif inmsgid:
571 # TODO: improve kde comment detection
572 if extracted.find("_:") != -1:
573 inmsgid_comment = 1
574 if inmsgid_comment:
575 self.msgidcomments.append(extracted)
576 else:
577 self.msgid.append(extracted)
578 if inmsgid_comment and extracted.find("\\n") != -1:
579 inmsgid_comment = 0
580 elif inmsgid_plural:
581 if extracted.find("_:") != -1:
582 inmsgid_comment = 1
583 if inmsgid_comment:
584 self.msgid_pluralcomments.append(extracted)
585 else:
586 self.msgid_plural.append(extracted)
587 if inmsgid_comment and extracted.find("\\n") != -1:
588 inmsgid_comment = 0
589 elif inmsgstr:
590 if msgstr_pluralid is None:
591 self.msgstr.append(extracted)
592 else:
593 if type(self.msgstr) == list:
594 self.msgstr = {0: self.msgstr}
595 if msgstr_pluralid not in self.msgstr:
596 self.msgstr[msgstr_pluralid] = []
597 self.msgstr[msgstr_pluralid].append(extracted)
598 if self.obsolete:
599 self.makeobsolete()
600 # If this unit is the header, we have to get the encoding to ensure that no
601 # methods are called that need the encoding before we obtained it.
602 if self.isheader():
603 charset = re.search("charset=([^\\s]+)", unquotefrompo(self.msgstr))
604 if charset:
605 self._encoding = encodingToUse(charset.group(1))
606 return linesprocessed
608 def parse(self, src):
609 if isinstance(src, str):
610 # This has not been decoded yet, so we need to make a plan
611 src = src.decode(self._encoding)
612 return self.parselines(src.split("\n"))
614 def _getmsgpartstr(self, partname, partlines, partcomments=""):
615 if isinstance(partlines, dict):
616 partkeys = partlines.keys()
617 partkeys.sort()
618 return "".join([self._getmsgpartstr("%s[%d]" % (partname, partkey), partlines[partkey], partcomments) for partkey in partkeys])
619 partstr = partname + " "
620 partstartline = 0
621 if len(partlines) > 0 and len(partcomments) == 0:
622 partstr += partlines[0]
623 partstartline = 1
624 elif len(partcomments) > 0:
625 if len(partlines) > 0 and len(unquotefrompo(partlines[:1])) == 0:
626 # if there is a blank leader line, it must come before the comment
627 partstr += partlines[0] + '\n'
628 # but if the whole string is blank, leave it in
629 if len(partlines) > 1:
630 partstartline += 1
631 else:
632 # All partcomments should start on a newline
633 partstr += '""\n'
634 # combine comments into one if more than one
635 if len(partcomments) > 1:
636 combinedcomment = []
637 for comment in partcomments:
638 comment = unquotefrompo([comment])
639 if comment.startswith("_:"):
640 comment = comment[len("_:"):]
641 if comment.endswith("\\n"):
642 comment = comment[:-len("\\n")]
643 #Before we used to strip. Necessary in some cases?
644 combinedcomment.append(comment)
645 partcomments = quoteforpo("_:%s" % "".join(combinedcomment))
646 # comments first, no blank leader line needed
647 partstr += "\n".join(partcomments)
648 partstr = quote.rstripeol(partstr)
649 else:
650 partstr += '""'
651 partstr += '\n'
652 # add the rest
653 for partline in partlines[partstartline:]:
654 partstr += partline + '\n'
655 return partstr
657 def _encodeifneccessary(self, output):
658 """encodes unicode strings and returns other strings unchanged"""
659 if isinstance(output, unicode):
660 encoding = encodingToUse(getattr(self, "encoding", "UTF-8"))
661 return output.encode(encoding)
662 return output
664 def __str__(self):
665 """convert to a string. double check that unicode is handled somehow here"""
666 output = self._getoutput()
667 return self._encodeifneccessary(output)
669 def _getoutput(self):
670 """return this po element as a string"""
671 lines = []
672 lines.extend(self.othercomments)
673 if self.isobsolete():
674 lines.extend(self.typecomments)
675 obsoletelines = []
676 if self.obsoletemsgctxt:
677 obsoletelines.append(self._getmsgpartstr("#~ msgctxt", self.obsoletemsgctxt))
678 obsoletelines.append(self._getmsgpartstr("#~ msgid", self.obsoletemsgid, self.obsoletemsgidcomments))
679 if self.obsoletemsgid_plural or self.obsoletemsgid_pluralcomments:
680 obsoletelines.append(self._getmsgpartstr("#~ msgid_plural", self.obsoletemsgid_plural, self.obsoletemsgid_pluralcomments))
681 obsoletelines.append(self._getmsgpartstr("#~ msgstr", self.obsoletemsgstr))
682 for index, obsoleteline in enumerate(obsoletelines):
683 # We need to account for a multiline msgid or msgstr here
684 obsoletelines[index] = obsoleteline.replace('\n"', '\n#~ "')
685 lines.extend(obsoletelines)
686 lines = [self._encodeifneccessary(line) for line in lines]
687 return "".join(lines)
688 # if there's no msgid don't do msgid and string, unless we're the header
689 # this will also discard any comments other than plain othercomments...
690 if (len(self.msgid) == 0) or ((len(self.msgid) == 1) and (self.msgid[0] == '""')):
691 if not (self.isheader() or self.msgidcomments or self.sourcecomments):
692 return "".join(lines)
693 lines.extend(self.automaticcomments)
694 lines.extend(self.sourcecomments)
695 lines.extend(self.typecomments)
696 if self.msgctxt:
697 lines.append(self._getmsgpartstr("msgctxt", self.msgctxt))
698 lines.append(self._getmsgpartstr("msgid", self.msgid, self.msgidcomments))
699 if self.msgid_plural or self.msgid_pluralcomments:
700 lines.append(self._getmsgpartstr("msgid_plural", self.msgid_plural, self.msgid_pluralcomments))
701 lines.append(self._getmsgpartstr("msgstr", self.msgstr))
702 lines = [self._encodeifneccessary(line) for line in lines]
703 postr = "".join(lines)
704 return postr
706 def getlocations(self):
707 """Get a list of locations from sourcecomments in the PO unit
709 rtype: List
710 return: A list of the locations with '#: ' stripped
713 locations = []
714 for sourcecomment in self.sourcecomments:
715 locations += quote.rstripeol(sourcecomment)[3:].split()
716 return locations
718 def addlocation(self, location):
719 """Add a location to sourcecomments in the PO unit
721 @param location: Text location e.g. 'file.c:23' does not include #:
722 @type location: String
725 self.sourcecomments.append("#: %s\n" % location)
727 def _extract_msgidcomments(self, text=None):
728 """Extract KDE style msgid comments from the unit.
730 @rtype: String
731 @return: Returns the extracted msgidcomments found in this unit's msgid.
735 if not text:
736 text = unquotefrompo(self.msgidcomments)
737 return text.split('\n')[0].replace('_: ', '', 1)
739 def getcontext(self):
740 """Get the message context."""
741 return unquotefrompo(self.msgctxt) + self._extract_msgidcomments()
743 def getid(self):
744 """Returns a unique identifier for this unit."""
745 context = self.getcontext()
746 # Gettext does not consider the plural to determine duplicates, only
747 # the msgid. For generation of .mo files, we might want to use this
748 # code to generate the entry for the hash table, but for now, it is
749 # commented out for conformance to gettext.
750 # id = '\0'.join(self.source.strings)
751 id = self.source
752 if self.msgidcomments:
753 id = "_: %s\n%s" % (context, id)
754 elif context:
755 id = "%s\04%s" % (context, id)
756 return id
758 class pofile(pocommon.pofile):
759 """this represents a .po file containing various units"""
760 UnitClass = pounit
761 def __init__(self, inputfile=None, encoding=None, unitclass=pounit):
762 """construct a pofile, optionally reading in from inputfile.
763 encoding can be specified but otherwise will be read from the PO header"""
764 self.UnitClass = unitclass
765 pocommon.pofile.__init__(self, unitclass=unitclass)
766 self.units = []
767 self.filename = ''
768 self._encoding = encodingToUse(encoding)
769 if inputfile is not None:
770 self.parse(inputfile)
772 def changeencoding(self, newencoding):
773 """changes the encoding on the file"""
774 self._encoding = encodingToUse(newencoding)
775 if not self.units:
776 return
777 header = self.header()
778 if not header or header.isblank():
779 return
780 charsetline = None
781 headerstr = unquotefrompo(header.msgstr)
782 for line in headerstr.split("\n"):
783 if not ":" in line: continue
784 key, value = line.strip().split(":", 1)
785 if key.strip() != "Content-Type": continue
786 charsetline = line
787 if charsetline is None:
788 headerstr += "Content-Type: text/plain; charset=%s" % self._encoding
789 else:
790 charset = re.search("charset=([^ ]*)", charsetline)
791 if charset is None:
792 newcharsetline = charsetline
793 if not newcharsetline.strip().endswith(";"):
794 newcharsetline += ";"
795 newcharsetline += " charset=%s" % self._encoding
796 else:
797 charset = charset.group(1)
798 newcharsetline = charsetline.replace("charset=%s" % charset, "charset=%s" % self._encoding, 1)
799 headerstr = headerstr.replace(charsetline, newcharsetline, 1)
800 header.msgstr = quoteforpo(headerstr)
802 def parse(self, input):
803 """parses the given file or file source string"""
804 try:
805 if hasattr(input, 'name'):
806 self.filename = input.name
807 elif not getattr(self, 'filename', ''):
808 self.filename = ''
809 if hasattr(input, "read"):
810 posrc = input.read()
811 input.close()
812 input = posrc
813 # TODO: change this to a proper parser that doesn't do line-by-line madness
814 lines = input.split("\n")
815 start = 0
816 end = 0
817 # make only the first one the header
818 linesprocessed = 0
819 is_decoded = False
820 while end <= len(lines):
821 if (end == len(lines)) or (not lines[end].strip()): # end of lines or blank line
822 newpe = self.UnitClass(encoding=self._encoding)
823 unit_lines = lines[start:end]
824 # We need to work carefully if we haven't decoded properly yet.
825 # So let's solve this temporarily until we actually get the
826 # encoding from the header.
827 if not is_decoded:
828 unit_lines = [line.decode('ascii', 'ignore') for line in unit_lines]
829 linesprocessed = newpe.parselines(unit_lines)
830 start += linesprocessed
831 # TODO: find a better way of working out if we actually read anything
832 if linesprocessed >= 1 and newpe._getoutput():
833 self.units.append(newpe)
834 if not is_decoded:
835 if newpe.isheader(): # If there is a header...
836 if "Content-Type" in self.parseheader(): # and a Content-Type...
837 if self._encoding.lower() != 'charset': # with a valid charset...
838 self._encoding = newpe._encoding # then change the encoding
839 # otherwise we'll decode using UTF-8
840 lines = self.decode(lines)
841 self.units = []
842 start = 0
843 end = 0
844 is_decoded = True
845 end = end+1
846 except Exception, e:
847 raise base.ParseError()
849 def removeduplicates(self, duplicatestyle="merge"):
850 """make sure each msgid is unique ; merge comments etc from duplicates into original"""
851 msgiddict = {}
852 uniqueunits = []
853 # we sometimes need to keep track of what has been marked
854 # TODO: this is using a list as the pos aren't hashable, but this is slow...
855 markedpos = []
856 def addcomment(thepo):
857 thepo.msgidcomments.append('"_: %s\\n"' % " ".join(thepo.getlocations()))
858 markedpos.append(thepo)
859 for thepo in self.units:
860 if duplicatestyle.startswith("msgid_comment"):
861 msgid = unquotefrompo(thepo.msgidcomments) + unquotefrompo(thepo.msgid)
862 else:
863 msgid = unquotefrompo(thepo.msgid)
864 if thepo.isheader():
865 # header msgids shouldn't be merged...
866 uniqueunits.append(thepo)
867 elif duplicatestyle == "msgid_comment_all":
868 addcomment(thepo)
869 uniqueunits.append(thepo)
870 elif msgid in msgiddict:
871 if duplicatestyle == "merge":
872 if msgid:
873 msgiddict[msgid].merge(thepo)
874 else:
875 addcomment(thepo)
876 uniqueunits.append(thepo)
877 elif duplicatestyle == "keep":
878 uniqueunits.append(thepo)
879 elif duplicatestyle == "msgid_comment":
880 origpo = msgiddict[msgid]
881 if origpo not in markedpos:
882 addcomment(origpo)
883 addcomment(thepo)
884 uniqueunits.append(thepo)
885 elif duplicatestyle == "msgctxt":
886 origpo = msgiddict[msgid]
887 if origpo not in markedpos:
888 origpo.msgctxt.append('"%s"' % " ".join(origpo.getlocations()))
889 markedpos.append(thepo)
890 thepo.msgctxt.append('"%s"' % " ".join(thepo.getlocations()))
891 uniqueunits.append(thepo)
892 else:
893 if not msgid and duplicatestyle != "keep":
894 addcomment(thepo)
895 msgiddict[msgid] = thepo
896 uniqueunits.append(thepo)
897 self.units = uniqueunits
899 def __str__(self):
900 """convert to a string. double check that unicode is handled somehow here"""
901 output = self._getoutput()
902 if isinstance(output, unicode):
903 return output.encode(getattr(self, "encoding", "UTF-8"))
904 return output
906 def _getoutput(self):
907 """convert the units back to lines"""
908 lines = []
909 for unit in self.units:
910 unitsrc = str(unit) + "\n"
911 lines.append(unitsrc)
912 lines = "".join(self.encode(lines)).rstrip()
913 #After the last pounit we will have \n\n and we only want to end in \n:
914 if lines: lines += "\n"
915 return lines
917 def encode(self, lines):
918 """encode any unicode strings in lines in self._encoding"""
919 newlines = []
920 encoding = self._encoding
921 if encoding is None or encoding.lower() == "charset":
922 encoding = 'UTF-8'
923 for line in lines:
924 if isinstance(line, unicode):
925 line = line.encode(encoding)
926 newlines.append(line)
927 return newlines
929 def decode(self, lines):
930 """decode any non-unicode strings in lines with self._encoding"""
931 newlines = []
932 for line in lines:
933 if isinstance(line, str) and self._encoding is not None and self._encoding.lower() != "charset":
934 try:
935 line = line.decode(self._encoding)
936 except UnicodeError, e:
937 raise UnicodeError("Error decoding line with encoding %r: %s. Line is %r" % (self._encoding, e, line))
938 newlines.append(line)
939 return newlines
941 def unit_iter(self):
942 for unit in self.units:
943 if not (unit.isheader() or unit.isobsolete()):
944 yield unit
946 if __name__ == '__main__':
947 import sys
948 pf = pofile(sys.stdin)
949 sys.stdout.write(str(pf))