2 # -*- coding: utf-8 -*-
4 # Copyright 2002-2007 Zuza Software Foundation
6 # This file is part of translate.
8 # translate is free software; you can redistribute it and/or modify
9 # it under the terms of the GNU General Public License as published by
10 # the Free Software Foundation; either version 2 of the License, or
11 # (at your option) any later version.
13 # translate is distributed in the hope that it will be useful,
14 # but WITHOUT ANY WARRANTY; without even the implied warranty of
15 # MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
16 # GNU General Public License for more details.
18 # You should have received a copy of the GNU General Public License
19 # along with translate; if not, write to the Free Software
20 # Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA 02111-1307 USA
22 """classes that hold units of .po files (pounit) or entire files (pofile)
23 gettext-style .po (or .pot) files are used in translations for KDE et al (see kbabel)"""
25 from __future__
import generators
26 from translate
.misc
.multistring
import multistring
27 from translate
.misc
import quote
28 from translate
.misc
import textwrap
29 from translate
.lang
import data
30 from translate
.storage
import pocommon
, base
34 """Seperator for #: entries"""
36 # general functions for quoting / unquoting po strings
38 po_unescape_map
= {"\\r": "\r", "\\t": "\t", '\\"': '"', '\\n': '\n', '\\\\': '\\'}
39 po_escape_map
= dict([(value
, key
) for (key
, value
) in po_unescape_map
.items()])
41 def escapeforpo(line
):
42 """Escapes a line for po format. assumes no \n occurs in the line.
44 @param line: unescaped text
46 special_locations
= []
47 for special_key
in po_escape_map
:
48 special_locations
.extend(quote
.find_all(line
, special_key
))
49 special_locations
= dict.fromkeys(special_locations
).keys()
50 special_locations
.sort()
53 for location
in special_locations
:
54 escaped_line
+= line
[last_location
:location
]
55 escaped_line
+= po_escape_map
[line
[location
:location
+1]]
56 last_location
= location
+1
57 escaped_line
+= line
[last_location
:]
60 def unescapehandler(escape
):
62 return po_unescape_map
.get(escape
, escape
)
65 """Wrap text for po files."""
66 wrappedlines
= textwrap
.wrap(line
, 76, replace_whitespace
=False, expand_tabs
=False, drop_whitespace
=False)
68 # Lines should not start with a space...
69 if len(wrappedlines
) > 1:
70 for index
, line
in enumerate(wrappedlines
[1:]):
71 if line
.startswith(' '):
72 # Remove the space at the beginning of the line:
73 wrappedlines
[index
+1] = line
[1:]
75 # Append a space to the previous line:
76 wrappedlines
[index
] += ' '
80 """quotes the given text for a PO file, returning quoted and escaped lines"""
84 lines
= text
.split("\n")
85 if len(lines
) > 1 or (len(lines
) == 1 and len(lines
[0]) > 71):
86 if len(lines
) != 2 or lines
[1]:
87 polines
.extend(['""'])
88 for line
in lines
[:-1]:
92 polines
.extend(['"' + escapeforpo(ln
) + '"'])
94 polines
.extend(['"' + escapeforpo(lns
[-1]) + '\\n"'])
96 polines
.extend(['"\\n"'])
98 polines
.extend(['"' + escapeforpo(line
) + '"' for line
in wrapline(lines
[-1])])
101 def extractpoline(line
):
102 """Remove quote and unescape line from po file.
104 @param line: a quoted line from a po file (msgid or msgstr)
106 extracted
= quote
.extractwithoutquotes(line
, '"', '"', '\\', includeescapes
=unescapehandler
)[0]
109 def unquotefrompo(postr
):
110 return u
"".join([extractpoline(line
) for line
in postr
])
112 def encodingToUse(encoding
):
113 """Tests whether the given encoding is known in the python runtime, or returns utf-8.
114 This function is used to ensure that a valid encoding is always used."""
115 if encoding
== "CHARSET" or encoding
== None: return 'utf-8'
117 # if encoding is None: return False
120 # tuple = codecs.lookup(encoding)
121 # except LookupError:
126 From the GNU gettext manual:
128 # TRANSLATOR-COMMENTS
129 #. AUTOMATIC-COMMENTS
130 #| PREVIOUS MSGID (Gettext 0.16 - check if this is the correct position - not yet implemented)
133 msgctxt CONTEXT (Gettext 0.15)
134 msgid UNTRANSLATED-STRING
135 msgstr TRANSLATED-STRING
138 class pounit(pocommon
.pounit
):
139 # othercomments = [] # # this is another comment
140 # automaticcomments = [] # #. comment extracted from the source code
141 # sourcecomments = [] # #: sourcefile.xxx:35
142 # typecomments = [] # #, fuzzy
143 # msgidcomments = [] # _: within msgid
148 def __init__(self
, source
=None, encoding
="UTF-8"):
149 self
._encoding
= encodingToUse(encoding
)
150 self
.obsolete
= False
151 self
._initallcomments
(blankall
=True)
154 self
.msgid_pluralcomments
= []
155 self
.msgid_plural
= []
157 self
.obsoletemsgctxt
= []
158 self
.obsoletemsgid
= []
159 self
.obsoletemsgid_pluralcomments
= []
160 self
.obsoletemsgid_plural
= []
161 self
.obsoletemsgstr
= []
163 self
.setsource(source
)
164 super(pounit
, self
).__init
__(source
)
166 def _initallcomments(self
, blankall
=False):
167 """Initialises allcomments"""
169 self
.othercomments
= []
170 self
.automaticcomments
= []
171 self
.sourcecomments
= []
172 self
.typecomments
= []
173 self
.msgidcomments
= []
174 self
.obsoletemsgidcomments
= []
175 self
.allcomments
= [self
.othercomments
,
176 self
.automaticcomments
,
180 self
.obsoletemsgidcomments
]
183 """Returns the unescaped msgid"""
184 multi
= multistring(unquotefrompo(self
.msgid
), self
._encoding
)
186 pluralform
= unquotefrompo(self
.msgid_plural
)
187 if isinstance(pluralform
, str):
188 pluralform
= pluralform
.decode(self
._encoding
)
189 multi
.strings
.append(pluralform
)
192 def setsource(self
, source
):
193 """Sets the msgid to the given (unescaped) value.
195 @param source: an unescaped source string.
197 if isinstance(source
, str):
198 source
= source
.decode(self
._encoding
)
199 if isinstance(source
, multistring
):
200 source
= source
.strings
201 if isinstance(source
, list):
202 self
.msgid
= quoteforpo(source
[0])
204 self
.msgid_plural
= quoteforpo(source
[1])
206 self
.msgid
= quoteforpo(source
)
207 source
= property(getsource
, setsource
)
210 """Returns the unescaped msgstr"""
211 if isinstance(self
.msgstr
, dict):
212 multi
= multistring(map(unquotefrompo
, self
.msgstr
.values()), self
._encoding
)
214 multi
= multistring(unquotefrompo(self
.msgstr
), self
._encoding
)
217 def settarget(self
, target
):
218 """Sets the msgstr to the given (unescaped) value"""
219 if isinstance(target
, str):
220 target
= target
.decode(self
._encoding
)
221 if target
== self
.target
:
224 if isinstance(target
, multistring
):
225 target
= target
.strings
226 elif isinstance(target
, basestring
):
228 elif isinstance(target
,(dict, list)):
232 raise ValueError("po msgid element has no plural but msgstr has %d elements (%s)" % (len(target
), target
))
233 templates
= self
.msgstr
234 if isinstance(templates
, list):
235 templates
= {0: templates
}
236 if isinstance(target
, list):
237 self
.msgstr
= dict([(i
, quoteforpo(target
[i
])) for i
in range(len(target
))])
238 elif isinstance(target
, dict):
239 self
.msgstr
= dict([(i
, quoteforpo(targetstring
)) for i
, targetstring
in target
.iteritems()])
241 self
.msgstr
= quoteforpo(target
)
242 target
= property(gettarget
, settarget
)
244 def getnotes(self
, origin
=None):
245 """Return comments based on origin value (programmer, developer, source code and translator)"""
247 comments
= u
"".join([comment
[2:] for comment
in self
.othercomments
])
248 comments
+= u
"".join([comment
[3:] for comment
in self
.automaticcomments
])
249 elif origin
== "translator":
250 comments
= u
"".join ([comment
[2:] for comment
in self
.othercomments
])
251 elif origin
in ["programmer", "developer", "source code"]:
252 comments
= u
"".join([comment
[3:] for comment
in self
.automaticcomments
])
254 raise ValueError("Comment type not valid")
255 # Let's drop the last newline
258 def addnote(self
, text
, origin
=None, position
="append"):
259 """This is modeled on the XLIFF method. See xliff.py::xliffunit.addnote"""
260 # We don't want to put in an empty '#' without a real comment:
263 text
= data
.forceunicode(text
)
264 commentlist
= self
.othercomments
266 if origin
in ["programmer", "developer", "source code"]:
268 commentlist
= self
.automaticcomments
270 text
= text
.split("\n")
271 if position
== "append":
272 commentlist
+= [linestart
+ line
+ "\n" for line
in text
]
274 newcomments
= [linestart
+ line
+ "\n" for line
in text
]
275 newcomments
+= [line
for line
in commentlist
]
277 self
.automaticcomments
= newcomments
279 self
.othercomments
= newcomments
281 def removenotes(self
):
282 """Remove all the translator's notes (other comments)"""
283 self
.othercomments
= []
286 newpo
= self
.__class
__()
287 newpo
.othercomments
= self
.othercomments
[:]
288 newpo
.automaticcomments
= self
.automaticcomments
[:]
289 newpo
.sourcecomments
= self
.sourcecomments
[:]
290 newpo
.typecomments
= self
.typecomments
[:]
291 newpo
.obsolete
= self
.obsolete
292 newpo
.msgidcomments
= self
.msgidcomments
[:]
293 newpo
._initallcomments
()
294 newpo
.msgctxt
= self
.msgctxt
[:]
295 newpo
.msgid
= self
.msgid
[:]
296 newpo
.msgid_pluralcomments
= self
.msgid_pluralcomments
[:]
297 newpo
.msgid_plural
= self
.msgid_plural
[:]
298 if isinstance(self
.msgstr
, dict):
299 newpo
.msgstr
= self
.msgstr
.copy()
301 newpo
.msgstr
= self
.msgstr
[:]
303 newpo
.obsoletemsgctxt
= self
.obsoletemsgctxt
[:]
304 newpo
.obsoletemsgid
= self
.obsoletemsgid
[:]
305 newpo
.obsoletemsgid_pluralcomments
= self
.obsoletemsgid_pluralcomments
[:]
306 newpo
.obsoletemsgid_plural
= self
.obsoletemsgid_plural
[:]
307 if isinstance(self
.obsoletemsgstr
, dict):
308 newpo
.obsoletemsgstr
= self
.obsoletemsgstr
.copy()
310 newpo
.obsoletemsgstr
= self
.obsoletemsgstr
[:]
315 return len(unquotefrompo(self
.msgid
).strip()) + len(unquotefrompo(self
.msgid_plural
).strip())
317 return len(unquotefrompo(self
.msgid
).strip())
320 if isinstance(self
.msgstr
, dict):
321 combinedstr
= "\n".join([unquotefrompo(msgstr
).strip() for msgstr
in self
.msgstr
.itervalues()])
322 return len(combinedstr
.strip())
324 return len(unquotefrompo(self
.msgstr
).strip())
326 def merge(self
, otherpo
, overwrite
=False, comments
=True, authoritative
=False):
327 """Merges the otherpo (with the same msgid) into this one.
329 Overwrite non-blank self.msgstr only if overwrite is True
330 merge comments only if comments is True
334 def mergelists(list1
, list2
, split
=False):
335 #decode where necessary
336 if unicode in [type(item
) for item
in list2
] + [type(item
) for item
in list1
]:
337 for position
, item
in enumerate(list1
):
338 if isinstance(item
, str):
339 list1
[position
] = item
.decode("utf-8")
340 for position
, item
in enumerate(list2
):
341 if isinstance(item
, str):
342 list2
[position
] = item
.decode("utf-8")
344 #Determine the newline style of list1
346 if list1
and list1
[0]:
347 for candidate
in ["\n", "\r", "\n\r"]:
348 if list1
[0].endswith(candidate
):
355 #Split if directed to do so:
361 splitlist1
.extend(item
.split()[1:])
362 prefix
= item
.split()[0]
364 splitlist2
.extend(item
.split()[1:])
365 prefix
= item
.split()[0]
366 list1
.extend(["%s %s%s" % (prefix
, item
, lineend
) for item
in splitlist2
if not item
in splitlist1
])
368 #Normal merge, but conform to list1 newline style
372 item
= item
.rstrip() + lineend
373 # avoid duplicate comment lines (this might cause some problems)
374 if item
not in list1
or len(item
) < 5:
376 if not isinstance(otherpo
, pounit
):
377 super(pounit
, self
).merge(otherpo
, overwrite
, comments
)
380 mergelists(self
.othercomments
, otherpo
.othercomments
)
381 mergelists(self
.typecomments
, otherpo
.typecomments
)
382 if not authoritative
:
383 # We don't bring across otherpo.automaticcomments as we consider ourself
384 # to be the the authority. Same applies to otherpo.msgidcomments
385 mergelists(self
.automaticcomments
, otherpo
.automaticcomments
)
386 mergelists(self
.msgidcomments
, otherpo
.msgidcomments
)
387 mergelists(self
.sourcecomments
, otherpo
.sourcecomments
, split
=True)
388 if not self
.istranslated() or overwrite
:
389 # Remove kde-style comments from the translation (if any).
390 if self
._extract
_msgidcomments
(otherpo
.target
):
391 otherpo
.target
= otherpo
.target
.replace('_: ' + otherpo
._extract
_msgidcomments
()+ '\n', '')
392 self
.target
= otherpo
.target
393 if self
.source
!= otherpo
.source
:
396 self
.markfuzzy(otherpo
.isfuzzy())
397 elif not otherpo
.istranslated():
398 if self
.source
!= otherpo
.source
:
401 if self
.target
!= otherpo
.target
:
405 #return (self.msgidlen() == 0) and (self.msgstrlen() > 0) and (len(self.msgidcomments) == 0)
406 #rewritten here for performance:
407 return ((self
.msgid
== [] or self
.msgid
== ['""']) and
408 not (self
.msgstr
== [] or self
.msgstr
== ['""'])
409 and self
.msgidcomments
== []
410 and (self
.msgctxt
== [] or self
.msgctxt
== ['""'])
411 and (self
.sourcecomments
== [] or self
.sourcecomments
== [""]))
414 if self
.isheader() or len(self
.msgidcomments
):
416 if (self
.msgidlen() == 0) and (self
.msgstrlen() == 0):
420 # Before, the equivalent of the following was the final return statement:
421 # return len(self.source.strip()) == 0
423 def hastypecomment(self
, typecomment
):
424 """check whether the given type comment is present"""
425 # check for word boundaries properly by using a regular expression...
426 return sum(map(lambda tcline
: len(re
.findall("\\b%s\\b" % typecomment
, tcline
)), self
.typecomments
)) != 0
428 def hasmarkedcomment(self
, commentmarker
):
429 """check whether the given comment marker is present as # (commentmarker) ..."""
430 commentmarker
= "(%s)" % commentmarker
431 for comment
in self
.othercomments
:
432 if comment
.replace("#", "", 1).strip().startswith(commentmarker
):
436 def settypecomment(self
, typecomment
, present
=True):
437 """alters whether a given typecomment is present"""
438 if self
.hastypecomment(typecomment
) != present
:
440 self
.typecomments
.append("#, %s\n" % typecomment
)
442 # this should handle word boundaries properly ...
443 typecomments
= map(lambda tcline
: re
.sub("\\b%s\\b[ \t,]*" % typecomment
, "", tcline
), self
.typecomments
)
444 self
.typecomments
= filter(lambda tcline
: tcline
.strip() != "#,", typecomments
)
446 def istranslated(self
):
447 return super(pounit
, self
).istranslated() and not self
.isobsolete()
449 def istranslatable(self
):
450 return not (self
.isheader() or self
.isblank())
453 return self
.hastypecomment("fuzzy")
455 def markfuzzy(self
, present
=True):
456 self
.settypecomment("fuzzy", present
)
459 return self
.hastypecomment("review") or self
.hasmarkedcomment("review") or self
.hasmarkedcomment("pofilter")
461 def isobsolete(self
):
464 def makeobsolete(self
):
465 """Makes this unit obsolete"""
468 self
.obsoletemsgctxt
= self
.msgctxt
470 self
.obsoletemsgid
= self
.msgid
472 if self
.msgidcomments
:
473 self
.obsoletemsgidcomments
= self
.msgidcomments
474 self
.msgidcomments
= []
475 if self
.msgid_plural
:
476 self
.obsoletemsgid_plural
= self
.msgid_plural
477 self
.msgid_plural
= []
479 self
.obsoletemsgstr
= self
.msgstr
481 self
.sourcecomments
= []
482 self
.automaticcomments
= []
485 """Makes an obsolete unit normal"""
486 self
.obsolete
= False
487 if self
.obsoletemsgctxt
:
488 self
.msgid
= self
.obsoletemsgctxt
489 self
.obsoletemsgctxt
= []
490 if self
.obsoletemsgid
:
491 self
.msgid
= self
.obsoletemsgid
492 self
.obsoletemsgid
= []
493 if self
.obsoletemsgidcomments
:
494 self
.msgidcomments
= self
.obsoletemsgidcomments
495 self
.obsoletemsgidcomments
= []
496 if self
.obsoletemsgid_plural
:
497 self
.msgid_plural
= self
.obsoletemsgid_plural
498 self
.obsoletemsgid_plural
= []
499 if self
.obsoletemsgstr
:
500 self
.msgstr
= self
.obsoletemsgstr
501 self
.obsoletemgstr
= []
504 """returns whether this pounit contains plural strings..."""
505 return len(self
.msgid_plural
) > 0
507 def parselines(self
, lines
):
513 msgstr_pluralid
= None
521 if inmsgstr
and not line
[1] == '~':
522 # if we're already in the message string, this is from the next element
525 self
.automaticcomments
.append(line
)
527 self
.sourcecomments
.append(line
)
529 self
.typecomments
.append(line
)
534 self
.othercomments
.append(line
)
535 if line
.startswith('msgid_plural'):
541 elif line
.startswith('msgctxt'):
547 elif line
.startswith('msgid'):
548 # if we just finished a msgstr or msgid_plural, there is probably an
549 # empty line missing between the units, so let's stop the parsing now.
550 if inmsgstr
or inmsgid_plural
:
557 elif line
.startswith('msgstr'):
562 if line
.startswith('msgstr['):
563 msgstr_pluralid
= int(line
[len('msgstr['):line
.find(']')].strip())
565 msgstr_pluralid
= None
566 extracted
= quote
.extractstr(line
)
567 if not extracted
is None:
569 self
.msgctxt
.append(extracted
)
571 # TODO: improve kde comment detection
572 if extracted
.find("_:") != -1:
575 self
.msgidcomments
.append(extracted
)
577 self
.msgid
.append(extracted
)
578 if inmsgid_comment
and extracted
.find("\\n") != -1:
581 if extracted
.find("_:") != -1:
584 self
.msgid_pluralcomments
.append(extracted
)
586 self
.msgid_plural
.append(extracted
)
587 if inmsgid_comment
and extracted
.find("\\n") != -1:
590 if msgstr_pluralid
is None:
591 self
.msgstr
.append(extracted
)
593 if type(self
.msgstr
) == list:
594 self
.msgstr
= {0: self
.msgstr
}
595 if msgstr_pluralid
not in self
.msgstr
:
596 self
.msgstr
[msgstr_pluralid
] = []
597 self
.msgstr
[msgstr_pluralid
].append(extracted
)
600 # If this unit is the header, we have to get the encoding to ensure that no
601 # methods are called that need the encoding before we obtained it.
603 charset
= re
.search("charset=([^\\s]+)", unquotefrompo(self
.msgstr
))
605 self
._encoding
= encodingToUse(charset
.group(1))
606 return linesprocessed
608 def parse(self
, src
):
609 if isinstance(src
, str):
610 # This has not been decoded yet, so we need to make a plan
611 src
= src
.decode(self
._encoding
)
612 return self
.parselines(src
.split("\n"))
614 def _getmsgpartstr(self
, partname
, partlines
, partcomments
=""):
615 if isinstance(partlines
, dict):
616 partkeys
= partlines
.keys()
618 return "".join([self
._getmsgpartstr
("%s[%d]" % (partname
, partkey
), partlines
[partkey
], partcomments
) for partkey
in partkeys
])
619 partstr
= partname
+ " "
621 if len(partlines
) > 0 and len(partcomments
) == 0:
622 partstr
+= partlines
[0]
624 elif len(partcomments
) > 0:
625 if len(partlines
) > 0 and len(unquotefrompo(partlines
[:1])) == 0:
626 # if there is a blank leader line, it must come before the comment
627 partstr
+= partlines
[0] + '\n'
628 # but if the whole string is blank, leave it in
629 if len(partlines
) > 1:
632 # All partcomments should start on a newline
634 # combine comments into one if more than one
635 if len(partcomments
) > 1:
637 for comment
in partcomments
:
638 comment
= unquotefrompo([comment
])
639 if comment
.startswith("_:"):
640 comment
= comment
[len("_:"):]
641 if comment
.endswith("\\n"):
642 comment
= comment
[:-len("\\n")]
643 #Before we used to strip. Necessary in some cases?
644 combinedcomment
.append(comment
)
645 partcomments
= quoteforpo("_:%s" % "".join(combinedcomment
))
646 # comments first, no blank leader line needed
647 partstr
+= "\n".join(partcomments
)
648 partstr
= quote
.rstripeol(partstr
)
653 for partline
in partlines
[partstartline
:]:
654 partstr
+= partline
+ '\n'
657 def _encodeifneccessary(self
, output
):
658 """encodes unicode strings and returns other strings unchanged"""
659 if isinstance(output
, unicode):
660 encoding
= encodingToUse(getattr(self
, "encoding", "UTF-8"))
661 return output
.encode(encoding
)
665 """convert to a string. double check that unicode is handled somehow here"""
666 output
= self
._getoutput
()
667 return self
._encodeifneccessary
(output
)
669 def _getoutput(self
):
670 """return this po element as a string"""
672 lines
.extend(self
.othercomments
)
673 if self
.isobsolete():
674 lines
.extend(self
.typecomments
)
676 if self
.obsoletemsgctxt
:
677 obsoletelines
.append(self
._getmsgpartstr
("#~ msgctxt", self
.obsoletemsgctxt
))
678 obsoletelines
.append(self
._getmsgpartstr
("#~ msgid", self
.obsoletemsgid
, self
.obsoletemsgidcomments
))
679 if self
.obsoletemsgid_plural
or self
.obsoletemsgid_pluralcomments
:
680 obsoletelines
.append(self
._getmsgpartstr
("#~ msgid_plural", self
.obsoletemsgid_plural
, self
.obsoletemsgid_pluralcomments
))
681 obsoletelines
.append(self
._getmsgpartstr
("#~ msgstr", self
.obsoletemsgstr
))
682 for index
, obsoleteline
in enumerate(obsoletelines
):
683 # We need to account for a multiline msgid or msgstr here
684 obsoletelines
[index
] = obsoleteline
.replace('\n"', '\n#~ "')
685 lines
.extend(obsoletelines
)
686 lines
= [self
._encodeifneccessary
(line
) for line
in lines
]
687 return "".join(lines
)
688 # if there's no msgid don't do msgid and string, unless we're the header
689 # this will also discard any comments other than plain othercomments...
690 if (len(self
.msgid
) == 0) or ((len(self
.msgid
) == 1) and (self
.msgid
[0] == '""')):
691 if not (self
.isheader() or self
.msgidcomments
or self
.sourcecomments
):
692 return "".join(lines
)
693 lines
.extend(self
.automaticcomments
)
694 lines
.extend(self
.sourcecomments
)
695 lines
.extend(self
.typecomments
)
697 lines
.append(self
._getmsgpartstr
("msgctxt", self
.msgctxt
))
698 lines
.append(self
._getmsgpartstr
("msgid", self
.msgid
, self
.msgidcomments
))
699 if self
.msgid_plural
or self
.msgid_pluralcomments
:
700 lines
.append(self
._getmsgpartstr
("msgid_plural", self
.msgid_plural
, self
.msgid_pluralcomments
))
701 lines
.append(self
._getmsgpartstr
("msgstr", self
.msgstr
))
702 lines
= [self
._encodeifneccessary
(line
) for line
in lines
]
703 postr
= "".join(lines
)
706 def getlocations(self
):
707 """Get a list of locations from sourcecomments in the PO unit
710 return: A list of the locations with '#: ' stripped
714 for sourcecomment
in self
.sourcecomments
:
715 locations
+= quote
.rstripeol(sourcecomment
)[3:].split()
718 def addlocation(self
, location
):
719 """Add a location to sourcecomments in the PO unit
721 @param location: Text location e.g. 'file.c:23' does not include #:
722 @type location: String
725 self
.sourcecomments
.append("#: %s\n" % location
)
727 def _extract_msgidcomments(self
, text
=None):
728 """Extract KDE style msgid comments from the unit.
731 @return: Returns the extracted msgidcomments found in this unit's msgid.
736 text
= unquotefrompo(self
.msgidcomments
)
737 return text
.split('\n')[0].replace('_: ', '', 1)
739 def getcontext(self
):
740 """Get the message context."""
741 return unquotefrompo(self
.msgctxt
) + self
._extract
_msgidcomments
()
744 """Returns a unique identifier for this unit."""
745 context
= self
.getcontext()
746 # Gettext does not consider the plural to determine duplicates, only
747 # the msgid. For generation of .mo files, we might want to use this
748 # code to generate the entry for the hash table, but for now, it is
749 # commented out for conformance to gettext.
750 # id = '\0'.join(self.source.strings)
752 if self
.msgidcomments
:
753 id = "_: %s\n%s" % (context
, id)
755 id = "%s\04%s" % (context
, id)
758 class pofile(pocommon
.pofile
):
759 """this represents a .po file containing various units"""
761 def __init__(self
, inputfile
=None, encoding
=None, unitclass
=pounit
):
762 """construct a pofile, optionally reading in from inputfile.
763 encoding can be specified but otherwise will be read from the PO header"""
764 self
.UnitClass
= unitclass
765 pocommon
.pofile
.__init
__(self
, unitclass
=unitclass
)
768 self
._encoding
= encodingToUse(encoding
)
769 if inputfile
is not None:
770 self
.parse(inputfile
)
772 def changeencoding(self
, newencoding
):
773 """changes the encoding on the file"""
774 self
._encoding
= encodingToUse(newencoding
)
777 header
= self
.header()
778 if not header
or header
.isblank():
781 headerstr
= unquotefrompo(header
.msgstr
)
782 for line
in headerstr
.split("\n"):
783 if not ":" in line
: continue
784 key
, value
= line
.strip().split(":", 1)
785 if key
.strip() != "Content-Type": continue
787 if charsetline
is None:
788 headerstr
+= "Content-Type: text/plain; charset=%s" % self
._encoding
790 charset
= re
.search("charset=([^ ]*)", charsetline
)
792 newcharsetline
= charsetline
793 if not newcharsetline
.strip().endswith(";"):
794 newcharsetline
+= ";"
795 newcharsetline
+= " charset=%s" % self
._encoding
797 charset
= charset
.group(1)
798 newcharsetline
= charsetline
.replace("charset=%s" % charset
, "charset=%s" % self
._encoding
, 1)
799 headerstr
= headerstr
.replace(charsetline
, newcharsetline
, 1)
800 header
.msgstr
= quoteforpo(headerstr
)
802 def parse(self
, input):
803 """parses the given file or file source string"""
805 if hasattr(input, 'name'):
806 self
.filename
= input.name
807 elif not getattr(self
, 'filename', ''):
809 if hasattr(input, "read"):
813 # TODO: change this to a proper parser that doesn't do line-by-line madness
814 lines
= input.split("\n")
817 # make only the first one the header
820 while end
<= len(lines
):
821 if (end
== len(lines
)) or (not lines
[end
].strip()): # end of lines or blank line
822 newpe
= self
.UnitClass(encoding
=self
._encoding
)
823 unit_lines
= lines
[start
:end
]
824 # We need to work carefully if we haven't decoded properly yet.
825 # So let's solve this temporarily until we actually get the
826 # encoding from the header.
828 unit_lines
= [line
.decode('ascii', 'ignore') for line
in unit_lines
]
829 linesprocessed
= newpe
.parselines(unit_lines
)
830 start
+= linesprocessed
831 # TODO: find a better way of working out if we actually read anything
832 if linesprocessed
>= 1 and newpe
._getoutput
():
833 self
.units
.append(newpe
)
835 if newpe
.isheader(): # If there is a header...
836 if "Content-Type" in self
.parseheader(): # and a Content-Type...
837 if self
._encoding
.lower() != 'charset': # with a valid charset...
838 self
._encoding
= newpe
._encoding
# then change the encoding
839 # otherwise we'll decode using UTF-8
840 lines
= self
.decode(lines
)
847 raise base
.ParseError()
849 def removeduplicates(self
, duplicatestyle
="merge"):
850 """make sure each msgid is unique ; merge comments etc from duplicates into original"""
853 # we sometimes need to keep track of what has been marked
854 # TODO: this is using a list as the pos aren't hashable, but this is slow...
856 def addcomment(thepo
):
857 thepo
.msgidcomments
.append('"_: %s\\n"' % " ".join(thepo
.getlocations()))
858 markedpos
.append(thepo
)
859 for thepo
in self
.units
:
860 if duplicatestyle
.startswith("msgid_comment"):
861 msgid
= unquotefrompo(thepo
.msgidcomments
) + unquotefrompo(thepo
.msgid
)
863 msgid
= unquotefrompo(thepo
.msgid
)
865 # header msgids shouldn't be merged...
866 uniqueunits
.append(thepo
)
867 elif duplicatestyle
== "msgid_comment_all":
869 uniqueunits
.append(thepo
)
870 elif msgid
in msgiddict
:
871 if duplicatestyle
== "merge":
873 msgiddict
[msgid
].merge(thepo
)
876 uniqueunits
.append(thepo
)
877 elif duplicatestyle
== "keep":
878 uniqueunits
.append(thepo
)
879 elif duplicatestyle
== "msgid_comment":
880 origpo
= msgiddict
[msgid
]
881 if origpo
not in markedpos
:
884 uniqueunits
.append(thepo
)
885 elif duplicatestyle
== "msgctxt":
886 origpo
= msgiddict
[msgid
]
887 if origpo
not in markedpos
:
888 origpo
.msgctxt
.append('"%s"' % " ".join(origpo
.getlocations()))
889 markedpos
.append(thepo
)
890 thepo
.msgctxt
.append('"%s"' % " ".join(thepo
.getlocations()))
891 uniqueunits
.append(thepo
)
893 if not msgid
and duplicatestyle
!= "keep":
895 msgiddict
[msgid
] = thepo
896 uniqueunits
.append(thepo
)
897 self
.units
= uniqueunits
900 """convert to a string. double check that unicode is handled somehow here"""
901 output
= self
._getoutput
()
902 if isinstance(output
, unicode):
903 return output
.encode(getattr(self
, "encoding", "UTF-8"))
906 def _getoutput(self
):
907 """convert the units back to lines"""
909 for unit
in self
.units
:
910 unitsrc
= str(unit
) + "\n"
911 lines
.append(unitsrc
)
912 lines
= "".join(self
.encode(lines
)).rstrip()
913 #After the last pounit we will have \n\n and we only want to end in \n:
914 if lines
: lines
+= "\n"
917 def encode(self
, lines
):
918 """encode any unicode strings in lines in self._encoding"""
920 encoding
= self
._encoding
921 if encoding
is None or encoding
.lower() == "charset":
924 if isinstance(line
, unicode):
925 line
= line
.encode(encoding
)
926 newlines
.append(line
)
929 def decode(self
, lines
):
930 """decode any non-unicode strings in lines with self._encoding"""
933 if isinstance(line
, str) and self
._encoding
is not None and self
._encoding
.lower() != "charset":
935 line
= line
.decode(self
._encoding
)
936 except UnicodeError, e
:
937 raise UnicodeError("Error decoding line with encoding %r: %s. Line is %r" % (self
._encoding
, e
, line
))
938 newlines
.append(line
)
942 for unit
in self
.units
:
943 if not (unit
.isheader() or unit
.isobsolete()):
946 if __name__
== '__main__':
948 pf
= pofile(sys
.stdin
)
949 sys
.stdout
.write(str(pf
))