2 # -*- coding: utf-8 -*-
4 # This file is part of translate.
6 # translate is free software; you can redistribute it and/or modify
7 # it under the terms of the GNU General Public License as published by
8 # the Free Software Foundation; either version 2 of the License, or
9 # (at your option) any later version.
11 # translate is distributed in the hope that it will be useful,
12 # but WITHOUT ANY WARRANTY; without even the implied warranty of
13 # MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
14 # GNU General Public License for more details.
16 # You should have received a copy of the GNU General Public License
17 # along with translate; if not, write to the Free Software
18 # Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA 02111-1307 USA
20 """reads a set of .po or .pot files to produce a pootle-terminology.pot"""
22 from translate
.lang
import factory
as lang_factory
23 from translate
.misc
import optrecurse
24 from translate
.storage
import po
25 from translate
.storage
import factory
30 class TerminologyOptionParser(optrecurse
.RecursiveOptionParser
):
31 """a specialized Option Parser for the terminology tool..."""
33 # handles c-format and python-format
34 formatpat
= re
.compile(r
"%(?:\([^)]+\)|[0-9]+\$)?[-+#0]*[0-9.*]*(?:[hlLzjt][hl])?[EFGXc-ginoprsux]")
35 # handles XML/HTML elements (<foo>text</foo> => text)
36 xmlelpat
= re
.compile(r
"<(?:![[-]|[/?]?[A-Za-z_:])[^>]*>")
37 # handles XML/HTML entities (    & &my_entity;)
38 xmlentpat
= re
.compile(r
"&(?:#(?:[0-9]+|x[0-9a-f]+)|[a-z_:][\w.-:]*);",
39 flags
=re
.UNICODE|re
.IGNORECASE
)
41 sortorders
= [ "frequency", "dictionary", "length" ]
46 def parse_args(self
, args
=None, values
=None):
47 """parses the command line options, handling implicit input/output args"""
48 (options
, args
) = optrecurse
.optparse
.OptionParser
.parse_args(self
, args
, values
)
49 # some intelligence as to what reasonable people might give on the command line
50 if args
and not options
.input:
51 if not options
.output
and len(args
) > 1:
52 options
.input = args
[:-1]
57 if args
and not options
.output
:
58 options
.output
= args
[-1]
60 if not options
.output
:
61 options
.output
= "pootle-terminology.pot"
63 self
.error("You have used an invalid combination of --input, --output and freestanding args")
64 if isinstance(options
.input, list) and len(options
.input) == 1:
65 options
.input = options
.input[0]
66 if options
.inputmin
== None:
68 elif options
.inputmin
== None:
70 return (options
, args
)
72 def set_usage(self
, usage
=None):
73 """sets the usage string - if usage not given, uses getusagestring for each option"""
75 self
.usage
= "%prog " + " ".join([self
.getusagestring(option
) for option
in self
.option_list
]) + \
76 "\n input directory is searched for PO files, terminology PO file is output file"
78 super(TerminologyOptionParser
, self
).set_usage(usage
)
81 """parses the arguments, and runs recursiveprocess with the resulting options"""
82 (options
, args
) = self
.parse_args()
83 options
.inputformats
= self
.inputformats
84 options
.outputoptions
= self
.outputoptions
85 self
.usepsyco(options
)
86 self
.recursiveprocess(options
)
88 def recursiveprocess(self
, options
):
89 """recurse through directories and process files"""
90 if self
.isrecursive(options
.input, 'input') and getattr(options
, "allowrecursiveinput", True):
91 if isinstance(options
.input, list):
92 inputfiles
= self
.recurseinputfilelist(options
)
94 inputfiles
= self
.recurseinputfiles(options
)
97 inputfiles
= [os
.path
.basename(options
.input)]
98 options
.input = os
.path
.dirname(options
.input)
100 inputfiles
= [options
.input]
101 if os
.path
.isdir(options
.output
):
102 options
.output
= os
.path
.join(options
.output
,"pootle-terminology.pot")
105 actions
= { '+': frozenset(), ':': frozenset(['skip']),
106 '<': frozenset(['phrase']), '=': frozenset(['word']),
107 '>': frozenset(['word','skip']),
108 '@': frozenset(['word','phrase']) }
109 if options
.stopwordfile
!= None:
110 stopfile
= open(options
.stopwordfile
, "r")
112 for stopline
in stopfile
:
113 stoptype
= stopline
[0]
114 if stoptype
== '#' or stoptype
== "\n":
116 elif stoptype
== '/':
117 self
.stoprelist
.append(re
.compile(stopline
[1:-1]+'$'))
119 self
.stopwords
[stopline
[1:-1]] = actions
[stoptype
]
120 except KeyError, character
:
121 self
.warning("Bad line in stopword list %s starts with" % (options
.stopwordfile
), options
, sys
.exc_info())
124 self
.initprogressbar(inputfiles
, options
)
125 for inputpath
in inputfiles
:
127 fullinputpath
= self
.getfullinputpath(options
, inputpath
)
130 self
.processfile(None, options
, fullinputpath
)
131 except Exception, error
:
132 if isinstance(error
, KeyboardInterrupt):
134 self
.warning("Error processing: input %s" % (fullinputpath
), options
, sys
.exc_info())
136 self
.reportprogress(inputpath
, success
)
138 self
.outputterminology(options
)
140 def clean(self
, string
, options
):
141 """returns the cleaned string that contains the text to be matched"""
142 for accelerator
in options
.accelchars
:
143 string
= string
.replace(accelerator
, "")
144 string
= self
.formatpat
.sub(" ", string
)
145 string
= self
.xmlelpat
.sub(" ", string
)
146 string
= self
.xmlentpat
.sub(" ", string
)
147 string
= string
.strip()
150 def addphrases(self
, words
, skips
, translation
, partials
=True):
151 """adds (sub)phrases with non-skipwords and more than one word"""
152 if (len(words
) > skips
+ 1 and
153 'skip' not in self
.stopwords
.get(words
[0], frozenset()) and
154 'skip' not in self
.stopwords
.get(words
[-1], frozenset())):
155 self
.glossary
.setdefault(' '.join(words
), []).append(translation
)
159 if 'skip' in self
.stopwords
.get(part
.pop(), frozenset()):
161 if (len(part
) > skips
+ 1 and
162 'skip' not in self
.stopwords
.get(part
[0], frozenset()) and
163 'skip' not in self
.stopwords
.get(part
[-1], frozenset())):
164 self
.glossary
.setdefault(' '.join(part
), []).append(translation
)
166 def processfile(self
, fileprocessor
, options
, fullinputpath
):
167 """process an individual file"""
168 inputfile
= self
.openinputfile(options
, fullinputpath
)
169 inputfile
= factory
.getobject(inputfile
)
170 sourcelang
= lang_factory
.getlanguage(options
.sourcelanguage
)
171 rematchignore
= frozenset(('word','phrase'))
172 defaultignore
= frozenset()
173 for unit
in inputfile
.units
:
179 if not options
.invert
:
180 source
= self
.clean(unit
.source
, options
)
181 target
= self
.clean(unit
.target
, options
)
183 target
= self
.clean(unit
.source
, options
)
184 source
= self
.clean(unit
.target
, options
)
187 for sentence
in sourcelang
.sentences(source
):
190 for word
in sourcelang
.words(sentence
):
191 if options
.ignorecase
or (options
.foldtitle
and word
.istitle()):
193 ignore
= defaultignore
194 if word
in self
.stopwords
:
195 ignore
= self
.stopwords
[word
]
197 for stopre
in self
.stoprelist
:
198 if stopre
.match(word
) != None:
199 ignore
= rematchignore
201 translation
= (source
, target
, unit
, fullinputpath
)
202 if 'word' not in ignore
:
205 if len(word
) > 3 and word
[-1] == 's' and word
[0:-1] in self
.glossary
:
207 elif len(root
) > 2 and root
+ 's' in self
.glossary
:
208 self
.glossary
[root
] = self
.glossary
.pop(root
+ 's')
209 self
.glossary
.setdefault(root
, []).append(translation
)
210 if options
.termlength
> 1:
211 if 'phrase' in ignore
:
212 # add trailing phrases in previous words
213 while len(words
) > 2:
214 if 'skip' in self
.stopwords
.get(words
.pop(0), defaultignore
):
216 self
.addphrases(words
, skips
, translation
)
223 if len(words
) > options
.termlength
+ skips
:
224 while len(words
) > options
.termlength
+ skips
:
225 if 'skip' in self
.stopwords
.get(words
.pop(0), defaultignore
):
227 self
.addphrases(words
, skips
, translation
)
229 self
.addphrases(words
, skips
, translation
, partials
=False)
230 if options
.termlength
> 1:
231 # add trailing phrases in sentence after reaching end
232 while options
.termlength
> 1 and len(words
) > 2:
233 if 'skip' in self
.stopwords
.get(words
.pop(0), defaultignore
):
235 self
.addphrases(words
, skips
, translation
)
237 def outputterminology(self
, options
):
238 """saves the generated terminology glossary"""
239 termfile
= po
.pofile()
241 locre
= re
.compile(r
":[0-9]+$")
242 print >> sys
.stderr
, ("%d terms from %d units in %d files" %
243 (len(self
.glossary
), self
.units
, self
.files
))
244 for term
, translations
in self
.glossary
.iteritems():
245 if len(translations
) <= 1:
249 termunit
= po
.pounit(term
)
255 for source
, target
, unit
, filename
in translations
:
257 filecounts
[filename
] = filecounts
.setdefault(filename
, 0) + 1
258 if term
.lower() == self
.clean(unit
.source
, options
).lower():
260 target
= self
.clean(unit
.target
, options
)
261 if options
.ignorecase
or (options
.foldtitle
and target
.istitle()):
262 target
= target
.lower()
263 unit
.settarget(target
)
265 targets
.setdefault(target
, []).append(filename
)
266 if term
.lower() == unit
.source
.strip().lower():
267 sourcenotes
[unit
.getnotes("source code")] = None
268 transnotes
[unit
.getnotes("translator")] = None
272 termunit
.merge(unit
, overwrite
=False, comments
=False)
273 for loc
in unit
.getlocations():
274 locations
.setdefault(locre
.sub("", loc
))
275 numsources
= len(sources
)
276 numfiles
= len(filecounts
)
277 numlocs
= len(locations
)
278 if numfiles
< options
.inputmin
or numlocs
< options
.locmin
:
281 if numsources
< options
.fullmsgmin
:
283 elif numsources
< options
.substrmin
:
285 if len(targets
.keys()) > 1:
286 txt
= '; '.join(["%s {%s}" % (target
, ', '.join(files
))
287 for target
, files
in targets
.iteritems()])
288 if termunit
.gettarget().find('};') < 0:
289 termunit
.settarget(txt
)
292 # if annotated multiple terms already present, keep as-is
293 termunit
.addnote(txt
, "translator")
294 locmax
= 2 * options
.locmin
296 for location
in locations
.keys()[0:locmax
]:
297 termunit
.addlocation(location
)
298 termunit
.addlocation("(poterminology) %d more locations"
299 % (numlocs
- locmax
))
301 for location
in locations
.keys():
302 termunit
.addlocation(location
)
303 for sourcenote
in sourcenotes
.keys():
304 termunit
.addnote(sourcenote
, "source code")
305 for transnote
in transnotes
.keys():
306 termunit
.addnote(transnote
, "translator")
307 for filename
, count
in filecounts
.iteritems():
308 termunit
.othercomments
.append("# (poterminology) %s (%d)\n" % (filename
, count
))
309 terms
[term
] = (((10 * numfiles
) + numsources
, termunit
))
311 termlist
= terms
.keys()
312 print >> sys
.stderr
, "%d terms after thresholding" % len(termlist
)
313 termlist
.sort(lambda x
, y
: cmp(len(x
), len(y
)))
314 for term
in termlist
:
318 while len(words
) > 2:
320 if terms
[term
][0] == terms
.get(' '.join(words
), [0])[0]:
321 del terms
[' '.join(words
)]
323 while len(words
) > 2:
325 if terms
[term
][0] == terms
.get(' '.join(words
), [0])[0]:
326 del terms
[' '.join(words
)]
327 print >> sys
.stderr
, "%d terms after subphrase reduction" % len(terms
.keys())
328 termitems
= terms
.values()
329 if options
.sortorders
== None:
330 options
.sortorders
= self
.sortorders
331 while len(options
.sortorders
) > 0:
332 order
= options
.sortorders
.pop()
333 if order
== "frequency":
334 termitems
.sort(lambda x
, y
: cmp(y
[0], x
[0]))
335 elif order
== "dictionary":
336 termitems
.sort(lambda x
, y
: cmp(x
[1].source
.lower(), y
[1].source
.lower()))
337 elif order
== "length":
338 termitems
.sort(lambda x
, y
: cmp(len(x
[1].source
), len(y
[1].source
)))
340 self
.warning("unknown sort order %s" % order
, options
)
341 for count
, unit
in termitems
:
342 termfile
.units
.append(unit
)
343 open(options
.output
, "w").write(str(termfile
))
346 formats
= {"po":("po", None), None:("po", None)}
347 parser
= TerminologyOptionParser(formats
)
348 parser
.add_option("-I", "--ignore-case", dest
="ignorecase",
349 action
="store_true", default
=False, help="make all terms lowercase")
350 parser
.add_option("-F", "--fold-titlecase", dest
="foldtitle",
351 action
="store_true", default
=False, help="fold \"Title Case\" to lowercase")
352 parser
.add_option("", "--accelerator", dest
="accelchars", default
="",
353 metavar
="ACCELERATORS", help="ignores the given accelerator characters when matching")
354 parser
.add_option("-t", "--term-words", type="int", dest
="termlength", default
="3",
355 help="generate terms of up to LENGTH words (default 3)", metavar
="LENGTH")
356 parser
.add_option("", "--inputs-needed", type="int", dest
="inputmin",
357 help="omit terms appearing in less than MIN input files (default 1 - 2 if multiple input files)", metavar
="MIN")
358 parser
.add_option("", "--fullmsg-needed", type="int", dest
="fullmsgmin", default
="1",
359 help="omit full message terms appearing in less than MIN different messages (default 1)", metavar
="MIN")
360 parser
.add_option("", "--substr-needed", type="int", dest
="substrmin", default
="2",
361 help="omit substring-only terms appearing in less than MIN different messages (default 2)", metavar
="MIN")
362 parser
.add_option("", "--locs-needed", type="int", dest
="locmin", default
="2",
363 help="omit terms appearing in less than MIN different original source files (default 2)", metavar
="MIN")
364 parser
.add_option("", "--sort", dest
="sortorders", action
="append",
365 type="choice", choices
=parser
.sortorders
, metavar
="ORDER",
366 help="output sort order(s): %s (default is all orders in the above priority)" % ', '.join(parser
.sortorders
))
367 parser
.add_option("-S", "--stopword-list", type="string", dest
="stopwordfile",
368 help="name of file containing stopword list", metavar
="FILENAME")
369 parser
.add_option("", "--source-language", dest
="sourcelanguage", default
="en",
370 help="the source language code (default 'en')", metavar
="LANG")
371 parser
.add_option("-v", "--invert", dest
="invert",
372 action
="store_true", default
=False, help="invert the source and target languages for terminology")
374 parser
.description
= __doc__
378 if __name__
== '__main__':