fix git support for v1.5.3 (or higher) by setting "--work-tree"
[translate_toolkit.git] / storage / statsdb.py
blobaba7d1444d06f1af8289f9fa06201223b8f7eeb8
1 #!/usr/bin/env python
2 # -*- coding: utf-8 -*-
3 #
4 # Copyright 2007 Zuza Software Foundation
5 #
6 # This file is part of translate.
8 # translate is free software; you can redistribute it and/or modify
9 # it under the terms of the GNU General Public License as published by
10 # the Free Software Foundation; either version 2 of the License, or
11 # (at your option) any later version.
13 # translate is distributed in the hope that it will be useful,
14 # but WITHOUT ANY WARRANTY; without even the implied warranty of
15 # MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
16 # GNU General Public License for more details.
18 # You should have received a copy of the GNU General Public License
19 # along with translate; if not, write to the Free Software
20 # Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA 02111-1307 USA
22 """Module to provide a cache of statistics in a database.
24 @organization: Zuza Software Foundation
25 @copyright: 2007 Zuza Software Foundation
26 @license: U{GPL <http://www.fsf.org/licensing/licenses/gpl.html>}
27 """
29 from translate import __version__ as toolkitversion
30 from translate.storage import factory, base
31 from translate.misc.multistring import multistring
32 from translate.lang.common import Common
34 try:
35 from sqlite3 import dbapi2
36 except ImportError:
37 from pysqlite2 import dbapi2
38 import os.path
39 import re
40 import sys
41 import stat
43 kdepluralre = re.compile("^_n: ")
44 brtagre = re.compile("<br\s*?/?>")
45 xmltagre = re.compile("<[^>]+>")
46 numberre = re.compile("\\D\\.\\D")
48 state_strings = {0: "untranslated", 1: "translated", 2: "fuzzy"}
50 def wordcount(string):
51 # TODO: po class should understand KDE style plurals
52 string = kdepluralre.sub("", string)
53 string = brtagre.sub("\n", string)
54 string = xmltagre.sub("", string)
55 string = numberre.sub(" ", string)
56 #TODO: This should still use the correct language to count in the target
57 #language
58 return len(Common.words(string))
60 def wordsinunit(unit):
61 """Counts the words in the unit's source and target, taking plurals into
62 account. The target words are only counted if the unit is translated."""
63 (sourcewords, targetwords) = (0, 0)
64 if isinstance(unit.source, multistring):
65 sourcestrings = unit.source.strings
66 else:
67 sourcestrings = [unit.source or ""]
68 for s in sourcestrings:
69 sourcewords += wordcount(s)
70 if not unit.istranslated():
71 return sourcewords, targetwords
72 if isinstance(unit.target, multistring):
73 targetstrings = unit.target.strings
74 else:
75 targetstrings = [unit.target or ""]
76 for s in targetstrings:
77 targetwords += wordcount(s)
78 return sourcewords, targetwords
80 def statefordb(unit):
81 """Returns the numeric database state for the unit."""
82 if unit.istranslated():
83 return 1
84 if unit.isfuzzy() and unit.target:
85 return 2
86 return 0
88 def emptystats():
89 """Returns a dictionary with all statistics initalised to 0."""
90 stats = {}
91 for state in ["total", "translated", "fuzzy", "untranslated", "review"]:
92 stats[state] = 0
93 stats[state + "sourcewords"] = 0
94 stats[state + "targetwords"] = 0
95 return stats
97 # We allow the caller to specify which value to return when errors_return_empty
98 # is True. We do this, since Poolte wants None to be returned when it calls
99 # get_mod_info directly, whereas we want an integer to be returned for
100 # uses of get_mod_info within this module.
101 # TODO: Get rid of empty_return when Pootle code is improved to not require
102 # this.
103 def get_mod_info(file_path, errors_return_empty=False, empty_return=0):
104 try:
105 file_stat = os.stat(file_path)
106 assert not stat.S_ISDIR(file_stat.st_mode)
107 return (file_stat.st_mtime, file_stat.st_size)
108 except:
109 if errors_return_empty:
110 return empty_return
111 else:
112 raise
114 def suggestioninfo(filename, **kwargs):
115 """Provides the filename of the associated file containing suggestions and
116 its mod_info, if it exists."""
117 root, ext = os.path.splitext(filename)
118 suggestion_filename = None
119 suggestion_mod_info = -1
120 if ext == os.path.extsep + "po":
121 # For a PO file there might be an associated file with suggested
122 # translations. If either file changed, we want to regenerate the
123 # statistics.
124 suggestion_filename = filename + os.path.extsep + 'pending'
125 if not os.path.exists(suggestion_filename):
126 suggestion_filename = None
127 else:
128 suggestion_mod_info = get_mod_info(suggestion_filename, **kwargs)
129 return suggestion_filename, suggestion_mod_info
131 def parse_mod_info(string):
132 try:
133 tokens = string.strip("()").split(",")
134 if os.stat_float_times():
135 return (float(tokens[0]), long(tokens[1]))
136 else:
137 return (int(tokens[0]), long(tokens[1]))
138 except:
139 return (-1, -1)
141 def dump_mod_info(mod_info):
142 return str(mod_info)
144 class StatsCache(object):
145 """An object instantiated as a singleton for each statsfile that provides
146 access to the database cache from a pool of StatsCache objects."""
147 _caches = {}
148 defaultfile = None
149 con = None
150 """This cache's connection"""
151 cur = None
152 """The current cursor"""
154 def __new__(cls, statsfile=None):
155 if not statsfile:
156 if not cls.defaultfile:
157 userdir = os.path.expanduser("~")
158 cachedir = None
159 if os.name == "nt":
160 cachedir = os.path.join(userdir, "Translate Toolkit")
161 else:
162 cachedir = os.path.join(userdir, ".translate_toolkit")
163 if not os.path.exists(cachedir):
164 os.mkdir(cachedir)
165 cls.defaultfile = os.path.realpath(os.path.join(cachedir, "stats.db"))
166 statsfile = cls.defaultfile
167 else:
168 statsfile = os.path.realpath(statsfile)
169 # First see if a cache for this file already exists:
170 if statsfile in cls._caches:
171 return cls._caches[statsfile]
172 # No existing cache. Let's build a new one and keep a copy
173 cache = cls._caches[statsfile] = object.__new__(cls)
174 cache.con = dbapi2.connect(statsfile)
175 cache.cur = cache.con.cursor()
176 cache.create()
177 return cache
179 def create(self):
180 """Create all tables and indexes."""
181 self.cur.execute("""CREATE TABLE IF NOT EXISTS files(
182 fileid INTEGER PRIMARY KEY AUTOINCREMENT,
183 path VARCHAR NOT NULL UNIQUE,
184 mod_info CHAR(50) NOT NULL,
185 toolkitbuild INTEGER NOT NULL);""")
186 # mod_info should never be larger than about 138 bits as computed by
187 # get_mod_info. This is because st_mtime is at most 64 bits, multiplying
188 # by 1000 adds at most 10 bits and file_stat.st_size is at most 64 bits.
189 # Therefore, we should get away with 50 decimal digits (actually, we need
190 # math.log((1 << 139) - 1, 10) = 41.8 characters, but whatever).
192 self.cur.execute("""CREATE UNIQUE INDEX IF NOT EXISTS filepathindex
193 ON files (path);""")
195 self.cur.execute("""CREATE TABLE IF NOT EXISTS units(
196 id INTEGER PRIMARY KEY AUTOINCREMENT,
197 unitid VARCHAR NOT NULL,
198 fileid INTEGER NOT NULL,
199 unitindex INTEGER NOT NULL,
200 source VARCHAR NOT NULL,
201 target VARCHAR,
202 state INTEGER,
203 sourcewords INTEGER,
204 targetwords INTEGER);""")
206 self.cur.execute("""CREATE INDEX IF NOT EXISTS fileidindex
207 ON units(fileid);""")
209 self.cur.execute("""CREATE TABLE IF NOT EXISTS checkerconfigs(
210 configid INTEGER PRIMARY KEY AUTOINCREMENT,
211 config VARCHAR);""")
213 self.cur.execute("""CREATE INDEX IF NOT EXISTS configindex
214 ON checkerconfigs(config);""")
216 self.cur.execute("""CREATE TABLE IF NOT EXISTS uniterrors(
217 errorid INTEGER PRIMARY KEY AUTOINCREMENT,
218 unitindex INTEGER NOT NULL,
219 fileid INTEGER NOT NULL,
220 configid INTEGER NOT NULL,
221 name VARCHAR NOT NULL,
222 message VARCHAR);""")
224 self.cur.execute("""CREATE INDEX IF NOT EXISTS uniterrorindex
225 ON uniterrors(fileid, configid);""")
227 self.con.commit()
229 def _getfileid(self, filename, opt_mod_info=(-1, -1), check_mod_info=True, store=None, errors_return_empty=False):
230 """Attempt to find the fileid of the given file, if it hasn't been
231 updated since the last record update.
233 None is returned if either the file's record is not found, or if it is
234 not up to date.
236 @param filename: the filename to retrieve the id for
237 @param opt_mod_info: an optional mod_info to consider in addition
238 to the actual mod_info of the given file
239 @rtype: String or None
240 """
241 realpath = os.path.realpath(filename)
242 self.cur.execute("""SELECT fileid, mod_info FROM files
243 WHERE path=?;""", (realpath,))
244 filerow = self.cur.fetchone()
245 try:
246 mod_info = max(opt_mod_info, get_mod_info(realpath))
247 if filerow:
248 fileid = filerow[0]
249 if not check_mod_info:
250 # Update the mod_info of the file
251 self.cur.execute("""UPDATE files
252 SET mod_info=?
253 WHERE fileid=?;""", (dump_mod_info(mod_info), fileid))
254 return fileid
255 if parse_mod_info(filerow[1]) == mod_info:
256 return fileid
257 # We can only ignore the mod_info if the row already exists:
258 assert check_mod_info
259 store = store or factory.getobject(filename)
260 return self._cachestore(store, mod_info)
261 except (base.ParseError, IOError, OSError, AssertionError):
262 if errors_return_empty:
263 return -1
264 else:
265 raise
267 def _getstoredcheckerconfig(self, checker):
268 """See if this checker configuration has been used before."""
269 config = str(checker.config.__dict__)
270 self.cur.execute("""SELECT configid, config FROM checkerconfigs WHERE
271 config=?;""", (config,))
272 configrow = self.cur.fetchone()
273 if not configrow or configrow[1] != config:
274 return None
275 else:
276 return configrow[0]
278 def _cacheunitstats(self, units, fileid, unitindex=None):
279 """Cache the statistics for the supplied unit(s)."""
280 unitvalues = []
281 for index, unit in enumerate(units):
282 if unit.istranslatable():
283 sourcewords, targetwords = wordsinunit(unit)
284 if unitindex:
285 index = unitindex
286 # what about plurals in .source and .target?
287 unitvalues.append((unit.getid(), fileid, index, \
288 unit.source, unit.target, \
289 sourcewords, targetwords, \
290 statefordb(unit)))
291 # XXX: executemany is non-standard
292 self.cur.executemany("""INSERT INTO units
293 (unitid, fileid, unitindex, source, target, sourcewords, targetwords, state)
294 values (?, ?, ?, ?, ?, ?, ?, ?);""",
295 unitvalues)
296 self.con.commit()
297 if unitindex:
298 return state_strings[statefordb(units[0])]
299 return ""
301 def _cachestore(self, store, mod_info):
302 """Calculates and caches the statistics of the given store
303 unconditionally."""
304 realpath = os.path.realpath(store.filename)
305 os.utime(realpath, (mod_info[0], mod_info[0]))
306 self.cur.execute("""DELETE FROM files WHERE
307 path=?;""", (realpath,))
308 self.cur.execute("""INSERT INTO files
309 (fileid, path, mod_info, toolkitbuild) values (NULL, ?, ?, ?);""",
310 (realpath, dump_mod_info(mod_info), toolkitversion.build))
311 fileid = self.cur.lastrowid
312 self.cur.execute("""DELETE FROM units WHERE
313 fileid=?""", (fileid,))
314 self._cacheunitstats(store.units, fileid)
315 return fileid
317 def directorytotals(self, dirname):
318 """Retrieves the stored statistics for a given directory, all summed.
320 Note that this does not check for mod_infos or the presence of files."""
321 realpath = os.path.realpath(dirname)
322 self.cur.execute("""SELECT
323 state,
324 count(unitid) as total,
325 sum(sourcewords) as sourcewords,
326 sum(targetwords) as targetwords
327 FROM units WHERE fileid IN
328 (SELECT fileid from files
329 WHERE substr(path, 0, ?)=?)
330 GROUP BY state;""", (len(realpath), realpath))
331 totals = emptystats()
332 return self.cur.fetchall()
334 def filetotals(self, filename, **kwargs):
335 """Retrieves the statistics for the given file if possible, otherwise
336 delegates to cachestore()."""
337 fileid = None
338 if not fileid:
339 try:
340 fileid = self._getfileid(filename, **kwargs)
341 except ValueError, e:
342 print >> sys.stderr, str(e)
343 return {}
345 self.cur.execute("""SELECT
346 state,
347 count(unitid) as total,
348 sum(sourcewords) as sourcewords,
349 sum(targetwords) as targetwords
350 FROM units WHERE fileid=?
351 GROUP BY state;""", (fileid,))
352 values = self.cur.fetchall()
354 totals = emptystats()
355 for stateset in values:
356 state = state_strings[stateset[0]] # state
357 totals[state] = stateset[1] or 0 # total
358 totals[state + "sourcewords"] = stateset[2] # sourcewords
359 totals[state + "targetwords"] = stateset[3] # targetwords
360 totals["total"] = totals["untranslated"] + totals["translated"] + totals["fuzzy"]
361 totals["totalsourcewords"] = totals["untranslatedsourcewords"] + \
362 totals["translatedsourcewords"] + \
363 totals["fuzzysourcewords"]
364 return totals
366 def _cacheunitschecks(self, units, fileid, configid, checker, unitindex=None):
367 """Helper method for cachestorechecks() and recacheunit()"""
368 # We always want to store one dummy error to know that we have actually
369 # run the checks on this file with the current checker configuration
370 dummy = (-1, fileid, configid, "noerror", "")
371 unitvalues = [dummy]
372 # if we are doing a single unit, we want to return the checknames
373 errornames = []
374 for index, unit in enumerate(units):
375 if unit.istranslatable():
376 # Correctly assign the unitindex
377 if unitindex:
378 index = unitindex
379 failures = checker.run_filters(unit)
380 for checkname, checkmessage in failures.iteritems():
381 unitvalues.append((index, fileid, configid, checkname, checkmessage))
382 errornames.append("check-" + checkname)
383 checker.setsuggestionstore(None)
385 if unitindex:
386 # We are only updating a single unit, so we don't want to add an
387 # extra noerror-entry
388 unitvalues.remove(dummy)
389 errornames.append("total")
391 # XXX: executemany is non-standard
392 self.cur.executemany("""INSERT INTO uniterrors
393 (unitindex, fileid, configid, name, message)
394 values (?, ?, ?, ?, ?);""",
395 unitvalues)
396 self.con.commit()
397 return errornames
399 def cachestorechecks(self, fileid, store, checker, configid):
400 """Calculates and caches the error statistics of the given store
401 unconditionally."""
402 # Let's purge all previous failures because they will probably just
403 # fill up the database without much use.
404 self.cur.execute("""DELETE FROM uniterrors WHERE
405 fileid=?;""", (fileid,))
406 self._cacheunitschecks(store.units, fileid, configid, checker)
407 return fileid
409 def recacheunit(self, filename, checker, unit):
410 """Recalculate all information for a specific unit. This is necessary
411 for updating all statistics when a translation of a unit took place,
412 for example.
414 This method assumes that everything was up to date before (file totals,
415 checks, checker config, etc."""
416 suggestion_filename, suggestion_mod_info = suggestioninfo(filename)
417 fileid = self._getfileid(filename, suggestion_mod_info, check_mod_info=False)
418 configid = self._getstoredcheckerconfig(checker)
419 unitid = unit.getid()
420 # get the unit index
421 self.cur.execute("""SELECT unitindex FROM units WHERE
422 fileid=? AND unitid=?;""", (fileid, unitid))
423 unitindex = self.cur.fetchone()[0]
424 self.cur.execute("""DELETE FROM units WHERE
425 fileid=? AND unitid=?;""", (fileid, unitid))
426 state = [self._cacheunitstats([unit], fileid, unitindex)]
427 # remove the current errors
428 self.cur.execute("""DELETE FROM uniterrors WHERE
429 fileid=? AND unitindex=?;""", (fileid, unitindex))
430 if suggestion_filename:
431 checker.setsuggestionstore(factory.getobject(suggestion_filename, ignore=os.path.extsep+ 'pending'))
432 state.extend(self._cacheunitschecks([unit], fileid, configid, checker, unitindex))
433 return state
435 def filechecks(self, filename, checker, store=None, **kwargs):
436 """Retrieves the error statistics for the given file if possible,
437 otherwise delegates to cachestorechecks()."""
438 suggestion_filename, suggestion_mod_info = suggestioninfo(filename, **kwargs)
439 fileid = None
440 configid = self._getstoredcheckerconfig(checker)
441 try:
442 fileid = self._getfileid(filename, suggestion_mod_info, store=store, **kwargs)
443 if not configid:
444 self.cur.execute("""INSERT INTO checkerconfigs
445 (configid, config) values (NULL, ?);""",
446 (str(checker.config.__dict__),))
447 configid = self.cur.lastrowid
448 except ValueError, e:
449 print >> sys.stderr, str(e)
450 return {}
452 def geterrors():
453 self.cur.execute("""SELECT
454 name,
455 unitindex
456 FROM uniterrors WHERE fileid=? and configid=?
457 ORDER BY unitindex;""", (fileid, configid))
458 return self.cur.fetchall()
460 values = geterrors()
461 if not values:
462 # This could happen if we haven't done the checks before, or the
463 # file changed, or we are using a different configuration
464 store = store or factory.getobject(filename)
465 if suggestion_filename:
466 checker.setsuggestionstore(factory.getobject(suggestion_filename, ignore=os.path.extsep+ 'pending'))
467 self.cachestorechecks(fileid, store, checker, configid)
468 values = geterrors()
470 errors = {}
471 for value in values:
472 if value[1] == -1:
473 continue
474 checkkey = 'check-' + value[0] #value[0] is the error name
475 if not checkkey in errors:
476 errors[checkkey] = []
477 errors[checkkey].append(value[1]) #value[1] is the unitindex
479 return errors
481 def filestats(self, filename, checker, store=None, **kwargs):
482 """Return a dictionary of property names mapping sets of unit
483 indices with those properties."""
484 stats = {"total": [], "translated": [], "fuzzy": [], "untranslated": []}
486 stats.update(self.filechecks(filename, checker, store, **kwargs))
487 fileid = self._getfileid(filename, store=store, **kwargs)
489 self.cur.execute("""SELECT
490 state,
491 unitindex
492 FROM units WHERE fileid=?
493 ORDER BY unitindex;""", (fileid,))
495 values = self.cur.fetchall()
496 for value in values:
497 stats[state_strings[value[0]]].append(value[1])
498 stats["total"].append(value[1])
500 return stats
502 def unitstats(self, filename, _lang=None, store=None, **kwargs):
503 # For now, lang and store are unused. lang will allow the user to
504 # base stats information on the given language. See the commented
505 # line containing stats.update below.
506 """Return a dictionary of property names mapping to arrays which
507 map unit indices to property values.
509 Please note that this is different from filestats, since filestats
510 supplies sets of unit indices with a given property, whereas this
511 method supplies arrays which map unit indices to given values."""
512 stats = {"sourcewordcount": [], "targetwordcount": []}
514 #stats.update(self.unitchecks(filename, lang, store))
515 fileid = self._getfileid(filename, store=store, **kwargs)
517 self.cur.execute("""SELECT
518 sourcewords, targetwords
519 FROM units WHERE fileid=?
520 ORDER BY unitindex;""", (fileid,))
522 for sourcecount, targetcount in self.cur.fetchall():
523 stats["sourcewordcount"].append(sourcecount)
524 stats["targetwordcount"].append(targetcount)
526 return stats