2 # -*- coding: utf-8 -*-
4 # Copyright 2007 Zuza Software Foundation
6 # This file is part of translate.
8 # translate is free software; you can redistribute it and/or modify
9 # it under the terms of the GNU General Public License as published by
10 # the Free Software Foundation; either version 2 of the License, or
11 # (at your option) any later version.
13 # translate is distributed in the hope that it will be useful,
14 # but WITHOUT ANY WARRANTY; without even the implied warranty of
15 # MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
16 # GNU General Public License for more details.
18 # You should have received a copy of the GNU General Public License
19 # along with translate; if not, write to the Free Software
20 # Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA 02111-1307 USA
22 """Module to provide a cache of statistics in a database.
24 @organization: Zuza Software Foundation
25 @copyright: 2007 Zuza Software Foundation
26 @license: U{GPL <http://www.fsf.org/licensing/licenses/gpl.html>}
29 from translate
import __version__
as toolkitversion
30 from translate
.storage
import factory
, base
31 from translate
.misc
.multistring
import multistring
32 from translate
.lang
.common
import Common
35 from sqlite3
import dbapi2
37 from pysqlite2
import dbapi2
43 kdepluralre
= re
.compile("^_n: ")
44 brtagre
= re
.compile("<br\s*?/?>")
45 xmltagre
= re
.compile("<[^>]+>")
46 numberre
= re
.compile("\\D\\.\\D")
48 state_strings
= {0: "untranslated", 1: "translated", 2: "fuzzy"}
50 def wordcount(string
):
51 # TODO: po class should understand KDE style plurals
52 string
= kdepluralre
.sub("", string
)
53 string
= brtagre
.sub("\n", string
)
54 string
= xmltagre
.sub("", string
)
55 string
= numberre
.sub(" ", string
)
56 #TODO: This should still use the correct language to count in the target
58 return len(Common
.words(string
))
60 def wordsinunit(unit
):
61 """Counts the words in the unit's source and target, taking plurals into
62 account. The target words are only counted if the unit is translated."""
63 (sourcewords
, targetwords
) = (0, 0)
64 if isinstance(unit
.source
, multistring
):
65 sourcestrings
= unit
.source
.strings
67 sourcestrings
= [unit
.source
or ""]
68 for s
in sourcestrings
:
69 sourcewords
+= wordcount(s
)
70 if not unit
.istranslated():
71 return sourcewords
, targetwords
72 if isinstance(unit
.target
, multistring
):
73 targetstrings
= unit
.target
.strings
75 targetstrings
= [unit
.target
or ""]
76 for s
in targetstrings
:
77 targetwords
+= wordcount(s
)
78 return sourcewords
, targetwords
81 """Returns the numeric database state for the unit."""
82 if unit
.istranslated():
84 if unit
.isfuzzy() and unit
.target
:
89 """Returns a dictionary with all statistics initalised to 0."""
91 for state
in ["total", "translated", "fuzzy", "untranslated", "review"]:
93 stats
[state
+ "sourcewords"] = 0
94 stats
[state
+ "targetwords"] = 0
97 # We allow the caller to specify which value to return when errors_return_empty
98 # is True. We do this, since Poolte wants None to be returned when it calls
99 # get_mod_info directly, whereas we want an integer to be returned for
100 # uses of get_mod_info within this module.
101 # TODO: Get rid of empty_return when Pootle code is improved to not require
103 def get_mod_info(file_path
, errors_return_empty
=False, empty_return
=0):
105 file_stat
= os
.stat(file_path
)
106 assert not stat
.S_ISDIR(file_stat
.st_mode
)
107 return (file_stat
.st_mtime
, file_stat
.st_size
)
109 if errors_return_empty
:
114 def suggestioninfo(filename
, **kwargs
):
115 """Provides the filename of the associated file containing suggestions and
116 its mod_info, if it exists."""
117 root
, ext
= os
.path
.splitext(filename
)
118 suggestion_filename
= None
119 suggestion_mod_info
= -1
120 if ext
== os
.path
.extsep
+ "po":
121 # For a PO file there might be an associated file with suggested
122 # translations. If either file changed, we want to regenerate the
124 suggestion_filename
= filename
+ os
.path
.extsep
+ 'pending'
125 if not os
.path
.exists(suggestion_filename
):
126 suggestion_filename
= None
128 suggestion_mod_info
= get_mod_info(suggestion_filename
, **kwargs
)
129 return suggestion_filename
, suggestion_mod_info
131 def parse_mod_info(string
):
133 tokens
= string
.strip("()").split(",")
134 if os
.stat_float_times():
135 return (float(tokens
[0]), long(tokens
[1]))
137 return (int(tokens
[0]), long(tokens
[1]))
141 def dump_mod_info(mod_info
):
144 class StatsCache(object):
145 """An object instantiated as a singleton for each statsfile that provides
146 access to the database cache from a pool of StatsCache objects."""
150 """This cache's connection"""
152 """The current cursor"""
154 def __new__(cls
, statsfile
=None):
156 if not cls
.defaultfile
:
157 userdir
= os
.path
.expanduser("~")
160 cachedir
= os
.path
.join(userdir
, "Translate Toolkit")
162 cachedir
= os
.path
.join(userdir
, ".translate_toolkit")
163 if not os
.path
.exists(cachedir
):
165 cls
.defaultfile
= os
.path
.realpath(os
.path
.join(cachedir
, "stats.db"))
166 statsfile
= cls
.defaultfile
168 statsfile
= os
.path
.realpath(statsfile
)
169 # First see if a cache for this file already exists:
170 if statsfile
in cls
._caches
:
171 return cls
._caches
[statsfile
]
172 # No existing cache. Let's build a new one and keep a copy
173 cache
= cls
._caches
[statsfile
] = object.__new
__(cls
)
174 cache
.con
= dbapi2
.connect(statsfile
)
175 cache
.cur
= cache
.con
.cursor()
180 """Create all tables and indexes."""
181 self
.cur
.execute("""CREATE TABLE IF NOT EXISTS files(
182 fileid INTEGER PRIMARY KEY AUTOINCREMENT,
183 path VARCHAR NOT NULL UNIQUE,
184 mod_info CHAR(50) NOT NULL,
185 toolkitbuild INTEGER NOT NULL);""")
186 # mod_info should never be larger than about 138 bits as computed by
187 # get_mod_info. This is because st_mtime is at most 64 bits, multiplying
188 # by 1000 adds at most 10 bits and file_stat.st_size is at most 64 bits.
189 # Therefore, we should get away with 50 decimal digits (actually, we need
190 # math.log((1 << 139) - 1, 10) = 41.8 characters, but whatever).
192 self
.cur
.execute("""CREATE UNIQUE INDEX IF NOT EXISTS filepathindex
195 self
.cur
.execute("""CREATE TABLE IF NOT EXISTS units(
196 id INTEGER PRIMARY KEY AUTOINCREMENT,
197 unitid VARCHAR NOT NULL,
198 fileid INTEGER NOT NULL,
199 unitindex INTEGER NOT NULL,
200 source VARCHAR NOT NULL,
204 targetwords INTEGER);""")
206 self
.cur
.execute("""CREATE INDEX IF NOT EXISTS fileidindex
207 ON units(fileid);""")
209 self
.cur
.execute("""CREATE TABLE IF NOT EXISTS checkerconfigs(
210 configid INTEGER PRIMARY KEY AUTOINCREMENT,
213 self
.cur
.execute("""CREATE INDEX IF NOT EXISTS configindex
214 ON checkerconfigs(config);""")
216 self
.cur
.execute("""CREATE TABLE IF NOT EXISTS uniterrors(
217 errorid INTEGER PRIMARY KEY AUTOINCREMENT,
218 unitindex INTEGER NOT NULL,
219 fileid INTEGER NOT NULL,
220 configid INTEGER NOT NULL,
221 name VARCHAR NOT NULL,
222 message VARCHAR);""")
224 self
.cur
.execute("""CREATE INDEX IF NOT EXISTS uniterrorindex
225 ON uniterrors(fileid, configid);""")
229 def _getfileid(self
, filename
, opt_mod_info
=(-1, -1), check_mod_info
=True, store
=None, errors_return_empty
=False):
230 """Attempt to find the fileid of the given file, if it hasn't been
231 updated since the last record update.
233 None is returned if either the file's record is not found, or if it is
236 @param filename: the filename to retrieve the id for
237 @param opt_mod_info: an optional mod_info to consider in addition
238 to the actual mod_info of the given file
239 @rtype: String or None
241 realpath
= os
.path
.realpath(filename
)
242 self
.cur
.execute("""SELECT fileid, mod_info FROM files
243 WHERE path=?;""", (realpath
,))
244 filerow
= self
.cur
.fetchone()
246 mod_info
= max(opt_mod_info
, get_mod_info(realpath
))
249 if not check_mod_info
:
250 # Update the mod_info of the file
251 self
.cur
.execute("""UPDATE files
253 WHERE fileid=?;""", (dump_mod_info(mod_info
), fileid
))
255 if parse_mod_info(filerow
[1]) == mod_info
:
257 # We can only ignore the mod_info if the row already exists:
258 assert check_mod_info
259 store
= store
or factory
.getobject(filename
)
260 return self
._cachestore
(store
, mod_info
)
261 except (base
.ParseError
, IOError, OSError, AssertionError):
262 if errors_return_empty
:
267 def _getstoredcheckerconfig(self
, checker
):
268 """See if this checker configuration has been used before."""
269 config
= str(checker
.config
.__dict
__)
270 self
.cur
.execute("""SELECT configid, config FROM checkerconfigs WHERE
271 config=?;""", (config
,))
272 configrow
= self
.cur
.fetchone()
273 if not configrow
or configrow
[1] != config
:
278 def _cacheunitstats(self
, units
, fileid
, unitindex
=None):
279 """Cache the statistics for the supplied unit(s)."""
281 for index
, unit
in enumerate(units
):
282 if unit
.istranslatable():
283 sourcewords
, targetwords
= wordsinunit(unit
)
286 # what about plurals in .source and .target?
287 unitvalues
.append((unit
.getid(), fileid
, index
, \
288 unit
.source
, unit
.target
, \
289 sourcewords
, targetwords
, \
291 # XXX: executemany is non-standard
292 self
.cur
.executemany("""INSERT INTO units
293 (unitid, fileid, unitindex, source, target, sourcewords, targetwords, state)
294 values (?, ?, ?, ?, ?, ?, ?, ?);""",
298 return state_strings
[statefordb(units
[0])]
301 def _cachestore(self
, store
, mod_info
):
302 """Calculates and caches the statistics of the given store
304 realpath
= os
.path
.realpath(store
.filename
)
305 os
.utime(realpath
, (mod_info
[0], mod_info
[0]))
306 self
.cur
.execute("""DELETE FROM files WHERE
307 path=?;""", (realpath
,))
308 self
.cur
.execute("""INSERT INTO files
309 (fileid, path, mod_info, toolkitbuild) values (NULL, ?, ?, ?);""",
310 (realpath
, dump_mod_info(mod_info
), toolkitversion
.build
))
311 fileid
= self
.cur
.lastrowid
312 self
.cur
.execute("""DELETE FROM units WHERE
313 fileid=?""", (fileid
,))
314 self
._cacheunitstats
(store
.units
, fileid
)
317 def directorytotals(self
, dirname
):
318 """Retrieves the stored statistics for a given directory, all summed.
320 Note that this does not check for mod_infos or the presence of files."""
321 realpath
= os
.path
.realpath(dirname
)
322 self
.cur
.execute("""SELECT
324 count(unitid) as total,
325 sum(sourcewords) as sourcewords,
326 sum(targetwords) as targetwords
327 FROM units WHERE fileid IN
328 (SELECT fileid from files
329 WHERE substr(path, 0, ?)=?)
330 GROUP BY state;""", (len(realpath
), realpath
))
331 totals
= emptystats()
332 return self
.cur
.fetchall()
334 def filetotals(self
, filename
, **kwargs
):
335 """Retrieves the statistics for the given file if possible, otherwise
336 delegates to cachestore()."""
340 fileid
= self
._getfileid
(filename
, **kwargs
)
341 except ValueError, e
:
342 print >> sys
.stderr
, str(e
)
345 self
.cur
.execute("""SELECT
347 count(unitid) as total,
348 sum(sourcewords) as sourcewords,
349 sum(targetwords) as targetwords
350 FROM units WHERE fileid=?
351 GROUP BY state;""", (fileid
,))
352 values
= self
.cur
.fetchall()
354 totals
= emptystats()
355 for stateset
in values
:
356 state
= state_strings
[stateset
[0]] # state
357 totals
[state
] = stateset
[1] or 0 # total
358 totals
[state
+ "sourcewords"] = stateset
[2] # sourcewords
359 totals
[state
+ "targetwords"] = stateset
[3] # targetwords
360 totals
["total"] = totals
["untranslated"] + totals
["translated"] + totals
["fuzzy"]
361 totals
["totalsourcewords"] = totals
["untranslatedsourcewords"] + \
362 totals
["translatedsourcewords"] + \
363 totals
["fuzzysourcewords"]
366 def _cacheunitschecks(self
, units
, fileid
, configid
, checker
, unitindex
=None):
367 """Helper method for cachestorechecks() and recacheunit()"""
368 # We always want to store one dummy error to know that we have actually
369 # run the checks on this file with the current checker configuration
370 dummy
= (-1, fileid
, configid
, "noerror", "")
372 # if we are doing a single unit, we want to return the checknames
374 for index
, unit
in enumerate(units
):
375 if unit
.istranslatable():
376 # Correctly assign the unitindex
379 failures
= checker
.run_filters(unit
)
380 for checkname
, checkmessage
in failures
.iteritems():
381 unitvalues
.append((index
, fileid
, configid
, checkname
, checkmessage
))
382 errornames
.append("check-" + checkname
)
383 checker
.setsuggestionstore(None)
386 # We are only updating a single unit, so we don't want to add an
387 # extra noerror-entry
388 unitvalues
.remove(dummy
)
389 errornames
.append("total")
391 # XXX: executemany is non-standard
392 self
.cur
.executemany("""INSERT INTO uniterrors
393 (unitindex, fileid, configid, name, message)
394 values (?, ?, ?, ?, ?);""",
399 def cachestorechecks(self
, fileid
, store
, checker
, configid
):
400 """Calculates and caches the error statistics of the given store
402 # Let's purge all previous failures because they will probably just
403 # fill up the database without much use.
404 self
.cur
.execute("""DELETE FROM uniterrors WHERE
405 fileid=?;""", (fileid
,))
406 self
._cacheunitschecks
(store
.units
, fileid
, configid
, checker
)
409 def recacheunit(self
, filename
, checker
, unit
):
410 """Recalculate all information for a specific unit. This is necessary
411 for updating all statistics when a translation of a unit took place,
414 This method assumes that everything was up to date before (file totals,
415 checks, checker config, etc."""
416 suggestion_filename
, suggestion_mod_info
= suggestioninfo(filename
)
417 fileid
= self
._getfileid
(filename
, suggestion_mod_info
, check_mod_info
=False)
418 configid
= self
._getstoredcheckerconfig
(checker
)
419 unitid
= unit
.getid()
421 self
.cur
.execute("""SELECT unitindex FROM units WHERE
422 fileid=? AND unitid=?;""", (fileid
, unitid
))
423 unitindex
= self
.cur
.fetchone()[0]
424 self
.cur
.execute("""DELETE FROM units WHERE
425 fileid=? AND unitid=?;""", (fileid
, unitid
))
426 state
= [self
._cacheunitstats
([unit
], fileid
, unitindex
)]
427 # remove the current errors
428 self
.cur
.execute("""DELETE FROM uniterrors WHERE
429 fileid=? AND unitindex=?;""", (fileid
, unitindex
))
430 if suggestion_filename
:
431 checker
.setsuggestionstore(factory
.getobject(suggestion_filename
, ignore
=os
.path
.extsep
+ 'pending'))
432 state
.extend(self
._cacheunitschecks
([unit
], fileid
, configid
, checker
, unitindex
))
435 def filechecks(self
, filename
, checker
, store
=None, **kwargs
):
436 """Retrieves the error statistics for the given file if possible,
437 otherwise delegates to cachestorechecks()."""
438 suggestion_filename
, suggestion_mod_info
= suggestioninfo(filename
, **kwargs
)
440 configid
= self
._getstoredcheckerconfig
(checker
)
442 fileid
= self
._getfileid
(filename
, suggestion_mod_info
, store
=store
, **kwargs
)
444 self
.cur
.execute("""INSERT INTO checkerconfigs
445 (configid, config) values (NULL, ?);""",
446 (str(checker
.config
.__dict
__),))
447 configid
= self
.cur
.lastrowid
448 except ValueError, e
:
449 print >> sys
.stderr
, str(e
)
453 self
.cur
.execute("""SELECT
456 FROM uniterrors WHERE fileid=? and configid=?
457 ORDER BY unitindex;""", (fileid
, configid
))
458 return self
.cur
.fetchall()
462 # This could happen if we haven't done the checks before, or the
463 # file changed, or we are using a different configuration
464 store
= store
or factory
.getobject(filename
)
465 if suggestion_filename
:
466 checker
.setsuggestionstore(factory
.getobject(suggestion_filename
, ignore
=os
.path
.extsep
+ 'pending'))
467 self
.cachestorechecks(fileid
, store
, checker
, configid
)
474 checkkey
= 'check-' + value
[0] #value[0] is the error name
475 if not checkkey
in errors
:
476 errors
[checkkey
] = []
477 errors
[checkkey
].append(value
[1]) #value[1] is the unitindex
481 def filestats(self
, filename
, checker
, store
=None, **kwargs
):
482 """Return a dictionary of property names mapping sets of unit
483 indices with those properties."""
484 stats
= {"total": [], "translated": [], "fuzzy": [], "untranslated": []}
486 stats
.update(self
.filechecks(filename
, checker
, store
, **kwargs
))
487 fileid
= self
._getfileid
(filename
, store
=store
, **kwargs
)
489 self
.cur
.execute("""SELECT
492 FROM units WHERE fileid=?
493 ORDER BY unitindex;""", (fileid
,))
495 values
= self
.cur
.fetchall()
497 stats
[state_strings
[value
[0]]].append(value
[1])
498 stats
["total"].append(value
[1])
502 def unitstats(self
, filename
, _lang
=None, store
=None, **kwargs
):
503 # For now, lang and store are unused. lang will allow the user to
504 # base stats information on the given language. See the commented
505 # line containing stats.update below.
506 """Return a dictionary of property names mapping to arrays which
507 map unit indices to property values.
509 Please note that this is different from filestats, since filestats
510 supplies sets of unit indices with a given property, whereas this
511 method supplies arrays which map unit indices to given values."""
512 stats
= {"sourcewordcount": [], "targetwordcount": []}
514 #stats.update(self.unitchecks(filename, lang, store))
515 fileid
= self
._getfileid
(filename
, store
=store
, **kwargs
)
517 self
.cur
.execute("""SELECT
518 sourcewords, targetwords
519 FROM units WHERE fileid=?
520 ORDER BY unitindex;""", (fileid
,))
522 for sourcecount
, targetcount
in self
.cur
.fetchall():
523 stats
["sourcewordcount"].append(sourcecount
)
524 stats
["targetwordcount"].append(targetcount
)