storage/statsdb.py

   1 #!/usr/bin/env python
   2 # -*- coding: utf-8 -*-
   3 #
   4 # Copyright 2007 Zuza Software Foundation
   5 #
   6 # This file is part of translate.
   7 #
   8 # translate is free software; you can redistribute it and/or modify
   9 # it under the terms of the GNU General Public License as published by
  10 # the Free Software Foundation; either version 2 of the License, or
  11 # (at your option) any later version.
  12 #
  13 # translate is distributed in the hope that it will be useful,
  14 # but WITHOUT ANY WARRANTY; without even the implied warranty of
  15 # MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
  16 # GNU General Public License for more details.
  17 #
  18 # You should have received a copy of the GNU General Public License
  19 # along with translate; if not, write to the Free Software
  20 # Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA  02111-1307  USA
  21
  22 """Module to provide a cache of statistics in a database.
  23
  24 @organization: Zuza Software Foundation
  25 @copyright: 2007 Zuza Software Foundation
  26 @license: U{GPL <http://www.fsf.org/licensing/licenses/gpl.html>}
  27 """
  28
  29 from translate import __version__ as toolkitversion
  30 from translate.storage import factory, base
  31 from translate.misc.multistring import multistring
  32 from translate.lang.common import Common
  33
  34 try:
  35     from sqlite3 import dbapi2
  36 except ImportError:
  37     from pysqlite2 import dbapi2
  38 import os.path
  39 import re
  40 import sys
  41 import stat
  42
  43 kdepluralre = re.compile("^_n: ")
  44 brtagre = re.compile("<br\s*?/?>")
  45 xmltagre = re.compile("<[^>]+>")
  46 numberre = re.compile("\\D\\.\\D")
  47
  48 state_strings = {0: "untranslated", 1: "translated", 2: "fuzzy"}
  49
  50 def wordcount(string):
  51     # TODO: po class should understand KDE style plurals
  52     string = kdepluralre.sub("", string)
  53     string = brtagre.sub("\n", string)
  54     string = xmltagre.sub("", string)
  55     string = numberre.sub(" ", string)
  56     #TODO: This should still use the correct language to count in the target
  57     #language
  58     return len(Common.words(string))
  59
  60 def wordsinunit(unit):
  61     """Counts the words in the unit's source and target, taking plurals into
  62     account. The target words are only counted if the unit is translated."""
  63     (sourcewords, targetwords) = (0, 0)
  64     if isinstance(unit.source, multistring):
  65         sourcestrings = unit.source.strings
  66     else:
  67         sourcestrings = [unit.source or ""]
  68     for s in sourcestrings:
  69         sourcewords += wordcount(s)
  70     if not unit.istranslated():
  71         return sourcewords, targetwords
  72     if isinstance(unit.target, multistring):
  73         targetstrings = unit.target.strings
  74     else:
  75         targetstrings = [unit.target or ""]
  76     for s in targetstrings:
  77         targetwords += wordcount(s)
  78     return sourcewords, targetwords
  79
  80 def statefordb(unit):
  81     """Returns the numeric database state for the unit."""
  82     if unit.istranslated():
  83         return 1
  84     if unit.isfuzzy() and unit.target:
  85         return 2
  86     return 0
  87
  88 def emptystats():
  89     """Returns a dictionary with all statistics initalised to 0."""
  90     stats = {}
  91     for state in ["total", "translated", "fuzzy", "untranslated", "review"]:
  92         stats[state] = 0
  93         stats[state + "sourcewords"] = 0
  94         stats[state + "targetwords"] = 0
  95     return stats
  96
  97 # We allow the caller to specify which value to return when errors_return_empty
  98 # is True. We do this, since Poolte wants None to be returned when it calls
  99 # get_mod_info directly, whereas we want an integer to be returned for
 100 # uses of get_mod_info within this module.
 101 # TODO: Get rid of empty_return when Pootle code is improved to not require
 102 #       this.
 103 def get_mod_info(file_path, errors_return_empty=False, empty_return=0):
 104     try:
 105         file_stat = os.stat(file_path)
 106         assert not stat.S_ISDIR(file_stat.st_mode)
 107         return (file_stat.st_mtime, file_stat.st_size)
 108     except:
 109         if errors_return_empty:
 110             return empty_return
 111         else:
 112             raise
 113
 114 def suggestioninfo(filename, **kwargs):
 115     """Provides the filename of the associated file containing suggestions and
 116     its mod_info, if it exists."""
 117     root, ext = os.path.splitext(filename)
 118     suggestion_filename = None
 119     suggestion_mod_info = -1
 120     if ext == os.path.extsep + "po":
 121         # For a PO file there might be an associated file with suggested
 122         # translations. If either file changed, we want to regenerate the
 123         # statistics.
 124         suggestion_filename = filename + os.path.extsep + 'pending'
 125         if not os.path.exists(suggestion_filename):
 126             suggestion_filename = None
 127         else:
 128             suggestion_mod_info = get_mod_info(suggestion_filename, **kwargs)
 129     return suggestion_filename, suggestion_mod_info
 130
 131 def parse_mod_info(string):
 132     try:
 133         tokens = string.strip("()").split(",")
 134         if os.stat_float_times():
 135             return (float(tokens[0]), long(tokens[1]))
 136         else:
 137             return (int(tokens[0]), long(tokens[1]))
 138     except:
 139         return (-1, -1)
 140
 141 def dump_mod_info(mod_info):
 142     return str(mod_info)
 143
 144 class StatsCache(object):
 145     """An object instantiated as a singleton for each statsfile that provides
 146     access to the database cache from a pool of StatsCache objects."""
 147     _caches = {}
 148     defaultfile = None
 149     con = None
 150     """This cache's connection"""
 151     cur = None
 152     """The current cursor"""
 153
 154     def __new__(cls, statsfile=None):
 155         if not statsfile:
 156             if not cls.defaultfile:
 157                 userdir = os.path.expanduser("~")
 158                 cachedir = None
 159                 if os.name == "nt":
 160                     cachedir = os.path.join(userdir, "Translate Toolkit")
 161                 else:
 162                     cachedir = os.path.join(userdir, ".translate_toolkit")
 163                 if not os.path.exists(cachedir):
 164                     os.mkdir(cachedir)
 165                 cls.defaultfile = os.path.realpath(os.path.join(cachedir, "stats.db"))
 166             statsfile = cls.defaultfile
 167         else:
 168             statsfile = os.path.realpath(statsfile)
 169         # First see if a cache for this file already exists:
 170         if statsfile in cls._caches:
 171             return cls._caches[statsfile]
 172         # No existing cache. Let's build a new one and keep a copy
 173         cache = cls._caches[statsfile] = object.__new__(cls)
 174         cache.con = dbapi2.connect(statsfile)
 175         cache.cur = cache.con.cursor()
 176         cache.create()
 177         return cache
 178
 179     def create(self):
 180         """Create all tables and indexes."""
 181         self.cur.execute("""CREATE TABLE IF NOT EXISTS files(
 182             fileid INTEGER PRIMARY KEY AUTOINCREMENT,
 183             path VARCHAR NOT NULL UNIQUE,
 184             mod_info CHAR(50) NOT NULL,
 185             toolkitbuild INTEGER NOT NULL);""")
 186         # mod_info should never be larger than about 138 bits as computed by
 187         # get_mod_info. This is because st_mtime is at most 64 bits, multiplying
 188         # by 1000 adds at most 10 bits and file_stat.st_size is at most 64 bits.
 189         # Therefore, we should get away with 50 decimal digits (actually, we need
 190         # math.log((1 << 139) - 1, 10) = 41.8 characters, but whatever).
 191
 192         self.cur.execute("""CREATE UNIQUE INDEX IF NOT EXISTS filepathindex
 193             ON files (path);""")
 194
 195         self.cur.execute("""CREATE TABLE IF NOT EXISTS units(
 196             id INTEGER PRIMARY KEY AUTOINCREMENT,
 197             unitid VARCHAR NOT NULL,
 198             fileid INTEGER NOT NULL,
 199             unitindex INTEGER NOT NULL,
 200             source VARCHAR NOT NULL,
 201             target VARCHAR,
 202             state INTEGER,
 203             sourcewords INTEGER,
 204             targetwords INTEGER);""")
 205
 206         self.cur.execute("""CREATE INDEX IF NOT EXISTS fileidindex
 207             ON units(fileid);""")
 208
 209         self.cur.execute("""CREATE TABLE IF NOT EXISTS checkerconfigs(
 210             configid INTEGER PRIMARY KEY AUTOINCREMENT,
 211             config VARCHAR);""")
 212
 213         self.cur.execute("""CREATE INDEX IF NOT EXISTS configindex
 214             ON checkerconfigs(config);""")
 215
 216         self.cur.execute("""CREATE TABLE IF NOT EXISTS uniterrors(
 217             errorid INTEGER PRIMARY KEY AUTOINCREMENT,
 218             unitindex INTEGER NOT NULL,
 219             fileid INTEGER NOT NULL,
 220             configid INTEGER NOT NULL,
 221             name VARCHAR NOT NULL,
 222             message VARCHAR);""")
 223
 224         self.cur.execute("""CREATE INDEX IF NOT EXISTS uniterrorindex
 225             ON uniterrors(fileid, configid);""")
 226
 227         self.con.commit()
 228
 229     def _getfileid(self, filename, opt_mod_info=(-1, -1), check_mod_info=True, store=None, errors_return_empty=False):
 230         """Attempt to find the fileid of the given file, if it hasn't been
 231         updated since the last record update.
 232
 233         None is returned if either the file's record is not found, or if it is
 234         not up to date.
 235
 236         @param filename: the filename to retrieve the id for
 237         @param opt_mod_info: an optional mod_info to consider in addition
 238         to the actual mod_info of the given file
 239         @rtype: String or None
 240         """
 241         realpath = os.path.realpath(filename)
 242         self.cur.execute("""SELECT fileid, mod_info FROM files
 243                 WHERE path=?;""", (realpath,))
 244         filerow = self.cur.fetchone()
 245         try:
 246             mod_info = max(opt_mod_info, get_mod_info(realpath))
 247             if filerow:
 248                 fileid = filerow[0]
 249                 if not check_mod_info:
 250                     # Update the mod_info of the file
 251                     self.cur.execute("""UPDATE files
 252                             SET mod_info=?
 253                             WHERE fileid=?;""", (dump_mod_info(mod_info), fileid))
 254                     return fileid
 255                 if parse_mod_info(filerow[1]) == mod_info:
 256                     return fileid
 257             # We can only ignore the mod_info if the row already exists:
 258             assert check_mod_info
 259             store = store or factory.getobject(filename)
 260             return self._cachestore(store, mod_info)
 261         except (base.ParseError, IOError, OSError, AssertionError):
 262             if errors_return_empty:
 263                 return -1
 264             else:
 265                 raise
 266
 267     def _getstoredcheckerconfig(self, checker):
 268         """See if this checker configuration has been used before."""
 269         config = str(checker.config.__dict__)
 270         self.cur.execute("""SELECT configid, config FROM checkerconfigs WHERE
 271             config=?;""", (config,))
 272         configrow = self.cur.fetchone()
 273         if not configrow or configrow[1] != config:
 274             return None
 275         else:
 276             return configrow[0]
 277
 278     def _cacheunitstats(self, units, fileid, unitindex=None):
 279         """Cache the statistics for the supplied unit(s)."""
 280         unitvalues = []
 281         for index, unit in enumerate(units):
 282             if unit.istranslatable():
 283                 sourcewords, targetwords = wordsinunit(unit)
 284                 if unitindex:
 285                     index = unitindex
 286                 # what about plurals in .source and .target?
 287                 unitvalues.append((unit.getid(), fileid, index, \
 288                                 unit.source, unit.target, \
 289                                 sourcewords, targetwords, \
 290                                 statefordb(unit)))
 291         # XXX: executemany is non-standard
 292         self.cur.executemany("""INSERT INTO units
 293             (unitid, fileid, unitindex, source, target, sourcewords, targetwords, state)
 294             values (?, ?, ?, ?, ?, ?, ?, ?);""",
 295             unitvalues)
 296         self.con.commit()
 297         if unitindex:
 298             return state_strings[statefordb(units[0])]
 299         return ""
 300
 301     def _cachestore(self, store, mod_info):
 302         """Calculates and caches the statistics of the given store
 303         unconditionally."""
 304         realpath = os.path.realpath(store.filename)
 305         os.utime(realpath, (mod_info[0], mod_info[0]))
 306         self.cur.execute("""DELETE FROM files WHERE
 307             path=?;""", (realpath,))
 308         self.cur.execute("""INSERT INTO files
 309             (fileid, path, mod_info, toolkitbuild) values (NULL, ?, ?, ?);""",
 310             (realpath, dump_mod_info(mod_info), toolkitversion.build))
 311         fileid = self.cur.lastrowid
 312         self.cur.execute("""DELETE FROM units WHERE
 313             fileid=?""", (fileid,))
 314         self._cacheunitstats(store.units, fileid)
 315         return fileid
 316
 317     def directorytotals(self, dirname):
 318         """Retrieves the stored statistics for a given directory, all summed.
 319
 320         Note that this does not check for mod_infos or the presence of files."""
 321         realpath = os.path.realpath(dirname)
 322         self.cur.execute("""SELECT
 323             state,
 324             count(unitid) as total,
 325             sum(sourcewords) as sourcewords,
 326             sum(targetwords) as targetwords
 327             FROM units WHERE fileid IN
 328                 (SELECT fileid from files
 329                 WHERE substr(path, 0, ?)=?)
 330             GROUP BY state;""", (len(realpath), realpath))
 331         totals = emptystats()
 332         return self.cur.fetchall()
 333
 334     def filetotals(self, filename, **kwargs):
 335         """Retrieves the statistics for the given file if possible, otherwise
 336         delegates to cachestore()."""
 337         fileid = None
 338         if not fileid:
 339             try:
 340                 fileid = self._getfileid(filename, **kwargs)
 341             except ValueError, e:
 342                 print >> sys.stderr, str(e)
 343                 return {}
 344
 345         self.cur.execute("""SELECT
 346             state,
 347             count(unitid) as total,
 348             sum(sourcewords) as sourcewords,
 349             sum(targetwords) as targetwords
 350             FROM units WHERE fileid=?
 351             GROUP BY state;""", (fileid,))
 352         values = self.cur.fetchall()
 353
 354         totals = emptystats()
 355         for stateset in values:
 356             state = state_strings[stateset[0]]          # state
 357             totals[state] = stateset[1] or 0            # total
 358             totals[state + "sourcewords"] = stateset[2] # sourcewords
 359             totals[state + "targetwords"] = stateset[3] # targetwords
 360         totals["total"] = totals["untranslated"] + totals["translated"] + totals["fuzzy"]
 361         totals["totalsourcewords"] = totals["untranslatedsourcewords"] + \
 362                 totals["translatedsourcewords"] + \
 363                 totals["fuzzysourcewords"]
 364         return totals
 365
 366     def _cacheunitschecks(self, units, fileid, configid, checker, unitindex=None):
 367         """Helper method for cachestorechecks() and recacheunit()"""
 368         # We always want to store one dummy error to know that we have actually
 369         # run the checks on this file with the current checker configuration
 370         dummy = (-1, fileid, configid, "noerror", "")
 371         unitvalues = [dummy]
 372         # if we are doing a single unit, we want to return the checknames
 373         errornames = []
 374         for index, unit in enumerate(units):
 375             if unit.istranslatable():
 376                 # Correctly assign the unitindex
 377                 if unitindex:
 378                     index = unitindex
 379                 failures = checker.run_filters(unit)
 380                 for checkname, checkmessage in failures.iteritems():
 381                     unitvalues.append((index, fileid, configid, checkname, checkmessage))
 382                     errornames.append("check-" + checkname)
 383         checker.setsuggestionstore(None)
 384
 385         if unitindex:
 386             # We are only updating a single unit, so we don't want to add an
 387             # extra noerror-entry
 388             unitvalues.remove(dummy)
 389             errornames.append("total")
 390
 391         # XXX: executemany is non-standard
 392         self.cur.executemany("""INSERT INTO uniterrors
 393             (unitindex, fileid, configid, name, message)
 394             values (?, ?, ?, ?, ?);""",
 395             unitvalues)
 396         self.con.commit()
 397         return errornames
 398
 399     def cachestorechecks(self, fileid, store, checker, configid):
 400         """Calculates and caches the error statistics of the given store
 401         unconditionally."""
 402         # Let's purge all previous failures because they will probably just
 403         # fill up the database without much use.
 404         self.cur.execute("""DELETE FROM uniterrors WHERE
 405             fileid=?;""", (fileid,))
 406         self._cacheunitschecks(store.units, fileid, configid, checker)
 407         return fileid
 408
 409     def recacheunit(self, filename, checker, unit):
 410         """Recalculate all information for a specific unit. This is necessary
 411         for updating all statistics when a translation of a unit took place,
 412         for example.
 413
 414         This method assumes that everything was up to date before (file totals,
 415         checks, checker config, etc."""
 416         suggestion_filename, suggestion_mod_info = suggestioninfo(filename)
 417         fileid = self._getfileid(filename, suggestion_mod_info, check_mod_info=False)
 418         configid = self._getstoredcheckerconfig(checker)
 419         unitid = unit.getid()
 420         # get the unit index
 421         self.cur.execute("""SELECT unitindex FROM units WHERE
 422             fileid=? AND unitid=?;""", (fileid, unitid))
 423         unitindex = self.cur.fetchone()[0]
 424         self.cur.execute("""DELETE FROM units WHERE
 425             fileid=? AND unitid=?;""", (fileid, unitid))
 426         state = [self._cacheunitstats([unit], fileid, unitindex)]
 427         # remove the current errors
 428         self.cur.execute("""DELETE FROM uniterrors WHERE
 429             fileid=? AND unitindex=?;""", (fileid, unitindex))
 430         if suggestion_filename:
 431             checker.setsuggestionstore(factory.getobject(suggestion_filename, ignore=os.path.extsep+ 'pending'))
 432         state.extend(self._cacheunitschecks([unit], fileid, configid, checker, unitindex))
 433         return state
 434
 435     def filechecks(self, filename, checker, store=None, **kwargs):
 436         """Retrieves the error statistics for the given file if possible,
 437         otherwise delegates to cachestorechecks()."""
 438         suggestion_filename, suggestion_mod_info = suggestioninfo(filename, **kwargs)
 439         fileid = None
 440         configid = self._getstoredcheckerconfig(checker)
 441         try:
 442             fileid = self._getfileid(filename, suggestion_mod_info, store=store, **kwargs)
 443             if not configid:
 444                 self.cur.execute("""INSERT INTO checkerconfigs
 445                     (configid, config) values (NULL, ?);""",
 446                     (str(checker.config.__dict__),))
 447                 configid = self.cur.lastrowid
 448         except ValueError, e:
 449             print >> sys.stderr, str(e)
 450             return {}
 451
 452         def geterrors():
 453             self.cur.execute("""SELECT
 454                 name,
 455                 unitindex
 456                 FROM uniterrors WHERE fileid=? and configid=?
 457                 ORDER BY unitindex;""", (fileid, configid))
 458             return self.cur.fetchall()
 459
 460         values = geterrors()
 461         if not values:
 462             # This could happen if we haven't done the checks before, or the
 463             # file changed, or we are using a different configuration
 464             store = store or factory.getobject(filename)
 465             if suggestion_filename:
 466                 checker.setsuggestionstore(factory.getobject(suggestion_filename, ignore=os.path.extsep+ 'pending'))
 467             self.cachestorechecks(fileid, store, checker, configid)
 468             values = geterrors()
 469
 470         errors = {}
 471         for value in values:
 472             if value[1] == -1:
 473                 continue
 474             checkkey = 'check-' + value[0]      #value[0] is the error name
 475             if not checkkey in errors:
 476                 errors[checkkey] = []
 477             errors[checkkey].append(value[1])   #value[1] is the unitindex
 478
 479         return errors
 480
 481     def filestats(self, filename, checker, store=None, **kwargs):
 482         """Return a dictionary of property names mapping sets of unit
 483         indices with those properties."""
 484         stats = {"total": [], "translated": [], "fuzzy": [], "untranslated": []}
 485
 486         stats.update(self.filechecks(filename, checker, store, **kwargs))
 487         fileid = self._getfileid(filename, store=store, **kwargs)
 488
 489         self.cur.execute("""SELECT
 490             state,
 491             unitindex
 492             FROM units WHERE fileid=?
 493             ORDER BY unitindex;""", (fileid,))
 494
 495         values = self.cur.fetchall()
 496         for value in values:
 497             stats[state_strings[value[0]]].append(value[1])
 498             stats["total"].append(value[1])
 499
 500         return stats
 501
 502     def unitstats(self, filename, _lang=None, store=None, **kwargs):
 503         # For now, lang and store are unused. lang will allow the user to
 504         # base stats information on the given language. See the commented
 505         # line containing stats.update below.
 506         """Return a dictionary of property names mapping to arrays which
 507         map unit indices to property values.
 508
 509         Please note that this is different from filestats, since filestats
 510         supplies sets of unit indices with a given property, whereas this
 511         method supplies arrays which map unit indices to given values."""
 512         stats = {"sourcewordcount": [], "targetwordcount": []}
 513
 514         #stats.update(self.unitchecks(filename, lang, store))
 515         fileid = self._getfileid(filename, store=store, **kwargs)
 516
 517         self.cur.execute("""SELECT
 518           sourcewords, targetwords
 519           FROM units WHERE fileid=?
 520           ORDER BY unitindex;""", (fileid,))
 521
 522         for sourcecount, targetcount in self.cur.fetchall():
 523             stats["sourcewordcount"].append(sourcecount)
 524             stats["targetwordcount"].append(targetcount)
 525
 526         return stats