From 2d8c2167a4bfd2b16bff6a141561d34ff427d07f Mon Sep 17 00:00:00 2001 From: winterstream Date: Mon, 23 Jun 2008 12:47:14 +0000 Subject: [PATCH] Added some robustness to the stasistics code to handle corrupt files. The boolean parameter errors_return_empty can be passed to a number of the methods in Statdb; if errors_return_empty is true, then they will not raise exceptions, but instead return empty or zero values. Created a ParseError exception which should be raised by parsers which run into parsing problems. So far, only pypo.py does this (and in a somewhat rudimentary way). git-svn-id: http://translate.svn.sourceforge.net/svnroot/translate/src/trunk/translate@7646 54714841-351b-0410-a198-e36a94b762f5 --- storage/base.py | 3 ++ storage/pypo.py | 85 +++++++++++++++++++++++++++-------------------------- storage/statsdb.py | 86 +++++++++++++++++++++++++++++++----------------------- 3 files changed, 96 insertions(+), 78 deletions(-) diff --git a/storage/base.py b/storage/base.py index 7870dd4..d6c7e9f 100644 --- a/storage/base.py +++ b/storage/base.py @@ -43,6 +43,9 @@ def force_override(method, baseclass): if actualclass != baseclass: raise NotImplementedError("%s does not reimplement %s as required by %s" % (actualclass.__name__, method.__name__, baseclass.__name__)) +class ParseError(Exception): + pass + class TranslationUnit(object): """Base class for translation units. diff --git a/storage/pypo.py b/storage/pypo.py index 66b5772..8eba309 100644 --- a/storage/pypo.py +++ b/storage/pypo.py @@ -27,7 +27,7 @@ from translate.misc.multistring import multistring from translate.misc import quote from translate.misc import textwrap from translate.lang import data -from translate.storage import pocommon +from translate.storage import pocommon, base import re lsep = "\n#: " @@ -801,47 +801,50 @@ class pofile(pocommon.pofile): def parse(self, input): """parses the given file or file source string""" - if hasattr(input, 'name'): - self.filename = input.name - elif not getattr(self, 'filename', ''): - self.filename = '' - if hasattr(input, "read"): - posrc = input.read() - input.close() - input = posrc - # TODO: change this to a proper parser that doesn't do line-by-line madness - lines = input.split("\n") - start = 0 - end = 0 - # make only the first one the header - linesprocessed = 0 - is_decoded = False - while end <= len(lines): - if (end == len(lines)) or (not lines[end].strip()): # end of lines or blank line - newpe = self.UnitClass(encoding=self._encoding) - unit_lines = lines[start:end] - # We need to work carefully if we haven't decoded properly yet. - # So let's solve this temporarily until we actually get the - # encoding from the header. - if not is_decoded: - unit_lines = [line.decode('ascii', 'ignore') for line in unit_lines] - linesprocessed = newpe.parselines(unit_lines) - start += linesprocessed - # TODO: find a better way of working out if we actually read anything - if linesprocessed >= 1 and newpe._getoutput(): - self.units.append(newpe) + try: + if hasattr(input, 'name'): + self.filename = input.name + elif not getattr(self, 'filename', ''): + self.filename = '' + if hasattr(input, "read"): + posrc = input.read() + input.close() + input = posrc + # TODO: change this to a proper parser that doesn't do line-by-line madness + lines = input.split("\n") + start = 0 + end = 0 + # make only the first one the header + linesprocessed = 0 + is_decoded = False + while end <= len(lines): + if (end == len(lines)) or (not lines[end].strip()): # end of lines or blank line + newpe = self.UnitClass(encoding=self._encoding) + unit_lines = lines[start:end] + # We need to work carefully if we haven't decoded properly yet. + # So let's solve this temporarily until we actually get the + # encoding from the header. if not is_decoded: - if newpe.isheader(): # If there is a header... - if "Content-Type" in self.parseheader(): # and a Content-Type... - if self._encoding.lower() != 'charset': # with a valid charset... - self._encoding = newpe._encoding # then change the encoding - # otherwise we'll decode using UTF-8 - lines = self.decode(lines) - self.units = [] - start = 0 - end = 0 - is_decoded = True - end = end+1 + unit_lines = [line.decode('ascii', 'ignore') for line in unit_lines] + linesprocessed = newpe.parselines(unit_lines) + start += linesprocessed + # TODO: find a better way of working out if we actually read anything + if linesprocessed >= 1 and newpe._getoutput(): + self.units.append(newpe) + if not is_decoded: + if newpe.isheader(): # If there is a header... + if "Content-Type" in self.parseheader(): # and a Content-Type... + if self._encoding.lower() != 'charset': # with a valid charset... + self._encoding = newpe._encoding # then change the encoding + # otherwise we'll decode using UTF-8 + lines = self.decode(lines) + self.units = [] + start = 0 + end = 0 + is_decoded = True + end = end+1 + except Exception, e: + raise base.ParseError() def removeduplicates(self, duplicatestyle="merge"): """make sure each msgid is unique ; merge comments etc from duplicates into original""" diff --git a/storage/statsdb.py b/storage/statsdb.py index 6a0d4fd..e325b46 100644 --- a/storage/statsdb.py +++ b/storage/statsdb.py @@ -27,7 +27,7 @@ """ from translate import __version__ as toolkitversion -from translate.storage import factory +from translate.storage import factory, base from translate.misc.multistring import multistring from translate.lang.common import Common @@ -93,16 +93,22 @@ def emptystats(): stats[state + "targetwords"] = 0 return stats -def get_mod_info(file_path): - file_stat = os.stat(file_path) - # First, we multiply the mtime by 1000 to shift any millisecond values - # left of the . - # Then we make the somewhat daring assumption that 64 bits should be - # enough for any translation file size. To make space for the translation - # file size bits, we shift the mtime left by 64 bits. - return (long(file_stat.st_mtime * 1000) << 64) + file_stat.st_size +def get_mod_info(file_path, errors_return_empty=False): + try: + file_stat = os.stat(file_path) + # First, we multiply the mtime by 1000 to shift any millisecond values + # left of the . + # Then we make the somewhat daring assumption that 64 bits should be + # enough for any translation file size. To make space for the translation + # file size bits, we shift the mtime left by 64 bits. + return (long(file_stat.st_mtime * 1000) << 64) + file_stat.st_size + except: + if errors_return_empty: + return 0 + else: + raise -def suggestioninfo(filename): +def suggestioninfo(filename, **kwargs): """Provides the filename of the associated file containing suggestions and its mod_info, if it exists.""" root, ext = os.path.splitext(filename) @@ -116,7 +122,7 @@ def suggestioninfo(filename): if not os.path.exists(suggestion_filename): suggestion_filename = None else: - suggestion_mod_info = get_mod_info(suggestion_filename) + suggestion_mod_info = get_mod_info(suggestion_filename, **kwargs) return suggestion_filename, suggestion_mod_info class StatsCache(object): @@ -204,7 +210,7 @@ class StatsCache(object): self.con.commit() - def _getfileid(self, filename, opt_mod_info=-1, check_mod_info=True, store=None): + def _getfileid(self, filename, opt_mod_info=-1, check_mod_info=True, store=None, errors_return_empty=False): """Attempt to find the fileid of the given file, if it hasn't been updated since the last record update. @@ -220,21 +226,27 @@ class StatsCache(object): self.cur.execute("""SELECT fileid, mod_info FROM files WHERE path=?;""", (realpath,)) filerow = self.cur.fetchone() - mod_info = max(opt_mod_info, get_mod_info(realpath)) - if filerow: - fileid = filerow[0] - if not check_mod_info: - # Update the mod_info of the file - self.cur.execute("""UPDATE files - SET mod_info=? - WHERE fileid=?;""", (str(mod_info), fileid)) - return fileid - if long(filerow[1]) == mod_info: - return fileid - # We can only ignore the mod_info if the row already exists: - assert check_mod_info - store = store or factory.getobject(filename) - return self.cachestore(store) + try: + mod_info = max(opt_mod_info, get_mod_info(realpath)) + if filerow: + fileid = filerow[0] + if not check_mod_info: + # Update the mod_info of the file + self.cur.execute("""UPDATE files + SET mod_info=? + WHERE fileid=?;""", (str(mod_info), fileid)) + return fileid + if long(filerow[1]) == mod_info: + return fileid + # We can only ignore the mod_info if the row already exists: + assert check_mod_info + store = store or factory.getobject(filename) + return self.cachestore(store) + except (base.ParseError, IOError, OSError): + if errors_return_empty: + return -1 + else: + raise def _getstoredcheckerconfig(self, checker): """See if this checker configuration has been used before.""" @@ -303,13 +315,13 @@ class StatsCache(object): totals = emptystats() return self.cur.fetchall() - def filetotals(self, filename): + def filetotals(self, filename, **kwargs): """Retrieves the statistics for the given file if possible, otherwise delegates to cachestore().""" fileid = None if not fileid: try: - fileid = self._getfileid(filename) + fileid = self._getfileid(filename, **kwargs) except ValueError, e: print >> sys.stderr, str(e) return {} @@ -404,14 +416,14 @@ class StatsCache(object): state.extend(self._cacheunitschecks([unit], fileid, configid, checker, unitindex)) return state - def filechecks(self, filename, checker, store=None): + def filechecks(self, filename, checker, store=None, **kwargs): """Retrieves the error statistics for the given file if possible, otherwise delegates to cachestorechecks().""" - suggestion_filename, suggestion_mod_info = suggestioninfo(filename) + suggestion_filename, suggestion_mod_info = suggestioninfo(filename, **kwargs) fileid = None configid = self._getstoredcheckerconfig(checker) try: - fileid = self._getfileid(filename, suggestion_mod_info, store=store) + fileid = self._getfileid(filename, suggestion_mod_info, store=store, **kwargs) if not configid: self.cur.execute("""INSERT INTO checkerconfigs (configid, config) values (NULL, ?);""", @@ -450,13 +462,13 @@ class StatsCache(object): return errors - def filestats(self, filename, checker, store=None): + def filestats(self, filename, checker, store=None, **kwargs): """Return a dictionary of property names mapping sets of unit indices with those properties.""" stats = {"total": [], "translated": [], "fuzzy": [], "untranslated": []} - stats.update(self.filechecks(filename, checker, store)) - fileid = self._getfileid(filename, store=store) + stats.update(self.filechecks(filename, checker, store, **kwargs)) + fileid = self._getfileid(filename, store=store, **kwargs) self.cur.execute("""SELECT state, @@ -471,7 +483,7 @@ class StatsCache(object): return stats - def unitstats(self, filename, _lang=None, store=None): + def unitstats(self, filename, _lang=None, store=None, **kwargs): # For now, lang and store are unused. lang will allow the user to # base stats information on the given language. See the commented # line containing stats.update below. @@ -484,7 +496,7 @@ class StatsCache(object): stats = {"sourcewordcount": [], "targetwordcount": []} #stats.update(self.unitchecks(filename, lang, store)) - fileid = self._getfileid(filename, store=store) + fileid = self._getfileid(filename, store=store, **kwargs) self.cur.execute("""SELECT sourcewords, targetwords -- 2.11.4.GIT