From 2d8c2167a4bfd2b16bff6a141561d34ff427d07f Mon Sep 17 00:00:00 2001
From: winterstream <winterstream@54714841-351b-0410-a198-e36a94b762f5>
Date: Mon, 23 Jun 2008 12:47:14 +0000
Subject: [PATCH] Added some robustness to the stasistics code to handle
 corrupt files. The boolean parameter errors_return_empty can be passed to a
 number of the methods in Statdb; if errors_return_empty is true, then they
 will not raise exceptions, but instead return empty or zero values.

Created a ParseError exception which should be raised by parsers which
run into parsing problems. So far, only pypo.py does this (and in a
somewhat rudimentary way).


git-svn-id: http://translate.svn.sourceforge.net/svnroot/translate/src/trunk/translate@7646 54714841-351b-0410-a198-e36a94b762f5
---
 storage/base.py    |  3 ++
 storage/pypo.py    | 85 +++++++++++++++++++++++++++--------------------------
 storage/statsdb.py | 86 +++++++++++++++++++++++++++++++-----------------------
 3 files changed, 96 insertions(+), 78 deletions(-)

diff --git a/storage/base.py b/storage/base.py
index 7870dd4..d6c7e9f 100644
--- a/storage/base.py
+++ b/storage/base.py
@@ -43,6 +43,9 @@ def force_override(method, baseclass):
     if actualclass != baseclass:
         raise NotImplementedError("%s does not reimplement %s as required by %s" % (actualclass.__name__, method.__name__, baseclass.__name__))
 
+class ParseError(Exception):
+    pass
+
 class TranslationUnit(object):
     """Base class for translation units.
     
diff --git a/storage/pypo.py b/storage/pypo.py
index 66b5772..8eba309 100644
--- a/storage/pypo.py
+++ b/storage/pypo.py
@@ -27,7 +27,7 @@ from translate.misc.multistring import multistring
 from translate.misc import quote
 from translate.misc import textwrap
 from translate.lang import data
-from translate.storage import pocommon
+from translate.storage import pocommon, base
 import re
 
 lsep = "\n#: "
@@ -801,47 +801,50 @@ class pofile(pocommon.pofile):
 
     def parse(self, input):
         """parses the given file or file source string"""
-        if hasattr(input, 'name'):
-            self.filename = input.name
-        elif not getattr(self, 'filename', ''):
-            self.filename = ''
-        if hasattr(input, "read"):
-            posrc = input.read()
-            input.close()
-            input = posrc
-        # TODO: change this to a proper parser that doesn't do line-by-line madness
-        lines = input.split("\n")
-        start = 0
-        end = 0
-        # make only the first one the header
-        linesprocessed = 0
-        is_decoded = False
-        while end <= len(lines):
-            if (end == len(lines)) or (not lines[end].strip()):  # end of lines or blank line
-                newpe = self.UnitClass(encoding=self._encoding)
-                unit_lines = lines[start:end]
-                # We need to work carefully if we haven't decoded properly yet.
-                # So let's solve this temporarily until we actually get the
-                # encoding from the header.
-                if not is_decoded:
-                    unit_lines = [line.decode('ascii', 'ignore') for line in unit_lines]
-                linesprocessed = newpe.parselines(unit_lines)
-                start += linesprocessed
-                # TODO: find a better way of working out if we actually read anything
-                if linesprocessed >= 1 and newpe._getoutput():
-                    self.units.append(newpe)
+        try:
+            if hasattr(input, 'name'):
+                self.filename = input.name
+            elif not getattr(self, 'filename', ''):
+                self.filename = ''
+            if hasattr(input, "read"):
+                posrc = input.read()
+                input.close()
+                input = posrc
+            # TODO: change this to a proper parser that doesn't do line-by-line madness
+            lines = input.split("\n")
+            start = 0
+            end = 0
+            # make only the first one the header
+            linesprocessed = 0
+            is_decoded = False
+            while end <= len(lines):
+                if (end == len(lines)) or (not lines[end].strip()):  # end of lines or blank line
+                    newpe = self.UnitClass(encoding=self._encoding)
+                    unit_lines = lines[start:end]
+                    # We need to work carefully if we haven't decoded properly yet.
+                    # So let's solve this temporarily until we actually get the
+                    # encoding from the header.
                     if not is_decoded:
-                        if newpe.isheader(): # If there is a header...
-                            if "Content-Type" in self.parseheader(): # and a Content-Type...
-                                if self._encoding.lower() != 'charset': # with a valid charset...
-                                    self._encoding = newpe._encoding # then change the encoding
-                                    # otherwise we'll decode using UTF-8
-                        lines = self.decode(lines)
-                        self.units = []
-                        start = 0
-                        end = 0
-                        is_decoded = True
-            end = end+1
+                        unit_lines = [line.decode('ascii', 'ignore') for line in unit_lines]
+                    linesprocessed = newpe.parselines(unit_lines)
+                    start += linesprocessed
+                    # TODO: find a better way of working out if we actually read anything
+                    if linesprocessed >= 1 and newpe._getoutput():
+                        self.units.append(newpe)
+                        if not is_decoded:
+                            if newpe.isheader(): # If there is a header...
+                                if "Content-Type" in self.parseheader(): # and a Content-Type...
+                                    if self._encoding.lower() != 'charset': # with a valid charset...
+                                        self._encoding = newpe._encoding # then change the encoding
+                                        # otherwise we'll decode using UTF-8
+                            lines = self.decode(lines)
+                            self.units = []
+                            start = 0
+                            end = 0
+                            is_decoded = True
+                end = end+1
+        except Exception, e:
+            raise base.ParseError()
 
     def removeduplicates(self, duplicatestyle="merge"):
         """make sure each msgid is unique ; merge comments etc from duplicates into original"""
diff --git a/storage/statsdb.py b/storage/statsdb.py
index 6a0d4fd..e325b46 100644
--- a/storage/statsdb.py
+++ b/storage/statsdb.py
@@ -27,7 +27,7 @@
 """
 
 from translate import __version__ as toolkitversion
-from translate.storage import factory
+from translate.storage import factory, base
 from translate.misc.multistring import multistring
 from translate.lang.common import Common
 
@@ -93,16 +93,22 @@ def emptystats():
         stats[state + "targetwords"] = 0
     return stats
 
-def get_mod_info(file_path):
-    file_stat = os.stat(file_path)
-    # First, we multiply the mtime by 1000 to shift any millisecond values
-    # left of the .
-    # Then we make the somewhat daring assumption that 64 bits should be
-    # enough for any translation file size. To make space for the translation
-    # file size bits, we shift the mtime left by 64 bits.
-    return (long(file_stat.st_mtime * 1000) << 64) + file_stat.st_size
+def get_mod_info(file_path, errors_return_empty=False):
+    try:
+        file_stat = os.stat(file_path)
+        # First, we multiply the mtime by 1000 to shift any millisecond values
+        # left of the .
+        # Then we make the somewhat daring assumption that 64 bits should be
+        # enough for any translation file size. To make space for the translation
+        # file size bits, we shift the mtime left by 64 bits.
+        return (long(file_stat.st_mtime * 1000) << 64) + file_stat.st_size
+    except:
+        if errors_return_empty:
+            return 0
+        else:
+            raise
 
-def suggestioninfo(filename):
+def suggestioninfo(filename, **kwargs):
     """Provides the filename of the associated file containing suggestions and 
     its mod_info, if it exists."""
     root, ext = os.path.splitext(filename)
@@ -116,7 +122,7 @@ def suggestioninfo(filename):
         if not os.path.exists(suggestion_filename):
             suggestion_filename = None
         else:
-            suggestion_mod_info = get_mod_info(suggestion_filename)
+            suggestion_mod_info = get_mod_info(suggestion_filename, **kwargs)
     return suggestion_filename, suggestion_mod_info
 
 class StatsCache(object):
@@ -204,7 +210,7 @@ class StatsCache(object):
         
         self.con.commit()
 
-    def _getfileid(self, filename, opt_mod_info=-1, check_mod_info=True, store=None):
+    def _getfileid(self, filename, opt_mod_info=-1, check_mod_info=True, store=None, errors_return_empty=False):
         """Attempt to find the fileid of the given file, if it hasn't been
         updated since the last record update.
 
@@ -220,21 +226,27 @@ class StatsCache(object):
         self.cur.execute("""SELECT fileid, mod_info FROM files
                 WHERE path=?;""", (realpath,))
         filerow = self.cur.fetchone()
-        mod_info = max(opt_mod_info, get_mod_info(realpath))        
-        if filerow:
-            fileid = filerow[0]
-            if not check_mod_info:
-                # Update the mod_info of the file
-                self.cur.execute("""UPDATE files 
-                        SET mod_info=? 
-                        WHERE fileid=?;""", (str(mod_info), fileid))
-                return fileid
-            if long(filerow[1]) == mod_info:
-                return fileid
-        # We can only ignore the mod_info if the row already exists:
-        assert check_mod_info        
-        store = store or factory.getobject(filename)
-        return self.cachestore(store)
+        try:
+            mod_info = max(opt_mod_info, get_mod_info(realpath))        
+            if filerow:
+                fileid = filerow[0]
+                if not check_mod_info:
+                    # Update the mod_info of the file
+                    self.cur.execute("""UPDATE files 
+                            SET mod_info=? 
+                            WHERE fileid=?;""", (str(mod_info), fileid))
+                    return fileid
+                if long(filerow[1]) == mod_info:
+                    return fileid
+            # We can only ignore the mod_info if the row already exists:
+            assert check_mod_info        
+            store = store or factory.getobject(filename)
+            return self.cachestore(store)
+        except (base.ParseError, IOError, OSError):
+            if errors_return_empty:
+                return -1
+            else:
+                raise
 
     def _getstoredcheckerconfig(self, checker):
         """See if this checker configuration has been used before."""
@@ -303,13 +315,13 @@ class StatsCache(object):
         totals = emptystats()
         return self.cur.fetchall()
 
-    def filetotals(self, filename):
+    def filetotals(self, filename, **kwargs):
         """Retrieves the statistics for the given file if possible, otherwise 
         delegates to cachestore()."""
         fileid = None
         if not fileid:
             try:
-                fileid = self._getfileid(filename)
+                fileid = self._getfileid(filename, **kwargs)
             except ValueError, e:
                 print >> sys.stderr, str(e)
                 return {}
@@ -404,14 +416,14 @@ class StatsCache(object):
         state.extend(self._cacheunitschecks([unit], fileid, configid, checker, unitindex))
         return state
     
-    def filechecks(self, filename, checker, store=None):
+    def filechecks(self, filename, checker, store=None, **kwargs):
         """Retrieves the error statistics for the given file if possible, 
         otherwise delegates to cachestorechecks()."""
-        suggestion_filename, suggestion_mod_info = suggestioninfo(filename)
+        suggestion_filename, suggestion_mod_info = suggestioninfo(filename, **kwargs)
         fileid = None
         configid = self._getstoredcheckerconfig(checker)
         try:
-            fileid = self._getfileid(filename, suggestion_mod_info, store=store)
+            fileid = self._getfileid(filename, suggestion_mod_info, store=store, **kwargs)
             if not configid:
                 self.cur.execute("""INSERT INTO checkerconfigs
                     (configid, config) values (NULL, ?);""", 
@@ -450,13 +462,13 @@ class StatsCache(object):
 
         return errors
 
-    def filestats(self, filename, checker, store=None):
+    def filestats(self, filename, checker, store=None, **kwargs):
         """Return a dictionary of property names mapping sets of unit 
         indices with those properties."""
         stats = {"total": [], "translated": [], "fuzzy": [], "untranslated": []}
 
-        stats.update(self.filechecks(filename, checker, store))
-        fileid = self._getfileid(filename, store=store)
+        stats.update(self.filechecks(filename, checker, store, **kwargs))
+        fileid = self._getfileid(filename, store=store, **kwargs)
 
         self.cur.execute("""SELECT 
             state,
@@ -471,7 +483,7 @@ class StatsCache(object):
 
         return stats
       
-    def unitstats(self, filename, _lang=None, store=None):
+    def unitstats(self, filename, _lang=None, store=None, **kwargs):
         # For now, lang and store are unused. lang will allow the user to
         # base stats information on the given language. See the commented
         # line containing stats.update below. 
@@ -484,7 +496,7 @@ class StatsCache(object):
         stats = {"sourcewordcount": [], "targetwordcount": []}
         
         #stats.update(self.unitchecks(filename, lang, store))
-        fileid = self._getfileid(filename, store=store)
+        fileid = self._getfileid(filename, store=store, **kwargs)
         
         self.cur.execute("""SELECT
           sourcewords, targetwords
-- 
2.11.4.GIT