From 8cb6e97adec1ab57adccaefb04684c136d15750f Mon Sep 17 00:00:00 2001
From: Micah Jacob <mjacob@wgen.net>
Date: Wed, 11 May 2011 17:08:57 -0400
Subject: [PATCH] partial documentation; some fantastic caching stuff... via
 annotation..

---
 .settings/org.eclipse.core.resources.prefs      |   4 +-
 src/mjacob/ontologybuilder/framework.py         |  91 ++++++++++++------
 src/mjacob/ontologybuilder/highlighted_terms.py | 108 +++++++++++++--------
 src/util/cache_util.py                          |  28 ------
 src/util/cached.py                              | 123 ++++++++++++++++++++++++
 5 files changed, 256 insertions(+), 98 deletions(-)
 delete mode 100644 src/util/cache_util.py
 create mode 100644 src/util/cached.py

diff --git a/.settings/org.eclipse.core.resources.prefs b/.settings/org.eclipse.core.resources.prefs
index fb7e541..6c3703d 100644
--- a/.settings/org.eclipse.core.resources.prefs
+++ b/.settings/org.eclipse.core.resources.prefs
@@ -1,4 +1,4 @@
-#Sun May 08 23:31:27 EDT 2011
+#Wed May 11 16:57:29 EDT 2011
 eclipse.preferences.version=1
 encoding//src/mjacob/ontologybuilder/framework.py=utf-8
 encoding//src/mjacob/ontologybuilder/highlighted_terms.py=utf-8
@@ -8,7 +8,7 @@ encoding//src/mjacob/ontologybuilder/simple_concept_hierarchies.py=utf-8
 encoding//src/mjacob/ontologybuilder/simple_concepts.py=utf-8
 encoding//src/mjacob/ontologybuilder/simple_subterms.py=utf-8
 encoding//src/mjacob/ontologybuilder/simple_wordnet_synonyms.py=utf-8
-encoding//src/util/cache_util.py=utf-8
+encoding//src/util/cached.py=utf-8
 encoding//test/standalone/for_fun.py=utf-8
 encoding//test/standalone/symbol_ontology/symbol_ontology_test.py=utf-8
 encoding//util/create_symbol_corpus.py=utf-8
diff --git a/src/mjacob/ontologybuilder/framework.py b/src/mjacob/ontologybuilder/framework.py
index 37999fd..1f9558b 100644
--- a/src/mjacob/ontologybuilder/framework.py
+++ b/src/mjacob/ontologybuilder/framework.py
@@ -4,11 +4,17 @@ Created on Apr 29, 2011
 
 @author: mjacob
 '''
-
-import yaml
-from util.cache_util import get_cache, is_cached, set_cache
-
-class OntologyBuilderFramework(object):
+import os
+from util.cached import cacheable
+class OntologyBuilderFramework(cacheable):
+    """A framework for building domain ontologies, assuming that the framework for doing such is:
+    C{TERMS}: identify terms (from a corpus or such)
+    C{SYNONYMS}: identify synonyms amongst the terms
+    C{CONCEPTS}: identify concepts
+    C{CONCEPT_HIERARCHIES}: identify heirarchical relationships amongst concepts
+    C{RELATIONS}: identify other relationships amongst concepts
+    """
+    
     TERMS="terms"
     SYNONYMS="synonyms"
     CONCEPTS="concepts"
@@ -16,44 +22,63 @@ class OntologyBuilderFramework(object):
     RELATIONS="relations"
     
     def __init__(self, initial_state, name, cachedir):
+        """
+        @type initial_state: C{dict}
+        @param initial_state: additional state available when processing with a framework instance
+        @type name: C{str}
+        @param name: the name of this framework instance (used debuggin)
+        @type cachedir: C{str}
+        @param cachedir: the base directory in which to cache the ontology generation process
+        """
+        cacheable.__init__(self, os.path.join(cachedir, name))
         self.__state = initial_state
         self.__name = name
-        self.__cachedir = cachedir
         
-    
     def state(self):
+        """
+        get the current state.
+        
+        @rtype: C{dict}
+        """
         return self.__state
     
     def name(self):
+        """
+        get the name of the framework instance.
+        
+        @rtype: C{str}
+        """
         return self.__name
     
     def _do_step(self, step):
+        """
+        returns C{True} if the framework instance is designed to perform step C{step}
+        
+        @rtype: C{bool}
+        """
         return True
     
-    def _is_cached(self, component):
-        return is_cached(self.__cachedir, self.__name, component)
-    
-    def _get_cache(self, component):
-        return get_cache(self.__cachedir, self.__name, component)
-    
-    def _set_cache(self, component, object):
-        return set_cache(self.__cachedir, self.__name, component, object)
-    
-    def write_state(self, state, filename):
-        with open(filename, 'w') as fh:
-            fh.write(yaml.dump(state))
-    
-    def read_state(self, filename):
-        with open(filename, 'w') as fh:
-            return yaml.load(fh.read())
-        
     def __get_initial_state(self, additional_state):
-        state = dict(self.state())
-        for key, value, in additional_state.items():
-            state[key] = value
+        """
+        construct a state collection given the defaults for object,
+        and anything additional supplied for the specific run.
+        """
+        state = self.state().copy()
+        state.update(additional_state)
         return state
     
     def process(self, only_do=None, ignore_cache=None, **additional_state):
+        """
+        iterate through predefined to construct an ontology.
+        
+        @param only_do: if specified, only perform steps in this collection
+        @param ignore_cache: if specified, ignore any cached results from 
+            steps specified in this collection. note that any new results 
+            will still be saved to cache, possibly overwriting existing results.
+        
+        @return the resulting state 
+        """
+        
         state = self.__get_initial_state(additional_state)
         
         for step in (OntologyBuilderFramework.TERMS,
@@ -63,11 +88,12 @@ class OntologyBuilderFramework(object):
                      OntologyBuilderFramework.RELATIONS):
                          
             if self._do_step(step) and not (only_do and not step in only_do):
-                if (not (ignore_cache and step in ignore_cache)) and self._is_cached(step):
-                    result = self._get_cache(step)
+                if (not (ignore_cache and step in ignore_cache)) and self.is_cached(step):
+                    result = self.get_cache(step)
+                
                 else:
                     result = self.__getattribute__('_get_%s' % step)(**state)
-                    self._set_cache(step, result)
+                    self.set_cache(step, result)
                 
                 if not result:
                     raise Exception("no result (%s) at step %s" % (result, step))
@@ -78,22 +104,27 @@ class OntologyBuilderFramework(object):
         return state
 
 class OntologyBuilderTerms(object):
+    """interface for building terms for an ontology"""
     def _get_terms(self, **state):
         pass
 
 class OntologyBuilderSynonyms(object):
+    """interface for building synonyms (usually of terms) for an ontology"""
     def _get_synonyms(self, **state):
         pass
 
 class OntologyBuilderConcepts(object):
+    """interface for constructing concepts for an ontology"""
     def _get_concepts(self, **state):
         pass
 
 class OntologyBuilderConceptHierarchies(object):
+    """interafce for constructing hierarchies of concepts for an ontology"""
     def _get_concept_hierarchies(self, **state):
         pass
 
 class OntologyBuilderRelations(object):
+    """interface for building relations between concepts in an ontology"""
     def _get_relations(self, **state):
         pass
         
\ No newline at end of file
diff --git a/src/mjacob/ontologybuilder/highlighted_terms.py b/src/mjacob/ontologybuilder/highlighted_terms.py
index 781a140..9624a5f 100644
--- a/src/mjacob/ontologybuilder/highlighted_terms.py
+++ b/src/mjacob/ontologybuilder/highlighted_terms.py
@@ -8,10 +8,49 @@ Created on May 3, 2011
 from framework import OntologyBuilderTerms
 
 import math
+from util.cached import cached
 
 class HighlightedTerms(OntologyBuilderTerms):
+    """
+    this is an term extractor for use in generating a domain.
     
-    def __init__(self, corpus, highlights, other_corpora, tagger, chunker, chunk_filter, subterms, term_filter):
+    given a corpus in which terms may or may not be highlighted, term candidates
+    are extracted using a tagger and a chunker. term candidates are chosen
+    for use if they fit certain statistical patterns in relation to term candidates in
+    other corpora.  
+    """
+    
+    def __init__(self, 
+                 corpus, 
+                 highlights, 
+                 other_corpora, 
+                 tagger, 
+                 chunker, 
+                 chunk_filter, 
+                 subterms, 
+                 statistical_filter):
+        """
+        @type corpus: C{nltk.corpus.reader.api.CorpusReader}
+        @param corpus: a body of files tokenized by sentence and word
+        @type corpus: C{nltk.corpus.reader.api.CorpusReader}
+        @param highlights: the same body of files, but tokenized over highlighted
+            portions of the text. 
+        @type other_corpora: C{list} of C{nltk.corpus.reader.api.CorpusReader} 
+        @type tagger: C{nltk.tag.api.TaggerI}
+        @param tagger: a tagger to apply to the corpora
+        @type chunker: C{nltk.chunk.api.ChunkParserI}
+        @param chunker: a chunker used to mark terms in a tagged sentence
+        @type chunk_filter: function(C{nltk.tree.Tree}) -> C{list} of C{list}s
+        @param chunk_filter: a function which will extract the relevant components of a chunked sentence
+        @type subterms: function(list) -> C{list} of C{str}
+        @param subterms: a function which, given tokens, returns potential subterm strings of a term
+        @type statistical_filter: function(C{dict}, C{dict}, C{dict}) -> C{set}
+        @param statistical_filter: a method for extracting statistically relevant terms
+            @param 1: a set of terms 
+            @param 2: a dict relating terms to their relative occurrence in the target corpus vs. the other corpora
+            @param 3: a dict relating terms to their entropic relevance in the domain corpus 
+            
+        """
         self.__corpus = corpus
         self.__highlights = highlights
         self.__other_corpora = other_corpora
@@ -19,7 +58,7 @@ class HighlightedTerms(OntologyBuilderTerms):
         self.__chunker = chunker
         self.__chunk_filter = chunk_filter
         self.__subterms = subterms
-        self.__term_filter = term_filter
+        self.__statistical_filter = statistical_filter
     
     def _get_terms(self, **state):
         """input: 1 domain corpus, 1 general corpus
@@ -33,35 +72,41 @@ class HighlightedTerms(OntologyBuilderTerms):
         term_relevences = self.__get_term_relevences(main_statistics, other_stats)
         term_entropies = self.__get_term_entropies(main_statistics, len(self.__corpus.fileids()))
         
-        terms = self.__term_filter(main_statistics, term_relevences, term_entropies)
+        terms = self.__statistical_filter(main_statistics, term_relevences, term_entropies)
         
         return terms
         
+    @cached(lambda *x: "entropies")
     def __get_term_entropies(self, statistics, doc_count):
-        cache_filename = "entropies"
-        if self._is_cached(cache_filename):
-            entropies = self._get_cache(cache_filename)
-            return entropies
+        """
+        calculate the normalized entropy of a term,
+        N = total number of documents
+        n = the number of documents the term appears in
+        p = n/N 
+        normalized entropy H = p * log p / log(N)
+        http://www.xycoon.com/normalized_entropy.htm
+        """
 
         entropies = {}
-        entropy_total = 0.0
         for term in statistics:
             prob = (1.0*len(statistics[term])) / doc_count
-            entropy = prob * math.log(1/prob, 2)
-            entropy_total += entropy
-            entropies[term] = entropy
+            entropies[term] = prob * math.log(1/prob, 2)
         for term in entropies:
-            entropies[term] /= (entropy_total *.001)
-        
-        self._set_cache(cache_filename, entropies)
+            entropies[term] /= math.log(doc_count) *.001 # scale it up to make it easier to calculate w/
         
         return entropies
     
-    def __symbol_count(self, statistics):
-        n = 0
+    def __get_term_counts(self, statistics):
+        """
+        """
+        term_counts = {None, 0}
+        term_counts.setdefault(0)
+        
         for term in statistics:
-            for file in statistics[term]:
-                n += statistics[term][file]
+            term_counts[term] = sum(statistics[term].values())
+        
+        return term_counts
+        
                 
     def __count(self, term, statistics, size):
         n = 0
@@ -70,36 +115,24 @@ class HighlightedTerms(OntologyBuilderTerms):
                 n += statistics[term][file]
         return n
             
-    
+    @cached(lambda *x: "relevences")
     def __get_term_relevences(self, main_statistics, other_stats):
-        cache_filename = "relevences"
-        if self._is_cached(cache_filename):
-            term_relevences = self._get_cache(cache_filename)
-            return term_relevences
-        
         term_relevences = {}
         
-        main_size = self.__symbol_count(main_statistics)
-        other_sizes = [self.__symbol_count(stat) for stat in other_stats]
+        main_term_counts = self.__get_term_counts(main_statistics)
+        other_term_counts = [self.__get_term_counts(stat) for stat in other_stats]
         
         for term in main_statistics:
-            numerator = self.__count(term, main_statistics, main_size)
-            denominator = sum(self.__count(term, other_stats[i], other_sizes[i]) for i in xrange(len(other_stats)))
+            numerator = main_term_counts[term]
+            denominator = sum(other_term_count[term] for other_term_count in other_term_counts)
             term_relevences[term] = (1.0*numerator) / (numerator + denominator)
         
-        self._set_cache(cache_filename, term_relevences)
-
         return term_relevences
 
+    @cached(lambda corpus, corpus_name, highlights: "%s.statistics" % (corpus_name))
     def __get_statistics(self, corpus, corpus_name, highlights):
-        
         print "finding terms in %s" % (corpus_name)
-                
-        cache_filename = "%s.statistics" % (corpus_name)
-        if self._is_cached(cache_filename):
-            term_statistics = self._get_cache(cache_filename)
-            return term_statistics
-        
+
         term_statistics = {}
         for file in corpus.fileids():
             print "processing %s" % (file)
@@ -133,7 +166,6 @@ class HighlightedTerms(OntologyBuilderTerms):
                     term_statistics[term] = {file: count}
             print "processed %s" % (file)
                     
-        self._set_cache(cache_filename, term_statistics)
         return term_statistics
 
                 
diff --git a/src/util/cache_util.py b/src/util/cache_util.py
deleted file mode 100644
index 06a76cd..0000000
--- a/src/util/cache_util.py
+++ /dev/null
@@ -1,28 +0,0 @@
-# This Python file uses the following encoding: utf-8
-'''
-Created on May 5, 2011
-
-@author: mjacob
-'''
-import os
-#import yaml # if used, use CYaml or whatever it is
-import cPickle
-
-def is_cached(dir, name, component):
-    cachedir = os.path.join(dir, name, component)
-    found = os.path.exists(cachedir)
-    if found:
-        print "cached file found: %s" % (cachedir)
-    else: 
-        print "cached file not found: %s" % (cachedir)
-    return found
-
-def get_cache(dir, name, component):
-    print "reading '%s/%s/%s' from cache" % (dir, name, component)
-    return cPickle.load(open(os.path.join(dir, name, component)))
-
-def set_cache(dir, name, component, object):
-    if not os.path.exists(os.path.join(dir, name)):
-        os.makedirs(os.path.join(dir, name))
-    print "writing '%s/%s/%s' to cache" % (dir, name, component)
-    cPickle.dump(object, open(os.path.join(dir, name, component), 'w'))
\ No newline at end of file
diff --git a/src/util/cached.py b/src/util/cached.py
new file mode 100644
index 0000000..45ce5ad
--- /dev/null
+++ b/src/util/cached.py
@@ -0,0 +1,123 @@
+# This Python file uses the following encoding: utf-8
+'''
+Created on May 11, 2011
+
+@author: mjacob
+'''
+import os
+import functools
+import cPickle
+
+class cacheable(object):
+    def __init__(self, cachedir):
+        self.__cachedir = cachedir
+        
+    def _filename(self, component):
+        return os.path.join(self.__cachedir, component)
+    
+    def is_cached(self, component):
+        """
+        returns C{True} if the specified C{component} is cached
+    
+        @type component: C{str} 
+        @rtype: C{bool}
+        """
+        filename = self._filename(component)
+        found = os.path.exists(filename)
+        
+        if found:
+            print "cached file found: %s" % (filename)
+        else: 
+            print "cached file not found: %s" % (filename)
+        
+        return found
+
+    def get_cache(self, component, loader=cPickle):
+        """
+        returns the specified cached component
+        
+        @type component: C{str} 
+        @rtype: object
+        """
+        filename = self._filename(component)
+        print "reading '%s' from cache" % (filename)
+        
+        with open(filename) as file:
+            return loader.load(file)
+
+    def set_cache(self, component, object, dumper=cPickle):
+        """
+        writes the specified component to cache. 
+    
+        @type component: C{str} 
+        @type object: anything pickle-able 
+        """
+        
+        if not os.path.exists(self.__cachedir):
+            os.makedirs(self.__cachedir)
+        
+        filename = self._filename(component)
+
+        print "writing '%s' to cache" % (filename)
+        
+        with open(filename, 'w') as file:
+            dumper.dump(object, file)
+        
+        return filename
+
+
+class cached(object):
+    """Decorator that caches a function's return value each time it is called.
+    If called later with the same arguments, the cached value is returned, and
+    not re-evaluated.
+    """
+    def __init__(self, key, loader=cPickle, dumper=cPickle):
+        self.__key = key
+        self.__loader = loader
+        self.__dumper = dumper
+        
+    def __call__(self, func):
+        def caller(*args, **kwargs):
+            cacher = args[0]
+            if kwargs:
+                component = self.__key(*args[1:], **kwargs)
+            else:
+                component = self.__key(*args[1:])
+            if cacher.is_cached(component):
+                return cacher.get_cache(component, self.__loader)
+            else:
+                if kwargs:
+                    result = func(*args, **kwargs)
+                else:
+                    result = func(*args)
+                cacher.set_cache(component, result, self.__dumper)
+                return result
+        return caller
+    
+    def __repr__(self):
+        """Return the function's docstring."""
+        return self.func.__doc__
+    
+    def __get__(self, obj, objtype):
+        """Support instance methods."""
+        return functools.partial(self.__call__, obj)
+    
+    
+class Test(cacheable):
+    count = 0
+
+    def __init__(self):
+        cacheable.__init__(self, os.path.join("/Users/mjacob/testmemoize", "test memoize"))
+    
+    @cached(lambda number, *x: str(number))
+    def do_thing(self, number, *others, **kwargs):
+        self.count += 1
+        return (number, self.count)
+    
+if __name__ == "__main__":
+    a = Test()
+    print a.do_thing(1,4)
+    print a.do_thing(2,5)
+    print a.do_thing(3,6)
+    print a.do_thing(1,4)
+    print "..."
\ No newline at end of file
-- 
2.11.4.GIT