From 8cb6e97adec1ab57adccaefb04684c136d15750f Mon Sep 17 00:00:00 2001 From: Micah Jacob Date: Wed, 11 May 2011 17:08:57 -0400 Subject: [PATCH] partial documentation; some fantastic caching stuff... via annotation.. --- .settings/org.eclipse.core.resources.prefs | 4 +- src/mjacob/ontologybuilder/framework.py | 91 ++++++++++++------ src/mjacob/ontologybuilder/highlighted_terms.py | 108 +++++++++++++-------- src/util/cache_util.py | 28 ------ src/util/cached.py | 123 ++++++++++++++++++++++++ 5 files changed, 256 insertions(+), 98 deletions(-) delete mode 100644 src/util/cache_util.py create mode 100644 src/util/cached.py diff --git a/.settings/org.eclipse.core.resources.prefs b/.settings/org.eclipse.core.resources.prefs index fb7e541..6c3703d 100644 --- a/.settings/org.eclipse.core.resources.prefs +++ b/.settings/org.eclipse.core.resources.prefs @@ -1,4 +1,4 @@ -#Sun May 08 23:31:27 EDT 2011 +#Wed May 11 16:57:29 EDT 2011 eclipse.preferences.version=1 encoding//src/mjacob/ontologybuilder/framework.py=utf-8 encoding//src/mjacob/ontologybuilder/highlighted_terms.py=utf-8 @@ -8,7 +8,7 @@ encoding//src/mjacob/ontologybuilder/simple_concept_hierarchies.py=utf-8 encoding//src/mjacob/ontologybuilder/simple_concepts.py=utf-8 encoding//src/mjacob/ontologybuilder/simple_subterms.py=utf-8 encoding//src/mjacob/ontologybuilder/simple_wordnet_synonyms.py=utf-8 -encoding//src/util/cache_util.py=utf-8 +encoding//src/util/cached.py=utf-8 encoding//test/standalone/for_fun.py=utf-8 encoding//test/standalone/symbol_ontology/symbol_ontology_test.py=utf-8 encoding//util/create_symbol_corpus.py=utf-8 diff --git a/src/mjacob/ontologybuilder/framework.py b/src/mjacob/ontologybuilder/framework.py index 37999fd..1f9558b 100644 --- a/src/mjacob/ontologybuilder/framework.py +++ b/src/mjacob/ontologybuilder/framework.py @@ -4,11 +4,17 @@ Created on Apr 29, 2011 @author: mjacob ''' - -import yaml -from util.cache_util import get_cache, is_cached, set_cache - -class OntologyBuilderFramework(object): +import os +from util.cached import cacheable +class OntologyBuilderFramework(cacheable): + """A framework for building domain ontologies, assuming that the framework for doing such is: + C{TERMS}: identify terms (from a corpus or such) + C{SYNONYMS}: identify synonyms amongst the terms + C{CONCEPTS}: identify concepts + C{CONCEPT_HIERARCHIES}: identify heirarchical relationships amongst concepts + C{RELATIONS}: identify other relationships amongst concepts + """ + TERMS="terms" SYNONYMS="synonyms" CONCEPTS="concepts" @@ -16,44 +22,63 @@ class OntologyBuilderFramework(object): RELATIONS="relations" def __init__(self, initial_state, name, cachedir): + """ + @type initial_state: C{dict} + @param initial_state: additional state available when processing with a framework instance + @type name: C{str} + @param name: the name of this framework instance (used debuggin) + @type cachedir: C{str} + @param cachedir: the base directory in which to cache the ontology generation process + """ + cacheable.__init__(self, os.path.join(cachedir, name)) self.__state = initial_state self.__name = name - self.__cachedir = cachedir - def state(self): + """ + get the current state. + + @rtype: C{dict} + """ return self.__state def name(self): + """ + get the name of the framework instance. + + @rtype: C{str} + """ return self.__name def _do_step(self, step): + """ + returns C{True} if the framework instance is designed to perform step C{step} + + @rtype: C{bool} + """ return True - def _is_cached(self, component): - return is_cached(self.__cachedir, self.__name, component) - - def _get_cache(self, component): - return get_cache(self.__cachedir, self.__name, component) - - def _set_cache(self, component, object): - return set_cache(self.__cachedir, self.__name, component, object) - - def write_state(self, state, filename): - with open(filename, 'w') as fh: - fh.write(yaml.dump(state)) - - def read_state(self, filename): - with open(filename, 'w') as fh: - return yaml.load(fh.read()) - def __get_initial_state(self, additional_state): - state = dict(self.state()) - for key, value, in additional_state.items(): - state[key] = value + """ + construct a state collection given the defaults for object, + and anything additional supplied for the specific run. + """ + state = self.state().copy() + state.update(additional_state) return state def process(self, only_do=None, ignore_cache=None, **additional_state): + """ + iterate through predefined to construct an ontology. + + @param only_do: if specified, only perform steps in this collection + @param ignore_cache: if specified, ignore any cached results from + steps specified in this collection. note that any new results + will still be saved to cache, possibly overwriting existing results. + + @return the resulting state + """ + state = self.__get_initial_state(additional_state) for step in (OntologyBuilderFramework.TERMS, @@ -63,11 +88,12 @@ class OntologyBuilderFramework(object): OntologyBuilderFramework.RELATIONS): if self._do_step(step) and not (only_do and not step in only_do): - if (not (ignore_cache and step in ignore_cache)) and self._is_cached(step): - result = self._get_cache(step) + if (not (ignore_cache and step in ignore_cache)) and self.is_cached(step): + result = self.get_cache(step) + else: result = self.__getattribute__('_get_%s' % step)(**state) - self._set_cache(step, result) + self.set_cache(step, result) if not result: raise Exception("no result (%s) at step %s" % (result, step)) @@ -78,22 +104,27 @@ class OntologyBuilderFramework(object): return state class OntologyBuilderTerms(object): + """interface for building terms for an ontology""" def _get_terms(self, **state): pass class OntologyBuilderSynonyms(object): + """interface for building synonyms (usually of terms) for an ontology""" def _get_synonyms(self, **state): pass class OntologyBuilderConcepts(object): + """interface for constructing concepts for an ontology""" def _get_concepts(self, **state): pass class OntologyBuilderConceptHierarchies(object): + """interafce for constructing hierarchies of concepts for an ontology""" def _get_concept_hierarchies(self, **state): pass class OntologyBuilderRelations(object): + """interface for building relations between concepts in an ontology""" def _get_relations(self, **state): pass \ No newline at end of file diff --git a/src/mjacob/ontologybuilder/highlighted_terms.py b/src/mjacob/ontologybuilder/highlighted_terms.py index 781a140..9624a5f 100644 --- a/src/mjacob/ontologybuilder/highlighted_terms.py +++ b/src/mjacob/ontologybuilder/highlighted_terms.py @@ -8,10 +8,49 @@ Created on May 3, 2011 from framework import OntologyBuilderTerms import math +from util.cached import cached class HighlightedTerms(OntologyBuilderTerms): + """ + this is an term extractor for use in generating a domain. - def __init__(self, corpus, highlights, other_corpora, tagger, chunker, chunk_filter, subterms, term_filter): + given a corpus in which terms may or may not be highlighted, term candidates + are extracted using a tagger and a chunker. term candidates are chosen + for use if they fit certain statistical patterns in relation to term candidates in + other corpora. + """ + + def __init__(self, + corpus, + highlights, + other_corpora, + tagger, + chunker, + chunk_filter, + subterms, + statistical_filter): + """ + @type corpus: C{nltk.corpus.reader.api.CorpusReader} + @param corpus: a body of files tokenized by sentence and word + @type corpus: C{nltk.corpus.reader.api.CorpusReader} + @param highlights: the same body of files, but tokenized over highlighted + portions of the text. + @type other_corpora: C{list} of C{nltk.corpus.reader.api.CorpusReader} + @type tagger: C{nltk.tag.api.TaggerI} + @param tagger: a tagger to apply to the corpora + @type chunker: C{nltk.chunk.api.ChunkParserI} + @param chunker: a chunker used to mark terms in a tagged sentence + @type chunk_filter: function(C{nltk.tree.Tree}) -> C{list} of C{list}s + @param chunk_filter: a function which will extract the relevant components of a chunked sentence + @type subterms: function(list) -> C{list} of C{str} + @param subterms: a function which, given tokens, returns potential subterm strings of a term + @type statistical_filter: function(C{dict}, C{dict}, C{dict}) -> C{set} + @param statistical_filter: a method for extracting statistically relevant terms + @param 1: a set of terms + @param 2: a dict relating terms to their relative occurrence in the target corpus vs. the other corpora + @param 3: a dict relating terms to their entropic relevance in the domain corpus + + """ self.__corpus = corpus self.__highlights = highlights self.__other_corpora = other_corpora @@ -19,7 +58,7 @@ class HighlightedTerms(OntologyBuilderTerms): self.__chunker = chunker self.__chunk_filter = chunk_filter self.__subterms = subterms - self.__term_filter = term_filter + self.__statistical_filter = statistical_filter def _get_terms(self, **state): """input: 1 domain corpus, 1 general corpus @@ -33,35 +72,41 @@ class HighlightedTerms(OntologyBuilderTerms): term_relevences = self.__get_term_relevences(main_statistics, other_stats) term_entropies = self.__get_term_entropies(main_statistics, len(self.__corpus.fileids())) - terms = self.__term_filter(main_statistics, term_relevences, term_entropies) + terms = self.__statistical_filter(main_statistics, term_relevences, term_entropies) return terms + @cached(lambda *x: "entropies") def __get_term_entropies(self, statistics, doc_count): - cache_filename = "entropies" - if self._is_cached(cache_filename): - entropies = self._get_cache(cache_filename) - return entropies + """ + calculate the normalized entropy of a term, + N = total number of documents + n = the number of documents the term appears in + p = n/N + normalized entropy H = p * log p / log(N) + http://www.xycoon.com/normalized_entropy.htm + """ entropies = {} - entropy_total = 0.0 for term in statistics: prob = (1.0*len(statistics[term])) / doc_count - entropy = prob * math.log(1/prob, 2) - entropy_total += entropy - entropies[term] = entropy + entropies[term] = prob * math.log(1/prob, 2) for term in entropies: - entropies[term] /= (entropy_total *.001) - - self._set_cache(cache_filename, entropies) + entropies[term] /= math.log(doc_count) *.001 # scale it up to make it easier to calculate w/ return entropies - def __symbol_count(self, statistics): - n = 0 + def __get_term_counts(self, statistics): + """ + """ + term_counts = {None, 0} + term_counts.setdefault(0) + for term in statistics: - for file in statistics[term]: - n += statistics[term][file] + term_counts[term] = sum(statistics[term].values()) + + return term_counts + def __count(self, term, statistics, size): n = 0 @@ -70,36 +115,24 @@ class HighlightedTerms(OntologyBuilderTerms): n += statistics[term][file] return n - + @cached(lambda *x: "relevences") def __get_term_relevences(self, main_statistics, other_stats): - cache_filename = "relevences" - if self._is_cached(cache_filename): - term_relevences = self._get_cache(cache_filename) - return term_relevences - term_relevences = {} - main_size = self.__symbol_count(main_statistics) - other_sizes = [self.__symbol_count(stat) for stat in other_stats] + main_term_counts = self.__get_term_counts(main_statistics) + other_term_counts = [self.__get_term_counts(stat) for stat in other_stats] for term in main_statistics: - numerator = self.__count(term, main_statistics, main_size) - denominator = sum(self.__count(term, other_stats[i], other_sizes[i]) for i in xrange(len(other_stats))) + numerator = main_term_counts[term] + denominator = sum(other_term_count[term] for other_term_count in other_term_counts) term_relevences[term] = (1.0*numerator) / (numerator + denominator) - self._set_cache(cache_filename, term_relevences) - return term_relevences + @cached(lambda corpus, corpus_name, highlights: "%s.statistics" % (corpus_name)) def __get_statistics(self, corpus, corpus_name, highlights): - print "finding terms in %s" % (corpus_name) - - cache_filename = "%s.statistics" % (corpus_name) - if self._is_cached(cache_filename): - term_statistics = self._get_cache(cache_filename) - return term_statistics - + term_statistics = {} for file in corpus.fileids(): print "processing %s" % (file) @@ -133,7 +166,6 @@ class HighlightedTerms(OntologyBuilderTerms): term_statistics[term] = {file: count} print "processed %s" % (file) - self._set_cache(cache_filename, term_statistics) return term_statistics diff --git a/src/util/cache_util.py b/src/util/cache_util.py deleted file mode 100644 index 06a76cd..0000000 --- a/src/util/cache_util.py +++ /dev/null @@ -1,28 +0,0 @@ -# This Python file uses the following encoding: utf-8 -''' -Created on May 5, 2011 - -@author: mjacob -''' -import os -#import yaml # if used, use CYaml or whatever it is -import cPickle - -def is_cached(dir, name, component): - cachedir = os.path.join(dir, name, component) - found = os.path.exists(cachedir) - if found: - print "cached file found: %s" % (cachedir) - else: - print "cached file not found: %s" % (cachedir) - return found - -def get_cache(dir, name, component): - print "reading '%s/%s/%s' from cache" % (dir, name, component) - return cPickle.load(open(os.path.join(dir, name, component))) - -def set_cache(dir, name, component, object): - if not os.path.exists(os.path.join(dir, name)): - os.makedirs(os.path.join(dir, name)) - print "writing '%s/%s/%s' to cache" % (dir, name, component) - cPickle.dump(object, open(os.path.join(dir, name, component), 'w')) \ No newline at end of file diff --git a/src/util/cached.py b/src/util/cached.py new file mode 100644 index 0000000..45ce5ad --- /dev/null +++ b/src/util/cached.py @@ -0,0 +1,123 @@ +# This Python file uses the following encoding: utf-8 +''' +Created on May 11, 2011 + +@author: mjacob +''' +import os +import functools +import cPickle + +class cacheable(object): + def __init__(self, cachedir): + self.__cachedir = cachedir + + def _filename(self, component): + return os.path.join(self.__cachedir, component) + + def is_cached(self, component): + """ + returns C{True} if the specified C{component} is cached + + @type component: C{str} + @rtype: C{bool} + """ + filename = self._filename(component) + found = os.path.exists(filename) + + if found: + print "cached file found: %s" % (filename) + else: + print "cached file not found: %s" % (filename) + + return found + + def get_cache(self, component, loader=cPickle): + """ + returns the specified cached component + + @type component: C{str} + @rtype: object + """ + filename = self._filename(component) + print "reading '%s' from cache" % (filename) + + with open(filename) as file: + return loader.load(file) + + def set_cache(self, component, object, dumper=cPickle): + """ + writes the specified component to cache. + + @type component: C{str} + @type object: anything pickle-able + """ + + if not os.path.exists(self.__cachedir): + os.makedirs(self.__cachedir) + + filename = self._filename(component) + + print "writing '%s' to cache" % (filename) + + with open(filename, 'w') as file: + dumper.dump(object, file) + + return filename + + +class cached(object): + """Decorator that caches a function's return value each time it is called. + If called later with the same arguments, the cached value is returned, and + not re-evaluated. + """ + def __init__(self, key, loader=cPickle, dumper=cPickle): + self.__key = key + self.__loader = loader + self.__dumper = dumper + + def __call__(self, func): + def caller(*args, **kwargs): + cacher = args[0] + if kwargs: + component = self.__key(*args[1:], **kwargs) + else: + component = self.__key(*args[1:]) + if cacher.is_cached(component): + return cacher.get_cache(component, self.__loader) + else: + if kwargs: + result = func(*args, **kwargs) + else: + result = func(*args) + cacher.set_cache(component, result, self.__dumper) + return result + return caller + + def __repr__(self): + """Return the function's docstring.""" + return self.func.__doc__ + + def __get__(self, obj, objtype): + """Support instance methods.""" + return functools.partial(self.__call__, obj) + + +class Test(cacheable): + count = 0 + + def __init__(self): + cacheable.__init__(self, os.path.join("/Users/mjacob/testmemoize", "test memoize")) + + @cached(lambda number, *x: str(number)) + def do_thing(self, number, *others, **kwargs): + self.count += 1 + return (number, self.count) + +if __name__ == "__main__": + a = Test() + print a.do_thing(1,4) + print a.do_thing(2,5) + print a.do_thing(3,6) + print a.do_thing(1,4) + print "..." \ No newline at end of file -- 2.11.4.GIT