From 67a6bf32d24e9bea854e661d37ec88b4d5bad0fc Mon Sep 17 00:00:00 2001 From: Micah Jacob Date: Sat, 7 May 2011 18:16:17 -0400 Subject: [PATCH] terms seem to work again --- src/mjacob/ontologybuilder/framework.py | 4 +-- src/mjacob/ontologybuilder/highlighted_terms.py | 34 ++++++++++------------ .../ontologybuilder/simple_concept_hierarchies.py | 3 ++ src/util/cache_util.py | 8 ++++- .../symbol_ontology/symbol_ontology_builder.yaml | 27 ++++++++++------- .../symbol_ontology/symbol_ontology_test.py | 14 ++++----- 6 files changed, 52 insertions(+), 38 deletions(-) rewrite test/standalone/symbol_ontology/symbol_ontology_builder.yaml (76%) diff --git a/src/mjacob/ontologybuilder/framework.py b/src/mjacob/ontologybuilder/framework.py index 16edcde..fab2914 100644 --- a/src/mjacob/ontologybuilder/framework.py +++ b/src/mjacob/ontologybuilder/framework.py @@ -71,11 +71,11 @@ class OntologyBuilderConcepts(object): def _get_concepts(self, **state): pass -def OntologyBuilderConceptHierarchies(object): +class OntologyBuilderConceptHierarchies(object): def _get_concept_hierarchies(self, **state): pass -def OntologyBuilderRelations(object): +class OntologyBuilderRelations(object): def _get_relations(self, **state): pass \ No newline at end of file diff --git a/src/mjacob/ontologybuilder/highlighted_terms.py b/src/mjacob/ontologybuilder/highlighted_terms.py index f2934f7..583b589 100644 --- a/src/mjacob/ontologybuilder/highlighted_terms.py +++ b/src/mjacob/ontologybuilder/highlighted_terms.py @@ -27,23 +27,21 @@ class HighlightedTerms(OntologyBuilderTerms): identify term candidates in each (NPs that fit some statistics) remove terms identified in general corpus from domain corpus""" - main_statistics = self.__get_statistics(self.__corpus, "main", self.__highlights, state) - other_stats = [self.__get_statistics(other_corpus, other_corpus_name, None, state) - for other_corpus, other_corpus_name in self.__other_corpora - ] + main_statistics = self.__get_statistics(self.__corpus, "main", self.__highlights) + other_stats = [self.__get_statistics(other_corpus, other_corpus_name, None) + for other_corpus, other_corpus_name in self.__other_corpora] term_relevences = self.__get_term_relevences(main_statistics, other_stats) term_entropies = self.__get_term_entropies(main_statistics, len(self.__corpus.files())) - term_filter = state["term_filter"] - terms = term_filter(main_statistics, term_relevences, term_entropies) + terms = self.__term_filter(main_statistics, term_relevences, term_entropies) return terms def __get_term_entropies(self, statistics, doc_count): cache_filename = "entropies" - if is_cached(self._cachedir, self._name, cache_filename): - entropies = get_cache(self._cachedir, self._name, cache_filename) + if is_cached(self.cachedir(), self.name(), cache_filename): + entropies = get_cache(self.cachedir(), self.name(), cache_filename) return entropies entropies = {} @@ -56,7 +54,7 @@ class HighlightedTerms(OntologyBuilderTerms): for term in entropies: entropies[term] /= (entropy_total *.001) - set_cache(self._cachedir, self._name, cache_filename, entropies) + set_cache(self.cachedir(), self.name(), cache_filename, entropies) return entropies @@ -76,8 +74,8 @@ class HighlightedTerms(OntologyBuilderTerms): def __get_term_relevences(self, main_statistics, other_stats): cache_filename = "relevences" - if is_cached(self._cachedir, self._name, cache_filename): - term_relevences = get_cache(self._cachedir, self._name, cache_filename) + if is_cached(self.cachedir(), self.name(), cache_filename): + term_relevences = get_cache(self.cachedir(), self.name(), cache_filename) return term_relevences term_relevences = {} @@ -90,17 +88,17 @@ class HighlightedTerms(OntologyBuilderTerms): denominator = sum(self.__count(term, other_stats[i], other_sizes[i]) for i in xrange(len(other_stats))) term_relevences[term] = (1.0*numerator) / (numerator + denominator) - set_cache(self._cachedir, self._name, cache_filename, term_relevences) + set_cache(self.cachedir(), self.name(), cache_filename, term_relevences) return term_relevences - def __get_statistics(self, corpus, corpus_name, highlights, state): + def __get_statistics(self, corpus, corpus_name, highlights): print "finding terms in %s" % (corpus_name) cache_filename = "%s.statistics" % (corpus_name) - if is_cached(self._cachedir, self._name, cache_filename): - term_statistics = get_cache(self._cachedir, self._name, cache_filename) + if is_cached(self.cachedir(), self.name(), cache_filename): + term_statistics = get_cache(self.cachedir(), self.name(), cache_filename) return term_statistics term_statistics = {} @@ -113,7 +111,7 @@ class HighlightedTerms(OntologyBuilderTerms): chunked = self.__chunker.parse(tagged) relevent_chunks = self.__chunk_filter(chunked) for chunk in relevent_chunks: - for subterm in self.__get_subterms(chunk): + for subterm in self.__subterms(chunk): if subterm in termcounts_in_file: termcounts_in_file[subterm] += 1 else: @@ -122,7 +120,7 @@ class HighlightedTerms(OntologyBuilderTerms): sentence_highlights = highlights.sents(file) for term_sentence in sentence_highlights: for term in term_sentence: - for subterm in self.__get_subterms(chunk): + for subterm in self.__subterms(chunk): if subterm in termcounts_in_file: termcounts_in_file[subterm] += 1 else: @@ -136,7 +134,7 @@ class HighlightedTerms(OntologyBuilderTerms): term_statistics[term] = {file: count} print "processed %s" % (file) - set_cache(self._cachedir, self._name, cache_filename, term_statistics) + set_cache(self.cachedir(), self.name(), cache_filename, term_statistics) return term_statistics diff --git a/src/mjacob/ontologybuilder/simple_concept_hierarchies.py b/src/mjacob/ontologybuilder/simple_concept_hierarchies.py index 7620dbd..8a33030 100644 --- a/src/mjacob/ontologybuilder/simple_concept_hierarchies.py +++ b/src/mjacob/ontologybuilder/simple_concept_hierarchies.py @@ -19,4 +19,7 @@ class SimpleConceptHierarchies(OntologyBuilderConceptHierarchies): def _get_concept_hierarchies(self, **state): raise Exception() + #lch_similarity(self, other) + #path_similarity(self, other) + #wup_similarity(self, other) pass diff --git a/src/util/cache_util.py b/src/util/cache_util.py index 74ff487..06a76cd 100644 --- a/src/util/cache_util.py +++ b/src/util/cache_util.py @@ -9,7 +9,13 @@ import os import cPickle def is_cached(dir, name, component): - return os.path.exists(os.path.join(dir, name, component)) + cachedir = os.path.join(dir, name, component) + found = os.path.exists(cachedir) + if found: + print "cached file found: %s" % (cachedir) + else: + print "cached file not found: %s" % (cachedir) + return found def get_cache(dir, name, component): print "reading '%s/%s/%s' from cache" % (dir, name, component) diff --git a/test/standalone/symbol_ontology/symbol_ontology_builder.yaml b/test/standalone/symbol_ontology/symbol_ontology_builder.yaml dissimilarity index 76% index 0d20f36..4fc9749 100644 --- a/test/standalone/symbol_ontology/symbol_ontology_builder.yaml +++ b/test/standalone/symbol_ontology/symbol_ontology_builder.yaml @@ -1,10 +1,17 @@ -terms.NavigliTermFilter.alpha: .2 -terms.NavigliTermFilter.threshold: .4 -terms.other_corpora: - - !!python/name:nltk.corpus.brown - - !!python/name:nltk.corpus.gutenberg - - !!python/name:nltk.corpus.inaugural - - !!python/name:nltk.corpus.state_union - - !!python/name:nltk.corpus.movie_reviews - - !!python/name:nltk.corpus.reuters - - !!python/name:nltk.corpus.treebank_raw +terms.NavigliTermFilter.alpha: .2 +terms.NavigliTermFilter.threshold: .4 +terms.other_corpora: + - - !!python/name:nltk.corpus.brown + - brown + - - !!python/name:nltk.corpus.gutenberg + - gutenberg + - - !!python/name:nltk.corpus.inaugural + - inaugural + - - !!python/name:nltk.corpus.state_union + - state_union + - - !!python/name:nltk.corpus.movie_reviews + - movie_reviews + - - !!python/name:nltk.corpus.reuters + - reuters + - - !!python/name:nltk.corpus.treebank_raw + - treebank_raw diff --git a/test/standalone/symbol_ontology/symbol_ontology_test.py b/test/standalone/symbol_ontology/symbol_ontology_test.py index ab76e92..86e740d 100644 --- a/test/standalone/symbol_ontology/symbol_ontology_test.py +++ b/test/standalone/symbol_ontology/symbol_ontology_test.py @@ -21,12 +21,12 @@ from mjacob.ontologybuilder import (OntologyBuilderFramework , NavigliTermFilter ) -VERSION = 3 +VERSION = 4 NAME = 'symbol ontology generator V.%s' % (VERSION) -BASEDIR = os.path.split(__file__)[0] +BASEDIR = os.path.split(os.path.abspath(__file__))[0] CACHEDIR = os.path.join(BASEDIR, 'symbol_corpus_cache') CORPUSDIR = os.path.join(BASEDIR, 'symbol_corpus') -CORPUSFILES = os.path.join(CORPUSDIR, '*.txt.4') +CORPUSFILES = os.path.join(CORPUSDIR, '*.txt') class SymbolOntologyBuilder(OntologyBuilderFramework, HighlightedTerms, @@ -34,7 +34,7 @@ class SymbolOntologyBuilder(OntologyBuilderFramework, SimpleConcepts, SimpleConceptHierarchies, dict): - OTHER_CORPORA = "other_corpora" + OTHER_CORPORA = "terms.other_corpora" ALPHA = "terms.NavigliTermFilter.alpha" THRESHOLD = "terms.NavigliTermFilter.threshold" @@ -55,11 +55,11 @@ class SymbolOntologyBuilder(OntologyBuilderFramework, def __init_terms(self): corpus = PlaintextCorpusReader(root=CORPUSDIR, - fileids=glob(CORPUSFILES), + fileids=[os.path.split(file)[1] for file in glob(CORPUSFILES)], word_tokenizer=RegexpTokenizer(r'img#\w+|#\d+|\w+|\w+\'\w+|(\w\.){2:}'), encoding='utf-8') highlights = PlaintextCorpusReader(root=CORPUSDIR, - fileids=glob(CORPUSFILES), + fileids=[os.path.split(file)[1] for file in glob(CORPUSFILES)], word_tokenizer=RegexpTokenizer(r'<[^>]+>'), encoding='utf-8') other_corpora = self[SymbolOntologyBuilder.OTHER_CORPORA] @@ -90,4 +90,4 @@ class SymbolOntologyBuilder(OntologyBuilderFramework, if __name__ == "__main__": builder = SymbolOntologyBuilder('symbol_ontology_builder.yaml') - print "\n".join(sorted(builder._get_terms(**builder.get_state()))) \ No newline at end of file + print "\n".join(sorted(builder._get_terms(**builder.state()))) \ No newline at end of file -- 2.11.4.GIT