From ab33d3aea5ecd6f5466462211bde725701e0e9b3 Mon Sep 17 00:00:00 2001 From: Micah Jacob Date: Thu, 12 May 2011 21:25:12 -0400 Subject: [PATCH] nearly everything good to go, mod minor hierarchy bug --- src/mjacob/ontologybuilder/highlighted_terms.py | 17 +++++++++++++---- src/mjacob/ontologybuilder/simple_concepts.py | 6 +++++- src/mjacob/ontologybuilder/simple_wordnet_synonyms.py | 5 +++-- test/standalone/symbol_ontology/symbol_ontology_test.py | 15 ++++++++++----- 4 files changed, 31 insertions(+), 12 deletions(-) diff --git a/src/mjacob/ontologybuilder/highlighted_terms.py b/src/mjacob/ontologybuilder/highlighted_terms.py index b654422..d847425 100644 --- a/src/mjacob/ontologybuilder/highlighted_terms.py +++ b/src/mjacob/ontologybuilder/highlighted_terms.py @@ -29,7 +29,8 @@ class HighlightedTerms(OntologyBuilderTerms): chunker, chunk_filter, subterms, - statistical_filter): + statistical_filter, + debug=False): """ @type corpus: C{nltk.corpus.reader.api.CorpusReader} @param corpus: a body of files tokenized by sentence and word @@ -60,6 +61,7 @@ class HighlightedTerms(OntologyBuilderTerms): self.__chunk_filter = chunk_filter self.__subterms = subterms self.__statistical_filter = statistical_filter + self.__debug = debug @Cached(lambda *x, **y: OntologyBuilderFramework.TERMS) def _get_terms(self, **state): @@ -136,14 +138,18 @@ class HighlightedTerms(OntologyBuilderTerms): """ discovers how many times each term appears in the specified corpus """ - print "finding terms in %s" % (corpus_name) + if self.__debug: + print "finding terms in %s" % (corpus_name) term_statistics = {} for file in corpus.fileids(): - print "processing %s" % (file) + if self.__debug: + print "processing %s" % (file) termcounts_in_file = {} sentences = corpus.sents(file) + for sentence in sentences: + """discover term candidates and count them""" tagged = self.__tagger.tag(sentence) chunked = self.__chunker.parse(tagged) relevent_chunks = self.__chunk_filter(chunked) @@ -153,7 +159,9 @@ class HighlightedTerms(OntologyBuilderTerms): termcounts_in_file[subterm] += 1 else: termcounts_in_file[subterm] = 1 + if highlights: + """process the highlights, ammending to existing terms""" sentence_highlights = highlights.sents(file) for term_sentence in sentence_highlights: for term in term_sentence: @@ -169,7 +177,8 @@ class HighlightedTerms(OntologyBuilderTerms): term_statistics[term][file] = count else: term_statistics[term] = {file: count} - print "processed %s" % (file) + if self.__debug: + print "finished processing %s" % (file) return term_statistics diff --git a/src/mjacob/ontologybuilder/simple_concepts.py b/src/mjacob/ontologybuilder/simple_concepts.py index f6129e0..0ce0d8f 100644 --- a/src/mjacob/ontologybuilder/simple_concepts.py +++ b/src/mjacob/ontologybuilder/simple_concepts.py @@ -129,9 +129,12 @@ class SimpleConcepts(OntologyBuilderConcepts): if len(synonym) < 2: continue + """make sure that the head of the concept is nominal""" nominal_concept = self.__ensure_nominal(synonym) + if not nominal_concept[-1]: + continue # if not a nominal concept, just skip it - concept = self.get_concept(synonym) + concept = self.get_concept(nominal_concept) if concept: concept = tuple(concept) # finalize it @@ -147,6 +150,7 @@ class SimpleConcepts(OntologyBuilderConcepts): return concepts def __ensure_nominal(self, synonym): + """filters out non-nominal synsets from the head of the term""" filtered_end = filter(NOMINAL_SYNSET.match, synonym[-1]) return tuple(chain(synonym[:-1], [filtered_end])) diff --git a/src/mjacob/ontologybuilder/simple_wordnet_synonyms.py b/src/mjacob/ontologybuilder/simple_wordnet_synonyms.py index f05feca..bf87b75 100644 --- a/src/mjacob/ontologybuilder/simple_wordnet_synonyms.py +++ b/src/mjacob/ontologybuilder/simple_wordnet_synonyms.py @@ -45,8 +45,9 @@ class SimpleWordnetSynonyms(OntologyBuilderSynonyms): indices = self.__get_indices('of', subterms) phrases = self.__phrasify(subterms, indices) synset_list = tuple(chain(*(synset_list[a[0]:a[1]] for a in reversed(phrases)))) - if EMPTY_SYNSET in synset_list: - continue # just ignore anything w/out a synset + + if EMPTY_SYNSET in synset_list: + continue # just ignore anything w/out a synset if synset_list in synonyms: synonyms[synset_list].append(term) diff --git a/test/standalone/symbol_ontology/symbol_ontology_test.py b/test/standalone/symbol_ontology/symbol_ontology_test.py index 518c96b..21630a4 100644 --- a/test/standalone/symbol_ontology/symbol_ontology_test.py +++ b/test/standalone/symbol_ontology/symbol_ontology_test.py @@ -47,6 +47,8 @@ class SymbolOntologyBuilder(OntologyBuilderFramework, SymbolOntologyBuilder is implemented on top of the C{OntologyBuilderFramework}, as a combination of C{HighlightedTerms}, C{SimpleWordnetSynonyms}, C{SimpleConcepts}, and C{SimpleConceptHierarchies}, as well as being C{Cacheable}. + + Note that all of the components REQUIRE that the base object be C{Cacheable} """ OTHER_CORPORA = "terms.other_corpora" @@ -142,7 +144,8 @@ class SymbolOntologyBuilder(OntologyBuilderFramework, chunker, chunk_filter, subterms, - term_filter) + term_filter, + debug=True) def __init_synonyms(self): SimpleWordnetSynonyms.__init__(self) @@ -156,8 +159,8 @@ class SymbolOntologyBuilder(OntologyBuilderFramework, if __name__ == "__main__": builder = SymbolOntologyBuilder('symbol_ontology_builder.yaml', only_do=set(( - OntologyBuilderFramework.TERMS, - OntologyBuilderFramework.SYNONYMS, + #OntologyBuilderFramework.TERMS, + #OntologyBuilderFramework.SYNONYMS, OntologyBuilderFramework.CONCEPTS, OntologyBuilderFramework.CONCEPT_HIERARCHIES )), @@ -165,7 +168,9 @@ if __name__ == "__main__": #OntologyBuilderFramework.TERMS, #OntologyBuilderFramework.SYNONYMS, #OntologyBuilderFramework.CONCEPTS, - #OntologyBuilderFramework.CONCEPT_HIERARCHIES, + OntologyBuilderFramework.CONCEPT_HIERARCHIES, ))) state = builder.process() - #print "\n".join(sorted(state[OntologyBuilderFramework.TERMS])) + + # this will display the concept hierarchies, but takes A LONG TIME + #state[OntologyBuilderFramework.CONCEPT_HIERARCHIES].draw() -- 2.11.4.GIT