From 1e26150e0937ae2a86a4ee97f6d5b083ca1d6f32 Mon Sep 17 00:00:00 2001 From: Micah Jacob Date: Mon, 9 May 2011 13:50:16 -0400 Subject: [PATCH] seems pretty good. need to make hierarchies moreso... --- src/mjacob/ontologybuilder/simple_concepts.py | 32 +++++++++++++++------- .../ontologybuilder/simple_wordnet_synonyms.py | 9 +++--- test/standalone/for_fun.py | 3 +- .../symbol_ontology/symbol_ontology_test.py | 4 +-- 4 files changed, 31 insertions(+), 17 deletions(-) diff --git a/src/mjacob/ontologybuilder/simple_concepts.py b/src/mjacob/ontologybuilder/simple_concepts.py index 0a3058b..26a7481 100644 --- a/src/mjacob/ontologybuilder/simple_concepts.py +++ b/src/mjacob/ontologybuilder/simple_concepts.py @@ -112,6 +112,7 @@ class SimpleConcepts(OntologyBuilderConcepts): scores.append(score) if debug: + print subterm_synsets print scores index = self.__get_best_score_index(scores) @@ -126,12 +127,14 @@ class SimpleConcepts(OntologyBuilderConcepts): if score_total != 0: return score - return score # it's all 0s, we'll deal w/ that later + return [0] # it's all 0s, we'll deal w/ that later def __get_initial_specific_score(self, synset, synonym, other_synsets): score = [] total_score = 0 for other_synset in other_synsets: + if type(other_synset) is not str and type(other_synset) is not unicode: + raise Exception("wtf is up w/ this type %s %s (%s)" % (other_synset, type(other_synset), synonym)) intersection_score = self.__get_intersection_score(synset, other_synset) score.append(intersection_score) total_score += intersection_score @@ -145,6 +148,7 @@ class SimpleConcepts(OntologyBuilderConcepts): scores.append(score) if debug: + print subterm_synsets print scores index = self.__get_best_score_index(scores) @@ -159,7 +163,7 @@ class SimpleConcepts(OntologyBuilderConcepts): if score_total != 0: return score - return score # it's all 0s, we'll deal w/ that later + return [0] # it's all 0s, we'll deal w/ that later def __get_specific_score(self, synset, synonym, other_synsets): score = [] @@ -213,6 +217,14 @@ class SimpleConcepts(OntologyBuilderConcepts): return score return 0 + def __glosses(self, synset): + l = list(synset.examples) + l.append(synset.definition) + for other in synset.similar_tos(): + l.append(other.definition) + l.extend(other.examples) + return l + def __get_synonymy_score(self, S1, S2): if S1 == S2 or S2 in chain(*[(b.synset for b in a.pertainyms()) for a in S1.lemmas]): return 1 @@ -268,11 +280,11 @@ class SimpleConcepts(OntologyBuilderConcepts): def __get_gloss_score(self, S1, S2): for lemma in S2.lemmas: - for example in S1.examples: + for example in self.__glosses(S1): if self.__is_a_subsequence(lemma.name.split('_'), example.split(' ')): return 1 for lemma in S1.lemmas: - for example in S2.examples: + for example in self.__glosses(S2): if self.__is_a_subsequence(lemma.name.split('_'), example.split(' ')): return 1 return 0 @@ -281,7 +293,7 @@ class SimpleConcepts(OntologyBuilderConcepts): return 0 def __get_gloss_hyperonymy_meronymy_path_score(self, S1, S2): - for example in S1.examples: + for example in self.__glosses(S1): for word in example.split(' '): for synset in wn.synsets(word): if (self.__get_hypernymy_meronymy_path_score(synset, S2) @@ -290,7 +302,7 @@ class SimpleConcepts(OntologyBuilderConcepts): return 0 def __get_gloss_parallelism_score(self, S1, S2): - for example in S1.examples: + for example in self.__glosses(S1): for word in example.split(' '): for synset in wn.synsets(word): if self.__get_parallelism_score(synset, S2): @@ -298,8 +310,8 @@ class SimpleConcepts(OntologyBuilderConcepts): return 0 def __get_gloss_gloss_score(self, S1, S2): - synsets_lists1 = list(chain(*((wn.synsets(word) for word in example.split(' ')) for example in S1.examples))) - synsets_lists2 = list(chain(*((wn.synsets(word) for word in example.split(' ')) for example in S2.examples))) + synsets_lists1 = list(chain(*((wn.synsets(word) for word in example.split(' ')) for example in self.__glosses(S1)))) + synsets_lists2 = list(chain(*((wn.synsets(word) for word in example.split(' ')) for example in self.__glosses(S2)))) for synsets in synsets_lists1: if synsets in synsets_lists2: return 1 @@ -317,10 +329,10 @@ class SimpleConcepts(OntologyBuilderConcepts): for i in xrange(len(scores)): score = scores[i] if score: - if not best_score_index: + if best_score_index is None: if len(filter(lambda x: x != 0, score)) != 0: best_score_index = i - elif self.__is_better_score(scores[i], scores[best_score_index]): + elif self.__is_better_score(score, scores[best_score_index]): best_score_index = i return best_score_index diff --git a/src/mjacob/ontologybuilder/simple_wordnet_synonyms.py b/src/mjacob/ontologybuilder/simple_wordnet_synonyms.py index 9ee955e..257170e 100644 --- a/src/mjacob/ontologybuilder/simple_wordnet_synonyms.py +++ b/src/mjacob/ontologybuilder/simple_wordnet_synonyms.py @@ -16,13 +16,13 @@ class SimpleWordnetSynonyms(OntologyBuilderSynonyms): terms = state[OntologyBuilderFramework.TERMS] synonyms = {} for term in terms: - subterms = term.split(' ') + subterms = filter(lambda x: x not in set(('the', 'The', 'a', 'as', 'A')), term.split(' ')) synset_list = tuple(frozenset(synset.name for synset in wn.synsets(subterm, pos=wn.NOUN+wn.ADJ)) for subterm in subterms) if 'of' in subterms: indices = self.__get_indices('of', subterms) phrases = self.__phrasify(subterms, indices) - synset_list = list(chain(synset_list[a[0]:a[1]] for a in reversed(phrases))) + synset_list = tuple(chain(*(synset_list[a[0]:a[1]] for a in reversed(phrases)))) if synset_list in synonyms: synonyms[synset_list].append(term) @@ -36,16 +36,17 @@ class SimpleWordnetSynonyms(OntologyBuilderSynonyms): for i in xrange(len(ys)): if ys[i] == x: indices.append(i) + return indices def __phrasify(self, subterms, indices): phrases = [] start = 0 for index in indices: stop = index - if start + 1 < stop: + if start < stop: phrases.append((start, stop)) start = stop + 1 - phrases.append(start, len(subterms)) + phrases.append((start, len(subterms))) return phrases \ No newline at end of file diff --git a/test/standalone/for_fun.py b/test/standalone/for_fun.py index 37176f6..a164535 100644 --- a/test/standalone/for_fun.py +++ b/test/standalone/for_fun.py @@ -12,7 +12,8 @@ wn = wordnet a = SimpleConcepts() def gogo(thing): print " ".join([wn.synset(n).definition - for n in a.get_concept([[s.name for s in wn.synsets(x)] + for n in a.get_concept([[s.name for s in wn.synsets(x, pos='an')] for x in thing.split(' ')], debug=True)]) + gogo('hobo sign') \ No newline at end of file diff --git a/test/standalone/symbol_ontology/symbol_ontology_test.py b/test/standalone/symbol_ontology/symbol_ontology_test.py index ed6bcd4..aa496e7 100644 --- a/test/standalone/symbol_ontology/symbol_ontology_test.py +++ b/test/standalone/symbol_ontology/symbol_ontology_test.py @@ -91,13 +91,13 @@ class SymbolOntologyBuilder(OntologyBuilderFramework, if __name__ == "__main__": builder = SymbolOntologyBuilder('symbol_ontology_builder.yaml') state = builder.process(only_do=set(( - #OntologyBuilderFramework.TERMS, + OntologyBuilderFramework.TERMS, OntologyBuilderFramework.SYNONYMS, OntologyBuilderFramework.CONCEPTS, OntologyBuilderFramework.CONCEPT_HIERARCHIES)), ignore_cache=set(( #OntologyBuilderFramework.TERMS, - #OntologyBuilderFramework.SYNONYMS, + OntologyBuilderFramework.SYNONYMS, OntologyBuilderFramework.CONCEPTS, OntologyBuilderFramework.CONCEPT_HIERARCHIES,))) #print "\n".join(sorted(state[OntologyBuilderFramework.TERMS])) -- 2.11.4.GIT