nearly everything good to go, mod minor hierarchy bug
[nltk_ontology_framework.git] / src / mjacob / ontologybuilder /
1 # This Python file uses the following encoding: utf-8
2 '''
3 Created on May 5, 2011
5 @author: mjacob
6 '''
7 from framework import OntologyBuilderFramework, OntologyBuilderSynonyms
8 import nltk
9 from util.cached import Cached
10 wn = nltk.corpus.wordnet
11 wn.synsets # this is to get rid of stupid errors in my IDE due to nltk's lazy loading stuff
12 from itertools import chain
13 import re
15 POSSESIVE = re.compile(r"(?:'s|')")
16 DETERMINERS = set(('the', 'The', 'a', 'as', 'A', 'that', 'That', 'this', 'This'))
17 PREPOSITIONS = set(('to', 'To', 'in', 'In', 'on', 'On'))
19 EMPTY_SYNSET = frozenset()
21 class SimpleWordnetSynonyms(OntologyBuilderSynonyms):
22 """
23 Constructs synonyms from terms.
24 Two terms are deemed synonyms if the synsets for their component words are identical
25 """
27 @Cached(lambda *x, **y: OntologyBuilderFramework.SYNONYMS)
28 def _get_synonyms(self, **state):
29 """
30 constructs a dictionary of synonyms to terms, where a synonym
31 is defined as a list of wordnet synsets of the words in the term.
33 As an ad-hoc decision, it strips out some determiners,
34 and flips everything around instances of the word "of"
35 """
36 terms = state[OntologyBuilderFramework.TERMS]
37 synonyms = {}
38 for term in terms:
39 subterms = map(lambda x: POSSESIVE.sub("", x),
40 filter(lambda x: x not in DETERMINERS,
41 term.split(' ')))
42 synset_list = tuple(frozenset( for synset in wn.synsets(subterm, pos=wn.NOUN+wn.ADJ)) for subterm in subterms)
44 if 'of' in subterms:
45 indices = self.__get_indices('of', subterms)
46 phrases = self.__phrasify(subterms, indices)
47 synset_list = tuple(chain(*(synset_list[a[0]:a[1]] for a in reversed(phrases))))
49 if EMPTY_SYNSET in synset_list:
50 continue # just ignore anything w/out a synset
52 if synset_list in synonyms:
53 synonyms[synset_list].append(term)
54 else:
55 synonyms[synset_list] = [term]
57 return synonyms
59 def __get_indices(self, x, ys):
60 """return indicies in ys where x occurs"""
61 return filter(lambda i: ys[i] == x, xrange(len(ys)))
63 def __phrasify(self, subterms, indices):
64 """
65 assuming that "indices" is a list function words, returns a list of
66 subphrases that contain actual lexical items
67 """
68 phrases = []
69 start = 0
70 for index in indices:
71 stop = index
72 if start < stop:
73 phrases.append((start, stop))
74 start = stop + 1
75 phrases.append((start, len(subterms)))
76 return phrases