1 # This Python file uses the following encoding: utf-8
7 from framework
import OntologyBuilderFramework
, OntologyBuilderSynonyms
9 from util
.cached
import Cached
10 wn
= nltk
.corpus
.wordnet
11 wn
.synsets
# this is to get rid of stupid errors in my IDE due to nltk's lazy loading stuff
12 from itertools
import chain
15 POSSESIVE
= re
.compile(r
"(?:'s|')")
16 DETERMINERS
= set(('the', 'The', 'a', 'as', 'A', 'that', 'That', 'this', 'This'))
17 PREPOSITIONS
= set(('to', 'To', 'in', 'In', 'on', 'On'))
19 EMPTY_SYNSET
= frozenset()
21 class SimpleWordnetSynonyms(OntologyBuilderSynonyms
):
23 Constructs synonyms from terms.
24 Two terms are deemed synonyms if the synsets for their component words are identical
27 @Cached(lambda *x
, **y
: OntologyBuilderFramework
.SYNONYMS
)
28 def _get_synonyms(self
, **state
):
30 constructs a dictionary of synonyms to terms, where a synonym
31 is defined as a list of wordnet synsets of the words in the term.
33 As an ad-hoc decision, it strips out some determiners,
34 and flips everything around instances of the word "of"
36 terms
= state
[OntologyBuilderFramework
.TERMS
]
39 subterms
= map(lambda x
: POSSESIVE
.sub("", x
),
40 filter(lambda x
: x
not in DETERMINERS
,
42 synset_list
= tuple(frozenset(synset
.name
for synset
in wn
.synsets(subterm
, pos
=wn
.NOUN
+wn
.ADJ
)) for subterm
in subterms
)
45 indices
= self
.__get
_indices
('of', subterms
)
46 phrases
= self
.__phrasify
(subterms
, indices
)
47 synset_list
= tuple(chain(*(synset_list
[a
[0]:a
[1]] for a
in reversed(phrases
))))
49 if EMPTY_SYNSET
in synset_list
:
50 continue # just ignore anything w/out a synset
52 if synset_list
in synonyms
:
53 synonyms
[synset_list
].append(term
)
55 synonyms
[synset_list
] = [term
]
59 def __get_indices(self
, x
, ys
):
60 """return indicies in ys where x occurs"""
61 return filter(lambda i
: ys
[i
] == x
, xrange(len(ys
)))
63 def __phrasify(self
, subterms
, indices
):
65 assuming that "indices" is a list function words, returns a list of
66 subphrases that contain actual lexical items
73 phrases
.append((start
, stop
))
75 phrases
.append((start
, len(subterms
)))