1 # This Python file uses the following encoding: utf-8
8 from itertools
import chain
9 from framework
import OntologyBuilderConcepts
, OntologyBuilderFramework
11 from util
.cached
import Cached
12 from util
.memoized
import Memoized
13 wn
= nltk
.corpus
.wordnet
# this is to subvert a bug in my IDE
16 DOMAIN_LABEL
= re
.compile('\(of ([^\)]*)\)')
17 NOMINAL_SYNSET
= re
.compile('\w+\.n\.\d+$')
21 Shortcuts for referring to particular relationships in wordnet
23 HYPERNYMS = C{nltk.corpus.reader.wordnet.Synset.hypernyms}
24 INSTANCE_HYPERNYMS = C{nltk.corpus.reader.wordnet.Synset.instance_hypernyms}
26 HYPERNYMY = (HYPERNYMS, INSTANCE_HYPERNYMS)
28 HYPONYMS = C{nltk.corpus.reader.wordnet.Synset.hyponyms}
29 INSTANCE_HYPONYMS = C{nltk.corpus.reader.wordnet.Synset.instance_hyponyms}
31 HYPONYMY = (HYPONYMS, INSTANCE_HYPONYMS)
33 MEMBER_HOLONYMS = C{nltk.corpus.reader.wordnet.Synset.member_holonyms}
34 SUBSTANCE_HOLONYMS = C{nltk.corpus.reader.wordnet.Synset.substance_holonyms}
35 PART_HOLONYMS = C{nltk.corpus.reader.wordnet.Synset.part_holonyms}
37 HOLONYMY = (MEMBER_HOLONYMS, SUBSTANCE_HOLONYMS, PART_HOLONYMS)
39 MEMBER_MERONYMS = C{nltk.corpus.reader.wordnet.Synset.member_meronyms}
40 SUBSTANCE_MERONYMS = C{nltk.corpus.reader.wordnet.Synset.substance_meronyms}
41 PART_MERONYMS = C{nltk.corpus.reader.wordnet.Synset.part_meronyms}
43 MERONYMY = (MEMBER_MERONYMS, SUBSTANCE_MERONYMS, PART_MERONYMS)
45 ATTRIBUTES = C{nltk.corpus.reader.wordnet.Synset.attributes}
46 ENTAILMENTS = C{nltk.corpus.reader.wordnet.Synset.entailments}
47 CAUSES = C{nltk.corpus.reader.wordnet.Synset.causes}
48 ALSO_SEES = C{nltk.corpus.reader.wordnet.Synset.also_sees}
49 VERB_GROUPS = C{nltk.corpus.reader.wordnet.Synset.verb_groups}
50 SIMILAR_TOS = C{nltk.corpus.reader.wordnet.Synset.similar_tos}
54 HYPERNYMS
= nltk
.corpus
.reader
.wordnet
.Synset
.hypernyms
55 INSTANCE_HYPERNYMS
= nltk
.corpus
.reader
.wordnet
.Synset
.instance_hypernyms
57 HYPERNYMY
= (HYPERNYMS
, INSTANCE_HYPERNYMS
)
59 HYPONYMS
= nltk
.corpus
.reader
.wordnet
.Synset
.hyponyms
60 INSTANCE_HYPONYMS
= nltk
.corpus
.reader
.wordnet
.Synset
.instance_hyponyms
62 HYPONYMY
= (HYPONYMS
, INSTANCE_HYPONYMS
)
64 MEMBER_HOLONYMS
= nltk
.corpus
.reader
.wordnet
.Synset
.member_holonyms
65 SUBSTANCE_HOLONYMS
= nltk
.corpus
.reader
.wordnet
.Synset
.substance_holonyms
66 PART_HOLONYMS
= nltk
.corpus
.reader
.wordnet
.Synset
.part_holonyms
68 HOLONYMY
= (MEMBER_HOLONYMS
, SUBSTANCE_HOLONYMS
, PART_HOLONYMS
)
70 MEMBER_MERONYMS
= nltk
.corpus
.reader
.wordnet
.Synset
.member_meronyms
71 SUBSTANCE_MERONYMS
= nltk
.corpus
.reader
.wordnet
.Synset
.substance_meronyms
72 PART_MERONYMS
= nltk
.corpus
.reader
.wordnet
.Synset
.part_meronyms
74 MERONYMY
= (MEMBER_MERONYMS
, SUBSTANCE_MERONYMS
, PART_MERONYMS
)
76 ATTRIBUTES
= nltk
.corpus
.reader
.wordnet
.Synset
.attributes
77 ENTAILMENTS
= nltk
.corpus
.reader
.wordnet
.Synset
.entailments
78 CAUSES
= nltk
.corpus
.reader
.wordnet
.Synset
.causes
79 ALSO_SEES
= nltk
.corpus
.reader
.wordnet
.Synset
.also_sees
80 VERB_GROUPS
= nltk
.corpus
.reader
.wordnet
.Synset
.verb_groups
81 SIMILAR_TOS
= nltk
.corpus
.reader
.wordnet
.Synset
.similar_tos
100 class SimpleConcepts(OntologyBuilderConcepts
):
102 Generate concepts for terms out of collections of wordnet synsets.
104 The basic idea is taken from [Navigli and Velardi 2002], though it's simplified somewhat.
106 Concepts are basically generated by doing a primitive sort of lexical
107 disambiguation, by computing scores for a subterm based upon the presence
108 of specific connections in wordnet of a subterm and the subterm preceding it.
110 In [Navigli and Velardi 2002], the synset for the initial subterm was
111 chosen manually, but this algorithm instead tries to match it against
112 the subterm following it. It seems to work well enough for the purposes
116 @Cached(lambda *x
, **y
: OntologyBuilderFramework
.CONCEPTS
)
117 def _get_concepts(self
, **state
):
119 given a collection of synonyms (bundles of synsets per subterm),
120 disambiguate the synsets and return a single synsets per subterm.
122 12 specific structural relations mentioned in [Navigli and Velardi 2002]
123 are used in the process of disambiguation.
125 synonyms
= state
[OntologyBuilderFramework
.SYNONYMS
]
128 for synonym
in synonyms
:
132 """make sure that the head of the concept is nominal"""
133 nominal_concept
= self
.__ensure
_nominal
(synonym
)
134 if not nominal_concept
[-1]:
135 continue # if not a nominal concept, just skip it
137 concept
= self
.get_concept(nominal_concept
)
140 concept
= tuple(concept
) # finalize it
141 print "%s means %s" % (synonyms
[synonym
], concept
)
143 if concept
in concepts
:
144 concepts
[concept
].update(synonyms
[synonym
])
146 concepts
[concept
] = set(synonyms
[synonym
])
148 print "...... unmeaningful: %s" % (synonyms
[synonym
])
152 def __ensure_nominal(self
, synonym
):
153 """filters out non-nominal synsets from the head of the term"""
154 filtered_end
= filter(NOMINAL_SYNSET
.match
, synonym
[-1])
155 return tuple(chain(synonym
[:-1], [filtered_end
]))
157 def get_concept(self
, synonym
, debug
=False):
159 disambiguate a specific synonym.
161 initial_synset
= self
.__get
_initial
_synset
(synonym
, debug
=debug
)
162 if not initial_synset
:
164 print "no initial synset found"
167 concept
= [initial_synset
]
169 for k
in xrange(1, len(synonym
)):
170 best_synset
= self
.__get
_best
_synset
(synonym
, k
, debug
=debug
)
174 print "no optimal synset found for %s" % (synonym
[k
])
177 concept
.append(best_synset
)
181 def __get_initial_synset(self
, synonym
, debug
=False):
183 compares the synsets for the initial subterm to those coming after it
184 to attempt to disambiguate the sense of the subterm.
186 subterm_synsets
= list(synonym
[0])
188 for subterm_synset
in subterm_synsets
:
189 score
= self
.__get
_initial
_score
(subterm_synset
, synonym
)
193 print subterm_synsets
196 index
= self
.__get
_best
_score
_index
(scores
)
197 if index
is not None:
198 return subterm_synsets
[index
]
200 return None # we can't do anything w/ this
202 def __get_initial_score(self
, synset
, synonym
):
204 get the score for a particular initial synset.
205 If no matching synsets are found at all, returns C{[0]}
207 for i
in xrange(1, len(synonym
)):
208 score
, score_total
= self
.__get
_initial
_specific
_score
(synset
, synonym
[i
])
212 return [0] # it's all 0s, we'll dump the term later
214 def __get_initial_specific_score(self
, synset
, other_synsets
):
216 get the initial score of a synset compared to each of a collection of other synsets
218 score
= [self
.__get
_intersection
_score
(synset
, other_synset
)
219 for other_synset
in other_synsets
]
220 return score
, sum(score
)
222 def __get_best_synset(self
, synonym
, k
, debug
=False):
224 disambiguate the synsets of a subterm by looking for particular relations in wordnet
225 between them and the the synsets of the preceding subterm.
227 returns the best synset, or None if no relationships are found.
229 subterm_synsets
= list(synonym
[k
])
231 for subterm_synset
in subterm_synsets
:
232 score
= self
.__get
_score
(subterm_synset
, synonym
, k
)
236 print subterm_synsets
239 index
= self
.__get
_best
_score
_index
(scores
)
240 if index
is not None:
241 return subterm_synsets
[index
]
243 return None # we can't do anything w/ this
245 def __get_score(self
, synset
, synonym
, k
):
247 get the score of a C{synset} compared to all of the synsets
250 if no matches are found at C{k-1}, try C{k-2} all the way to C{0}
252 for i
in xrange(k
-1, -1, -1):
253 score
, score_total
= self
.__get
_specific
_score
(synset
, synonym
, synonym
[i
])
257 return [0] # it's all 0s, we'll deal w/ that later
259 def __get_specific_score(self
, synset
, synonym
, other_synsets
):
261 get the score of a synset compared to each of a collection of other synsets
263 score
= [self
.__get
_intersection
_score
(other_synset
, synset
)
264 for other_synset
in other_synsets
]
265 return score
, sum(score
)
268 def __get_intersection_score(self
, other_synset
, synset
):
270 count the number of semantically meaningful relationships in wordnet
271 as specified in [Navigli and Velardi 2002]
273 S1
= wn
.synset(other_synset
)
274 S2
= wn
.synset(synset
)
276 score
+= self
.__get
_colour
_score
(S1
, S2
)
277 score
+= self
.__get
_domain
_score
(S1
, S2
)
278 score
+= self
.__get
_synonymy
_score
(S1
, S2
)
279 score
+= self
.__get
_hypernymy
_meronymy
_path
_score
(S1
, S2
)
280 score
+= self
.__get
_hyponymy
_holonymy
_path
_score
(S1
, S2
)
281 score
+= self
.__get
_parallelism
_score
(S1
, S2
)
282 score
+= self
.__get
_gloss
_score
(S1
, S2
)
283 score
+= self
.__get
_gloss
_hyperonymy
_meronymy
_path
_score
(S1
, S2
)
284 score
+= self
.__get
_gloss
_parallelism
_score
(S1
, S2
)
285 score
+= self
.__get
_gloss
_gloss
_score
(S1
, S2
)
286 score
+= self
.__get
_hyperonymy
_meronyomy
_gloss
_path
_score
(S1
, S2
)
287 score
+= self
.__get
_parallelism
_gloss
_score
(S1
, S2
)
288 #print "intersection score: %s %s %s" % (other_synset, synset, score)
291 CHROMATIC
= wn
.synset('chromatic.a.03')
292 PHYSICAL_ENTITY
= wn
.synset('physical_entity.n.01')
293 COLORS
= frozenset(CHROMATIC
.similar_tos())
295 def __get_colour_score(self
, S1
, S2
):
297 if S1 is a color and S2 is a physical object, return 1. else return 0.
299 if S1
in SimpleConcepts
.COLORS
and SimpleConcepts
.PHYSICAL_ENTITY
in chain(*S2
.hypernym_paths()):
304 def __get_domain_score(self
, S1
, S2
):
306 if the definition of S1 is of the form that it indicates a property
307 of something specific, see if S2 is that type of thing.
309 m
= DOMAIN_LABEL
.match(S1
.definition
)
311 domain_label_synsets
= wn
.synsets(m
.group(1).replace(' ', '_'), pos
=wn
.NOUN
)
313 for domain_label_synset
in domain_label_synsets
:
314 if S2
== domain_label_synset
or S2
in domain_label_synset
.hyponyms():
320 def __glosses(self
, synset
):
322 return a list of all definitions and examples of the synset,
323 along with all definitions and examples of anything similar to it.
325 l
= list(synset
.examples
)
326 l
.append(synset
.definition
)
327 for other
in synset
.similar_tos():
328 l
.append(other
.definition
)
329 l
.extend(other
.examples
)
332 def __get_synonymy_score(self
, S1
, S2
):
334 If S1 and S2 are synonyms, return 1. else 0.
336 if S1
== S2
or S2
in chain(*[(b
.synset
for b
in a
.pertainyms()) for a
in S1
.lemmas
]):
342 def __net(self
, synset
, max_depth
=3, *relations
):
344 finds all connected synsets of @param synset of the kind specified in @param relations,
345 up to @param max_depth, which has a default of 3.
353 connections
= frozenset(chain(*[nym(synset
) for nym
in relations
]))
359 return frozenset(chain(connections
,
360 chain(*[self
.__net
(connection
, max_depth
-1, *relations
) for connection
in connections
])))
363 def __get_hypernymy_meronymy_path_score(self
, S1
, S2
):
365 If something above S1 and something below S2 are identical,
366 return 1, otherwise return 0.
368 net1
= self
.__net
(S1
, 3, *chain(Nyms
.HYPERNYMY
, Nyms
.MERONYMY
))
369 net2
= self
.__net
(S2
, 3, *chain(Nyms
.HYPONYMY
, Nyms
.HOLONYMY
))
370 if S1
in net2
or S2
in net1
or net1
.intersection(net2
):
375 def __get_hyponymy_holonymy_path_score(self
, S1
, S2
):
377 If something above S2 and something below S1 are identical,
378 return 1, otherwise return 0.
380 return self
.__get
_hypernymy
_meronymy
_path
_score
(S2
, S1
)
382 def __get_parallelism_score(self
, S1
, S2
):
384 If S1 and S2 share something above them, return 1, else return 0
386 net1
= self
.__net
(S1
, 3, *chain(Nyms
.HYPERNYMY
))
387 net2
= self
.__net
(S2
, 3, *chain(Nyms
.HYPERNYMY
))
388 if S1
in net2
or S2
in net1
or net1
.intersection(net2
):
393 def __is_a_subsequence(self
, a
, b
):
395 returns true if @param a is a subsequence of @param b
397 for j
in xrange(len(b
)-len(a
)+1):
399 for i
in xrange(len(a
)):
400 if not a
[i
] == b
[i
+j
]:
407 def __get_gloss_score(self
, S1
, S2
):
409 if a word representing S1 is mentioned in the glosses of S2 or vice versa,
410 return 1, otherwise return 0
412 for lemma
in S2
.lemmas
:
413 for example
in self
.__glosses
(S1
):
414 if self
.__is
_a
_subsequence
(lemma
.name
.split('_'), example
.split(' ')):
416 for lemma
in S1
.lemmas
:
417 for example
in self
.__glosses
(S2
):
418 if self
.__is
_a
_subsequence
(lemma
.name
.split('_'), example
.split(' ')):
422 def __get_gloss_hyperonymy_meronymy_path_score(self
, S1
, S2
):
424 if something in the gloss of S1 is above or below S2, return 1
427 for example
in self
.__glosses
(S1
):
428 for word
in example
.split(' '):
429 for synset
in wn
.synsets(word
):
430 if (self
.__get
_hypernymy
_meronymy
_path
_score
(synset
, S2
)
431 or self
.__get
_hyponymy
_holonymy
_path
_score
(synset
, S2
)):
435 def __get_gloss_parallelism_score(self
, S1
, S2
):
437 if there is a common term above S2 and a word in the glosses of S1,
438 return 1, otherwise 0
440 for example
in self
.__glosses
(S1
):
441 for word
in example
.split(' '):
442 for synset
in wn
.synsets(word
):
443 if self
.__get
_parallelism
_score
(synset
, S2
):
447 def __get_gloss_gloss_score(self
, S1
, S2
):
449 If a word in the glosses of S1 shares a synset with a word in the glosses of S2,
450 return 1, otherwise 0.
452 synsets_lists1
= list(chain(*((wn
.synsets(word
) for word
in example
.split(' ')) for example
in self
.__glosses
(S1
))))
453 synsets_lists2
= list(chain(*((wn
.synsets(word
) for word
in example
.split(' ')) for example
in self
.__glosses
(S2
))))
454 for synsets
in synsets_lists1
:
455 if synsets
in synsets_lists2
:
460 def __get_hyperonymy_meronyomy_gloss_path_score(self
, S1
, S2
):
462 if something in the gloss of S2 is above or below S1, return 1
465 return self
.__get
_gloss
_hyperonymy
_meronymy
_path
_score
(S2
, S1
)
467 def __get_parallelism_gloss_score(self
, S1
, S2
):
469 if there is a common term above S1 and a word in the glosses of S2,
470 return 1, otherwise 0
472 return self
.__get
_gloss
_parallelism
_score
(S2
, S1
)
474 def __get_best_score_index(self
, scores
):
476 given a set of scores, return the index of the "best" one
478 scores are assumed to be lists of numbers
480 best_score_index
= None
481 for i
in xrange(len(scores
)):
484 if best_score_index
is None:
485 if len(filter(lambda x
: x
!= 0, score
)) != 0:
487 elif self
.__is
_better
_score
(score
, scores
[best_score_index
]):
489 return best_score_index
491 def __is_better_score(self
, score_a
, score_b
):
493 returns true if score_a is better than score_b
495 return self
.__value
(score_a
) > self
.__value
(score_b
)
497 def __value(self
, score
):
499 [Navigli and Velardi 2002] mention some sort of "lexicographic ordering" of scores
500 but don't specify exactly what they mean by this, and my attempts at recreating it
501 produced incredibly weird results. Thus, I've opted for something a little simpler
502 that seems to work well enough in practice.
504 the "value" of a score is considered to be 3* the number of non-0 subscores in it,
505 plus the sum total of the subscores.
506 thus, having two scores of "1" results in a total score of 8, which is equivalent
507 to a single score of "5"
509 # dictionary order gives weird results.
510 return sum(score
) + 3*len(filter(lambda x
: x
!= 0, score
))