1 # This Python file uses the following encoding: utf-8
8 from itertools
import chain
9 from framework
import OntologyBuilderConcepts
, OntologyBuilderFramework
11 from util
.cached
import Cached
12 from util
.memoized
import Memoized
13 wn
= nltk
.corpus
.wordnet
# this is to subvert a bug in my IDE
16 DOMAIN_LABEL
= re
.compile('\(of ([^\)]*)\)')
17 NOMINAL_SYNSET
= re
.compile('\w+\.n\.\d+$')
21 Shortcuts for referring to particular relationships in wordnet
23 HYPERNYMS = C{nltk.corpus.reader.wordnet.Synset.hypernyms}
24 INSTANCE_HYPERNYMS = C{nltk.corpus.reader.wordnet.Synset.instance_hypernyms}
26 HYPERNYMY = (HYPERNYMS, INSTANCE_HYPERNYMS)
28 HYPONYMS = C{nltk.corpus.reader.wordnet.Synset.hyponyms}
29 INSTANCE_HYPONYMS = C{nltk.corpus.reader.wordnet.Synset.instance_hyponyms}
31 HYPONYMY = (HYPONYMS, INSTANCE_HYPONYMS)
33 MEMBER_HOLONYMS = C{nltk.corpus.reader.wordnet.Synset.member_holonyms}
34 SUBSTANCE_HOLONYMS = C{nltk.corpus.reader.wordnet.Synset.substance_holonyms}
35 PART_HOLONYMS = C{nltk.corpus.reader.wordnet.Synset.part_holonyms}
37 HOLONYMY = (MEMBER_HOLONYMS, SUBSTANCE_HOLONYMS, PART_HOLONYMS)
39 MEMBER_MERONYMS = C{nltk.corpus.reader.wordnet.Synset.member_meronyms}
40 SUBSTANCE_MERONYMS = C{nltk.corpus.reader.wordnet.Synset.substance_meronyms}
41 PART_MERONYMS = C{nltk.corpus.reader.wordnet.Synset.part_meronyms}
43 MERONYMY = (MEMBER_MERONYMS, SUBSTANCE_MERONYMS, PART_MERONYMS)
45 ATTRIBUTES = C{nltk.corpus.reader.wordnet.Synset.attributes}
46 ENTAILMENTS = C{nltk.corpus.reader.wordnet.Synset.entailments}
47 CAUSES = C{nltk.corpus.reader.wordnet.Synset.causes}
48 ALSO_SEES = C{nltk.corpus.reader.wordnet.Synset.also_sees}
49 VERB_GROUPS = C{nltk.corpus.reader.wordnet.Synset.verb_groups}
50 SIMILAR_TOS = C{nltk.corpus.reader.wordnet.Synset.similar_tos}
54 HYPERNYMS
= nltk
.corpus
.reader
.wordnet
.Synset
.hypernyms
55 INSTANCE_HYPERNYMS
= nltk
.corpus
.reader
.wordnet
.Synset
.instance_hypernyms
57 HYPERNYMY
= (HYPERNYMS
, INSTANCE_HYPERNYMS
)
59 HYPONYMS
= nltk
.corpus
.reader
.wordnet
.Synset
.hyponyms
60 INSTANCE_HYPONYMS
= nltk
.corpus
.reader
.wordnet
.Synset
.instance_hyponyms
62 HYPONYMY
= (HYPONYMS
, INSTANCE_HYPONYMS
)
64 MEMBER_HOLONYMS
= nltk
.corpus
.reader
.wordnet
.Synset
.member_holonyms
65 SUBSTANCE_HOLONYMS
= nltk
.corpus
.reader
.wordnet
.Synset
.substance_holonyms
66 PART_HOLONYMS
= nltk
.corpus
.reader
.wordnet
.Synset
.part_holonyms
68 HOLONYMY
= (MEMBER_HOLONYMS
, SUBSTANCE_HOLONYMS
, PART_HOLONYMS
)
70 MEMBER_MERONYMS
= nltk
.corpus
.reader
.wordnet
.Synset
.member_meronyms
71 SUBSTANCE_MERONYMS
= nltk
.corpus
.reader
.wordnet
.Synset
.substance_meronyms
72 PART_MERONYMS
= nltk
.corpus
.reader
.wordnet
.Synset
.part_meronyms
74 MERONYMY
= (MEMBER_MERONYMS
, SUBSTANCE_MERONYMS
, PART_MERONYMS
)
76 ATTRIBUTES
= nltk
.corpus
.reader
.wordnet
.Synset
.attributes
77 ENTAILMENTS
= nltk
.corpus
.reader
.wordnet
.Synset
.entailments
78 CAUSES
= nltk
.corpus
.reader
.wordnet
.Synset
.causes
79 ALSO_SEES
= nltk
.corpus
.reader
.wordnet
.Synset
.also_sees
80 VERB_GROUPS
= nltk
.corpus
.reader
.wordnet
.Synset
.verb_groups
81 SIMILAR_TOS
= nltk
.corpus
.reader
.wordnet
.Synset
.similar_tos
100 class SimpleConcepts(OntologyBuilderConcepts
):
102 Generate concepts for terms out of collections of wordnet synsets.
104 The basic idea is taken from [Navigli and Velardi 2002], though it's simplified somewhat.
106 Concepts are basically generated by doing a primitive sort of lexical
107 disambiguation, by computing scores for a subterm based upon the presence
108 of specific connections in wordnet of a subterm and the subterm preceding it.
110 In [Navigli and Velardi 2002], the synset for the initial subterm was
111 chosen manually, but this algorithm instead tries to match it against
112 the subterm following it. It seems to work well enough for the purposes
116 def __init__(self
, debug
=False):
119 @Cached(lambda *x
, **y
: OntologyBuilderFramework
.CONCEPTS
)
120 def _get_concepts(self
, **state
):
122 given a collection of synonyms (bundles of synsets per subterm),
123 disambiguate the synsets and return a single synsets per subterm.
125 12 specific structural relations mentioned in [Navigli and Velardi 2002]
126 are used in the process of disambiguation.
128 synonyms
= state
[OntologyBuilderFramework
.SYNONYMS
]
131 for synonym
in synonyms
:
135 """make sure that the head of the concept is nominal"""
136 nominal_concept
= self
.__ensure
_nominal
(synonym
)
137 if not nominal_concept
[-1]:
138 continue # if not a nominal concept, just skip it
140 concept
= self
.get_concept(nominal_concept
)
143 concept
= tuple(concept
) # finalize it
145 print "%s means %s" % (synonyms
[synonym
], concept
)
147 if concept
in concepts
:
148 concepts
[concept
].update(synonyms
[synonym
])
150 concepts
[concept
] = set(synonyms
[synonym
])
152 print "...... unmeaningful: %s" % (synonyms
[synonym
])
156 def __ensure_nominal(self
, synonym
):
157 """filters out non-nominal synsets from the head of the term"""
158 filtered_end
= filter(NOMINAL_SYNSET
.match
, synonym
[-1])
159 return tuple(chain(synonym
[:-1], [filtered_end
]))
161 def get_concept(self
, synonym
):
163 disambiguate a specific synonym.
165 initial_synset
= self
.__get
_initial
_synset
(synonym
)
166 if not initial_synset
:
168 print "no initial synset found"
171 concept
= [initial_synset
]
173 for k
in xrange(1, len(synonym
)):
174 best_synset
= self
.__get
_best
_synset
(synonym
, k
)
178 print "no optimal synset found for %s" % (synonym
[k
])
181 concept
.append(best_synset
)
185 def __get_initial_synset(self
, synonym
):
187 compares the synsets for the initial subterm to those coming after it
188 to attempt to disambiguate the sense of the subterm.
190 subterm_synsets
= list(synonym
[0])
192 for subterm_synset
in subterm_synsets
:
193 score
= self
.__get
_initial
_score
(subterm_synset
, synonym
)
197 print subterm_synsets
200 index
= self
.__get
_best
_score
_index
(scores
)
201 if index
is not None:
202 return subterm_synsets
[index
]
204 return None # we can't do anything w/ this
206 def __get_initial_score(self
, synset
, synonym
):
208 get the score for a particular initial synset.
209 If no matching synsets are found at all, returns C{[0]}
211 for i
in xrange(1, len(synonym
)):
212 score
, score_total
= self
.__get
_initial
_specific
_score
(synset
, synonym
[i
])
216 return [0] # it's all 0s, we'll dump the term later
218 def __get_initial_specific_score(self
, synset
, other_synsets
):
220 get the initial score of a synset compared to each of a collection of other synsets
222 score
= [self
.__get
_intersection
_score
(synset
, other_synset
)
223 for other_synset
in other_synsets
]
224 return score
, sum(score
)
226 def __get_best_synset(self
, synonym
, k
):
228 disambiguate the synsets of a subterm by looking for particular relations in wordnet
229 between them and the the synsets of the preceding subterm.
231 returns the best synset, or None if no relationships are found.
233 subterm_synsets
= list(synonym
[k
])
235 for subterm_synset
in subterm_synsets
:
236 score
= self
.__get
_score
(subterm_synset
, synonym
, k
)
240 print subterm_synsets
243 index
= self
.__get
_best
_score
_index
(scores
)
244 if index
is not None:
245 return subterm_synsets
[index
]
247 return None # we can't do anything w/ this
249 def __get_score(self
, synset
, synonym
, k
):
251 get the score of a C{synset} compared to all of the synsets
254 if no matches are found at C{k-1}, try C{k-2} all the way to C{0}
256 for i
in xrange(k
-1, -1, -1):
257 score
, score_total
= self
.__get
_specific
_score
(synset
, synonym
, synonym
[i
])
261 return [0] # it's all 0s, we'll deal w/ that later
263 def __get_specific_score(self
, synset
, synonym
, other_synsets
):
265 get the score of a synset compared to each of a collection of other synsets
267 score
= [self
.__get
_intersection
_score
(other_synset
, synset
)
268 for other_synset
in other_synsets
]
269 return score
, sum(score
)
272 def __get_intersection_score(self
, other_synset
, synset
):
274 count the number of semantically meaningful relationships in wordnet
275 as specified in [Navigli and Velardi 2002]
277 S1
= wn
.synset(other_synset
)
278 S2
= wn
.synset(synset
)
280 score
+= self
.__get
_colour
_score
(S1
, S2
)
281 score
+= self
.__get
_domain
_score
(S1
, S2
)
282 score
+= self
.__get
_synonymy
_score
(S1
, S2
)
283 score
+= self
.__get
_hypernymy
_meronymy
_path
_score
(S1
, S2
)
284 score
+= self
.__get
_hyponymy
_holonymy
_path
_score
(S1
, S2
)
285 score
+= self
.__get
_parallelism
_score
(S1
, S2
)
286 score
+= self
.__get
_gloss
_score
(S1
, S2
)
287 score
+= self
.__get
_gloss
_hyperonymy
_meronymy
_path
_score
(S1
, S2
)
288 score
+= self
.__get
_gloss
_parallelism
_score
(S1
, S2
)
289 score
+= self
.__get
_gloss
_gloss
_score
(S1
, S2
)
290 score
+= self
.__get
_hyperonymy
_meronyomy
_gloss
_path
_score
(S1
, S2
)
291 score
+= self
.__get
_parallelism
_gloss
_score
(S1
, S2
)
292 #print "intersection score: %s %s %s" % (other_synset, synset, score)
295 CHROMATIC
= wn
.synset('chromatic.a.03')
296 PHYSICAL_ENTITY
= wn
.synset('physical_entity.n.01')
297 COLORS
= frozenset(CHROMATIC
.similar_tos())
299 def __get_colour_score(self
, S1
, S2
):
301 if S1 is a color and S2 is a physical object, return 1. else return 0.
303 if S1
in SimpleConcepts
.COLORS
and SimpleConcepts
.PHYSICAL_ENTITY
in chain(*S2
.hypernym_paths()):
308 def __get_domain_score(self
, S1
, S2
):
310 if the definition of S1 is of the form that it indicates a property
311 of something specific, see if S2 is that type of thing.
313 m
= DOMAIN_LABEL
.match(S1
.definition
)
315 domain_label_synsets
= wn
.synsets(m
.group(1).replace(' ', '_'), pos
=wn
.NOUN
)
317 for domain_label_synset
in domain_label_synsets
:
318 if S2
== domain_label_synset
or S2
in domain_label_synset
.hyponyms():
324 def __glosses(self
, synset
):
326 return a list of all definitions and examples of the synset,
327 along with all definitions and examples of anything similar to it.
329 l
= list(synset
.examples
)
330 l
.append(synset
.definition
)
331 for other
in synset
.similar_tos():
332 l
.append(other
.definition
)
333 l
.extend(other
.examples
)
336 def __get_synonymy_score(self
, S1
, S2
):
338 If S1 and S2 are synonyms, return 1. else 0.
340 if S1
== S2
or S2
in chain(*[(b
.synset
for b
in a
.pertainyms()) for a
in S1
.lemmas
]):
346 def __net(self
, synset
, max_depth
=3, *relations
):
348 finds all connected synsets of @param synset of the kind specified in @param relations,
349 up to @param max_depth, which has a default of 3.
357 connections
= frozenset(chain(*[nym(synset
) for nym
in relations
]))
363 return frozenset(chain(connections
,
364 chain(*[self
.__net
(connection
, max_depth
-1, *relations
) for connection
in connections
])))
367 def __get_hypernymy_meronymy_path_score(self
, S1
, S2
):
369 If something above S1 and something below S2 are identical,
370 return 1, otherwise return 0.
372 net1
= self
.__net
(S1
, 3, *chain(Nyms
.HYPERNYMY
, Nyms
.MERONYMY
))
373 net2
= self
.__net
(S2
, 3, *chain(Nyms
.HYPONYMY
, Nyms
.HOLONYMY
))
374 if S1
in net2
or S2
in net1
or net1
.intersection(net2
):
379 def __get_hyponymy_holonymy_path_score(self
, S1
, S2
):
381 If something above S2 and something below S1 are identical,
382 return 1, otherwise return 0.
384 return self
.__get
_hypernymy
_meronymy
_path
_score
(S2
, S1
)
386 def __get_parallelism_score(self
, S1
, S2
):
388 If S1 and S2 share something above them, return 1, else return 0
390 net1
= self
.__net
(S1
, 3, *chain(Nyms
.HYPERNYMY
))
391 net2
= self
.__net
(S2
, 3, *chain(Nyms
.HYPERNYMY
))
392 if S1
in net2
or S2
in net1
or net1
.intersection(net2
):
397 def __is_a_subsequence(self
, a
, b
):
399 returns true if @param a is a subsequence of @param b
401 for j
in xrange(len(b
)-len(a
)+1):
403 for i
in xrange(len(a
)):
404 if not a
[i
] == b
[i
+j
]:
411 def __get_gloss_score(self
, S1
, S2
):
413 if a word representing S1 is mentioned in the glosses of S2 or vice versa,
414 return 1, otherwise return 0
416 for lemma
in S2
.lemmas
:
417 for example
in self
.__glosses
(S1
):
418 if self
.__is
_a
_subsequence
(lemma
.name
.split('_'), example
.split(' ')):
420 for lemma
in S1
.lemmas
:
421 for example
in self
.__glosses
(S2
):
422 if self
.__is
_a
_subsequence
(lemma
.name
.split('_'), example
.split(' ')):
426 def __get_gloss_hyperonymy_meronymy_path_score(self
, S1
, S2
):
428 if something in the gloss of S1 is above or below S2, return 1
431 for example
in self
.__glosses
(S1
):
432 for word
in example
.split(' '):
433 for synset
in wn
.synsets(word
):
434 if (self
.__get
_hypernymy
_meronymy
_path
_score
(synset
, S2
)
435 or self
.__get
_hyponymy
_holonymy
_path
_score
(synset
, S2
)):
439 def __get_gloss_parallelism_score(self
, S1
, S2
):
441 if there is a common term above S2 and a word in the glosses of S1,
442 return 1, otherwise 0
444 for example
in self
.__glosses
(S1
):
445 for word
in example
.split(' '):
446 for synset
in wn
.synsets(word
):
447 if self
.__get
_parallelism
_score
(synset
, S2
):
451 def __get_gloss_gloss_score(self
, S1
, S2
):
453 If a word in the glosses of S1 shares a synset with a word in the glosses of S2,
454 return 1, otherwise 0.
456 synsets_lists1
= list(chain(*((wn
.synsets(word
) for word
in example
.split(' ')) for example
in self
.__glosses
(S1
))))
457 synsets_lists2
= list(chain(*((wn
.synsets(word
) for word
in example
.split(' ')) for example
in self
.__glosses
(S2
))))
458 for synsets
in synsets_lists1
:
459 if synsets
in synsets_lists2
:
464 def __get_hyperonymy_meronyomy_gloss_path_score(self
, S1
, S2
):
466 if something in the gloss of S2 is above or below S1, return 1
469 return self
.__get
_gloss
_hyperonymy
_meronymy
_path
_score
(S2
, S1
)
471 def __get_parallelism_gloss_score(self
, S1
, S2
):
473 if there is a common term above S1 and a word in the glosses of S2,
474 return 1, otherwise 0
476 return self
.__get
_gloss
_parallelism
_score
(S2
, S1
)
478 def __get_best_score_index(self
, scores
):
480 given a set of scores, return the index of the "best" one
482 scores are assumed to be lists of numbers
484 best_score_index
= None
485 for i
in xrange(len(scores
)):
488 if best_score_index
is None:
489 if len(filter(lambda x
: x
!= 0, score
)) != 0:
491 elif self
.__is
_better
_score
(score
, scores
[best_score_index
]):
493 return best_score_index
495 def __is_better_score(self
, score_a
, score_b
):
497 returns true if score_a is better than score_b
499 return self
.__value
(score_a
) > self
.__value
(score_b
)
501 def __value(self
, score
):
503 [Navigli and Velardi 2002] mention some sort of "lexicographic ordering" of scores
504 but don't specify exactly what they mean by this, and my attempts at recreating it
505 produced incredibly weird results. Thus, I've opted for something a little simpler
506 that seems to work well enough in practice.
508 the "value" of a score is considered to be 3* the number of non-0 subscores in it,
509 plus the sum total of the subscores.
510 thus, having two scores of "1" results in a total score of 8, which is equivalent
511 to a single score of "5"
513 # dictionary order gives weird results.
514 return sum(score
) + 3*len(filter(lambda x
: x
!= 0, score
))