dependencies
[nltk_ontology_framework.git] / src / mjacob / ontologybuilder / simple_concepts.py
blob803be139eb5a45b92b9f4bc4fa4c94a4600f8ed4
1 # This Python file uses the following encoding: utf-8
2 '''
3 Created on May 5, 2011
5 @author: mjacob
6 '''
8 from itertools import chain
9 from framework import OntologyBuilderConcepts, OntologyBuilderFramework
10 import nltk
11 from util.cached import Cached
12 from util.memoized import Memoized
13 wn = nltk.corpus.wordnet # this is to subvert a bug in my IDE
14 import re
16 DOMAIN_LABEL = re.compile('\(of ([^\)]*)\)')
17 NOMINAL_SYNSET = re.compile('\w+\.n\.\d+$')
19 class Nyms(object):
20 """
21 Shortcuts for referring to particular relationships in wordnet
23 HYPERNYMS = C{nltk.corpus.reader.wordnet.Synset.hypernyms}
24 INSTANCE_HYPERNYMS = C{nltk.corpus.reader.wordnet.Synset.instance_hypernyms}
26 HYPERNYMY = (HYPERNYMS, INSTANCE_HYPERNYMS)
28 HYPONYMS = C{nltk.corpus.reader.wordnet.Synset.hyponyms}
29 INSTANCE_HYPONYMS = C{nltk.corpus.reader.wordnet.Synset.instance_hyponyms}
31 HYPONYMY = (HYPONYMS, INSTANCE_HYPONYMS)
33 MEMBER_HOLONYMS = C{nltk.corpus.reader.wordnet.Synset.member_holonyms}
34 SUBSTANCE_HOLONYMS = C{nltk.corpus.reader.wordnet.Synset.substance_holonyms}
35 PART_HOLONYMS = C{nltk.corpus.reader.wordnet.Synset.part_holonyms}
37 HOLONYMY = (MEMBER_HOLONYMS, SUBSTANCE_HOLONYMS, PART_HOLONYMS)
39 MEMBER_MERONYMS = C{nltk.corpus.reader.wordnet.Synset.member_meronyms}
40 SUBSTANCE_MERONYMS = C{nltk.corpus.reader.wordnet.Synset.substance_meronyms}
41 PART_MERONYMS = C{nltk.corpus.reader.wordnet.Synset.part_meronyms}
43 MERONYMY = (MEMBER_MERONYMS, SUBSTANCE_MERONYMS, PART_MERONYMS)
45 ATTRIBUTES = C{nltk.corpus.reader.wordnet.Synset.attributes}
46 ENTAILMENTS = C{nltk.corpus.reader.wordnet.Synset.entailments}
47 CAUSES = C{nltk.corpus.reader.wordnet.Synset.causes}
48 ALSO_SEES = C{nltk.corpus.reader.wordnet.Synset.also_sees}
49 VERB_GROUPS = C{nltk.corpus.reader.wordnet.Synset.verb_groups}
50 SIMILAR_TOS = C{nltk.corpus.reader.wordnet.Synset.similar_tos}
52 ALL = all of them
53 """
54 HYPERNYMS = nltk.corpus.reader.wordnet.Synset.hypernyms
55 INSTANCE_HYPERNYMS = nltk.corpus.reader.wordnet.Synset.instance_hypernyms
57 HYPERNYMY = (HYPERNYMS, INSTANCE_HYPERNYMS)
59 HYPONYMS = nltk.corpus.reader.wordnet.Synset.hyponyms
60 INSTANCE_HYPONYMS = nltk.corpus.reader.wordnet.Synset.instance_hyponyms
62 HYPONYMY = (HYPONYMS, INSTANCE_HYPONYMS)
64 MEMBER_HOLONYMS = nltk.corpus.reader.wordnet.Synset.member_holonyms
65 SUBSTANCE_HOLONYMS = nltk.corpus.reader.wordnet.Synset.substance_holonyms
66 PART_HOLONYMS = nltk.corpus.reader.wordnet.Synset.part_holonyms
68 HOLONYMY = (MEMBER_HOLONYMS, SUBSTANCE_HOLONYMS, PART_HOLONYMS)
70 MEMBER_MERONYMS = nltk.corpus.reader.wordnet.Synset.member_meronyms
71 SUBSTANCE_MERONYMS = nltk.corpus.reader.wordnet.Synset.substance_meronyms
72 PART_MERONYMS = nltk.corpus.reader.wordnet.Synset.part_meronyms
74 MERONYMY = (MEMBER_MERONYMS, SUBSTANCE_MERONYMS, PART_MERONYMS)
76 ATTRIBUTES = nltk.corpus.reader.wordnet.Synset.attributes
77 ENTAILMENTS = nltk.corpus.reader.wordnet.Synset.entailments
78 CAUSES = nltk.corpus.reader.wordnet.Synset.causes
79 ALSO_SEES = nltk.corpus.reader.wordnet.Synset.also_sees
80 VERB_GROUPS = nltk.corpus.reader.wordnet.Synset.verb_groups
81 SIMILAR_TOS = nltk.corpus.reader.wordnet.Synset.similar_tos
83 ALL = (HYPERNYMS
84 , INSTANCE_HYPERNYMS
85 , HYPONYMS
86 , INSTANCE_HYPONYMS
87 , MEMBER_HOLONYMS
88 , SUBSTANCE_HOLONYMS
89 , PART_HOLONYMS
90 , MEMBER_MERONYMS
91 , SUBSTANCE_MERONYMS
92 , PART_MERONYMS
93 , ATTRIBUTES
94 , ENTAILMENTS
95 , CAUSES
96 , ALSO_SEES
97 , VERB_GROUPS
98 , SIMILAR_TOS)
100 class SimpleConcepts(OntologyBuilderConcepts):
102 Generate concepts for terms out of collections of wordnet synsets.
104 The basic idea is taken from [Navigli and Velardi 2002], though it's simplified somewhat.
106 Concepts are basically generated by doing a primitive sort of lexical
107 disambiguation, by computing scores for a subterm based upon the presence
108 of specific connections in wordnet of a subterm and the subterm preceding it.
110 In [Navigli and Velardi 2002], the synset for the initial subterm was
111 chosen manually, but this algorithm instead tries to match it against
112 the subterm following it. It seems to work well enough for the purposes
113 of demonstration.
116 def __init__(self, debug=False):
117 self.__debug = debug
119 @Cached(lambda *x, **y: OntologyBuilderFramework.CONCEPTS)
120 def _get_concepts(self, **state):
122 given a collection of synonyms (bundles of synsets per subterm),
123 disambiguate the synsets and return a single synsets per subterm.
125 12 specific structural relations mentioned in [Navigli and Velardi 2002]
126 are used in the process of disambiguation.
128 synonyms = state[OntologyBuilderFramework.SYNONYMS]
130 concepts = {}
131 for synonym in synonyms:
132 if len(synonym) < 2:
133 continue
135 """make sure that the head of the concept is nominal"""
136 nominal_concept = self.__ensure_nominal(synonym)
137 if not nominal_concept[-1]:
138 continue # if not a nominal concept, just skip it
140 concept = self.get_concept(nominal_concept)
142 if concept:
143 concept = tuple(concept) # finalize it
144 if self.__debug:
145 print "%s means %s" % (synonyms[synonym], concept)
147 if concept in concepts:
148 concepts[concept].update(synonyms[synonym])
149 else:
150 concepts[concept] = set(synonyms[synonym])
151 elif self.__debug:
152 print "...... unmeaningful: %s" % (synonyms[synonym])
154 return concepts
156 def __ensure_nominal(self, synonym):
157 """filters out non-nominal synsets from the head of the term"""
158 filtered_end = filter(NOMINAL_SYNSET.match, synonym[-1])
159 return tuple(chain(synonym[:-1], [filtered_end]))
161 def get_concept(self, synonym):
163 disambiguate a specific synonym.
165 initial_synset = self.__get_initial_synset(synonym)
166 if not initial_synset:
167 if self.__debug:
168 print "no initial synset found"
169 return None
171 concept = [initial_synset]
173 for k in xrange(1, len(synonym)):
174 best_synset = self.__get_best_synset(synonym, k)
176 if not best_synset:
177 if self.__debug:
178 print "no optimal synset found for %s" % (synonym[k])
179 return None
181 concept.append(best_synset)
183 return concept
185 def __get_initial_synset(self, synonym):
187 compares the synsets for the initial subterm to those coming after it
188 to attempt to disambiguate the sense of the subterm.
190 subterm_synsets = list(synonym[0])
191 scores = []
192 for subterm_synset in subterm_synsets:
193 score = self.__get_initial_score(subterm_synset, synonym)
194 scores.append(score)
196 if self.__debug:
197 print subterm_synsets
198 print scores
200 index = self.__get_best_score_index(scores)
201 if index is not None:
202 return subterm_synsets[index]
203 else:
204 return None # we can't do anything w/ this
206 def __get_initial_score(self, synset, synonym):
208 get the score for a particular initial synset.
209 If no matching synsets are found at all, returns C{[0]}
211 for i in xrange(1, len(synonym)):
212 score, score_total = self.__get_initial_specific_score(synset, synonym[i])
213 if score_total != 0:
214 return score
216 return [0] # it's all 0s, we'll dump the term later
218 def __get_initial_specific_score(self, synset, other_synsets):
220 get the initial score of a synset compared to each of a collection of other synsets
222 score = [self.__get_intersection_score(synset, other_synset)
223 for other_synset in other_synsets]
224 return score, sum(score)
226 def __get_best_synset(self, synonym, k):
228 disambiguate the synsets of a subterm by looking for particular relations in wordnet
229 between them and the the synsets of the preceding subterm.
231 returns the best synset, or None if no relationships are found.
233 subterm_synsets = list(synonym[k])
234 scores = []
235 for subterm_synset in subterm_synsets:
236 score = self.__get_score(subterm_synset, synonym, k)
237 scores.append(score)
239 if self.__debug:
240 print subterm_synsets
241 print scores
243 index = self.__get_best_score_index(scores)
244 if index is not None:
245 return subterm_synsets[index]
246 else:
247 return None # we can't do anything w/ this
249 def __get_score(self, synset, synonym, k):
251 get the score of a C{synset} compared to all of the synsets
252 at subterm C{k-1}
254 if no matches are found at C{k-1}, try C{k-2} all the way to C{0}
256 for i in xrange(k-1, -1, -1):
257 score, score_total = self.__get_specific_score(synset, synonym, synonym[i])
258 if score_total != 0:
259 return score
261 return [0] # it's all 0s, we'll deal w/ that later
263 def __get_specific_score(self, synset, synonym, other_synsets):
265 get the score of a synset compared to each of a collection of other synsets
267 score = [self.__get_intersection_score(other_synset, synset)
268 for other_synset in other_synsets]
269 return score, sum(score)
271 @Memoized
272 def __get_intersection_score(self, other_synset, synset):
274 count the number of semantically meaningful relationships in wordnet
275 as specified in [Navigli and Velardi 2002]
277 S1 = wn.synset(other_synset)
278 S2 = wn.synset(synset)
279 score = 0
280 score += self.__get_colour_score(S1, S2)
281 score += self.__get_domain_score(S1, S2)
282 score += self.__get_synonymy_score(S1, S2)
283 score += self.__get_hypernymy_meronymy_path_score(S1, S2)
284 score += self.__get_hyponymy_holonymy_path_score(S1, S2)
285 score += self.__get_parallelism_score(S1, S2)
286 score += self.__get_gloss_score(S1, S2)
287 score += self.__get_gloss_hyperonymy_meronymy_path_score(S1, S2)
288 score += self.__get_gloss_parallelism_score(S1, S2)
289 score += self.__get_gloss_gloss_score(S1, S2)
290 score += self.__get_hyperonymy_meronyomy_gloss_path_score(S1, S2)
291 score += self.__get_parallelism_gloss_score(S1, S2)
292 #print "intersection score: %s %s %s" % (other_synset, synset, score)
293 return score
295 CHROMATIC = wn.synset('chromatic.a.03')
296 PHYSICAL_ENTITY = wn.synset('physical_entity.n.01')
297 COLORS = frozenset(CHROMATIC.similar_tos())
299 def __get_colour_score(self, S1, S2):
301 if S1 is a color and S2 is a physical object, return 1. else return 0.
303 if S1 in SimpleConcepts.COLORS and SimpleConcepts.PHYSICAL_ENTITY in chain(*S2.hypernym_paths()):
304 return 1
305 else:
306 return 0
308 def __get_domain_score(self, S1, S2):
310 if the definition of S1 is of the form that it indicates a property
311 of something specific, see if S2 is that type of thing.
313 m = DOMAIN_LABEL.match(S1.definition)
314 if m:
315 domain_label_synsets = wn.synsets(m.group(1).replace(' ', '_'), pos=wn.NOUN)
316 score = 0
317 for domain_label_synset in domain_label_synsets:
318 if S2 == domain_label_synset or S2 in domain_label_synset.hyponyms():
319 score += 1
320 return score
321 return 0
323 @Memoized
324 def __glosses(self, synset):
326 return a list of all definitions and examples of the synset,
327 along with all definitions and examples of anything similar to it.
329 l = list(synset.examples)
330 l.append(synset.definition)
331 for other in synset.similar_tos():
332 l.append(other.definition)
333 l.extend(other.examples)
334 return l
336 def __get_synonymy_score(self, S1, S2):
338 If S1 and S2 are synonyms, return 1. else 0.
340 if S1 == S2 or S2 in chain(*[(b.synset for b in a.pertainyms()) for a in S1.lemmas]):
341 return 1
342 else:
343 return 0
345 @Memoized
346 def __net(self, synset, max_depth=3, *relations):
348 finds all connected synsets of @param synset of the kind specified in @param relations,
349 up to @param max_depth, which has a default of 3.
351 if max_depth < 1:
352 return []
354 if not relations:
355 relations = Nyms.ALL
357 connections = frozenset(chain(*[nym(synset) for nym in relations]))
359 if max_depth == 1:
360 return connections
362 else:
363 return frozenset(chain(connections,
364 chain(*[self.__net(connection, max_depth-1, *relations) for connection in connections])))
367 def __get_hypernymy_meronymy_path_score(self, S1, S2):
369 If something above S1 and something below S2 are identical,
370 return 1, otherwise return 0.
372 net1 = self.__net(S1, 3, *chain(Nyms.HYPERNYMY, Nyms.MERONYMY))
373 net2 = self.__net(S2, 3, *chain(Nyms.HYPONYMY, Nyms.HOLONYMY))
374 if S1 in net2 or S2 in net1 or net1.intersection(net2):
375 return 1
376 else:
377 return 0
379 def __get_hyponymy_holonymy_path_score(self, S1, S2):
381 If something above S2 and something below S1 are identical,
382 return 1, otherwise return 0.
384 return self.__get_hypernymy_meronymy_path_score(S2, S1)
386 def __get_parallelism_score(self, S1, S2):
388 If S1 and S2 share something above them, return 1, else return 0
390 net1 = self.__net(S1, 3, *chain(Nyms.HYPERNYMY))
391 net2 = self.__net(S2, 3, *chain(Nyms.HYPERNYMY))
392 if S1 in net2 or S2 in net1 or net1.intersection(net2):
393 return 1
394 else:
395 return 0
397 def __is_a_subsequence(self, a, b):
399 returns true if @param a is a subsequence of @param b
401 for j in xrange(len(b)-len(a)+1):
402 match = True
403 for i in xrange(len(a)):
404 if not a[i] == b[i+j]:
405 match=False
406 break
407 if match:
408 return True
409 return False
411 def __get_gloss_score(self, S1, S2):
413 if a word representing S1 is mentioned in the glosses of S2 or vice versa,
414 return 1, otherwise return 0
416 for lemma in S2.lemmas:
417 for example in self.__glosses(S1):
418 if self.__is_a_subsequence(lemma.name.split('_'), example.split(' ')):
419 return 1
420 for lemma in S1.lemmas:
421 for example in self.__glosses(S2):
422 if self.__is_a_subsequence(lemma.name.split('_'), example.split(' ')):
423 return 1
424 return 0
426 def __get_gloss_hyperonymy_meronymy_path_score(self, S1, S2):
428 if something in the gloss of S1 is above or below S2, return 1
429 otherwise 0
431 for example in self.__glosses(S1):
432 for word in example.split(' '):
433 for synset in wn.synsets(word):
434 if (self.__get_hypernymy_meronymy_path_score(synset, S2)
435 or self.__get_hyponymy_holonymy_path_score(synset, S2)):
436 return 1
437 return 0
439 def __get_gloss_parallelism_score(self, S1, S2):
441 if there is a common term above S2 and a word in the glosses of S1,
442 return 1, otherwise 0
444 for example in self.__glosses(S1):
445 for word in example.split(' '):
446 for synset in wn.synsets(word):
447 if self.__get_parallelism_score(synset, S2):
448 return 1
449 return 0
451 def __get_gloss_gloss_score(self, S1, S2):
453 If a word in the glosses of S1 shares a synset with a word in the glosses of S2,
454 return 1, otherwise 0.
456 synsets_lists1 = list(chain(*((wn.synsets(word) for word in example.split(' ')) for example in self.__glosses(S1))))
457 synsets_lists2 = list(chain(*((wn.synsets(word) for word in example.split(' ')) for example in self.__glosses(S2))))
458 for synsets in synsets_lists1:
459 if synsets in synsets_lists2:
460 return 1
462 return 0
464 def __get_hyperonymy_meronyomy_gloss_path_score(self, S1, S2):
466 if something in the gloss of S2 is above or below S1, return 1
467 otherwise 0
469 return self.__get_gloss_hyperonymy_meronymy_path_score(S2, S1)
471 def __get_parallelism_gloss_score(self, S1, S2):
473 if there is a common term above S1 and a word in the glosses of S2,
474 return 1, otherwise 0
476 return self.__get_gloss_parallelism_score(S2, S1)
478 def __get_best_score_index(self, scores):
480 given a set of scores, return the index of the "best" one
482 scores are assumed to be lists of numbers
484 best_score_index = None
485 for i in xrange(len(scores)):
486 score = scores[i]
487 if score:
488 if best_score_index is None:
489 if len(filter(lambda x: x != 0, score)) != 0:
490 best_score_index = i
491 elif self.__is_better_score(score, scores[best_score_index]):
492 best_score_index = i
493 return best_score_index
495 def __is_better_score(self, score_a, score_b):
497 returns true if score_a is better than score_b
499 return self.__value(score_a) > self.__value(score_b)
501 def __value(self, score):
503 [Navigli and Velardi 2002] mention some sort of "lexicographic ordering" of scores
504 but don't specify exactly what they mean by this, and my attempts at recreating it
505 produced incredibly weird results. Thus, I've opted for something a little simpler
506 that seems to work well enough in practice.
508 the "value" of a score is considered to be 3* the number of non-0 subscores in it,
509 plus the sum total of the subscores.
510 thus, having two scores of "1" results in a total score of 8, which is equivalent
511 to a single score of "5"
513 # dictionary order gives weird results.
514 return sum(score) + 3*len(filter(lambda x: x != 0, score))